From 64d5779bb1ef1d16e70601ffc64553b43b87f32f Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 09:55:14 +0900 Subject: start to test by xed --- test/target/misc.txt | 657 +++++++++++++++++++++++++++++++++++++++++++++++++++ test/test_by_xed.cpp | 3 +- test/test_by_xed.py | 184 +++++++++++---- test/test_by_xed.sh | 4 +- 4 files changed, 801 insertions(+), 47 deletions(-) create mode 100644 test/target/misc.txt diff --git a/test/target/misc.txt b/test/target/misc.txt new file mode 100644 index 0000000..9e4f097 --- /dev/null +++ b/test/target/misc.txt @@ -0,0 +1,657 @@ +v4fmaddps(zmm1, zmm8, ptr [rdx + 64]); +v4fmaddss(xmm15, xmm8, ptr [rax + 64]); +v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]); +v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]); +vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]); +vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]); +vaesdec(xmm20, xmm30, ptr [rcx + 64]); +vaesdec(ymm1, ymm2, ptr [rcx + 64]); +vaesdec(zmm1, zmm2, ptr [rcx + 64]); +vaesdeclast(xmm20, xmm30, ptr [rax + 64]); +vaesdeclast(ymm20, ymm30, ptr [rax + 64]); +vaesdeclast(zmm20, zmm30, ptr [rax + 64]); +vaesenc(xmm20, xmm30, ptr [rcx + 64]); +vaesenc(ymm1, ymm2, ptr [rcx + 64]); +vaesenc(zmm1, zmm2, ptr [rcx + 64]); +vaesenclast(xmm20, xmm30, ptr [rax + 64]); +vaesenclast(ymm20, ymm30, ptr [rax + 64]); +vaesenclast(zmm20, zmm30, ptr [rax + 64]); +vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3); +vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3); +vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3); +vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3); +vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3); +vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3); +vpcompressb(ptr[rax + 64], xmm1); +vpcompressb(xmm30 | k5, xmm1); +vpcompressb(ptr[rax + 64], ymm1); +vpcompressb(ymm30 | k3 |T_z, ymm1); +vpcompressb(ptr[rax + 64], zmm1); +vpcompressb(zmm30 | k2 |T_z, zmm1); +vpcompressw(ptr[rax + 64], xmm1); +vpcompressw(xmm30 | k5, xmm1); +vpcompressw(ptr[rax + 64], ymm1); +vpcompressw(ymm30 | k3 |T_z, ymm1); +vpcompressw(ptr[rax + 64], zmm1); +vpcompressw(zmm30 | k2 |T_z, zmm1); +vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); +vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); +vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); +vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); +vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); +vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); +vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); +vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); +vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); +vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); +vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); +vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); +vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]); +vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpexpandb(xmm5|k3|T_z, xmm30); +vpexpandb(ymm5|k3|T_z, ymm30); +vpexpandb(zmm5|k3|T_z, zmm30); +vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]); +vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(xmm5|k3|T_z, xmm30); +vpexpandw(ymm5|k3|T_z, ymm30); +vpexpandw(zmm5|k3|T_z, zmm30); +vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]); +gf2p8affineinvqb(xmm1, xmm2, 3); +gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3); +vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3); +vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5); +vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5); +vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5); +vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); +vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); +vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); +gf2p8affineqb(xmm1, xmm2, 3); +gf2p8affineqb(xmm1, ptr [rax + 0x40], 3); +vgf2p8affineqb(xmm1, xmm5, xmm2, 3); +vgf2p8affineqb(ymm1, ymm5, ymm2, 3); +vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3); +vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3); +vgf2p8affineqb(xmm30, xmm31, xmm4, 5); +vgf2p8affineqb(ymm30, ymm31, ymm4, 5); +vgf2p8affineqb(zmm30, zmm31, zmm4, 5); +vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); +vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); +vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); +gf2p8mulb(xmm1, xmm2); +gf2p8mulb(xmm1, ptr [rax + 0x40]); +vgf2p8mulb(xmm1, xmm5, xmm2); +vgf2p8mulb(ymm1, ymm5, ymm2); +vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]); +vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]); +vgf2p8mulb(xmm30, xmm31, xmm4); +vgf2p8mulb(ymm30, ymm31, ymm4); +vgf2p8mulb(zmm30, zmm31, zmm4); +vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]); +vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]); +vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]); +vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]); +vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]); +vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]); +vcvtneps2bf16(xmm0, xword [rax + 64]); +vcvtneps2bf16(xmm0 | k1, yword [rax + 64]); +vcvtneps2bf16(ymm0 | k1, zword [rax + 64]); +vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]); +vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]); +vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]); +vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]); +ldtilecfg(ptr[rax + rcx * 4 + 64]); +sttilecfg(ptr[rsp + rax * 8 + 128]); +tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]); +tileloaddt1(tmm4, ptr[r8 + r9 + 32]); +tilerelease(); +tilestored(ptr[r10 + r11 * 2 + 32], tmm2); +tilezero(tmm7); +tdpbssd(tmm1, tmm2, tmm3); +tdpbsud(tmm2, tmm3, tmm4); +tdpbusd(tmm3, tmm4, tmm5); +tdpbuud(tmm4, tmm5, tmm6); +tdpbf16ps(tmm5, tmm6, tmm7); +tileloadd(tmm1, ptr[r8+r8]); +tileloadd(tmm1, ptr[rax+rcx*4]); +tileloadd(tmm1, ptr[r8+r9*1+0x40]); +vaddph(zmm0, zmm1, ptr[rax+64]); +vaddph(ymm0, ymm1, ptr[rax+64]); +vaddph(xmm0, xmm1, ptr[rax+64]); +vaddph(zmm0, zmm1, ptr_b[rax+64]); +vaddph(ymm0, ymm1, ptr_b[rax+64]); +vaddph(xmm0, xmm1, ptr_b[rax+64]); +vaddsh(xmm0, xmm15, ptr[rax+64]); +vaddsh(xmm0|k5|T_z|T_rd_sae, xmm15, xmm3); +vcmpph(k1, xm15, ptr[rax+64], 1); +vcmpph(k2, ym15, ptr[rax+64], 2); +vcmpph(k3, zm15, ptr[rax+64], 3); +vcmpph(k1, xm15, ptr_b[rax+64], 1); +vcmpph(k2, ym15, ptr_b[rax+64], 2); +vcmpph(k3, zm15, ptr_b[rax+64], 3); +vcmpsh(k1, xm15, ptr[rax+64], 1); +vcmpsh(k3|k5, xmm1, xmm25|T_sae, 4); +vcomish(xmm1, ptr[rax+64]); +vcomish(xmm1|T_sae, xmm15); +vucomish(xmm1, ptr [rax+0x40]); +vucomish(xmm1|T_sae, xmm15); +vfmaddsub213ph(xmm1, xmm2, ptr [rax+0x40]); +vfmaddsub213ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmaddsub213ph(xmm1|k3, xmm2, xmm5); +vfmaddsub213ph(ymm1, ymm2, ptr [rax+0x40]); +vfmaddsub213ph(ymm1, ymm2, ptr_b[rax+0x40]); +vfmaddsub213ph(ymm1|k3, ymm2, ymm5); +vfmaddsub213ph(zmm1, zmm2, ptr [rax+0x40]); +vfmaddsub213ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmaddsub213ph(zmm1|T_ru_sae, zmm2, zmm5); +vfmsubadd132ph(xmm1, xmm2, ptr [rax+0x40]); +vfmsubadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmsubadd132ph(ymm1, ymm2, ptr [rax+0x40]); +vfmsubadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmsubadd132ph(zmm1, zmm2, ptr [rax+0x40]); +vfmsubadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmsubadd132ph(zmm1|T_ru_sae, zmm2, zmm5); +vfmadd132ph(xmm1, xmm2, ptr [rax+0x40]); +vfmadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmadd132ph(ymm1, ymm2, ptr [rax+0x40]); +vfmadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmadd132ph(zmm1, zmm2, ptr [rax+0x40]); +vfmadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmadd132ph(zmm1|T_rd_sae, zmm2, zmm5); +vfmsub231ph(xmm1, xmm2, ptr [rax+0x40]); +vfmsub231ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmsub231ph(ymm1, ymm2, ptr [rax+0x40]); +vfmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmsub231ph(zmm1, zmm2, ptr [rax+0x40]); +vfmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); +vfnmsub231ph(xmm1, xmm2, ptr [rax+0x40]); +vfnmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfnmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfnmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); +vfmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfmadd132sh(xmm1, xmm2, ptr [rax+0x40]); +vfnmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfnmadd132sh(xmm1, xmm2, ptr [rax+0x40]); +vfmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfmsub132sh(xmm1, xmm2, ptr [rax+0x40]); +vfnmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfnmsub132sh(xmm1, xmm2, ptr [rax+0x40]); +vfcmaddcph(xmm1|k1|T_z, xmm2, ptr [rax+0x40]); +vfcmaddcph(ymm1|k1|T_z, ymm2, ptr [rax+0x40]); +vfcmaddcph(zmm1|k1, zmm2, ptr [rax+0x40]); +vfcmaddcph(zmm1|k1|T_rd_sae, zmm2, zmm5); +vfcmaddcph(xmm1|k1|T_z, xmm2, ptr_b [rax+0x40]); +vfcmaddcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfcmaddcph(zmm1|k1|T_z, zmm2, ptr_b [rax+0x40]); +vfmaddcph(xm1, xm2, ptr[rax+0x40]); +vfmaddcph(ym1|k1|T_z, ym2, ptr_b[rax+0x40]); +vfmaddcph(zm1, zm2, ptr_b[rax+0x40]); +vfcmulcph(xmm1, xmm2, ptr [rax+0x40]); +vfcmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfcmulcph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmulcph(xmm1, xmm2, ptr [rax+0x40]); +vfmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfmulcph(zmm1, zmm2, ptr_b [rax+0x40]); +vrcpph(xmm1, ptr [rax+0x40]); +vrcpph(xmm1, ptr_b [rax+0x40]); +vrcpph(ymm1, ptr [rax+0x40]); +vrcpph(ymm1, ptr_b [rax+0x40]); +vrcpph(zmm1, ptr [rax+0x40]); +vrcpph(zmm1, ptr_b [rax+0x40]); +vrcpsh(xmm1, xmm3, ptr [rax+0x40]); +vrsqrtph(xmm1, ptr [rax+0x40]); +vrsqrtph(xmm1, ptr_b [rax+0x40]); +vrsqrtph(ymm2, ptr [rax+0x40]); +vrsqrtph(ymm2, ptr_b [rax+0x40]); +vrsqrtph(zmm2, ptr [rax+0x40]); +vrsqrtph(zmm2, ptr_b [rax+0x40]); +vrsqrtsh(xmm1|k5|T_z, xmm7, ptr [rax+0x40]); +vsqrtph(xmm1|k4|T_z, ptr [rax+0x40]); +vsqrtph(xmm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtph(ymm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtph(zmm1|k4|T_z, ptr [rax+0x40]); +vsqrtph(zmm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtsh(xmm1|k4|T_z, xmm5, ptr [rax+0x40]); +vsqrtsh(xmm1|k4|T_z|T_rd_sae, xmm5, xmm7); +vscalefph(xmm1, xmm5, ptr [rax+0x40]); +vscalefph(xmm1, xmm5, ptr_b [rax+0x40]); +vscalefph(ymm1, ymm5, ptr [rax+0x40]); +vscalefph(ymm1, ymm5, ptr_b [rax+0x40]); +vscalefph(zmm1, zmm5, ptr [rax+0x40]); +vscalefph(zmm1, zmm5, ptr_b [rax+0x40]); +vscalefph(zmm1|k1|T_z|T_rd_sae, zmm5, zmm7); +vscalefsh(xmm1, xmm5, ptr [rax+0x40]); +vscalefsh(xmm1|k1|T_z|T_rd_sae, xmm5, xmm7); +vreduceph(xmm1, ptr [rax+0x40], 0x1); +vreduceph(xmm1, ptr_b [rax+0x40], 0x2); +vreduceph(ymm1, ptr [rax+0x40], 0x3); +vreduceph(ymm1, ptr_b [rax+0x40], 0x4); +vreduceph(zmm1, ptr [rax+0x40], 0x5); +vreduceph(zmm1, ptr_b [rax+0x40], 0x6); +vreduceph(zmm1|k1|T_z|T_sae, zmm5, 0x7); +vreducesh(xmm1, xmm3, ptr [rax+0x40], 0x1); +vreducesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); +vrndscaleph(xmm1, ptr [rax+0x40], 0x1); +vrndscaleph(xmm1, ptr_b [rax+0x40], 0x2); +vrndscaleph(ymm1, ptr [rax+0x40], 0x3); +vrndscaleph(ymm1, ptr_b [rax+0x40], 0x4); +vrndscaleph(zmm1, ptr [rax+0x40], 0x5); +vrndscaleph(zmm1, ptr_b [rax+0x40], 0x6); +vrndscaleph(zmm1|k1|T_z|T_sae, zmm5, 0x7); +vrndscalesh(xmm1, xmm3, ptr [rax+0x40], 0x1); +vrndscalesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); +vfpclassph(k1, xword [rax+0x40], 0x1); +vfpclassph(k1, xword_b[rax+0x40], 0x2); +vfpclassph(k1, yword [rax+0x40], 0x3); +vfpclassph(k1, yword_b[rax+0x40], 0x4); +vfpclassph(k1, zword [rax+0x40], 0x5); +vfpclassph(k1, zword_b[rax+0x40], 0x6); +vfpclasssh(k1|k2, xmm3, 0x5); +vfpclasssh(k1|k2, ptr [rax+0x40], 0x5); +vgetexpph(xmm1, ptr [rax+0x40]); +vgetexpph(ymm1, ptr_b [rax+0x40]); +vgetexpph(zmm1, ptr [rax+0x40]); +vgetexpph(zmm1|k1|T_z|T_sae, zmm5); +vgetexpsh(xmm1, xmm5, ptr [rax+0x40]); +vgetexpsh(xmm1|k1|T_z|T_sae, xmm3, xmm5); +vgetmantph(xmm1, ptr [rax+0x40], 0x1); +vgetmantph(ymm1, ptr_b [rax+0x40], 0x2); +vgetmantph(zmm1, ptr [rax+0x40], 0x3); +vgetmantph(zmm1|k1|T_z|T_sae, zmm5, 0x4); +vgetmantsh(xmm1, xmm5, ptr [rax+0x40], 0x5); +vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6); +vmovsh(xmm1|k1|T_z, ptr [rax+0x40]); +vmovsh(ptr [rax+0x40]|k1, xmm1); +vmovsh(xmm1|k2|T_z, xmm3, xmm5); +vmovw(xmm1, r13d); +vmovw(xmm3, ptr [rax+0x40]); +vmovw(r9d, xmm1); +vmovw(ptr [rax+0x40], xmm7); +vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3); +vcvtsh2sd(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2ss(xmm1|k1|T_z|T_sae, xmm2, xmm3); +vcvtsh2ss(xmm1, xmm2, ptr [rax+0x40]); +vcvtss2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vcvtss2sh(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2si(edx|T_rd_sae, xmm1); +vcvtsh2si(edx, ptr [rax+0x40]); +vcvtsh2si(rdx|T_rd_sae, xmm1); +vcvtsh2si(r8, ptr [rax+0x40]); +vcvtph2dq(xmm1, xmm5); +vcvtph2dq(xmm1, ptr [rax+0x40]); +vcvtph2dq(xmm1, ptr_b [rax+0x40]); +vcvtph2dq(ymm1|k2|T_z, xmm5); +vcvtph2dq(ymm1, ptr [rax+0x40]); +vcvtph2dq(ymm1, ptr_b [rax+0x40]); +vcvtph2dq(zmm1|k5|T_z|T_rd_sae, ymm3); +vcvtph2dq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2psx(xmm1, xmm5); +vcvtph2psx(xmm1, ptr [rax+0x40]); +vcvtph2psx(xmm1, ptr_b [rax+0x40]); +vcvtph2psx(ymm1|k2|T_z, xmm5); +vcvtph2psx(ymm1, ptr [rax+0x40]); +vcvtph2psx(ymm1, ptr_b [rax+0x40]); +vcvtph2psx(zmm1|k5|T_z|T_sae, ymm3); +vcvtph2psx(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2psx(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2udq(xmm1, xmm5); +vcvtph2udq(xmm1, ptr [rax+0x40]); +vcvtph2udq(xmm1, ptr_b [rax+0x40]); +vcvtph2udq(ymm1|k2|T_z, xmm5); +vcvtph2udq(ymm1, ptr [rax+0x40]); +vcvtph2udq(ymm1, ptr_b [rax+0x40]); +vcvtph2udq(zmm1|k5|T_z|T_rd_sae, ymm3); +vcvtph2udq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2dq(xmm1, xmm5); +vcvttph2dq(xmm1, ptr [rax+0x40]); +vcvttph2dq(xmm1, ptr_b [rax+0x40]); +vcvttph2dq(ymm1|k2|T_z, xmm5); +vcvttph2dq(ymm1, ptr [rax+0x40]); +vcvttph2dq(ymm1, ptr_b [rax+0x40]); +vcvttph2dq(zmm1|k5|T_z|T_sae, ymm3); +vcvttph2dq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2udq(xmm1, xmm5); +vcvttph2udq(xmm1, ptr [rax+0x40]); +vcvttph2udq(xmm1, ptr_b [rax+0x40]); +vcvttph2udq(ymm1|k2|T_z, xmm5); +vcvttph2udq(ymm1, ptr [rax+0x40]); +vcvttph2udq(ymm1, ptr_b [rax+0x40]); +vcvttph2udq(zmm1|k5|T_z|T_sae, ymm3); +vcvttph2udq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2pd(xmm1, xmm5); +vcvtph2pd(xmm1, ptr [rax+0x40]); +vcvtph2pd(xmm1, ptr_b [rax+0x40]); +vcvtph2pd(ymm1|k2|T_z, xmm5); +vcvtph2pd(ymm1, ptr [rax+0x40]); +vcvtph2pd(ymm1, ptr_b [rax+0x40]); +vcvtph2pd(zmm1|k5|T_z|T_sae, xmm3); +vcvtph2pd(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2pd(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2qq(xmm1, xmm5); +vcvtph2qq(xmm1, ptr [rax+0x40]); +vcvtph2qq(xmm1, ptr_b [rax+0x40]); +vcvtph2qq(ymm1|k2|T_z, xmm5); +vcvtph2qq(ymm1, ptr [rax+0x40]); +vcvtph2qq(ymm1, ptr_b [rax+0x40]); +vcvtph2qq(zmm1|k5|T_z|T_rd_sae, xmm3); +vcvtph2qq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2uqq(xmm1, xmm5); +vcvtph2uqq(xmm1, ptr [rax+0x40]); +vcvtph2uqq(xmm1, ptr_b [rax+0x40]); +vcvtph2uqq(ymm1|k2|T_z, xmm5); +vcvtph2uqq(ymm1, ptr [rax+0x40]); +vcvtph2uqq(ymm1, ptr_b [rax+0x40]); +vcvtph2uqq(zmm1|k5|T_z|T_rd_sae, xmm3); +vcvtph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2uqq(xmm1, xmm5); +vcvttph2uqq(xmm1, ptr [rax+0x40]); +vcvttph2uqq(xmm1, ptr_b [rax+0x40]); +vcvttph2uqq(ymm1|k2|T_z, xmm5); +vcvttph2uqq(ymm1, ptr [rax+0x40]); +vcvttph2uqq(ymm1, ptr_b [rax+0x40]); +vcvttph2uqq(zmm1|k5|T_z|T_sae, xmm3); +vcvttph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtdq2ph(xmm1, xmm5); +vcvtdq2ph(xmm1, xword [rax+0x40]); +vcvtdq2ph(xmm1, xword_b [rax+0x40]); +vcvtdq2ph(xmm1, yword [rax+0x40]); +vcvtdq2ph(xmm1, yword_b [rax+0x40]); +vcvtdq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtdq2ph(ymm1, ptr [rax+0x40]); +vcvtdq2ph(ymm1, ptr_b [rax+0x40]); +vcvtps2phx(xmm1, xmm5); +vcvtps2phx(xmm1, xword [rax+0x40]); +vcvtps2phx(xmm1, xword_b [rax+0x40]); +vcvtps2phx(xmm1, yword [rax+0x40]); +vcvtps2phx(xmm1, yword_b [rax+0x40]); +vcvtps2phx(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtps2phx(ymm1, ptr [rax+0x40]); +vcvtps2phx(ymm1, ptr_b [rax+0x40]); +vcvtudq2ph(xmm1, xmm5); +vcvtudq2ph(xmm1, xword [rax+0x40]); +vcvtudq2ph(xmm1, xword_b [rax+0x40]); +vcvtudq2ph(xmm1, yword [rax+0x40]); +vcvtudq2ph(xmm1, yword_b [rax+0x40]); +vcvtudq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtudq2ph(ymm1, ptr [rax+0x40]); +vcvtudq2ph(ymm1, ptr_b [rax+0x40]); +vcvtpd2ph(xmm1, xmm5); +vcvtpd2ph(xmm1, ymm5); +vcvtpd2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtpd2ph(xmm1, xword [rax+0x40]); +vcvtpd2ph(xmm1, xword_b [rax+0x40]); +vcvtpd2ph(xmm1, yword [rax+0x40]); +vcvtpd2ph(xmm1, yword_b [rax+0x40]); +vcvtpd2ph(xmm1, zword [rax+0x40]); +vcvtpd2ph(xmm1, zword_b [rax+0x40]); +vcvtqq2ph(xmm1, xmm5); +vcvtqq2ph(xmm1, ymm5); +vcvtqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtqq2ph(xmm1, xword [rax+0x40]); +vcvtqq2ph(xmm1, xword_b [rax+0x40]); +vcvtqq2ph(xmm1, yword [rax+0x40]); +vcvtqq2ph(xmm1, yword_b [rax+0x40]); +vcvtqq2ph(xmm1, zword [rax+0x40]); +vcvtqq2ph(xmm1, zword_b [rax+0x40]); +vcvtuqq2ph(xmm1, xmm5); +vcvtuqq2ph(xmm1, ymm5); +vcvtuqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtuqq2ph(xmm1, xword [rax+0x40]); +vcvtuqq2ph(xmm1, xword_b [rax+0x40]); +vcvtuqq2ph(xmm1, yword [rax+0x40]); +vcvtuqq2ph(xmm1, yword_b [rax+0x40]); +vcvtuqq2ph(xmm1, zword [rax+0x40]); +vcvtuqq2ph(xmm1, zword_b [rax+0x40]); +vcvtph2uw(xmm1, xmm5); +vcvtph2uw(xmm1, ptr [rax+0x40]); +vcvtph2uw(xmm1, ptr_b [rax+0x40]); +vcvtph2uw(ymm1, ptr [rax+0x40]); +vcvtph2uw(ymm1, ptr_b [rax+0x40]); +vcvtph2uw(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtph2uw(zmm1, ptr [rax+0x40]); +vcvtph2uw(zmm1, ptr_b [rax+0x40]); +vcvtph2w(xmm1, xmm5); +vcvtph2w(xmm1, ptr [rax+0x40]); +vcvtph2w(xmm1, ptr_b [rax+0x40]); +vcvtph2w(ymm1, ptr [rax+0x40]); +vcvtph2w(ymm1, ptr_b [rax+0x40]); +vcvtph2w(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtph2w(zmm1, ptr [rax+0x40]); +vcvtph2w(zmm1, ptr_b [rax+0x40]); +vcvttph2uw(xmm1, xmm5); +vcvttph2uw(xmm1, ptr [rax+0x40]); +vcvttph2uw(xmm1, ptr_b [rax+0x40]); +vcvttph2uw(ymm1, ptr [rax+0x40]); +vcvttph2uw(ymm1, ptr_b [rax+0x40]); +vcvttph2uw(zmm1|k2|T_z|T_sae, zmm5); +vcvttph2uw(zmm1, ptr [rax+0x40]); +vcvttph2uw(zmm1, ptr_b [rax+0x40]); +vcvttph2w(xmm1, xmm5); +vcvttph2w(xmm1, ptr [rax+0x40]); +vcvttph2w(xmm1, ptr_b [rax+0x40]); +vcvttph2w(ymm1, ptr [rax+0x40]); +vcvttph2w(ymm1, ptr_b [rax+0x40]); +vcvttph2w(zmm1|k2|T_z|T_sae, zmm5); +vcvttph2w(zmm1, ptr [rax+0x40]); +vcvttph2w(zmm1, ptr_b [rax+0x40]); +vcvtuw2ph(xmm1, xmm5); +vcvtuw2ph(xmm1, ptr [rax+0x40]); +vcvtuw2ph(xmm1, ptr_b [rax+0x40]); +vcvtuw2ph(ymm1, ptr [rax+0x40]); +vcvtuw2ph(ymm1, ptr_b [rax+0x40]); +vcvtuw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtuw2ph(zmm1, ptr [rax+0x40]); +vcvtuw2ph(zmm1, ptr_b [rax+0x40]); +vcvtw2ph(xmm1, xmm5); +vcvtw2ph(xmm1, ptr [rax+0x40]); +vcvtw2ph(xmm1, ptr_b [rax+0x40]); +vcvtw2ph(ymm1, ptr [rax+0x40]); +vcvtw2ph(ymm1, ptr_b [rax+0x40]); +vcvtw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtw2ph(zmm1, ptr [rax+0x40]); +vcvtw2ph(zmm1, ptr_b [rax+0x40]); +vcvtps2ph(xmm1, xmm2, 0x1); +vcvtps2ph(ptr [rax+0x40], xmm2, 0x2); +vcvtps2ph(xmm1, ymm2, 0x3); +vcvtps2ph(ptr [rax+0x40], ymm2, 0x4); +vcvtps2ph(xmm1|k1|T_z, xmm2, 0x5); +vcvtps2ph(ptr [rax+0x40]|k1, xmm3, 0x6); +vcvtps2ph(xmm1|k2, ymm4, 0x7); +vcvtps2ph(ptr [rax+0x40]|k2, ymm5, 0x8); +vcvtps2ph(ymm1|k2|T_sae, zmm5, 0x9); +vcvtps2ph(ptr [rax+0x40]|k5, zmm4, 0xa); +vcvtsh2usi(ecx|T_rd_sae, xmm1); +vcvtsh2usi(eax, ptr [rax+0x40]); +vcvtsh2usi(r9|T_rd_sae, xmm1); +vcvtsh2usi(r13, ptr [rax+0x40]); +vcvttsh2si(ecx|T_sae, xmm1); +vcvttsh2si(eax, ptr [rax+0x40]); +vcvttsh2si(r9|T_sae, xmm1); +vcvttsh2si(r13, ptr [rax+0x40]); +vcvttsh2usi(ecx|T_sae, xmm1); +vcvttsh2usi(eax, ptr [rax+0x40]); +vcvttsh2usi(r9|T_sae, xmm1); +vcvttsh2usi(r13, ptr [rax+0x40]); +vcvttph2qq(xmm1, xmm5); +vcvttph2qq(xmm1, ptr [rax+0x40]); +vcvttph2qq(xmm1, ptr_b [rax+0x40]); +vcvttph2qq(ymm1|k2|T_z, xmm5); +vcvttph2qq(ymm1, ptr [rax+0x40]); +vcvttph2qq(ymm1, ptr_b [rax+0x40]); +vcvttph2qq(zmm1|k5|T_z|T_sae, xmm3); +vcvttph2qq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtsi2sh(xmm1|T_rd_sae, xmm2, eax); +vcvtsi2sh(xmm1, xmm2, dword [rax+0x40]); +vcvtsi2sh(xmm1|T_rd_sae, xmm2, r9); +vcvtsi2sh(xmm1, xmm2, qword [rax+0x40]); +vcvtusi2sh(xmm1|T_rd_sae, xmm2, eax); +vcvtusi2sh(xmm1, xmm2, dword [rax+0x40]); +vcvtusi2sh(xmm1|T_rd_sae, xmm2, r9); +vcvtusi2sh(xmm1, xmm2, qword [rax+0x40]); +aadd(ptr[rax], ecx); +aadd(ptr[eax], ecx); +aadd(ptr[rax], r10); +aand(ptr[rax], ecx); +aand(ptr[eax], ecx); +aand(ptr[rax], r10); +aor(ptr[rax], ecx); +aor(ptr[eax], ecx); +aor(ptr[rax], r10); +axor(ptr[rax], ecx); +axor(ptr[eax], ecx); +axor(ptr[rax], r10); +cmpbexadd(ptr[rax+r10*4], rcx, rdx); +cmpbxadd(ptr[rax+r10*4], rcx, rdx); +cmplexadd(ptr[rax+r10*4], rcx, rdx); +cmplxadd(ptr[rax+r10*4], rcx, rdx); +cmpnbexadd(ptr[rax+r10*4], rcx, rdx); +cmpnbxadd(ptr[rax+r10*4], rcx, rdx); +cmpnlexadd(ptr[rax+r10*4], rcx, rdx); +cmpnlxadd(ptr[rax+r10*4], rcx, rdx); +cmpnoxadd(ptr[rax+r10*4], rcx, rdx); +cmpnpxadd(ptr[rax+r10*4], rcx, rdx); +cmpnsxadd(ptr[rax+r10*4], rcx, rdx); +cmpnzxadd(ptr[rax+r10*4], rcx, rdx); +cmpoxadd(ptr[rax+r10*4], rcx, rdx); +cmppxadd(ptr[rax+r10*4], rcx, rdx); +cmpsxadd(ptr[rax+r10*4], rcx, rdx); +cmpzxadd(ptr[rax+r10*4], rcx, rdx); +vsha512msg1(ymm3, xmm5); +vsha512msg2(ymm9, ymm10); +vsha512rnds2(ymm1, ymm3, xmm2); +vsm3msg1(xmm1, xmm2, xmm3); +vsm3msg1(xmm1, xmm2, ptr [rax]); +vsm3msg2(xmm5, xmm7, xmm3); +vsm3msg2(xmm5, xmm6, ptr [rax]); +vsm3rnds2(xmm5, xmm7, xmm3, 0x12); +vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34); +vsm4key4(xmm1, xmm2, xmm3); +vsm4key4(xmm1, xmm2, ptr [rdx]); +vsm4rnds4(xmm1, xmm2, xmm3); +vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]); +vpdpbssd(xmm1, xmm2, xmm3); +vpdpbssd(ymm1, ymm2, ptr [rax]); +vpdpbssds(xmm1, xmm2, xmm3); +vpdpbssds(ymm1, ymm2, ptr [rax]); +vpdpbsud(xmm1, xmm2, xmm3); +vpdpbsud(ymm1, ymm2, ptr [rax]); +vpdpbsuds(xmm1, xmm2, xmm3); +vpdpbsuds(ymm1, ymm2, ptr [rax]); +vpdpbuud(xmm1, xmm2, xmm3); +vpdpbuud(ymm1, ymm2, ptr [rax]); +vpdpbuuds(xmm1, xmm2, xmm3); +vpdpbuuds(ymm1, ymm2, ptr [rax]); +vpdpwsud(xmm1, xmm2, xmm3); +vpdpwsud(ymm1, ymm2, ptr [rax]); +vpdpwsuds(xmm1, xmm2, xmm3); +vpdpwsuds(ymm1, ymm2, ptr [rax]); +vpdpwusd(xmm1, xmm2, xmm3); +vpdpwusd(ymm1, ymm2, ptr [rax]); +vpdpwusds(xmm1, xmm2, xmm3); +vpdpwusds(ymm1, ymm2, ptr [rax]); +vpdpwuud(xmm1, xmm2, xmm3); +vpdpwuud(ymm1, ymm2, ptr [rax]); +vpdpwuuds(xmm1, xmm2, xmm3); +vpdpwuuds(ymm1, ymm2, ptr [rax]); diff --git a/test/test_by_xed.cpp b/test/test_by_xed.cpp index 08dc8af..93c370c 100644 --- a/test/test_by_xed.cpp +++ b/test/test_by_xed.cpp @@ -3,8 +3,9 @@ struct Code : Xbyak::CodeGenerator { Code() + : Xbyak::CodeGenerator(4096*8) { -#include "cpp.txt" +#include "tmp.cpp" } }; diff --git a/test/test_by_xed.py b/test/test_by_xed.py index f24d7f6..3e4b98f 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -7,6 +7,25 @@ class Reg: self.name = s def __str__(self): return self.name + def __eq__(self, rhs): + return self.name == rhs.name + def __lt__(self, rhs): + return self.name < rhs.name + +g_xmmTbl = ''' +xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 +xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 +xmm16 xmm17 xmm18 xmm19 xmm20 xmm21 xmm22 xmm23 +xmm24 xmm25 xmm26 xmm27 xmm28 xmm29 xmm30 xmm31 +ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 +ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 +ymm16 ymm17 ymm18 ymm19 ymm20 ymm21 ymm22 ymm23 +ymm24 ymm25 ymm26 ymm27 ymm28 ymm29 ymm30 ymm31 +zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 +zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 +zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 +zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31 +'''.split() g_regTbl = ''' eax ecx edx ebx esp ebp esi edi @@ -22,49 +41,53 @@ r16w r17w r18w r19w r20w r21w r22w r23w r24w r25w r26w r27w r28w r29w r30w r31w r8b r9b r10b r11b r12b r13b r14b r15b r16b r17b r18b r19b r20b r21b r22b r23b r24b r25b r26b r27b r28b r29b r30b r31b spl bpl sil dil -xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 -xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 -xmm16 xmm17 xmm18 xmm19 xmm20 xmm21 xmm22 xmm23 -xmm24 xmm25 xmm26 xmm27 xmm28 xmm29 xmm30 xmm31 -ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 -ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 -ymm16 ymm17 ymm18 ymm19 ymm20 ymm21 ymm22 ymm23 -ymm24 ymm25 ymm26 ymm27 ymm28 ymm29 ymm30 ymm31 -zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 -zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 -zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 -zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31 -'''.split() +tmm0 tmm1 tmm2 tmm3 tmm4 tmm5 tmm6 tmm7 +'''.split()+g_xmmTbl # define global constants for e in g_regTbl: globals()[e] = Reg(e) +g_maskTbl = [k1, k2, k3, k4, k5, k6, k7] + g_replaceCharTbl = '{}();|,' g_replaceChar = str.maketrans(g_replaceCharTbl, ' '*len(g_replaceCharTbl)) g_sizeTbl = ['byte', 'word', 'dword', 'qword', 'xword', 'yword', 'zword'] -g_attrTbl = ['T_sae', 'T_rn_sae', 'T_rd_sae', 'T_ru_sae', 'T_rz_sae'] #, 'T_z'] -g_attrXedTbl = ['sae', 'rne-sae', 'rd-sae', 'ru-sae', 'rz-sae'] +g_xedSizeTbl = ['xmmword', 'ymmword', 'zmmword'] +g_attrTbl = ['T_sae', 'T_rn_sae', 'T_rd_sae', 'T_ru_sae', 'T_rz_sae', 'T_z'] +g_attrXedTbl = ['sae', 'rne-sae', 'rd-sae', 'ru-sae', 'rz-sae', 'z'] class Attr: def __init__(self, s): self.name = s def __str__(self): return self.name + def __eq__(self, rhs): + return self.name == rhs.name + def __lt__(self, rhs): + return self.name < rhs.name for e in g_attrTbl: globals()[e] = Attr(e) +def newReg(s): + if type(s) == str: + return Reg(s) + return s + class Memory: - def __init__(self, size=0, base=None, index=None, scale=0, disp=0): + def __init__(self, size=0, base=None, index=None, scale=0, disp=0, broadcast=False): self.size = size - self.base = base - self.index = index + self.base = newReg(base) + self.index = newReg(index) self.scale = scale self.disp = disp + self.broadcast = broadcast def __str__(self): s = 'ptr' if self.size == 0 else g_sizeTbl[int(math.log2(self.size))] + if self.broadcast: + s += '_b' s += ' [' needPlus = False if self.base: @@ -84,47 +107,72 @@ class Memory: s += ']' return s - def __eq__(self, rhs): - return str(self) == str(rhs) + # xbyak uses ptr if it is automatically detected, so xword == ptr is true + if self.broadcast != rhs.broadcast: return False +# if not self.broadcast and 0 < self.size <= 8 and 0 < rhs.size <= 8 and self.size != rhs.size: return False + if not self.broadcast and self.size > 0 and rhs.size > 0 and self.size != rhs.size: return False + r = self.base == rhs.base and self.index == rhs.index and self.scale == rhs.scale and self.disp == rhs.disp + return r -def parseMemory(s): - sizeTbl = { - 'byte': 1, 'word': 2, 'dword': 4, 'qword': 8, - 'xword': 16, 'yword': 32, 'zword': 64 - } +def parseBroadcast(s): + if '_b' in s: + return (s.replace('_b', ''), True) + r = re.search(r'({1to\d+})', s) + if not r: + return (s, False) + return (s.replace(r.group(1), ''), True) + +def parseMemory(s, broadcast=False): + org_s = s s = s.replace(' ', '').lower() - # Parse size size = 0 + base = index = None + scale = 0 + disp = 0 + + if not broadcast: + (s, broadcast) = parseBroadcast(s) + + # Parse size for i in range(len(g_sizeTbl)): w = g_sizeTbl[i] if s.startswith(w): size = 1< 0: s += ', ' s += str(self.args[i]) - for e in self.attrs: - s += f'|{e}' + if i == 0 and self.attrs: + for e in self.attrs: + s += f'|{e}' s += ');' return s + def __eq__(self, rhs): + return self.name == rhs.name and self.args == rhs.args and self.attrs == rhs.attrs def parseNmemonic(s): + args = [] + attrs = [] + + (s, broadcast) = parseBroadcast(s) + + # replace xm0 with xmm0 + while True: + r = re.search(r'([xyz])m(\d\d?)', s) + if not r: + break + s = s.replace(r.group(0), r.group(1) + 'mm' + r.group(2)) + + # check 'zmm0{k7}' + r = re.search(r'({k[1-7]})', s) + if r: + idx = int(r.group(1)[2]) + attrs.append(g_maskTbl[idx-1]) + s = s.replace(r.group(1), '') + # check 'zmm0|k7' + r = re.search(r'(\|\s*k[1-7])', s) + if r: + idx = int(r.group(1)[-1]) + attrs.append(g_maskTbl[idx-1]) + s = s.replace(r.group(1), '') + s = s.translate(g_replaceChar) # reconstruct memory string @@ -168,13 +244,12 @@ def parseNmemonic(s): inMemory = False else: v.append(e) - if e in g_sizeTbl or e == 'ptr': + if e in g_sizeTbl or e in g_xedSizeTbl or e.startswith('ptr'): v[-1] += ' ' # to avoid 'byteptr' - inMemory = True + if ']' not in v[-1]: + inMemory = True name = v[0] - args = [] - attrs = [] for e in v[1:]: if e.startswith('0x'): args.append(int(e, 16)) @@ -185,9 +260,12 @@ def parseNmemonic(s): elif e in g_attrXedTbl: attrs.append(Attr(g_attrTbl[g_attrXedTbl.index(e)])) elif e in g_regTbl: - args.append(e) + args.append(Reg(e)) + # xed special format : xmm8+3 + elif e[:-2] in g_xmmTbl and e.endswith('+3'): + args.append(Reg(e[:-2])) else: - args.append(parseMemory(e)) + args.append(parseMemory(e, broadcast)) return Nmemonic(name, args, attrs) def loadFile(name): @@ -215,13 +293,17 @@ def run(cppText, xedText): m1 = parseNmemonic(line1) m2 = parseNmemonic(line2) - assertEqualStr(m1, m2, f'{i}') + assertEqual(m1, m2, f'{i+1}') print('run ok') def assertEqualStr(a, b, msg=None): if str(a) != str(b): raise Exception(f'assert fail {msg}:', str(a), str(b)) +def assertEqual(a, b, msg=None): + if a != b: + raise Exception(f'assert fail {msg}:', str(a), str(b)) + def MemoryTest(): tbl = [ (Memory(0, rax), 'ptr [rax]'), @@ -231,18 +313,23 @@ def MemoryTest(): (Memory(8, None, rcx, 4), 'qword [rcx*4]'), (Memory(8, rax, None, 0, 5), 'qword [rax+0x5]'), (Memory(8, None, None, 0, 255), 'qword [0xff]'), + (Memory(0, r8, r9, 1, 32), 'ptr [r8+r9+0x20]'), ] for (m, expected) in tbl: assertEqualStr(m, expected) + assertEqual(Memory(16, rax), Memory(0, rax)) + def parseMemoryTest(): print('parseMemoryTest') tbl = [ ('[]', Memory()), ('[rax]', Memory(0, rax)), ('ptr[rax]', Memory(0, rax)), + ('ptr_b[rax]', Memory(0, rax, broadcast=True)), ('dword[rbx]', Memory(4, rbx)), ('xword ptr[rcx]', Memory(16, rcx)), + ('xmmword ptr[rcx]', Memory(16, rcx)), ('xword ptr[rdx*8]', Memory(16, None, rdx, 8)), ('[12345]', Memory(0, None, None, 0, 12345)), ('[0x12345]', Memory(0, None, None, 0, 0x12345)), @@ -262,10 +349,19 @@ def parseNmemonicTest(): ('mov(rax, ptr [rcx + rdx * 8 ] );', Nmemonic('mov', [rax, Memory(0, rcx, rdx, 8)])), ('vcmppd(k1, ymm2, ymm3 |T_sae, 3);', Nmemonic('vcmppd', [k1, ymm2, ymm3, 3], [T_sae])), ('vcmppd k1{sae}, ymm2, ymm3, 0x3', Nmemonic('vcmppd', [k1, ymm2, ymm3, 3], [T_sae])), + ('v4fmaddps zmm1, zmm8+3, xmmword ptr [rdx+0x40]', Nmemonic('v4fmaddps', [zmm1, zmm8, Memory(16, rdx, None, 0, 0x40)])), + ('vp4dpwssd zmm23{k7}{z}, zmm1+3, xmmword ptr [rax+0x40]', Nmemonic('vp4dpwssd', [zmm23, zmm1, Memory(16, rax, None, 0, 0x40)], [k7, T_z])), + ('v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]);', Nmemonic('v4fnmaddps', [zmm5, zmm2, Memory(0, rcx, None, 0, 0x80)], [k5])), + ('vpcompressw(zmm30 | k2 |T_z, zmm1);', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])), + ('vpcompressw zmm30{k2}{z}, zmm1', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])), + ('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])), + ('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), + ('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), + ('vcmpph(k1, xm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xm15, Memory(0, rax, None, 0, 64), 1])), ] for (s, expected) in tbl: e = parseNmemonic(s) - assertEqualStr(e, expected) + assertEqual(e, expected) def test(): print('test start') diff --git a/test/test_by_xed.sh b/test/test_by_xed.sh index 6d820bd..a1d3629 100755 --- a/test/test_by_xed.sh +++ b/test/test_by_xed.sh @@ -15,9 +15,9 @@ TARGET=$1 CFLAGS="-Wall -Wextra -I ../" echo "test:" $TARGET -cp $TARGET cpp.txt +cp $TARGET tmp.cpp $CXX $CFLAGS test_by_xed.cpp -o test_by_xed ./test_by_xed $XED -64 -ir bin > out.txt -$PYTHON test_by_xed.py cpp.txt out.txt +$PYTHON test_by_xed.py $TARGET out.txt -- cgit v1.2.3 From 903a23ba0b591cc6e1b37bcd350e1a8475a054c9 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 10:26:05 +0900 Subject: add test by xed --- .github/workflows/main.yml | 7 ++++++- test/test_by_xed.sh | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 814a85b..3ab9d6b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,6 +19,11 @@ jobs: steps: - uses: actions/checkout@v4 - run: apt -y update - - run: apt -y install g++-multilib libboost-dev make nasm yasm + - run: apt -y install g++-multilib libboost-dev make nasm yasm wget - run: make test - run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION" + - run: | + cd test + wget https://downloadmirror.intel.com/831748/sde-external-9.44.0-2024-08-22-lin.tar.xz + tar xvf sde-external-9.44.0-2024-08-22-lin.tar.xz + env XED=sde-external-9.44.0-2024-08-22-lin/xed64 make xed_test diff --git a/test/test_by_xed.sh b/test/test_by_xed.sh index a1d3629..905b8a0 100755 --- a/test/test_by_xed.sh +++ b/test/test_by_xed.sh @@ -4,6 +4,7 @@ set -e XED=${XED:=xed} CXX=${CXX:=g++} PYTHON=${PYTHON:=python3} +echo $XED if [ $# -ne 1 ]; then echo "./test_by_xed.sh " -- cgit v1.2.3 From c35fc541b8842b8885ccc2ec148fe334378d61fa Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 10:35:05 +0900 Subject: install xz-util for github action --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3ab9d6b..a4a503c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,7 +19,7 @@ jobs: steps: - uses: actions/checkout@v4 - run: apt -y update - - run: apt -y install g++-multilib libboost-dev make nasm yasm wget + - run: apt -y install g++-multilib libboost-dev make nasm yasm wget xz-utils - run: make test - run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION" - run: | -- cgit v1.2.3 From 183e17f94d4ae2f72d6bc84063c1c01cf2512eb5 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 10:39:42 +0900 Subject: install python3 for github action --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a4a503c..3d520a3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,7 +19,7 @@ jobs: steps: - uses: actions/checkout@v4 - run: apt -y update - - run: apt -y install g++-multilib libboost-dev make nasm yasm wget xz-utils + - run: apt -y install g++-multilib libboost-dev make nasm yasm wget xz-utils python3 - run: make test - run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION" - run: | -- cgit v1.2.3 From 864fd0c49ce07fc534b16250758987c445bb9c70 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 11:22:35 +0900 Subject: add vaddnepbf16 --- .gitignore | 1 + gen/gen_avx512.cpp | 22 ++ test/Makefile | 4 +- test/avx10/bf16.txt | 4 + test/avx10/new-ymm.txt | 149 +++++++++++ test/avx10/old.txt | 657 +++++++++++++++++++++++++++++++++++++++++++++++++ test/target/avx10.txt | 149 ----------- test/target/misc.txt | 657 ------------------------------------------------- test/test_by_xed.py | 5 +- xbyak/xbyak_mnemonic.h | 1 + 10 files changed, 840 insertions(+), 809 deletions(-) create mode 100644 test/avx10/bf16.txt create mode 100644 test/avx10/new-ymm.txt create mode 100644 test/avx10/old.txt delete mode 100644 test/target/avx10.txt delete mode 100644 test/target/misc.txt diff --git a/.gitignore b/.gitignore index 24b0b1d..507091e 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /build* # cmake +*CVS diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 79ec79a..23923b0 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -944,6 +944,22 @@ void putFP16_2() } } +void putAVX10_BF16() +{ + const struct Tbl { + const char *name; + uint64_t type; + uint8_t code; + } tbl[] = { + { "vaddnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x58 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl& p = tbl[i]; + std::string s = type2String(p.type | T_MUST_EVEX); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%2X); }\n" , p.name, s.c_str(), p.code); + } +} + void putFP16() { putFP16_1(); @@ -952,6 +968,11 @@ void putFP16() putFP16_2(); } +void putAVX10() +{ + putAVX10_BF16(); +} + int main(int argc, char *[]) { bool only64bit = argc == 2; @@ -977,4 +998,5 @@ int main(int argc, char *[]) putScatter(); putV4FMA(); putFP16(); + putAVX10(); } diff --git a/test/Makefile b/test/Makefile index ca2f0bb..4d0b85d 100644 --- a/test/Makefile +++ b/test/Makefile @@ -60,9 +60,9 @@ apx: apx.cpp $(XBYAK_INC) avx10_test: avx10_test.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 -TEST_FILES=avx10.txt misc.txt +TEST_FILES=old.txt new-ymm.txt bf16.txt xed_test: - @for target in $(addprefix target/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done + @for target in $(addprefix avx10/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done test_nm: normalize_prefix $(TARGET) $(MAKE) -C ../gen diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt new file mode 100644 index 0000000..1c77f93 --- /dev/null +++ b/test/avx10/bf16.txt @@ -0,0 +1,4 @@ +vaddnepbf16(xm1, xm2, xm3); +vaddnepbf16(ym1|k1, ym2, ptr[rax+128]); +vaddnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vaddnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); diff --git a/test/avx10/new-ymm.txt b/test/avx10/new-ymm.txt new file mode 100644 index 0000000..8ee52ca --- /dev/null +++ b/test/avx10/new-ymm.txt @@ -0,0 +1,149 @@ +vaddpd(ymm1, ymm2, ymm3 |T_rn_sae); +vaddph(ymm1, ymm2, ymm3 |T_rn_sae); +vaddps(ymm1, ymm2, ymm3 |T_rn_sae); +vcmppd(k1, ymm2, ymm3 |T_sae, 3); +vcmpph(k1, ymm2, ymm3 |T_sae, 3); +vcmpps(k1, ymm2, ymm3 |T_sae, 3); +vcvtdq2ph(xmm1, ymm2 |T_rn_sae); +vcvtdq2ps(ymm1, ymm2 |T_rn_sae); +vcvtpd2dq(xmm1, ymm2 |T_rn_sae); +vcvtpd2ph(xmm1, ymm2 |T_rn_sae); +vcvtpd2ps(xmm1, ymm2 |T_rn_sae); +vcvtpd2qq(ymm1, ymm2 |T_rn_sae); +vcvtpd2udq(xmm1, ymm2 |T_rn_sae); +vcvtpd2uqq(ymm1, ymm2 |T_rn_sae); +vcvtph2dq(ymm1, xmm2 |T_rn_sae); +vcvtph2pd(ymm1, xmm2 |T_sae); +vcvtph2ps(ymm1, xmm2 |T_sae); +vcvtph2psx(ymm1, xmm2 |T_sae); +vcvtph2qq(ymm1, xmm2 |T_rn_sae); +vcvtph2udq(ymm1, xmm2 |T_rn_sae); +vcvtph2uqq(ymm1, xmm2 |T_rn_sae); +vcvtph2uw(ymm1, ymm2 |T_rn_sae); +vcvtph2w(ymm1, ymm2 |T_rn_sae); +vcvtps2dq(ymm1, ymm2 |T_rn_sae); +vcvtps2pd(ymm1, xmm2 |T_sae); +vcvtps2ph(xmm1, ymm2 |T_sae, 3); +vcvtps2phx(xmm1, ymm2 |T_rn_sae); +vcvtps2qq(ymm1, xmm2 |T_rn_sae); +vcvtps2udq(ymm1, ymm2 |T_rn_sae); +vcvtps2uqq(ymm1, xmm2 |T_rn_sae); +vcvtqq2pd(ymm1, ymm2 |T_rn_sae); +vcvtqq2ph(xmm1, ymm2 |T_rn_sae); +vcvtqq2ps(xmm1, ymm2 |T_rn_sae); +vcvttpd2dq(xmm1, ymm2 |T_sae); +vcvttpd2qq(ymm1, ymm2 |T_sae); +vcvttpd2udq(xmm1, ymm2 |T_sae); +vcvttpd2uqq(ymm1, ymm2 |T_sae); +vcvttph2dq(ymm1, xmm2 |T_sae); +vcvttph2qq(ymm1, xmm2 |T_sae); +vcvttph2udq(ymm1, xmm2 |T_sae); +vcvttph2uqq(ymm1, xmm2 |T_sae); +vcvttph2uw(ymm1, ymm2 |T_sae); +vcvttph2w(ymm1, ymm2 |T_sae); +vcvttps2dq(ymm1, ymm2 |T_sae); +vcvttps2qq(ymm1, xmm2 |T_sae); +vcvttps2udq(ymm1, ymm2 |T_sae); +vcvttps2uqq(ymm1, xmm2 |T_sae); +vcvtudq2ph(xmm1, ymm2 |T_rn_sae); +vcvtudq2ps(ymm1, ymm2 |T_rn_sae); +vcvtuqq2pd(ymm1, ymm2 |T_rn_sae); +vcvtuqq2ph(xmm1, ymm2 |T_rn_sae); +vcvtuqq2ps(xmm1, ymm2 |T_rn_sae); +vcvtuw2ph(ymm1, ymm2 |T_rn_sae); +vcvtw2ph(ymm1, ymm2 |T_rn_sae); +vdivpd(ymm1, ymm2, ymm3 |T_rn_sae); +vdivph(ymm1, ymm2, ymm3 |T_rn_sae); +vdivps(ymm1, ymm2, ymm3 |T_rn_sae); +vfcmaddcph(ymm1, ymm2, ymm3 |T_rn_sae); +vfcmulcph(ymm1, ymm2, ymm3 |T_rn_sae); +vfixupimmpd(ymm1, ymm2, ymm3 |T_sae, 3); +vfixupimmps(ymm1, ymm2, ymm3 |T_sae, 3); +vfmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmaddcph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmaddsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmaddsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmaddsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmaddsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmaddsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmaddsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmaddsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmaddsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmaddsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsubadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsubadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsubadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsubadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsubadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsubadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsubadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsubadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfmsubadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfmulcph(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); +vfnmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); +vgetexppd(ymm1, ymm2 |T_sae); +vgetexpph(ymm1, ymm2 |T_sae); +vgetexpps(ymm1, ymm2 |T_sae); +vgetmantpd(ymm1, ymm2 |T_sae, 3); +vgetmantph(ymm1, ymm2 |T_sae, 3); +vgetmantps(ymm1, ymm2 |T_sae, 3); +vmaxpd(ymm1, ymm2, ymm3 |T_sae); +vmaxph(ymm1, ymm2, ymm3 |T_sae); +vmaxps(ymm1, ymm2, ymm3 |T_sae); +vminpd(ymm1, ymm2, ymm3 |T_sae); +vminph(ymm1, ymm2, ymm3 |T_sae); +vminps(ymm1, ymm2, ymm3 |T_sae); +vmulpd(ymm1, ymm2, ymm3 |T_rn_sae); +vmulph(ymm1, ymm2, ymm3 |T_rn_sae); +vmulps(ymm1, ymm2, ymm3 |T_rn_sae); +vrangepd(ymm1, ymm2, ymm3 |T_sae, 3); +vrangeps(ymm1, ymm2, ymm3 |T_sae, 3); +vreducepd(ymm1, ymm2 |T_sae, 3); +vreduceph(ymm1, ymm2 |T_sae, 3); +vreduceps(ymm1, ymm2 |T_sae, 3); +vrndscalepd(ymm1, ymm2 |T_sae, 3); +vrndscaleph(ymm1, ymm2 |T_sae, 3); +vrndscaleps(ymm1, ymm2 |T_sae, 3); +vscalefpd(ymm1, ymm2, ymm3 |T_rn_sae); +vscalefph(ymm1, ymm2, ymm3 |T_rn_sae); +vscalefps(ymm1, ymm2, ymm3 |T_rn_sae); +vsqrtpd(ymm1, ymm2 |T_rn_sae); +vsqrtph(ymm1, ymm2 |T_rn_sae); +vsqrtps(ymm1, ymm2 |T_rn_sae); +vsubpd(ymm1, ymm2, ymm3 |T_rn_sae); +vsubph(ymm1, ymm2, ymm3 |T_rn_sae); +vsubps(ymm1, ymm2, ymm3 |T_rn_sae); diff --git a/test/avx10/old.txt b/test/avx10/old.txt new file mode 100644 index 0000000..9e4f097 --- /dev/null +++ b/test/avx10/old.txt @@ -0,0 +1,657 @@ +v4fmaddps(zmm1, zmm8, ptr [rdx + 64]); +v4fmaddss(xmm15, xmm8, ptr [rax + 64]); +v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]); +v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]); +vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]); +vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]); +vaesdec(xmm20, xmm30, ptr [rcx + 64]); +vaesdec(ymm1, ymm2, ptr [rcx + 64]); +vaesdec(zmm1, zmm2, ptr [rcx + 64]); +vaesdeclast(xmm20, xmm30, ptr [rax + 64]); +vaesdeclast(ymm20, ymm30, ptr [rax + 64]); +vaesdeclast(zmm20, zmm30, ptr [rax + 64]); +vaesenc(xmm20, xmm30, ptr [rcx + 64]); +vaesenc(ymm1, ymm2, ptr [rcx + 64]); +vaesenc(zmm1, zmm2, ptr [rcx + 64]); +vaesenclast(xmm20, xmm30, ptr [rax + 64]); +vaesenclast(ymm20, ymm30, ptr [rax + 64]); +vaesenclast(zmm20, zmm30, ptr [rax + 64]); +vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3); +vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3); +vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3); +vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3); +vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3); +vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3); +vpcompressb(ptr[rax + 64], xmm1); +vpcompressb(xmm30 | k5, xmm1); +vpcompressb(ptr[rax + 64], ymm1); +vpcompressb(ymm30 | k3 |T_z, ymm1); +vpcompressb(ptr[rax + 64], zmm1); +vpcompressb(zmm30 | k2 |T_z, zmm1); +vpcompressw(ptr[rax + 64], xmm1); +vpcompressw(xmm30 | k5, xmm1); +vpcompressw(ptr[rax + 64], ymm1); +vpcompressw(ymm30 | k3 |T_z, ymm1); +vpcompressw(ptr[rax + 64], zmm1); +vpcompressw(zmm30 | k2 |T_z, zmm1); +vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); +vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); +vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); +vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); +vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); +vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); +vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); +vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); +vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); +vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); +vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); +vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); +vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]); +vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpexpandb(xmm5|k3|T_z, xmm30); +vpexpandb(ymm5|k3|T_z, ymm30); +vpexpandb(zmm5|k3|T_z, zmm30); +vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]); +vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(xmm5|k3|T_z, xmm30); +vpexpandw(ymm5|k3|T_z, ymm30); +vpexpandw(zmm5|k3|T_z, zmm30); +vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]); +gf2p8affineinvqb(xmm1, xmm2, 3); +gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3); +vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3); +vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5); +vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5); +vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5); +vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); +vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); +vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); +gf2p8affineqb(xmm1, xmm2, 3); +gf2p8affineqb(xmm1, ptr [rax + 0x40], 3); +vgf2p8affineqb(xmm1, xmm5, xmm2, 3); +vgf2p8affineqb(ymm1, ymm5, ymm2, 3); +vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3); +vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3); +vgf2p8affineqb(xmm30, xmm31, xmm4, 5); +vgf2p8affineqb(ymm30, ymm31, ymm4, 5); +vgf2p8affineqb(zmm30, zmm31, zmm4, 5); +vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); +vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); +vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); +gf2p8mulb(xmm1, xmm2); +gf2p8mulb(xmm1, ptr [rax + 0x40]); +vgf2p8mulb(xmm1, xmm5, xmm2); +vgf2p8mulb(ymm1, ymm5, ymm2); +vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]); +vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]); +vgf2p8mulb(xmm30, xmm31, xmm4); +vgf2p8mulb(ymm30, ymm31, ymm4); +vgf2p8mulb(zmm30, zmm31, zmm4); +vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]); +vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]); +vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]); +vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]); +vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]); +vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]); +vcvtneps2bf16(xmm0, xword [rax + 64]); +vcvtneps2bf16(xmm0 | k1, yword [rax + 64]); +vcvtneps2bf16(ymm0 | k1, zword [rax + 64]); +vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]); +vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]); +vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]); +vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]); +ldtilecfg(ptr[rax + rcx * 4 + 64]); +sttilecfg(ptr[rsp + rax * 8 + 128]); +tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]); +tileloaddt1(tmm4, ptr[r8 + r9 + 32]); +tilerelease(); +tilestored(ptr[r10 + r11 * 2 + 32], tmm2); +tilezero(tmm7); +tdpbssd(tmm1, tmm2, tmm3); +tdpbsud(tmm2, tmm3, tmm4); +tdpbusd(tmm3, tmm4, tmm5); +tdpbuud(tmm4, tmm5, tmm6); +tdpbf16ps(tmm5, tmm6, tmm7); +tileloadd(tmm1, ptr[r8+r8]); +tileloadd(tmm1, ptr[rax+rcx*4]); +tileloadd(tmm1, ptr[r8+r9*1+0x40]); +vaddph(zmm0, zmm1, ptr[rax+64]); +vaddph(ymm0, ymm1, ptr[rax+64]); +vaddph(xmm0, xmm1, ptr[rax+64]); +vaddph(zmm0, zmm1, ptr_b[rax+64]); +vaddph(ymm0, ymm1, ptr_b[rax+64]); +vaddph(xmm0, xmm1, ptr_b[rax+64]); +vaddsh(xmm0, xmm15, ptr[rax+64]); +vaddsh(xmm0|k5|T_z|T_rd_sae, xmm15, xmm3); +vcmpph(k1, xm15, ptr[rax+64], 1); +vcmpph(k2, ym15, ptr[rax+64], 2); +vcmpph(k3, zm15, ptr[rax+64], 3); +vcmpph(k1, xm15, ptr_b[rax+64], 1); +vcmpph(k2, ym15, ptr_b[rax+64], 2); +vcmpph(k3, zm15, ptr_b[rax+64], 3); +vcmpsh(k1, xm15, ptr[rax+64], 1); +vcmpsh(k3|k5, xmm1, xmm25|T_sae, 4); +vcomish(xmm1, ptr[rax+64]); +vcomish(xmm1|T_sae, xmm15); +vucomish(xmm1, ptr [rax+0x40]); +vucomish(xmm1|T_sae, xmm15); +vfmaddsub213ph(xmm1, xmm2, ptr [rax+0x40]); +vfmaddsub213ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmaddsub213ph(xmm1|k3, xmm2, xmm5); +vfmaddsub213ph(ymm1, ymm2, ptr [rax+0x40]); +vfmaddsub213ph(ymm1, ymm2, ptr_b[rax+0x40]); +vfmaddsub213ph(ymm1|k3, ymm2, ymm5); +vfmaddsub213ph(zmm1, zmm2, ptr [rax+0x40]); +vfmaddsub213ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmaddsub213ph(zmm1|T_ru_sae, zmm2, zmm5); +vfmsubadd132ph(xmm1, xmm2, ptr [rax+0x40]); +vfmsubadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmsubadd132ph(ymm1, ymm2, ptr [rax+0x40]); +vfmsubadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmsubadd132ph(zmm1, zmm2, ptr [rax+0x40]); +vfmsubadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmsubadd132ph(zmm1|T_ru_sae, zmm2, zmm5); +vfmadd132ph(xmm1, xmm2, ptr [rax+0x40]); +vfmadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmadd132ph(ymm1, ymm2, ptr [rax+0x40]); +vfmadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmadd132ph(zmm1, zmm2, ptr [rax+0x40]); +vfmadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmadd132ph(zmm1|T_rd_sae, zmm2, zmm5); +vfmsub231ph(xmm1, xmm2, ptr [rax+0x40]); +vfmsub231ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmsub231ph(ymm1, ymm2, ptr [rax+0x40]); +vfmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmsub231ph(zmm1, zmm2, ptr [rax+0x40]); +vfmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); +vfnmsub231ph(xmm1, xmm2, ptr [rax+0x40]); +vfnmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfnmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfnmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); +vfmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfmadd132sh(xmm1, xmm2, ptr [rax+0x40]); +vfnmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfnmadd132sh(xmm1, xmm2, ptr [rax+0x40]); +vfmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfmsub132sh(xmm1, xmm2, ptr [rax+0x40]); +vfnmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfnmsub132sh(xmm1, xmm2, ptr [rax+0x40]); +vfcmaddcph(xmm1|k1|T_z, xmm2, ptr [rax+0x40]); +vfcmaddcph(ymm1|k1|T_z, ymm2, ptr [rax+0x40]); +vfcmaddcph(zmm1|k1, zmm2, ptr [rax+0x40]); +vfcmaddcph(zmm1|k1|T_rd_sae, zmm2, zmm5); +vfcmaddcph(xmm1|k1|T_z, xmm2, ptr_b [rax+0x40]); +vfcmaddcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfcmaddcph(zmm1|k1|T_z, zmm2, ptr_b [rax+0x40]); +vfmaddcph(xm1, xm2, ptr[rax+0x40]); +vfmaddcph(ym1|k1|T_z, ym2, ptr_b[rax+0x40]); +vfmaddcph(zm1, zm2, ptr_b[rax+0x40]); +vfcmulcph(xmm1, xmm2, ptr [rax+0x40]); +vfcmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfcmulcph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmulcph(xmm1, xmm2, ptr [rax+0x40]); +vfmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfmulcph(zmm1, zmm2, ptr_b [rax+0x40]); +vrcpph(xmm1, ptr [rax+0x40]); +vrcpph(xmm1, ptr_b [rax+0x40]); +vrcpph(ymm1, ptr [rax+0x40]); +vrcpph(ymm1, ptr_b [rax+0x40]); +vrcpph(zmm1, ptr [rax+0x40]); +vrcpph(zmm1, ptr_b [rax+0x40]); +vrcpsh(xmm1, xmm3, ptr [rax+0x40]); +vrsqrtph(xmm1, ptr [rax+0x40]); +vrsqrtph(xmm1, ptr_b [rax+0x40]); +vrsqrtph(ymm2, ptr [rax+0x40]); +vrsqrtph(ymm2, ptr_b [rax+0x40]); +vrsqrtph(zmm2, ptr [rax+0x40]); +vrsqrtph(zmm2, ptr_b [rax+0x40]); +vrsqrtsh(xmm1|k5|T_z, xmm7, ptr [rax+0x40]); +vsqrtph(xmm1|k4|T_z, ptr [rax+0x40]); +vsqrtph(xmm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtph(ymm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtph(zmm1|k4|T_z, ptr [rax+0x40]); +vsqrtph(zmm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtsh(xmm1|k4|T_z, xmm5, ptr [rax+0x40]); +vsqrtsh(xmm1|k4|T_z|T_rd_sae, xmm5, xmm7); +vscalefph(xmm1, xmm5, ptr [rax+0x40]); +vscalefph(xmm1, xmm5, ptr_b [rax+0x40]); +vscalefph(ymm1, ymm5, ptr [rax+0x40]); +vscalefph(ymm1, ymm5, ptr_b [rax+0x40]); +vscalefph(zmm1, zmm5, ptr [rax+0x40]); +vscalefph(zmm1, zmm5, ptr_b [rax+0x40]); +vscalefph(zmm1|k1|T_z|T_rd_sae, zmm5, zmm7); +vscalefsh(xmm1, xmm5, ptr [rax+0x40]); +vscalefsh(xmm1|k1|T_z|T_rd_sae, xmm5, xmm7); +vreduceph(xmm1, ptr [rax+0x40], 0x1); +vreduceph(xmm1, ptr_b [rax+0x40], 0x2); +vreduceph(ymm1, ptr [rax+0x40], 0x3); +vreduceph(ymm1, ptr_b [rax+0x40], 0x4); +vreduceph(zmm1, ptr [rax+0x40], 0x5); +vreduceph(zmm1, ptr_b [rax+0x40], 0x6); +vreduceph(zmm1|k1|T_z|T_sae, zmm5, 0x7); +vreducesh(xmm1, xmm3, ptr [rax+0x40], 0x1); +vreducesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); +vrndscaleph(xmm1, ptr [rax+0x40], 0x1); +vrndscaleph(xmm1, ptr_b [rax+0x40], 0x2); +vrndscaleph(ymm1, ptr [rax+0x40], 0x3); +vrndscaleph(ymm1, ptr_b [rax+0x40], 0x4); +vrndscaleph(zmm1, ptr [rax+0x40], 0x5); +vrndscaleph(zmm1, ptr_b [rax+0x40], 0x6); +vrndscaleph(zmm1|k1|T_z|T_sae, zmm5, 0x7); +vrndscalesh(xmm1, xmm3, ptr [rax+0x40], 0x1); +vrndscalesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); +vfpclassph(k1, xword [rax+0x40], 0x1); +vfpclassph(k1, xword_b[rax+0x40], 0x2); +vfpclassph(k1, yword [rax+0x40], 0x3); +vfpclassph(k1, yword_b[rax+0x40], 0x4); +vfpclassph(k1, zword [rax+0x40], 0x5); +vfpclassph(k1, zword_b[rax+0x40], 0x6); +vfpclasssh(k1|k2, xmm3, 0x5); +vfpclasssh(k1|k2, ptr [rax+0x40], 0x5); +vgetexpph(xmm1, ptr [rax+0x40]); +vgetexpph(ymm1, ptr_b [rax+0x40]); +vgetexpph(zmm1, ptr [rax+0x40]); +vgetexpph(zmm1|k1|T_z|T_sae, zmm5); +vgetexpsh(xmm1, xmm5, ptr [rax+0x40]); +vgetexpsh(xmm1|k1|T_z|T_sae, xmm3, xmm5); +vgetmantph(xmm1, ptr [rax+0x40], 0x1); +vgetmantph(ymm1, ptr_b [rax+0x40], 0x2); +vgetmantph(zmm1, ptr [rax+0x40], 0x3); +vgetmantph(zmm1|k1|T_z|T_sae, zmm5, 0x4); +vgetmantsh(xmm1, xmm5, ptr [rax+0x40], 0x5); +vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6); +vmovsh(xmm1|k1|T_z, ptr [rax+0x40]); +vmovsh(ptr [rax+0x40]|k1, xmm1); +vmovsh(xmm1|k2|T_z, xmm3, xmm5); +vmovw(xmm1, r13d); +vmovw(xmm3, ptr [rax+0x40]); +vmovw(r9d, xmm1); +vmovw(ptr [rax+0x40], xmm7); +vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3); +vcvtsh2sd(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2ss(xmm1|k1|T_z|T_sae, xmm2, xmm3); +vcvtsh2ss(xmm1, xmm2, ptr [rax+0x40]); +vcvtss2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vcvtss2sh(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2si(edx|T_rd_sae, xmm1); +vcvtsh2si(edx, ptr [rax+0x40]); +vcvtsh2si(rdx|T_rd_sae, xmm1); +vcvtsh2si(r8, ptr [rax+0x40]); +vcvtph2dq(xmm1, xmm5); +vcvtph2dq(xmm1, ptr [rax+0x40]); +vcvtph2dq(xmm1, ptr_b [rax+0x40]); +vcvtph2dq(ymm1|k2|T_z, xmm5); +vcvtph2dq(ymm1, ptr [rax+0x40]); +vcvtph2dq(ymm1, ptr_b [rax+0x40]); +vcvtph2dq(zmm1|k5|T_z|T_rd_sae, ymm3); +vcvtph2dq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2psx(xmm1, xmm5); +vcvtph2psx(xmm1, ptr [rax+0x40]); +vcvtph2psx(xmm1, ptr_b [rax+0x40]); +vcvtph2psx(ymm1|k2|T_z, xmm5); +vcvtph2psx(ymm1, ptr [rax+0x40]); +vcvtph2psx(ymm1, ptr_b [rax+0x40]); +vcvtph2psx(zmm1|k5|T_z|T_sae, ymm3); +vcvtph2psx(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2psx(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2udq(xmm1, xmm5); +vcvtph2udq(xmm1, ptr [rax+0x40]); +vcvtph2udq(xmm1, ptr_b [rax+0x40]); +vcvtph2udq(ymm1|k2|T_z, xmm5); +vcvtph2udq(ymm1, ptr [rax+0x40]); +vcvtph2udq(ymm1, ptr_b [rax+0x40]); +vcvtph2udq(zmm1|k5|T_z|T_rd_sae, ymm3); +vcvtph2udq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2dq(xmm1, xmm5); +vcvttph2dq(xmm1, ptr [rax+0x40]); +vcvttph2dq(xmm1, ptr_b [rax+0x40]); +vcvttph2dq(ymm1|k2|T_z, xmm5); +vcvttph2dq(ymm1, ptr [rax+0x40]); +vcvttph2dq(ymm1, ptr_b [rax+0x40]); +vcvttph2dq(zmm1|k5|T_z|T_sae, ymm3); +vcvttph2dq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2udq(xmm1, xmm5); +vcvttph2udq(xmm1, ptr [rax+0x40]); +vcvttph2udq(xmm1, ptr_b [rax+0x40]); +vcvttph2udq(ymm1|k2|T_z, xmm5); +vcvttph2udq(ymm1, ptr [rax+0x40]); +vcvttph2udq(ymm1, ptr_b [rax+0x40]); +vcvttph2udq(zmm1|k5|T_z|T_sae, ymm3); +vcvttph2udq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2pd(xmm1, xmm5); +vcvtph2pd(xmm1, ptr [rax+0x40]); +vcvtph2pd(xmm1, ptr_b [rax+0x40]); +vcvtph2pd(ymm1|k2|T_z, xmm5); +vcvtph2pd(ymm1, ptr [rax+0x40]); +vcvtph2pd(ymm1, ptr_b [rax+0x40]); +vcvtph2pd(zmm1|k5|T_z|T_sae, xmm3); +vcvtph2pd(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2pd(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2qq(xmm1, xmm5); +vcvtph2qq(xmm1, ptr [rax+0x40]); +vcvtph2qq(xmm1, ptr_b [rax+0x40]); +vcvtph2qq(ymm1|k2|T_z, xmm5); +vcvtph2qq(ymm1, ptr [rax+0x40]); +vcvtph2qq(ymm1, ptr_b [rax+0x40]); +vcvtph2qq(zmm1|k5|T_z|T_rd_sae, xmm3); +vcvtph2qq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2uqq(xmm1, xmm5); +vcvtph2uqq(xmm1, ptr [rax+0x40]); +vcvtph2uqq(xmm1, ptr_b [rax+0x40]); +vcvtph2uqq(ymm1|k2|T_z, xmm5); +vcvtph2uqq(ymm1, ptr [rax+0x40]); +vcvtph2uqq(ymm1, ptr_b [rax+0x40]); +vcvtph2uqq(zmm1|k5|T_z|T_rd_sae, xmm3); +vcvtph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2uqq(xmm1, xmm5); +vcvttph2uqq(xmm1, ptr [rax+0x40]); +vcvttph2uqq(xmm1, ptr_b [rax+0x40]); +vcvttph2uqq(ymm1|k2|T_z, xmm5); +vcvttph2uqq(ymm1, ptr [rax+0x40]); +vcvttph2uqq(ymm1, ptr_b [rax+0x40]); +vcvttph2uqq(zmm1|k5|T_z|T_sae, xmm3); +vcvttph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtdq2ph(xmm1, xmm5); +vcvtdq2ph(xmm1, xword [rax+0x40]); +vcvtdq2ph(xmm1, xword_b [rax+0x40]); +vcvtdq2ph(xmm1, yword [rax+0x40]); +vcvtdq2ph(xmm1, yword_b [rax+0x40]); +vcvtdq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtdq2ph(ymm1, ptr [rax+0x40]); +vcvtdq2ph(ymm1, ptr_b [rax+0x40]); +vcvtps2phx(xmm1, xmm5); +vcvtps2phx(xmm1, xword [rax+0x40]); +vcvtps2phx(xmm1, xword_b [rax+0x40]); +vcvtps2phx(xmm1, yword [rax+0x40]); +vcvtps2phx(xmm1, yword_b [rax+0x40]); +vcvtps2phx(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtps2phx(ymm1, ptr [rax+0x40]); +vcvtps2phx(ymm1, ptr_b [rax+0x40]); +vcvtudq2ph(xmm1, xmm5); +vcvtudq2ph(xmm1, xword [rax+0x40]); +vcvtudq2ph(xmm1, xword_b [rax+0x40]); +vcvtudq2ph(xmm1, yword [rax+0x40]); +vcvtudq2ph(xmm1, yword_b [rax+0x40]); +vcvtudq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtudq2ph(ymm1, ptr [rax+0x40]); +vcvtudq2ph(ymm1, ptr_b [rax+0x40]); +vcvtpd2ph(xmm1, xmm5); +vcvtpd2ph(xmm1, ymm5); +vcvtpd2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtpd2ph(xmm1, xword [rax+0x40]); +vcvtpd2ph(xmm1, xword_b [rax+0x40]); +vcvtpd2ph(xmm1, yword [rax+0x40]); +vcvtpd2ph(xmm1, yword_b [rax+0x40]); +vcvtpd2ph(xmm1, zword [rax+0x40]); +vcvtpd2ph(xmm1, zword_b [rax+0x40]); +vcvtqq2ph(xmm1, xmm5); +vcvtqq2ph(xmm1, ymm5); +vcvtqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtqq2ph(xmm1, xword [rax+0x40]); +vcvtqq2ph(xmm1, xword_b [rax+0x40]); +vcvtqq2ph(xmm1, yword [rax+0x40]); +vcvtqq2ph(xmm1, yword_b [rax+0x40]); +vcvtqq2ph(xmm1, zword [rax+0x40]); +vcvtqq2ph(xmm1, zword_b [rax+0x40]); +vcvtuqq2ph(xmm1, xmm5); +vcvtuqq2ph(xmm1, ymm5); +vcvtuqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtuqq2ph(xmm1, xword [rax+0x40]); +vcvtuqq2ph(xmm1, xword_b [rax+0x40]); +vcvtuqq2ph(xmm1, yword [rax+0x40]); +vcvtuqq2ph(xmm1, yword_b [rax+0x40]); +vcvtuqq2ph(xmm1, zword [rax+0x40]); +vcvtuqq2ph(xmm1, zword_b [rax+0x40]); +vcvtph2uw(xmm1, xmm5); +vcvtph2uw(xmm1, ptr [rax+0x40]); +vcvtph2uw(xmm1, ptr_b [rax+0x40]); +vcvtph2uw(ymm1, ptr [rax+0x40]); +vcvtph2uw(ymm1, ptr_b [rax+0x40]); +vcvtph2uw(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtph2uw(zmm1, ptr [rax+0x40]); +vcvtph2uw(zmm1, ptr_b [rax+0x40]); +vcvtph2w(xmm1, xmm5); +vcvtph2w(xmm1, ptr [rax+0x40]); +vcvtph2w(xmm1, ptr_b [rax+0x40]); +vcvtph2w(ymm1, ptr [rax+0x40]); +vcvtph2w(ymm1, ptr_b [rax+0x40]); +vcvtph2w(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtph2w(zmm1, ptr [rax+0x40]); +vcvtph2w(zmm1, ptr_b [rax+0x40]); +vcvttph2uw(xmm1, xmm5); +vcvttph2uw(xmm1, ptr [rax+0x40]); +vcvttph2uw(xmm1, ptr_b [rax+0x40]); +vcvttph2uw(ymm1, ptr [rax+0x40]); +vcvttph2uw(ymm1, ptr_b [rax+0x40]); +vcvttph2uw(zmm1|k2|T_z|T_sae, zmm5); +vcvttph2uw(zmm1, ptr [rax+0x40]); +vcvttph2uw(zmm1, ptr_b [rax+0x40]); +vcvttph2w(xmm1, xmm5); +vcvttph2w(xmm1, ptr [rax+0x40]); +vcvttph2w(xmm1, ptr_b [rax+0x40]); +vcvttph2w(ymm1, ptr [rax+0x40]); +vcvttph2w(ymm1, ptr_b [rax+0x40]); +vcvttph2w(zmm1|k2|T_z|T_sae, zmm5); +vcvttph2w(zmm1, ptr [rax+0x40]); +vcvttph2w(zmm1, ptr_b [rax+0x40]); +vcvtuw2ph(xmm1, xmm5); +vcvtuw2ph(xmm1, ptr [rax+0x40]); +vcvtuw2ph(xmm1, ptr_b [rax+0x40]); +vcvtuw2ph(ymm1, ptr [rax+0x40]); +vcvtuw2ph(ymm1, ptr_b [rax+0x40]); +vcvtuw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtuw2ph(zmm1, ptr [rax+0x40]); +vcvtuw2ph(zmm1, ptr_b [rax+0x40]); +vcvtw2ph(xmm1, xmm5); +vcvtw2ph(xmm1, ptr [rax+0x40]); +vcvtw2ph(xmm1, ptr_b [rax+0x40]); +vcvtw2ph(ymm1, ptr [rax+0x40]); +vcvtw2ph(ymm1, ptr_b [rax+0x40]); +vcvtw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtw2ph(zmm1, ptr [rax+0x40]); +vcvtw2ph(zmm1, ptr_b [rax+0x40]); +vcvtps2ph(xmm1, xmm2, 0x1); +vcvtps2ph(ptr [rax+0x40], xmm2, 0x2); +vcvtps2ph(xmm1, ymm2, 0x3); +vcvtps2ph(ptr [rax+0x40], ymm2, 0x4); +vcvtps2ph(xmm1|k1|T_z, xmm2, 0x5); +vcvtps2ph(ptr [rax+0x40]|k1, xmm3, 0x6); +vcvtps2ph(xmm1|k2, ymm4, 0x7); +vcvtps2ph(ptr [rax+0x40]|k2, ymm5, 0x8); +vcvtps2ph(ymm1|k2|T_sae, zmm5, 0x9); +vcvtps2ph(ptr [rax+0x40]|k5, zmm4, 0xa); +vcvtsh2usi(ecx|T_rd_sae, xmm1); +vcvtsh2usi(eax, ptr [rax+0x40]); +vcvtsh2usi(r9|T_rd_sae, xmm1); +vcvtsh2usi(r13, ptr [rax+0x40]); +vcvttsh2si(ecx|T_sae, xmm1); +vcvttsh2si(eax, ptr [rax+0x40]); +vcvttsh2si(r9|T_sae, xmm1); +vcvttsh2si(r13, ptr [rax+0x40]); +vcvttsh2usi(ecx|T_sae, xmm1); +vcvttsh2usi(eax, ptr [rax+0x40]); +vcvttsh2usi(r9|T_sae, xmm1); +vcvttsh2usi(r13, ptr [rax+0x40]); +vcvttph2qq(xmm1, xmm5); +vcvttph2qq(xmm1, ptr [rax+0x40]); +vcvttph2qq(xmm1, ptr_b [rax+0x40]); +vcvttph2qq(ymm1|k2|T_z, xmm5); +vcvttph2qq(ymm1, ptr [rax+0x40]); +vcvttph2qq(ymm1, ptr_b [rax+0x40]); +vcvttph2qq(zmm1|k5|T_z|T_sae, xmm3); +vcvttph2qq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtsi2sh(xmm1|T_rd_sae, xmm2, eax); +vcvtsi2sh(xmm1, xmm2, dword [rax+0x40]); +vcvtsi2sh(xmm1|T_rd_sae, xmm2, r9); +vcvtsi2sh(xmm1, xmm2, qword [rax+0x40]); +vcvtusi2sh(xmm1|T_rd_sae, xmm2, eax); +vcvtusi2sh(xmm1, xmm2, dword [rax+0x40]); +vcvtusi2sh(xmm1|T_rd_sae, xmm2, r9); +vcvtusi2sh(xmm1, xmm2, qword [rax+0x40]); +aadd(ptr[rax], ecx); +aadd(ptr[eax], ecx); +aadd(ptr[rax], r10); +aand(ptr[rax], ecx); +aand(ptr[eax], ecx); +aand(ptr[rax], r10); +aor(ptr[rax], ecx); +aor(ptr[eax], ecx); +aor(ptr[rax], r10); +axor(ptr[rax], ecx); +axor(ptr[eax], ecx); +axor(ptr[rax], r10); +cmpbexadd(ptr[rax+r10*4], rcx, rdx); +cmpbxadd(ptr[rax+r10*4], rcx, rdx); +cmplexadd(ptr[rax+r10*4], rcx, rdx); +cmplxadd(ptr[rax+r10*4], rcx, rdx); +cmpnbexadd(ptr[rax+r10*4], rcx, rdx); +cmpnbxadd(ptr[rax+r10*4], rcx, rdx); +cmpnlexadd(ptr[rax+r10*4], rcx, rdx); +cmpnlxadd(ptr[rax+r10*4], rcx, rdx); +cmpnoxadd(ptr[rax+r10*4], rcx, rdx); +cmpnpxadd(ptr[rax+r10*4], rcx, rdx); +cmpnsxadd(ptr[rax+r10*4], rcx, rdx); +cmpnzxadd(ptr[rax+r10*4], rcx, rdx); +cmpoxadd(ptr[rax+r10*4], rcx, rdx); +cmppxadd(ptr[rax+r10*4], rcx, rdx); +cmpsxadd(ptr[rax+r10*4], rcx, rdx); +cmpzxadd(ptr[rax+r10*4], rcx, rdx); +vsha512msg1(ymm3, xmm5); +vsha512msg2(ymm9, ymm10); +vsha512rnds2(ymm1, ymm3, xmm2); +vsm3msg1(xmm1, xmm2, xmm3); +vsm3msg1(xmm1, xmm2, ptr [rax]); +vsm3msg2(xmm5, xmm7, xmm3); +vsm3msg2(xmm5, xmm6, ptr [rax]); +vsm3rnds2(xmm5, xmm7, xmm3, 0x12); +vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34); +vsm4key4(xmm1, xmm2, xmm3); +vsm4key4(xmm1, xmm2, ptr [rdx]); +vsm4rnds4(xmm1, xmm2, xmm3); +vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]); +vpdpbssd(xmm1, xmm2, xmm3); +vpdpbssd(ymm1, ymm2, ptr [rax]); +vpdpbssds(xmm1, xmm2, xmm3); +vpdpbssds(ymm1, ymm2, ptr [rax]); +vpdpbsud(xmm1, xmm2, xmm3); +vpdpbsud(ymm1, ymm2, ptr [rax]); +vpdpbsuds(xmm1, xmm2, xmm3); +vpdpbsuds(ymm1, ymm2, ptr [rax]); +vpdpbuud(xmm1, xmm2, xmm3); +vpdpbuud(ymm1, ymm2, ptr [rax]); +vpdpbuuds(xmm1, xmm2, xmm3); +vpdpbuuds(ymm1, ymm2, ptr [rax]); +vpdpwsud(xmm1, xmm2, xmm3); +vpdpwsud(ymm1, ymm2, ptr [rax]); +vpdpwsuds(xmm1, xmm2, xmm3); +vpdpwsuds(ymm1, ymm2, ptr [rax]); +vpdpwusd(xmm1, xmm2, xmm3); +vpdpwusd(ymm1, ymm2, ptr [rax]); +vpdpwusds(xmm1, xmm2, xmm3); +vpdpwusds(ymm1, ymm2, ptr [rax]); +vpdpwuud(xmm1, xmm2, xmm3); +vpdpwuud(ymm1, ymm2, ptr [rax]); +vpdpwuuds(xmm1, xmm2, xmm3); +vpdpwuuds(ymm1, ymm2, ptr [rax]); diff --git a/test/target/avx10.txt b/test/target/avx10.txt deleted file mode 100644 index 8ee52ca..0000000 --- a/test/target/avx10.txt +++ /dev/null @@ -1,149 +0,0 @@ -vaddpd(ymm1, ymm2, ymm3 |T_rn_sae); -vaddph(ymm1, ymm2, ymm3 |T_rn_sae); -vaddps(ymm1, ymm2, ymm3 |T_rn_sae); -vcmppd(k1, ymm2, ymm3 |T_sae, 3); -vcmpph(k1, ymm2, ymm3 |T_sae, 3); -vcmpps(k1, ymm2, ymm3 |T_sae, 3); -vcvtdq2ph(xmm1, ymm2 |T_rn_sae); -vcvtdq2ps(ymm1, ymm2 |T_rn_sae); -vcvtpd2dq(xmm1, ymm2 |T_rn_sae); -vcvtpd2ph(xmm1, ymm2 |T_rn_sae); -vcvtpd2ps(xmm1, ymm2 |T_rn_sae); -vcvtpd2qq(ymm1, ymm2 |T_rn_sae); -vcvtpd2udq(xmm1, ymm2 |T_rn_sae); -vcvtpd2uqq(ymm1, ymm2 |T_rn_sae); -vcvtph2dq(ymm1, xmm2 |T_rn_sae); -vcvtph2pd(ymm1, xmm2 |T_sae); -vcvtph2ps(ymm1, xmm2 |T_sae); -vcvtph2psx(ymm1, xmm2 |T_sae); -vcvtph2qq(ymm1, xmm2 |T_rn_sae); -vcvtph2udq(ymm1, xmm2 |T_rn_sae); -vcvtph2uqq(ymm1, xmm2 |T_rn_sae); -vcvtph2uw(ymm1, ymm2 |T_rn_sae); -vcvtph2w(ymm1, ymm2 |T_rn_sae); -vcvtps2dq(ymm1, ymm2 |T_rn_sae); -vcvtps2pd(ymm1, xmm2 |T_sae); -vcvtps2ph(xmm1, ymm2 |T_sae, 3); -vcvtps2phx(xmm1, ymm2 |T_rn_sae); -vcvtps2qq(ymm1, xmm2 |T_rn_sae); -vcvtps2udq(ymm1, ymm2 |T_rn_sae); -vcvtps2uqq(ymm1, xmm2 |T_rn_sae); -vcvtqq2pd(ymm1, ymm2 |T_rn_sae); -vcvtqq2ph(xmm1, ymm2 |T_rn_sae); -vcvtqq2ps(xmm1, ymm2 |T_rn_sae); -vcvttpd2dq(xmm1, ymm2 |T_sae); -vcvttpd2qq(ymm1, ymm2 |T_sae); -vcvttpd2udq(xmm1, ymm2 |T_sae); -vcvttpd2uqq(ymm1, ymm2 |T_sae); -vcvttph2dq(ymm1, xmm2 |T_sae); -vcvttph2qq(ymm1, xmm2 |T_sae); -vcvttph2udq(ymm1, xmm2 |T_sae); -vcvttph2uqq(ymm1, xmm2 |T_sae); -vcvttph2uw(ymm1, ymm2 |T_sae); -vcvttph2w(ymm1, ymm2 |T_sae); -vcvttps2dq(ymm1, ymm2 |T_sae); -vcvttps2qq(ymm1, xmm2 |T_sae); -vcvttps2udq(ymm1, ymm2 |T_sae); -vcvttps2uqq(ymm1, xmm2 |T_sae); -vcvtudq2ph(xmm1, ymm2 |T_rn_sae); -vcvtudq2ps(ymm1, ymm2 |T_rn_sae); -vcvtuqq2pd(ymm1, ymm2 |T_rn_sae); -vcvtuqq2ph(xmm1, ymm2 |T_rn_sae); -vcvtuqq2ps(xmm1, ymm2 |T_rn_sae); -vcvtuw2ph(ymm1, ymm2 |T_rn_sae); -vcvtw2ph(ymm1, ymm2 |T_rn_sae); -vdivpd(ymm1, ymm2, ymm3 |T_rn_sae); -vdivph(ymm1, ymm2, ymm3 |T_rn_sae); -vdivps(ymm1, ymm2, ymm3 |T_rn_sae); -vfcmaddcph(ymm1, ymm2, ymm3 |T_rn_sae); -vfcmulcph(ymm1, ymm2, ymm3 |T_rn_sae); -vfixupimmpd(ymm1, ymm2, ymm3 |T_sae, 3); -vfixupimmps(ymm1, ymm2, ymm3 |T_sae, 3); -vfmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmaddcph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmaddsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmaddsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmaddsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmaddsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmaddsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmaddsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmaddsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmaddsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmaddsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsubadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsubadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsubadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsubadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsubadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsubadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsubadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsubadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfmsubadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfmulcph(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); -vfnmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); -vgetexppd(ymm1, ymm2 |T_sae); -vgetexpph(ymm1, ymm2 |T_sae); -vgetexpps(ymm1, ymm2 |T_sae); -vgetmantpd(ymm1, ymm2 |T_sae, 3); -vgetmantph(ymm1, ymm2 |T_sae, 3); -vgetmantps(ymm1, ymm2 |T_sae, 3); -vmaxpd(ymm1, ymm2, ymm3 |T_sae); -vmaxph(ymm1, ymm2, ymm3 |T_sae); -vmaxps(ymm1, ymm2, ymm3 |T_sae); -vminpd(ymm1, ymm2, ymm3 |T_sae); -vminph(ymm1, ymm2, ymm3 |T_sae); -vminps(ymm1, ymm2, ymm3 |T_sae); -vmulpd(ymm1, ymm2, ymm3 |T_rn_sae); -vmulph(ymm1, ymm2, ymm3 |T_rn_sae); -vmulps(ymm1, ymm2, ymm3 |T_rn_sae); -vrangepd(ymm1, ymm2, ymm3 |T_sae, 3); -vrangeps(ymm1, ymm2, ymm3 |T_sae, 3); -vreducepd(ymm1, ymm2 |T_sae, 3); -vreduceph(ymm1, ymm2 |T_sae, 3); -vreduceps(ymm1, ymm2 |T_sae, 3); -vrndscalepd(ymm1, ymm2 |T_sae, 3); -vrndscaleph(ymm1, ymm2 |T_sae, 3); -vrndscaleps(ymm1, ymm2 |T_sae, 3); -vscalefpd(ymm1, ymm2, ymm3 |T_rn_sae); -vscalefph(ymm1, ymm2, ymm3 |T_rn_sae); -vscalefps(ymm1, ymm2, ymm3 |T_rn_sae); -vsqrtpd(ymm1, ymm2 |T_rn_sae); -vsqrtph(ymm1, ymm2 |T_rn_sae); -vsqrtps(ymm1, ymm2 |T_rn_sae); -vsubpd(ymm1, ymm2, ymm3 |T_rn_sae); -vsubph(ymm1, ymm2, ymm3 |T_rn_sae); -vsubps(ymm1, ymm2, ymm3 |T_rn_sae); diff --git a/test/target/misc.txt b/test/target/misc.txt deleted file mode 100644 index 9e4f097..0000000 --- a/test/target/misc.txt +++ /dev/null @@ -1,657 +0,0 @@ -v4fmaddps(zmm1, zmm8, ptr [rdx + 64]); -v4fmaddss(xmm15, xmm8, ptr [rax + 64]); -v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]); -v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]); -vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]); -vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]); -vaesdec(xmm20, xmm30, ptr [rcx + 64]); -vaesdec(ymm1, ymm2, ptr [rcx + 64]); -vaesdec(zmm1, zmm2, ptr [rcx + 64]); -vaesdeclast(xmm20, xmm30, ptr [rax + 64]); -vaesdeclast(ymm20, ymm30, ptr [rax + 64]); -vaesdeclast(zmm20, zmm30, ptr [rax + 64]); -vaesenc(xmm20, xmm30, ptr [rcx + 64]); -vaesenc(ymm1, ymm2, ptr [rcx + 64]); -vaesenc(zmm1, zmm2, ptr [rcx + 64]); -vaesenclast(xmm20, xmm30, ptr [rax + 64]); -vaesenclast(ymm20, ymm30, ptr [rax + 64]); -vaesenclast(zmm20, zmm30, ptr [rax + 64]); -vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3); -vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3); -vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3); -vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3); -vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3); -vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3); -vpcompressb(ptr[rax + 64], xmm1); -vpcompressb(xmm30 | k5, xmm1); -vpcompressb(ptr[rax + 64], ymm1); -vpcompressb(ymm30 | k3 |T_z, ymm1); -vpcompressb(ptr[rax + 64], zmm1); -vpcompressb(zmm30 | k2 |T_z, zmm1); -vpcompressw(ptr[rax + 64], xmm1); -vpcompressw(xmm30 | k5, xmm1); -vpcompressw(ptr[rax + 64], ymm1); -vpcompressw(ymm30 | k3 |T_z, ymm1); -vpcompressw(ptr[rax + 64], zmm1); -vpcompressw(zmm30 | k2 |T_z, zmm1); -vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); -vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); -vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); -vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); -vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); -vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); -vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); -vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); -vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); -vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); -vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); -vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); -vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); -vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); -vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); -vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); -vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); -vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); -vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); -vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); -vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); -vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); -vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); -vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); -vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); -vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); -vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); -vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); -vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); -vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); -vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); -vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); -vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); -vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); -vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); -vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); -vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); -vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); -vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); -vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); -vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); -vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); -vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); -vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); -vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); -vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); -vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); -vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); -vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]); -vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]); -vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]); -vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]); -vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]); -vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]); -vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]); -vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]); -vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]); -vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]); -vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]); -vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]); -vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]); -vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]); -vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]); -vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]); -vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]); -vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]); -vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); -vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); -vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); -vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); -vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); -vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); -vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); -vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); -vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); -vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); -vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); -vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); -vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); -vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); -vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); -vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); -vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); -vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); -vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); -vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); -vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); -vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); -vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); -vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); -vpexpandb(xmm5|k3|T_z, xmm30); -vpexpandb(ymm5|k3|T_z, ymm30); -vpexpandb(zmm5|k3|T_z, zmm30); -vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]); -vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]); -vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]); -vpexpandw(xmm5|k3|T_z, xmm30); -vpexpandw(ymm5|k3|T_z, ymm30); -vpexpandw(zmm5|k3|T_z, zmm30); -vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]); -vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]); -vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]); -vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]); -vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]); -vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]); -gf2p8affineinvqb(xmm1, xmm2, 3); -gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3); -vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3); -vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3); -vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3); -vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3); -vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5); -vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5); -vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5); -vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); -vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); -vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); -vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); -vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); -vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); -gf2p8affineqb(xmm1, xmm2, 3); -gf2p8affineqb(xmm1, ptr [rax + 0x40], 3); -vgf2p8affineqb(xmm1, xmm5, xmm2, 3); -vgf2p8affineqb(ymm1, ymm5, ymm2, 3); -vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3); -vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3); -vgf2p8affineqb(xmm30, xmm31, xmm4, 5); -vgf2p8affineqb(ymm30, ymm31, ymm4, 5); -vgf2p8affineqb(zmm30, zmm31, zmm4, 5); -vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); -vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); -vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); -vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); -vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); -vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); -gf2p8mulb(xmm1, xmm2); -gf2p8mulb(xmm1, ptr [rax + 0x40]); -vgf2p8mulb(xmm1, xmm5, xmm2); -vgf2p8mulb(ymm1, ymm5, ymm2); -vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]); -vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]); -vgf2p8mulb(xmm30, xmm31, xmm4); -vgf2p8mulb(ymm30, ymm31, ymm4); -vgf2p8mulb(zmm30, zmm31, zmm4); -vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]); -vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]); -vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]); -vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]); -vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]); -vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]); -vcvtneps2bf16(xmm0, xword [rax + 64]); -vcvtneps2bf16(xmm0 | k1, yword [rax + 64]); -vcvtneps2bf16(ymm0 | k1, zword [rax + 64]); -vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]); -vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]); -vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]); -vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]); -ldtilecfg(ptr[rax + rcx * 4 + 64]); -sttilecfg(ptr[rsp + rax * 8 + 128]); -tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]); -tileloaddt1(tmm4, ptr[r8 + r9 + 32]); -tilerelease(); -tilestored(ptr[r10 + r11 * 2 + 32], tmm2); -tilezero(tmm7); -tdpbssd(tmm1, tmm2, tmm3); -tdpbsud(tmm2, tmm3, tmm4); -tdpbusd(tmm3, tmm4, tmm5); -tdpbuud(tmm4, tmm5, tmm6); -tdpbf16ps(tmm5, tmm6, tmm7); -tileloadd(tmm1, ptr[r8+r8]); -tileloadd(tmm1, ptr[rax+rcx*4]); -tileloadd(tmm1, ptr[r8+r9*1+0x40]); -vaddph(zmm0, zmm1, ptr[rax+64]); -vaddph(ymm0, ymm1, ptr[rax+64]); -vaddph(xmm0, xmm1, ptr[rax+64]); -vaddph(zmm0, zmm1, ptr_b[rax+64]); -vaddph(ymm0, ymm1, ptr_b[rax+64]); -vaddph(xmm0, xmm1, ptr_b[rax+64]); -vaddsh(xmm0, xmm15, ptr[rax+64]); -vaddsh(xmm0|k5|T_z|T_rd_sae, xmm15, xmm3); -vcmpph(k1, xm15, ptr[rax+64], 1); -vcmpph(k2, ym15, ptr[rax+64], 2); -vcmpph(k3, zm15, ptr[rax+64], 3); -vcmpph(k1, xm15, ptr_b[rax+64], 1); -vcmpph(k2, ym15, ptr_b[rax+64], 2); -vcmpph(k3, zm15, ptr_b[rax+64], 3); -vcmpsh(k1, xm15, ptr[rax+64], 1); -vcmpsh(k3|k5, xmm1, xmm25|T_sae, 4); -vcomish(xmm1, ptr[rax+64]); -vcomish(xmm1|T_sae, xmm15); -vucomish(xmm1, ptr [rax+0x40]); -vucomish(xmm1|T_sae, xmm15); -vfmaddsub213ph(xmm1, xmm2, ptr [rax+0x40]); -vfmaddsub213ph(xmm1, xmm2, ptr_b [rax+0x40]); -vfmaddsub213ph(xmm1|k3, xmm2, xmm5); -vfmaddsub213ph(ymm1, ymm2, ptr [rax+0x40]); -vfmaddsub213ph(ymm1, ymm2, ptr_b[rax+0x40]); -vfmaddsub213ph(ymm1|k3, ymm2, ymm5); -vfmaddsub213ph(zmm1, zmm2, ptr [rax+0x40]); -vfmaddsub213ph(zmm1, zmm2, ptr_b [rax+0x40]); -vfmaddsub213ph(zmm1|T_ru_sae, zmm2, zmm5); -vfmsubadd132ph(xmm1, xmm2, ptr [rax+0x40]); -vfmsubadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); -vfmsubadd132ph(ymm1, ymm2, ptr [rax+0x40]); -vfmsubadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); -vfmsubadd132ph(zmm1, zmm2, ptr [rax+0x40]); -vfmsubadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); -vfmsubadd132ph(zmm1|T_ru_sae, zmm2, zmm5); -vfmadd132ph(xmm1, xmm2, ptr [rax+0x40]); -vfmadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); -vfmadd132ph(ymm1, ymm2, ptr [rax+0x40]); -vfmadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); -vfmadd132ph(zmm1, zmm2, ptr [rax+0x40]); -vfmadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); -vfmadd132ph(zmm1|T_rd_sae, zmm2, zmm5); -vfmsub231ph(xmm1, xmm2, ptr [rax+0x40]); -vfmsub231ph(xmm1, xmm2, ptr_b [rax+0x40]); -vfmsub231ph(ymm1, ymm2, ptr [rax+0x40]); -vfmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); -vfmsub231ph(zmm1, zmm2, ptr [rax+0x40]); -vfmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); -vfmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); -vfnmsub231ph(xmm1, xmm2, ptr [rax+0x40]); -vfnmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); -vfnmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); -vfnmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); -vfmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); -vfmadd132sh(xmm1, xmm2, ptr [rax+0x40]); -vfnmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); -vfnmadd132sh(xmm1, xmm2, ptr [rax+0x40]); -vfmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); -vfmsub132sh(xmm1, xmm2, ptr [rax+0x40]); -vfnmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); -vfnmsub132sh(xmm1, xmm2, ptr [rax+0x40]); -vfcmaddcph(xmm1|k1|T_z, xmm2, ptr [rax+0x40]); -vfcmaddcph(ymm1|k1|T_z, ymm2, ptr [rax+0x40]); -vfcmaddcph(zmm1|k1, zmm2, ptr [rax+0x40]); -vfcmaddcph(zmm1|k1|T_rd_sae, zmm2, zmm5); -vfcmaddcph(xmm1|k1|T_z, xmm2, ptr_b [rax+0x40]); -vfcmaddcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); -vfcmaddcph(zmm1|k1|T_z, zmm2, ptr_b [rax+0x40]); -vfmaddcph(xm1, xm2, ptr[rax+0x40]); -vfmaddcph(ym1|k1|T_z, ym2, ptr_b[rax+0x40]); -vfmaddcph(zm1, zm2, ptr_b[rax+0x40]); -vfcmulcph(xmm1, xmm2, ptr [rax+0x40]); -vfcmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); -vfcmulcph(zmm1, zmm2, ptr_b [rax+0x40]); -vfmulcph(xmm1, xmm2, ptr [rax+0x40]); -vfmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); -vfmulcph(zmm1, zmm2, ptr_b [rax+0x40]); -vrcpph(xmm1, ptr [rax+0x40]); -vrcpph(xmm1, ptr_b [rax+0x40]); -vrcpph(ymm1, ptr [rax+0x40]); -vrcpph(ymm1, ptr_b [rax+0x40]); -vrcpph(zmm1, ptr [rax+0x40]); -vrcpph(zmm1, ptr_b [rax+0x40]); -vrcpsh(xmm1, xmm3, ptr [rax+0x40]); -vrsqrtph(xmm1, ptr [rax+0x40]); -vrsqrtph(xmm1, ptr_b [rax+0x40]); -vrsqrtph(ymm2, ptr [rax+0x40]); -vrsqrtph(ymm2, ptr_b [rax+0x40]); -vrsqrtph(zmm2, ptr [rax+0x40]); -vrsqrtph(zmm2, ptr_b [rax+0x40]); -vrsqrtsh(xmm1|k5|T_z, xmm7, ptr [rax+0x40]); -vsqrtph(xmm1|k4|T_z, ptr [rax+0x40]); -vsqrtph(xmm1|k4|T_z, ptr_b [rax+0x40]); -vsqrtph(ymm1|k4|T_z, ptr_b [rax+0x40]); -vsqrtph(zmm1|k4|T_z, ptr [rax+0x40]); -vsqrtph(zmm1|k4|T_z, ptr_b [rax+0x40]); -vsqrtsh(xmm1|k4|T_z, xmm5, ptr [rax+0x40]); -vsqrtsh(xmm1|k4|T_z|T_rd_sae, xmm5, xmm7); -vscalefph(xmm1, xmm5, ptr [rax+0x40]); -vscalefph(xmm1, xmm5, ptr_b [rax+0x40]); -vscalefph(ymm1, ymm5, ptr [rax+0x40]); -vscalefph(ymm1, ymm5, ptr_b [rax+0x40]); -vscalefph(zmm1, zmm5, ptr [rax+0x40]); -vscalefph(zmm1, zmm5, ptr_b [rax+0x40]); -vscalefph(zmm1|k1|T_z|T_rd_sae, zmm5, zmm7); -vscalefsh(xmm1, xmm5, ptr [rax+0x40]); -vscalefsh(xmm1|k1|T_z|T_rd_sae, xmm5, xmm7); -vreduceph(xmm1, ptr [rax+0x40], 0x1); -vreduceph(xmm1, ptr_b [rax+0x40], 0x2); -vreduceph(ymm1, ptr [rax+0x40], 0x3); -vreduceph(ymm1, ptr_b [rax+0x40], 0x4); -vreduceph(zmm1, ptr [rax+0x40], 0x5); -vreduceph(zmm1, ptr_b [rax+0x40], 0x6); -vreduceph(zmm1|k1|T_z|T_sae, zmm5, 0x7); -vreducesh(xmm1, xmm3, ptr [rax+0x40], 0x1); -vreducesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); -vrndscaleph(xmm1, ptr [rax+0x40], 0x1); -vrndscaleph(xmm1, ptr_b [rax+0x40], 0x2); -vrndscaleph(ymm1, ptr [rax+0x40], 0x3); -vrndscaleph(ymm1, ptr_b [rax+0x40], 0x4); -vrndscaleph(zmm1, ptr [rax+0x40], 0x5); -vrndscaleph(zmm1, ptr_b [rax+0x40], 0x6); -vrndscaleph(zmm1|k1|T_z|T_sae, zmm5, 0x7); -vrndscalesh(xmm1, xmm3, ptr [rax+0x40], 0x1); -vrndscalesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); -vfpclassph(k1, xword [rax+0x40], 0x1); -vfpclassph(k1, xword_b[rax+0x40], 0x2); -vfpclassph(k1, yword [rax+0x40], 0x3); -vfpclassph(k1, yword_b[rax+0x40], 0x4); -vfpclassph(k1, zword [rax+0x40], 0x5); -vfpclassph(k1, zword_b[rax+0x40], 0x6); -vfpclasssh(k1|k2, xmm3, 0x5); -vfpclasssh(k1|k2, ptr [rax+0x40], 0x5); -vgetexpph(xmm1, ptr [rax+0x40]); -vgetexpph(ymm1, ptr_b [rax+0x40]); -vgetexpph(zmm1, ptr [rax+0x40]); -vgetexpph(zmm1|k1|T_z|T_sae, zmm5); -vgetexpsh(xmm1, xmm5, ptr [rax+0x40]); -vgetexpsh(xmm1|k1|T_z|T_sae, xmm3, xmm5); -vgetmantph(xmm1, ptr [rax+0x40], 0x1); -vgetmantph(ymm1, ptr_b [rax+0x40], 0x2); -vgetmantph(zmm1, ptr [rax+0x40], 0x3); -vgetmantph(zmm1|k1|T_z|T_sae, zmm5, 0x4); -vgetmantsh(xmm1, xmm5, ptr [rax+0x40], 0x5); -vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6); -vmovsh(xmm1|k1|T_z, ptr [rax+0x40]); -vmovsh(ptr [rax+0x40]|k1, xmm1); -vmovsh(xmm1|k2|T_z, xmm3, xmm5); -vmovw(xmm1, r13d); -vmovw(xmm3, ptr [rax+0x40]); -vmovw(r9d, xmm1); -vmovw(ptr [rax+0x40], xmm7); -vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); -vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]); -vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3); -vcvtsh2sd(xmm1, xmm2, ptr [rax+0x40]); -vcvtsh2ss(xmm1|k1|T_z|T_sae, xmm2, xmm3); -vcvtsh2ss(xmm1, xmm2, ptr [rax+0x40]); -vcvtss2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); -vcvtss2sh(xmm1, xmm2, ptr [rax+0x40]); -vcvtsh2si(edx|T_rd_sae, xmm1); -vcvtsh2si(edx, ptr [rax+0x40]); -vcvtsh2si(rdx|T_rd_sae, xmm1); -vcvtsh2si(r8, ptr [rax+0x40]); -vcvtph2dq(xmm1, xmm5); -vcvtph2dq(xmm1, ptr [rax+0x40]); -vcvtph2dq(xmm1, ptr_b [rax+0x40]); -vcvtph2dq(ymm1|k2|T_z, xmm5); -vcvtph2dq(ymm1, ptr [rax+0x40]); -vcvtph2dq(ymm1, ptr_b [rax+0x40]); -vcvtph2dq(zmm1|k5|T_z|T_rd_sae, ymm3); -vcvtph2dq(zmm1|k5|T_z, ptr [rax+0x40]); -vcvtph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); -vcvtph2psx(xmm1, xmm5); -vcvtph2psx(xmm1, ptr [rax+0x40]); -vcvtph2psx(xmm1, ptr_b [rax+0x40]); -vcvtph2psx(ymm1|k2|T_z, xmm5); -vcvtph2psx(ymm1, ptr [rax+0x40]); -vcvtph2psx(ymm1, ptr_b [rax+0x40]); -vcvtph2psx(zmm1|k5|T_z|T_sae, ymm3); -vcvtph2psx(zmm1|k5|T_z, ptr [rax+0x40]); -vcvtph2psx(zmm1|k5|T_z, ptr_b [rax+0x40]); -vcvtph2udq(xmm1, xmm5); -vcvtph2udq(xmm1, ptr [rax+0x40]); -vcvtph2udq(xmm1, ptr_b [rax+0x40]); -vcvtph2udq(ymm1|k2|T_z, xmm5); -vcvtph2udq(ymm1, ptr [rax+0x40]); -vcvtph2udq(ymm1, ptr_b [rax+0x40]); -vcvtph2udq(zmm1|k5|T_z|T_rd_sae, ymm3); -vcvtph2udq(zmm1|k5|T_z, ptr [rax+0x40]); -vcvtph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); -vcvttph2dq(xmm1, xmm5); -vcvttph2dq(xmm1, ptr [rax+0x40]); -vcvttph2dq(xmm1, ptr_b [rax+0x40]); -vcvttph2dq(ymm1|k2|T_z, xmm5); -vcvttph2dq(ymm1, ptr [rax+0x40]); -vcvttph2dq(ymm1, ptr_b [rax+0x40]); -vcvttph2dq(zmm1|k5|T_z|T_sae, ymm3); -vcvttph2dq(zmm1|k5|T_z, ptr [rax+0x40]); -vcvttph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); -vcvttph2udq(xmm1, xmm5); -vcvttph2udq(xmm1, ptr [rax+0x40]); -vcvttph2udq(xmm1, ptr_b [rax+0x40]); -vcvttph2udq(ymm1|k2|T_z, xmm5); -vcvttph2udq(ymm1, ptr [rax+0x40]); -vcvttph2udq(ymm1, ptr_b [rax+0x40]); -vcvttph2udq(zmm1|k5|T_z|T_sae, ymm3); -vcvttph2udq(zmm1|k5|T_z, ptr [rax+0x40]); -vcvttph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); -vcvtph2pd(xmm1, xmm5); -vcvtph2pd(xmm1, ptr [rax+0x40]); -vcvtph2pd(xmm1, ptr_b [rax+0x40]); -vcvtph2pd(ymm1|k2|T_z, xmm5); -vcvtph2pd(ymm1, ptr [rax+0x40]); -vcvtph2pd(ymm1, ptr_b [rax+0x40]); -vcvtph2pd(zmm1|k5|T_z|T_sae, xmm3); -vcvtph2pd(zmm1|k5|T_z, ptr [rax+0x40]); -vcvtph2pd(zmm1|k5|T_z, ptr_b [rax+0x40]); -vcvtph2qq(xmm1, xmm5); -vcvtph2qq(xmm1, ptr [rax+0x40]); -vcvtph2qq(xmm1, ptr_b [rax+0x40]); -vcvtph2qq(ymm1|k2|T_z, xmm5); -vcvtph2qq(ymm1, ptr [rax+0x40]); -vcvtph2qq(ymm1, ptr_b [rax+0x40]); -vcvtph2qq(zmm1|k5|T_z|T_rd_sae, xmm3); -vcvtph2qq(zmm1|k5|T_z, ptr [rax+0x40]); -vcvtph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); -vcvtph2uqq(xmm1, xmm5); -vcvtph2uqq(xmm1, ptr [rax+0x40]); -vcvtph2uqq(xmm1, ptr_b [rax+0x40]); -vcvtph2uqq(ymm1|k2|T_z, xmm5); -vcvtph2uqq(ymm1, ptr [rax+0x40]); -vcvtph2uqq(ymm1, ptr_b [rax+0x40]); -vcvtph2uqq(zmm1|k5|T_z|T_rd_sae, xmm3); -vcvtph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); -vcvtph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); -vcvttph2uqq(xmm1, xmm5); -vcvttph2uqq(xmm1, ptr [rax+0x40]); -vcvttph2uqq(xmm1, ptr_b [rax+0x40]); -vcvttph2uqq(ymm1|k2|T_z, xmm5); -vcvttph2uqq(ymm1, ptr [rax+0x40]); -vcvttph2uqq(ymm1, ptr_b [rax+0x40]); -vcvttph2uqq(zmm1|k5|T_z|T_sae, xmm3); -vcvttph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); -vcvttph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); -vcvtdq2ph(xmm1, xmm5); -vcvtdq2ph(xmm1, xword [rax+0x40]); -vcvtdq2ph(xmm1, xword_b [rax+0x40]); -vcvtdq2ph(xmm1, yword [rax+0x40]); -vcvtdq2ph(xmm1, yword_b [rax+0x40]); -vcvtdq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); -vcvtdq2ph(ymm1, ptr [rax+0x40]); -vcvtdq2ph(ymm1, ptr_b [rax+0x40]); -vcvtps2phx(xmm1, xmm5); -vcvtps2phx(xmm1, xword [rax+0x40]); -vcvtps2phx(xmm1, xword_b [rax+0x40]); -vcvtps2phx(xmm1, yword [rax+0x40]); -vcvtps2phx(xmm1, yword_b [rax+0x40]); -vcvtps2phx(ymm1|k2|T_z|T_rd_sae, zmm5); -vcvtps2phx(ymm1, ptr [rax+0x40]); -vcvtps2phx(ymm1, ptr_b [rax+0x40]); -vcvtudq2ph(xmm1, xmm5); -vcvtudq2ph(xmm1, xword [rax+0x40]); -vcvtudq2ph(xmm1, xword_b [rax+0x40]); -vcvtudq2ph(xmm1, yword [rax+0x40]); -vcvtudq2ph(xmm1, yword_b [rax+0x40]); -vcvtudq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); -vcvtudq2ph(ymm1, ptr [rax+0x40]); -vcvtudq2ph(ymm1, ptr_b [rax+0x40]); -vcvtpd2ph(xmm1, xmm5); -vcvtpd2ph(xmm1, ymm5); -vcvtpd2ph(xmm1|k2|T_z|T_rd_sae, zmm5); -vcvtpd2ph(xmm1, xword [rax+0x40]); -vcvtpd2ph(xmm1, xword_b [rax+0x40]); -vcvtpd2ph(xmm1, yword [rax+0x40]); -vcvtpd2ph(xmm1, yword_b [rax+0x40]); -vcvtpd2ph(xmm1, zword [rax+0x40]); -vcvtpd2ph(xmm1, zword_b [rax+0x40]); -vcvtqq2ph(xmm1, xmm5); -vcvtqq2ph(xmm1, ymm5); -vcvtqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); -vcvtqq2ph(xmm1, xword [rax+0x40]); -vcvtqq2ph(xmm1, xword_b [rax+0x40]); -vcvtqq2ph(xmm1, yword [rax+0x40]); -vcvtqq2ph(xmm1, yword_b [rax+0x40]); -vcvtqq2ph(xmm1, zword [rax+0x40]); -vcvtqq2ph(xmm1, zword_b [rax+0x40]); -vcvtuqq2ph(xmm1, xmm5); -vcvtuqq2ph(xmm1, ymm5); -vcvtuqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); -vcvtuqq2ph(xmm1, xword [rax+0x40]); -vcvtuqq2ph(xmm1, xword_b [rax+0x40]); -vcvtuqq2ph(xmm1, yword [rax+0x40]); -vcvtuqq2ph(xmm1, yword_b [rax+0x40]); -vcvtuqq2ph(xmm1, zword [rax+0x40]); -vcvtuqq2ph(xmm1, zword_b [rax+0x40]); -vcvtph2uw(xmm1, xmm5); -vcvtph2uw(xmm1, ptr [rax+0x40]); -vcvtph2uw(xmm1, ptr_b [rax+0x40]); -vcvtph2uw(ymm1, ptr [rax+0x40]); -vcvtph2uw(ymm1, ptr_b [rax+0x40]); -vcvtph2uw(zmm1|k2|T_z|T_rd_sae, zmm5); -vcvtph2uw(zmm1, ptr [rax+0x40]); -vcvtph2uw(zmm1, ptr_b [rax+0x40]); -vcvtph2w(xmm1, xmm5); -vcvtph2w(xmm1, ptr [rax+0x40]); -vcvtph2w(xmm1, ptr_b [rax+0x40]); -vcvtph2w(ymm1, ptr [rax+0x40]); -vcvtph2w(ymm1, ptr_b [rax+0x40]); -vcvtph2w(zmm1|k2|T_z|T_rd_sae, zmm5); -vcvtph2w(zmm1, ptr [rax+0x40]); -vcvtph2w(zmm1, ptr_b [rax+0x40]); -vcvttph2uw(xmm1, xmm5); -vcvttph2uw(xmm1, ptr [rax+0x40]); -vcvttph2uw(xmm1, ptr_b [rax+0x40]); -vcvttph2uw(ymm1, ptr [rax+0x40]); -vcvttph2uw(ymm1, ptr_b [rax+0x40]); -vcvttph2uw(zmm1|k2|T_z|T_sae, zmm5); -vcvttph2uw(zmm1, ptr [rax+0x40]); -vcvttph2uw(zmm1, ptr_b [rax+0x40]); -vcvttph2w(xmm1, xmm5); -vcvttph2w(xmm1, ptr [rax+0x40]); -vcvttph2w(xmm1, ptr_b [rax+0x40]); -vcvttph2w(ymm1, ptr [rax+0x40]); -vcvttph2w(ymm1, ptr_b [rax+0x40]); -vcvttph2w(zmm1|k2|T_z|T_sae, zmm5); -vcvttph2w(zmm1, ptr [rax+0x40]); -vcvttph2w(zmm1, ptr_b [rax+0x40]); -vcvtuw2ph(xmm1, xmm5); -vcvtuw2ph(xmm1, ptr [rax+0x40]); -vcvtuw2ph(xmm1, ptr_b [rax+0x40]); -vcvtuw2ph(ymm1, ptr [rax+0x40]); -vcvtuw2ph(ymm1, ptr_b [rax+0x40]); -vcvtuw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); -vcvtuw2ph(zmm1, ptr [rax+0x40]); -vcvtuw2ph(zmm1, ptr_b [rax+0x40]); -vcvtw2ph(xmm1, xmm5); -vcvtw2ph(xmm1, ptr [rax+0x40]); -vcvtw2ph(xmm1, ptr_b [rax+0x40]); -vcvtw2ph(ymm1, ptr [rax+0x40]); -vcvtw2ph(ymm1, ptr_b [rax+0x40]); -vcvtw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); -vcvtw2ph(zmm1, ptr [rax+0x40]); -vcvtw2ph(zmm1, ptr_b [rax+0x40]); -vcvtps2ph(xmm1, xmm2, 0x1); -vcvtps2ph(ptr [rax+0x40], xmm2, 0x2); -vcvtps2ph(xmm1, ymm2, 0x3); -vcvtps2ph(ptr [rax+0x40], ymm2, 0x4); -vcvtps2ph(xmm1|k1|T_z, xmm2, 0x5); -vcvtps2ph(ptr [rax+0x40]|k1, xmm3, 0x6); -vcvtps2ph(xmm1|k2, ymm4, 0x7); -vcvtps2ph(ptr [rax+0x40]|k2, ymm5, 0x8); -vcvtps2ph(ymm1|k2|T_sae, zmm5, 0x9); -vcvtps2ph(ptr [rax+0x40]|k5, zmm4, 0xa); -vcvtsh2usi(ecx|T_rd_sae, xmm1); -vcvtsh2usi(eax, ptr [rax+0x40]); -vcvtsh2usi(r9|T_rd_sae, xmm1); -vcvtsh2usi(r13, ptr [rax+0x40]); -vcvttsh2si(ecx|T_sae, xmm1); -vcvttsh2si(eax, ptr [rax+0x40]); -vcvttsh2si(r9|T_sae, xmm1); -vcvttsh2si(r13, ptr [rax+0x40]); -vcvttsh2usi(ecx|T_sae, xmm1); -vcvttsh2usi(eax, ptr [rax+0x40]); -vcvttsh2usi(r9|T_sae, xmm1); -vcvttsh2usi(r13, ptr [rax+0x40]); -vcvttph2qq(xmm1, xmm5); -vcvttph2qq(xmm1, ptr [rax+0x40]); -vcvttph2qq(xmm1, ptr_b [rax+0x40]); -vcvttph2qq(ymm1|k2|T_z, xmm5); -vcvttph2qq(ymm1, ptr [rax+0x40]); -vcvttph2qq(ymm1, ptr_b [rax+0x40]); -vcvttph2qq(zmm1|k5|T_z|T_sae, xmm3); -vcvttph2qq(zmm1|k5|T_z, ptr [rax+0x40]); -vcvttph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); -vcvtsi2sh(xmm1|T_rd_sae, xmm2, eax); -vcvtsi2sh(xmm1, xmm2, dword [rax+0x40]); -vcvtsi2sh(xmm1|T_rd_sae, xmm2, r9); -vcvtsi2sh(xmm1, xmm2, qword [rax+0x40]); -vcvtusi2sh(xmm1|T_rd_sae, xmm2, eax); -vcvtusi2sh(xmm1, xmm2, dword [rax+0x40]); -vcvtusi2sh(xmm1|T_rd_sae, xmm2, r9); -vcvtusi2sh(xmm1, xmm2, qword [rax+0x40]); -aadd(ptr[rax], ecx); -aadd(ptr[eax], ecx); -aadd(ptr[rax], r10); -aand(ptr[rax], ecx); -aand(ptr[eax], ecx); -aand(ptr[rax], r10); -aor(ptr[rax], ecx); -aor(ptr[eax], ecx); -aor(ptr[rax], r10); -axor(ptr[rax], ecx); -axor(ptr[eax], ecx); -axor(ptr[rax], r10); -cmpbexadd(ptr[rax+r10*4], rcx, rdx); -cmpbxadd(ptr[rax+r10*4], rcx, rdx); -cmplexadd(ptr[rax+r10*4], rcx, rdx); -cmplxadd(ptr[rax+r10*4], rcx, rdx); -cmpnbexadd(ptr[rax+r10*4], rcx, rdx); -cmpnbxadd(ptr[rax+r10*4], rcx, rdx); -cmpnlexadd(ptr[rax+r10*4], rcx, rdx); -cmpnlxadd(ptr[rax+r10*4], rcx, rdx); -cmpnoxadd(ptr[rax+r10*4], rcx, rdx); -cmpnpxadd(ptr[rax+r10*4], rcx, rdx); -cmpnsxadd(ptr[rax+r10*4], rcx, rdx); -cmpnzxadd(ptr[rax+r10*4], rcx, rdx); -cmpoxadd(ptr[rax+r10*4], rcx, rdx); -cmppxadd(ptr[rax+r10*4], rcx, rdx); -cmpsxadd(ptr[rax+r10*4], rcx, rdx); -cmpzxadd(ptr[rax+r10*4], rcx, rdx); -vsha512msg1(ymm3, xmm5); -vsha512msg2(ymm9, ymm10); -vsha512rnds2(ymm1, ymm3, xmm2); -vsm3msg1(xmm1, xmm2, xmm3); -vsm3msg1(xmm1, xmm2, ptr [rax]); -vsm3msg2(xmm5, xmm7, xmm3); -vsm3msg2(xmm5, xmm6, ptr [rax]); -vsm3rnds2(xmm5, xmm7, xmm3, 0x12); -vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34); -vsm4key4(xmm1, xmm2, xmm3); -vsm4key4(xmm1, xmm2, ptr [rdx]); -vsm4rnds4(xmm1, xmm2, xmm3); -vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]); -vpdpbssd(xmm1, xmm2, xmm3); -vpdpbssd(ymm1, ymm2, ptr [rax]); -vpdpbssds(xmm1, xmm2, xmm3); -vpdpbssds(ymm1, ymm2, ptr [rax]); -vpdpbsud(xmm1, xmm2, xmm3); -vpdpbsud(ymm1, ymm2, ptr [rax]); -vpdpbsuds(xmm1, xmm2, xmm3); -vpdpbsuds(ymm1, ymm2, ptr [rax]); -vpdpbuud(xmm1, xmm2, xmm3); -vpdpbuud(ymm1, ymm2, ptr [rax]); -vpdpbuuds(xmm1, xmm2, xmm3); -vpdpbuuds(ymm1, ymm2, ptr [rax]); -vpdpwsud(xmm1, xmm2, xmm3); -vpdpwsud(ymm1, ymm2, ptr [rax]); -vpdpwsuds(xmm1, xmm2, xmm3); -vpdpwsuds(ymm1, ymm2, ptr [rax]); -vpdpwusd(xmm1, xmm2, xmm3); -vpdpwusd(ymm1, ymm2, ptr [rax]); -vpdpwusds(xmm1, xmm2, xmm3); -vpdpwusds(ymm1, ymm2, ptr [rax]); -vpdpwuud(xmm1, xmm2, xmm3); -vpdpwuud(ymm1, ymm2, ptr [rax]); -vpdpwuuds(xmm1, xmm2, xmm3); -vpdpwuuds(ymm1, ymm2, ptr [rax]); diff --git a/test/test_by_xed.py b/test/test_by_xed.py index 3e4b98f..cd6b7bb 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -273,7 +273,7 @@ def loadFile(name): r = [] for line in f.read().split('\n'): if line: - if line[0] == '#': + if line[0] == '#' or line.startswith('//'): continue r.append(line) return r @@ -287,6 +287,9 @@ def removeExtraInfo(s): def run(cppText, xedText): cpp = loadFile(cppText) xed = loadFile(xedText) + if len(cpp) != len(xed): + raise Exception(f'different line {len(cpp)} {len(xed)}') + for i in range(len(cpp)): line1 = cpp[i] line2 = removeExtraInfo(xed[i]) diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 8316bd9..f98e001 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2047,6 +2047,7 @@ void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B); } void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); } void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB); } +void vaddnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x58); } void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58); } void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58); } void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x03, imm); } -- cgit v1.2.3 From 3ca7e64c63daac8c3dd1c3cbafdc26ac011fa6ab Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 11:50:55 +0900 Subject: add type of w(x, x, op) in avx10 bf16 --- gen/gen_avx512.cpp | 12 ++++++++++-- test/avx10/bf16.txt | 30 ++++++++++++++++++++++++++++++ xbyak/xbyak_mnemonic.h | 6 ++++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 23923b0..ff1ba30 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -946,18 +946,26 @@ void putFP16_2() void putAVX10_BF16() { - const struct Tbl { + // x, x, op + const struct xxopTbl { const char *name; uint64_t type; uint8_t code; } tbl[] = { { "vaddnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x58 }, + { "vdivnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5E }, + { "vmaxpbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5F }, + { "vminpbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5D }, + { "vmulnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x59 }, + { "vscalefpbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x2C }, + { "vsubnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5C }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { - const Tbl& p = tbl[i]; + const xxopTbl& p = tbl[i]; std::string s = type2String(p.type | T_MUST_EVEX); printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%2X); }\n" , p.name, s.c_str(), p.code); } +// { "vrcppbf16", T_66 | T_MAP6 | T_EW0 | T_YMM | T_B16, 0x4C }, } void putFP16() diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt index 1c77f93..d8f4c5a 100644 --- a/test/avx10/bf16.txt +++ b/test/avx10/bf16.txt @@ -2,3 +2,33 @@ vaddnepbf16(xm1, xm2, xm3); vaddnepbf16(ym1|k1, ym2, ptr[rax+128]); vaddnepbf16(ym1|k1, ym2, ptr_b[rax+128]); vaddnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vdivnepbf16(xm1, xm2, xm3); +vdivnepbf16(ym1|k1, ym2, ptr[rax+128]); +vdivnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vdivnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vmaxpbf16(xm1, xm2, xm3); +vmaxpbf16(ym1|k1, ym2, ptr[rax+128]); +vmaxpbf16(ym1|k1, ym2, ptr_b[rax+128]); +vmaxpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vminpbf16(xm1, xm2, xm3); +vminpbf16(ym1|k1, ym2, ptr[rax+128]); +vminpbf16(ym1|k1, ym2, ptr_b[rax+128]); +vminpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vmulnepbf16(xm1, xm2, xm3); +vmulnepbf16(ym1|k1, ym2, ptr[rax+128]); +vmulnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vmulnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vscalefpbf16(xm1, xm2, xm3); +vscalefpbf16(ym1|k1, ym2, ptr[rax+128]); +vscalefpbf16(ym1|k1, ym2, ptr_b[rax+128]); +vscalefpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vsubnepbf16(xm1, xm2, xm3); +vsubnepbf16(ym1|k1, ym2, ptr[rax+128]); +vsubnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vsubnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index f98e001..f6cafef 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2259,6 +2259,7 @@ void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2 void vcvtuw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vcvtw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x42, imm); } +void vdivnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5E); } void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E); } void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E); } void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52); } @@ -2350,8 +2351,10 @@ void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x3A, imm); } void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x38, imm); } void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3A, imm); } +void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); } void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); } void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); } +void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); } void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); } void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); } void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); } @@ -2372,6 +2375,7 @@ void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); } +void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); } void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); } void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); } void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); } @@ -2555,6 +2559,7 @@ void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM( void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCD); } void vrsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); } void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4F); } +void vscalefpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); } void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x2C); } void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x2C); } void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x2C); } @@ -2579,6 +2584,7 @@ void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm); } void vsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x51); } void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_F3|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x51); } +void vsubnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5C); } void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C); } void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C); } void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); } -- cgit v1.2.3 From a84866bcbc8411416e51f53b210c7d9f06e3e763 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 12:21:48 +0900 Subject: add vf[,n]m[add,sub][132,213,231]nebf16 --- gen/gen_avx512.cpp | 16 ++++++++++++++ test/avx10/bf16.txt | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++ test/test_by_xed.py | 9 ++++---- xbyak/xbyak_mnemonic.h | 12 ++++++++++ 4 files changed, 93 insertions(+), 4 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index ff1ba30..b1bf0b1 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -959,6 +959,22 @@ void putAVX10_BF16() { "vmulnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x59 }, { "vscalefpbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x2C }, { "vsubnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5C }, + + { "vfmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x98 }, + { "vfmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xA8 }, + { "vfmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xB8 }, + + { "vfnmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9C }, + { "vfnmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAC }, + { "vfnmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBC }, + + { "vfmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9A }, + { "vfmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAA }, + { "vfmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBA }, + + { "vfnmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9E }, + { "vfnmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAE }, + { "vfnmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBE }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const xxopTbl& p = tbl[i]; diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt index d8f4c5a..7dcdb25 100644 --- a/test/avx10/bf16.txt +++ b/test/avx10/bf16.txt @@ -32,3 +32,63 @@ vsubnepbf16(xm1, xm2, xm3); vsubnepbf16(ym1|k1, ym2, ptr[rax+128]); vsubnepbf16(ym1|k1, ym2, ptr_b[rax+128]); vsubnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// madd +vfmadd132nepbf16(xm1, xm2, xm3); +vfmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmadd213nepbf16(xm1, xm2, xm3); +vfmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmadd231nepbf16(xm1, xm2, xm3); +vfmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// nmadd +vfnmadd132nepbf16(xm1, xm2, xm3); +vfnmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmadd213nepbf16(xm1, xm2, xm3); +vfnmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmadd231nepbf16(xm1, xm2, xm3); +vfnmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// msub +vfmsub132nepbf16(xm1, xm2, xm3); +vfmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmsub213nepbf16(xm1, xm2, xm3); +vfmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmsub231nepbf16(xm1, xm2, xm3); +vfmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// nmsub +vfnmsub132nepbf16(xm1, xm2, xm3); +vfnmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmsub213nepbf16(xm1, xm2, xm3); +vfnmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmsub231nepbf16(xm1, xm2, xm3); +vfnmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); diff --git a/test/test_by_xed.py b/test/test_by_xed.py index cd6b7bb..5b84995 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -287,17 +287,18 @@ def removeExtraInfo(s): def run(cppText, xedText): cpp = loadFile(cppText) xed = loadFile(xedText) - if len(cpp) != len(xed): - raise Exception(f'different line {len(cpp)} {len(xed)}') + n = len(cpp) + if n != len(xed): + raise Exception(f'different line {n} {len(xed)}') - for i in range(len(cpp)): + for i in range(n): line1 = cpp[i] line2 = removeExtraInfo(xed[i]) m1 = parseNmemonic(line1) m2 = parseNmemonic(line2) assertEqual(m1, m2, f'{i+1}') - print('run ok') + print('run ok', n) def assertEqualStr(a, b, msg=None): if str(a) != str(b): diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index f6cafef..7ce61e0 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2281,36 +2281,48 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); } void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } +void vfmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); } void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x98); } void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x99); } +void vfmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); } void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA8); } void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); } +void vfmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); } void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); } void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); } void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); } void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); } +void vfmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); } void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9A); } void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9B); } +void vfmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); } void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAA); } void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAB); } +void vfmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); } void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBA); } void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBB); } void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); } void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); } void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); } void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); } +void vfnmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); } +void vfnmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); } void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); } void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAD); } +void vfnmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); } void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBC); } void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBD); } +void vfnmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); } void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9E); } void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9F); } +void vfnmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); } void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAE); } void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAF); } +void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); } void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); } -- cgit v1.2.3 From 6dc564185b85d16e240f9f15aad5be08036176d9 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 13:02:17 +0900 Subject: add vcmppbf16, vfpclasspbf16 --- gen/gen_avx512.cpp | 5 +++-- test/avx10/bf16.txt | 17 +++++++++++++++++ xbyak/xbyak_mnemonic.h | 2 ++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index b1bf0b1..3812cdd 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -946,7 +946,7 @@ void putFP16_2() void putAVX10_BF16() { - // x, x, op + // x, x, op : 8 const struct xxopTbl { const char *name; uint64_t type; @@ -981,7 +981,8 @@ void putAVX10_BF16() std::string s = type2String(p.type | T_MUST_EVEX); printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%2X); }\n" , p.name, s.c_str(), p.code); } -// { "vrcppbf16", T_66 | T_MAP6 | T_EW0 | T_YMM | T_B16, 0x4C }, + puts("void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }"); + puts("void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }"); } void putFP16() diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt index 7dcdb25..f3e44a6 100644 --- a/test/avx10/bf16.txt +++ b/test/avx10/bf16.txt @@ -92,3 +92,20 @@ vfnmsub231nepbf16(xm1, xm2, xm3); vfnmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); vfnmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfnmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vcmppbf16(k1, xm5, xm4, 5); +vcmppbf16(k2, ym5, ym4, 6); +vcmppbf16(k3, ym15, ptr_b[rax+128], 7); +vcmppbf16(k4, zm30, zm20, 8); +vcmppbf16(k5, zm1, ptr[rax+128], 9); +vcmppbf16(k6, zm10, ptr_b[rax+128], 10); + +vfpclasspbf16(k1, xm4, 5); +vfpclasspbf16(k2|k5, ym4, 6); +vfpclasspbf16(k3|k5, zm20, 7); +vfpclasspbf16(k3|k5, xword[rax+128], 8); +vfpclasspbf16(k3, xword_b[rax+128], 9); +vfpclasspbf16(k5|k5, yword[rax+128], 10); +vfpclasspbf16(k6|k5, yword_b[rax+128], 11); +vfpclasspbf16(k7|k5, zword[rax+128], 12); +vfpclasspbf16(k7|k5, zword_b[rax+128], 13); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 7ce61e0..cfcd6e2 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2176,6 +2176,7 @@ void vcmpordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, void vcmpordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 7); } void vcmpordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 7); } void vcmpordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 7); } +void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); } void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0xC2, imm); } void vcmpph(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0xC2, imm); } void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0xC2, imm); } @@ -2325,6 +2326,7 @@ void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_X void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); } +void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); } void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); } void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm); } void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); } -- cgit v1.2.3 From 842c3cc83f209c0789f15a809ae45cf30a8b2f5f Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 14:40:00 +0900 Subject: support all avx10 bf16 instructions --- gen/gen_avx512.cpp | 70 +++++++++++++++++------------------ test/avx10/bf16.txt | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++ xbyak/xbyak_mnemonic.h | 9 +++++ 3 files changed, 143 insertions(+), 35 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 3812cdd..ddf8adc 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -198,6 +198,8 @@ void putX_XM() { 0x7C, "vcvttph2w", T_66 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_SAE_Z }, { 0x7D, "vcvtuw2ph", T_F2 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_ER_Z }, { 0x7D, "vcvtw2ph", T_F3 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_ER_Z }, + + { 0x51, "vsqrtnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -401,6 +403,30 @@ void putX_X_XM_IMM() { 0x5A, "vcvtsh2sd", T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, false }, { 0x13, "vcvtsh2ss", T_MAP6 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, false }, { 0x1D, "vcvtss2sh", T_MAP5 | T_MUST_EVEX | T_EW0 | T_ER_X | T_N4, false }, + + { 0x58, "vaddnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x5E, "vdivnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x5F, "vmaxpbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x5D, "vminpbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x59, "vmulnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x2C, "vscalefpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16,false }, + { 0x5C, "vsubnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + + { 0x98, "vfmadd132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xA8, "vfmadd213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xB8, "vfmadd231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x9C, "vfnmadd132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xAC, "vfnmadd213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xBC, "vfnmadd231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x9A, "vfmsub132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xAA, "vfmsub213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xBA, "vfmsub231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x9E, "vfnmsub132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xAE, "vfnmsub213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xBE, "vfnmsub231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -758,6 +784,15 @@ void putX_XM_IMM() { 0x62, "vpexpandb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_N1, false }, { 0x62, "vpexpandw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_N2, false }, + + { 0x2F, "vcomsbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_N2, false }, + { 0x42, "vgetexppbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x26, "vgetmantpbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x4C, "vrcppbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0x56, "vreducenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x08, "vrndscalenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x4E, "vrsqrtpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0x2C, "vscalefpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -946,41 +981,6 @@ void putFP16_2() void putAVX10_BF16() { - // x, x, op : 8 - const struct xxopTbl { - const char *name; - uint64_t type; - uint8_t code; - } tbl[] = { - { "vaddnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x58 }, - { "vdivnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5E }, - { "vmaxpbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5F }, - { "vminpbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5D }, - { "vmulnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x59 }, - { "vscalefpbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x2C }, - { "vsubnepbf16", T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, 0x5C }, - - { "vfmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x98 }, - { "vfmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xA8 }, - { "vfmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xB8 }, - - { "vfnmadd132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9C }, - { "vfnmadd213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAC }, - { "vfnmadd231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBC }, - - { "vfmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9A }, - { "vfmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAA }, - { "vfmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBA }, - - { "vfnmsub132nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0x9E }, - { "vfnmsub213nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xAE }, - { "vfnmsub231nepbf16", T_MAP6 | T_EW0 | T_YMM | T_B16, 0xBE }, - }; - for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { - const xxopTbl& p = tbl[i]; - std::string s = type2String(p.type | T_MUST_EVEX); - printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%2X); }\n" , p.name, s.c_str(), p.code); - } puts("void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }"); puts("void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }"); } diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt index f3e44a6..c544e02 100644 --- a/test/avx10/bf16.txt +++ b/test/avx10/bf16.txt @@ -109,3 +109,102 @@ vfpclasspbf16(k5|k5, yword[rax+128], 10); vfpclasspbf16(k6|k5, yword_b[rax+128], 11); vfpclasspbf16(k7|k5, zword[rax+128], 12); vfpclasspbf16(k7|k5, zword_b[rax+128], 13); + +vcomsbf16(xm2, xm3); +vcomsbf16(xm2, ptr[rax+128]); + +vgetexppbf16(xm1|k3, xmm2); +vgetexppbf16(xm1|k3, ptr[rax+128]); +vgetexppbf16(xm1|k3, ptr_b[rax+128]); + +vgetexppbf16(ym1|k3, ymm2); +vgetexppbf16(ym1|k3, ptr[rax+128]); +vgetexppbf16(ym1|k3, ptr_b[rax+128]); + +vgetexppbf16(zm1|k3, zmm2); +vgetexppbf16(zm1|k3, ptr[rax+128]); +vgetexppbf16(zm1|k3, ptr_b[rax+128]); + +vgetmantpbf16(xm1|k3, xmm2, 3); +vgetmantpbf16(xm1|k3, ptr[rax+128], 5); +vgetmantpbf16(xm1|k3, ptr_b[rax+128], 9); + +vgetmantpbf16(ym1|k3, ymm2, 3); +vgetmantpbf16(ym1|k3, ptr[rax+128], 5); +vgetmantpbf16(ym1|k3, ptr_b[rax+128], 9); + +vgetmantpbf16(zm1|k3, zmm2, 3); +vgetmantpbf16(zm1|k3, ptr[rax+128], 5); +vgetmantpbf16(zm1|k3, ptr_b[rax+128], 9); + +vrcppbf16(xm1|k5, xm2); +vrcppbf16(xm1|k5, ptr[rcx+128]); +vrcppbf16(xm1|k5, ptr_b[rcx+128]); + +vrcppbf16(ym1|k5, ym2); +vrcppbf16(ym1|k5, ptr[rcx+128]); +vrcppbf16(ym1|k5, ptr_b[rcx+128]); + +vrcppbf16(zm1|k5, zm2); +vrcppbf16(zm1|k5, ptr[rcx+128]); +vrcppbf16(zm1|k5, ptr_b[rcx+128]); + +vreducenepbf16(xm1|k4, xm2, 1); +vreducenepbf16(xm1|k4, ptr[rax+128], 1); +vreducenepbf16(xm1|k4, ptr_b[rax+128], 1); + +vreducenepbf16(ym1|k4, ym2, 1); +vreducenepbf16(ym1|k4, ptr[rax+128], 1); +vreducenepbf16(ym1|k4, ptr_b[rax+128], 1); + +vreducenepbf16(zm1|k4, zm2, 1); +vreducenepbf16(zm1|k4, ptr[rax+128], 1); +vreducenepbf16(zm1|k4, ptr_b[rax+128], 1); + +vrndscalenepbf16(xm1|k4, xm2, 1); +vrndscalenepbf16(xm1|k4, ptr[rax+128], 1); +vrndscalenepbf16(xm1|k4, ptr_b[rax+128], 1); + +vrndscalenepbf16(ym1|k4, ym2, 1); +vrndscalenepbf16(ym1|k4, ptr[rax+128], 1); +vrndscalenepbf16(ym1|k4, ptr_b[rax+128], 1); + +vrndscalenepbf16(zm1|k4, zm2, 1); +vrndscalenepbf16(zm1|k4, ptr[rax+128], 1); +vrndscalenepbf16(zm1|k4, ptr_b[rax+128], 1); + +vrsqrtpbf16(xm1|k5, xm2); +vrsqrtpbf16(xm1|k5, ptr[rcx+128]); +vrsqrtpbf16(xm1|k5, ptr_b[rcx+128]); + +vrsqrtpbf16(ym1|k5, ym2); +vrsqrtpbf16(ym1|k5, ptr[rcx+128]); +vrsqrtpbf16(ym1|k5, ptr_b[rcx+128]); + +vrsqrtpbf16(zm1|k5, zm2); +vrsqrtpbf16(zm1|k5, ptr[rcx+128]); +vrsqrtpbf16(zm1|k5, ptr_b[rcx+128]); + +vscalefpbf16(xm1|k5, xm5, xm2); +vscalefpbf16(xm1|k5, xm5, ptr[rcx+128]); +vscalefpbf16(xm1|k5, xm5, ptr_b[rcx+128]); + +vscalefpbf16(ym1|k5, ym9, ym2); +vscalefpbf16(ym1|k5, ym9, ptr[rcx+128]); +vscalefpbf16(ym1|k5, ym9, ptr_b[rcx+128]); + +vscalefpbf16(zm1|k5, zm30, zm2); +vscalefpbf16(zm1|k5, zm30, ptr[rcx+128]); +vscalefpbf16(zm1|k5, zm30, ptr_b[rcx+128]); + +vsqrtnepbf16(xm5|k3, xmm4); +vsqrtnepbf16(xm5|k3, ptr[rax+128]); +vsqrtnepbf16(xm5|k3, ptr_b[rax+128]); + +vsqrtnepbf16(ym5|k3, ymm4); +vsqrtnepbf16(ym5|k3, ptr[rax+128]); +vsqrtnepbf16(ym5|k3, ptr_b[rax+128]); + +vsqrtnepbf16(zm5|k3, zmm4); +vsqrtnepbf16(zm5|k3, ptr[rax+128]); +vsqrtnepbf16(zm5|k3, ptr_b[rax+128]); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index cfcd6e2..93d9350 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2202,6 +2202,7 @@ void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); } void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8A); } void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8A); } +void vcomsbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x5B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW1|T_ER_Z|T_MUST_EVEX|T_B64, 0x5A); } @@ -2345,12 +2346,14 @@ void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8|T_66|T_0 void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4|T_66|T_0F38|T_EW0|T_MUST_EVEX|T_M_K|T_VSIB, 0xC7, Operand::ZMM); } void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 0); } void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 2); } +void vgetexppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); } void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x42); } void vgetexpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x42); } void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x42); } void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0x43); } void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); } void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); } +void vgetmantpbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x26, imm); } void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x26, imm); } void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x26, imm); } void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x26, imm); } @@ -2549,14 +2552,17 @@ void vrcp28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_ void vrcp28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA); } void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCB); } void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCB); } +void vrcppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); } void vrcpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); } void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4D); } +void vreducenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x56, imm); } void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x56, imm); } void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x56, imm); } void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x56, imm); } void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x57, imm); } void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); } void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); } +void vrndscalenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x08, imm); } void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x09, imm); } void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x08, imm); } void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x08, imm); } @@ -2571,8 +2577,10 @@ void vrsqrt28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | void vrsqrt28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC); } void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCD); } void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCD); } +void vrsqrtpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); } void vrsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); } void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4F); } +void vscalefpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); } void vscalefpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); } void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x2C); } void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x2C); } @@ -2596,6 +2604,7 @@ void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm); } void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm); } void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm); } +void vsqrtnepbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x51); } void vsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x51); } void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_F3|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x51); } void vsubnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5C); } -- cgit v1.2.3 From 8457f52cbbac53345ac3211e50a3828c251dc22f Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 14:46:17 +0900 Subject: remove unnecessary blank --- gen/gen_avx512.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index ddf8adc..e92f0e2 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -409,7 +409,7 @@ void putX_X_XM_IMM() { 0x5F, "vmaxpbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, { 0x5D, "vminpbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, { 0x59, "vmulnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, - { 0x2C, "vscalefpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16,false }, + { 0x2C, "vscalefpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16,false }, { 0x5C, "vsubnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, { 0x98, "vfmadd132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, -- cgit v1.2.3 From 56fc5457eb15920f93be9b77cce3b5098b5879d7 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 11 Oct 2024 17:44:53 +0900 Subject: add avx10 compare and convert instructions --- gen/gen_avx512.cpp | 61 +++++++++++++---- test/Makefile | 2 +- test/avx10/comp.txt | 17 +++++ test/avx10/convert.txt | 176 +++++++++++++++++++++++++++++++++++++++++++++++++ xbyak/xbyak.h | 12 ++++ xbyak/xbyak_mnemonic.h | 24 ++++++- 6 files changed, 278 insertions(+), 14 deletions(-) create mode 100644 test/avx10/comp.txt create mode 100644 test/avx10/convert.txt diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index e92f0e2..656bf4e 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -137,8 +137,6 @@ void putVcmp() printf("void %s(const Opmask& k, const Xmm& x, const Operand& op%s) { opAVX_K_X_XM(k, x, op, %s, 0x%02X%s); }\n" , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : ""); } - puts("void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); }"); - puts("void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); }"); } void putVcmpAlias() @@ -200,6 +198,17 @@ void putX_XM() { 0x7D, "vcvtw2ph", T_F3 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_ER_Z }, { 0x51, "vsqrtnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16 }, + + { 0x2F, "vcomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2E, "vucomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + + { 0x2F, "vcomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, + { 0x2F, "vcomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2F, "vcomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, + + { 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, + { 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -427,6 +436,12 @@ void putX_X_XM_IMM() { 0x9E, "vfnmsub132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, { 0xAE, "vfnmsub213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, { 0xBE, "vfnmsub231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x67, "vcvt2ps2phx", T_MUST_EVEX | T_66 | T_0F38 | T_EW0 | T_YMM | T_B32 | T_ER_Y | T_ER_Z, false }, + { 0x74, "vcvtne2ph2bf8", T_MUST_EVEX | T_F2 | T_0F38 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + { 0x74, "vcvtne2ph2bf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + { 0x18, "vcvtne2ph2hf8", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + { 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -979,12 +994,6 @@ void putFP16_2() } } -void putAVX10_BF16() -{ - puts("void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }"); - puts("void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }"); -} - void putFP16() { putFP16_1(); @@ -993,9 +1002,39 @@ void putFP16() putFP16_2(); } -void putAVX10() +void putAVX10_2() { - putAVX10_BF16(); + puts("void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }"); + puts("void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }"); + + const struct Tbl { + uint8_t code; + const char *name; + uint64_t type; + } tbl1[] = { + { 0x74, "vcvtbiasph2bf8", T_MUST_EVEX | T_0F38 | T_EW0 |T_YMM | T_B16 }, + { 0x74, "vcvtbiasph2bf8s", T_MUST_EVEX | T_MAP5 | T_EW0 |T_YMM | T_B16 }, + { 0x18, "vcvtbiasph2hf8", T_MUST_EVEX | T_MAP5 | T_EW0 | T_YMM | T_B16 }, + { 0x1B, "vcvtbiasph2hf8s", T_MUST_EVEX | T_MAP5 | T_EW0 | T_YMM | T_B16 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl1); i++) { + const Tbl *p = &tbl1[i]; + std::string s = type2String(p->type); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, %s, 0x%02X); }\n" , p->name, s.c_str(), p->code); + } + puts("void vcvthf82ph(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_N1, 0x1E); }"); + + const Tbl tbl2[] = { + { 0x74, "vcvtneph2bf8", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_B16 }, + { 0x74, "vcvtneph2bf8s", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 }, + { 0x18, "vcvtneph2hf8", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 }, + { 0x1B, "vcvtneph2hf8s", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl2); i++) { + const Tbl *p = &tbl2[i]; + std::string s = type2String(p->type); + printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n" , p->name, s.c_str(), p->code); + } } int main(int argc, char *[]) @@ -1023,5 +1062,5 @@ int main(int argc, char *[]) putScatter(); putV4FMA(); putFP16(); - putAVX10(); + putAVX10_2(); } diff --git a/test/Makefile b/test/Makefile index 4d0b85d..336dcaf 100644 --- a/test/Makefile +++ b/test/Makefile @@ -60,7 +60,7 @@ apx: apx.cpp $(XBYAK_INC) avx10_test: avx10_test.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 -TEST_FILES=old.txt new-ymm.txt bf16.txt +TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt xed_test: @for target in $(addprefix avx10/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done diff --git a/test/avx10/comp.txt b/test/avx10/comp.txt new file mode 100644 index 0000000..bfc883e --- /dev/null +++ b/test/avx10/comp.txt @@ -0,0 +1,17 @@ +vcomxsd(xm1, xm2|T_sae); +vcomxsd(xm1, ptr[rax+128]); + +vcomxsh(xm1, xm2|T_sae); +vcomxsh(xm1, ptr[rax+128]); + +vcomxss(xm1, xm2|T_sae); +vcomxss(xm1, ptr[rax+128]); + +vucomxsd(xm1, xm2|T_sae); +vucomxsd(xm1, ptr[rax+128]); + +vucomxsh(xm1, xm2|T_sae); +vucomxsh(xm1, ptr[rax+128]); + +vucomxss(xm1, xm2|T_sae); +vucomxss(xm1, ptr[rax+128]); diff --git a/test/avx10/convert.txt b/test/avx10/convert.txt new file mode 100644 index 0000000..836fcca --- /dev/null +++ b/test/avx10/convert.txt @@ -0,0 +1,176 @@ +vcvt2ps2phx(xm1|k5, xm2, xm3); +vcvt2ps2phx(xm1|k5, xm2, ptr[rax+128]); +vcvt2ps2phx(xm1|k5, xm2, ptr_b[rax+128]); + +vcvt2ps2phx(ym1|k5, ym2, ym3); +vcvt2ps2phx(ym1|k5, ym2, ptr[rax+128]); +vcvt2ps2phx(ym1|k5, ym2, ptr_b[rax+128]); + +vcvt2ps2phx(zm1|k5, zm2, zm3); +vcvt2ps2phx(zm1|k5, zm2, ptr[rax+128]); +vcvt2ps2phx(zm1|k5, zm2, ptr_b[rax+128]); + +// vcvtbiasph2hf8 +vcvtbiasph2bf8(xm1|k2, xm3, xm5); +vcvtbiasph2bf8(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2bf8(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2bf8(xm1|k2, ym3, ym5); +vcvtbiasph2bf8(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2bf8(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2bf8(ym1|k2, zm3, zm5); +vcvtbiasph2bf8(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2bf8(ym1|k2, zm3, ptr_b[rax+128]); + +// vcvtbiasph2bf8s +vcvtbiasph2bf8s(xm1|k2, xm3, xm5); +vcvtbiasph2bf8s(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2bf8s(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2bf8s(xm1|k2, ym3, ym5); +vcvtbiasph2bf8s(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2bf8s(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2bf8s(ym1|k2, zm3, zm5); +vcvtbiasph2bf8s(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2bf8s(ym1|k2, zm3, ptr_b[rax+128]); + +// vcvtbiasph2hf8 +vcvtbiasph2hf8(xm1|k2, xm3, xm5); +vcvtbiasph2hf8(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2hf8(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2hf8(xm1|k2, ym3, ym5); +vcvtbiasph2hf8(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2hf8(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2hf8(ym1|k2, zm3, zm5); +vcvtbiasph2hf8(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2hf8(ym1|k2, zm3, ptr_b[rax+128]); + +// vcvtbiasph2hf8s +vcvtbiasph2hf8s(xm1|k2, xm3, xm5); +vcvtbiasph2hf8s(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2hf8s(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2hf8s(xm1|k2, ym3, ym5); +vcvtbiasph2hf8s(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2hf8s(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2hf8s(ym1|k2, zm3, zm5); +vcvtbiasph2hf8s(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2hf8s(ym1|k2, zm3, ptr_b[rax+128]); + +vcvthf82ph(xm1|k5|T_z, xm2); +vcvthf82ph(xm1|k5|T_z, ptr[rax+128]); + +vcvthf82ph(ym1|k5|T_z, xm2); +vcvthf82ph(ym1|k5|T_z, ptr[rax+128]); + +vcvthf82ph(zm1|k5|T_z, ym2); +vcvthf82ph(zm1|k5|T_z, ptr[rax+128]); + +// +vcvtne2ph2bf8(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2bf8(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2bf8(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2bf8(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2bf8(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2bf8(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2bf8(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2bf8(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2bf8(zm1|T_z, zm2, ptr_b[rax+128]); + +// +vcvtne2ph2bf8s(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2bf8s(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2bf8s(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2bf8s(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2bf8s(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2bf8s(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2bf8s(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2bf8s(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2bf8s(zm1|T_z, zm2, ptr_b[rax+128]); + +// +vcvtne2ph2hf8(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2hf8(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2hf8(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2hf8(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2hf8(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2hf8(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2hf8(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2hf8(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2hf8(zm1|T_z, zm2, ptr_b[rax+128]); + +// +vcvtne2ph2hf8s(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2hf8s(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2hf8s(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2hf8s(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2hf8s(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2hf8s(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2hf8s(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2hf8s(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2hf8s(zm1|T_z, zm2, ptr_b[rax+128]); + +// vcvtneph2bf8 +vcvtneph2bf8(xmm1|k2|T_z, xmm2); +vcvtneph2bf8(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2bf8(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2bf8(xmm1|k2|T_z, ymm2); +vcvtneph2bf8(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2bf8(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2bf8(ymm1|k2|T_z, zmm2); +vcvtneph2bf8(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2bf8(ymm1|k2|T_z, zword_b[rax+128]); + +// vcvtneph2bf8s +vcvtneph2bf8s(xmm1|k2|T_z, xmm2); +vcvtneph2bf8s(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2bf8s(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2bf8s(xmm1|k2|T_z, ymm2); +vcvtneph2bf8s(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2bf8s(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2bf8s(ymm1|k2|T_z, zmm2); +vcvtneph2bf8s(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2bf8s(ymm1|k2|T_z, zword_b[rax+128]); + +// vcvtneph2hf8 +vcvtneph2hf8(xmm1|k2|T_z, xmm2); +vcvtneph2hf8(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2hf8(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2hf8(xmm1|k2|T_z, ymm2); +vcvtneph2hf8(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2hf8(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2hf8(ymm1|k2|T_z, zmm2); +vcvtneph2hf8(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2hf8(ymm1|k2|T_z, zword_b[rax+128]); + +// vcvtneph2hf8s +vcvtneph2hf8s(xmm1|k2|T_z, xmm2); +vcvtneph2hf8s(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2hf8s(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2hf8s(xmm1|k2|T_z, ymm2); +vcvtneph2hf8s(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2hf8s(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2hf8s(ymm1|k2|T_z, zmm2); +vcvtneph2hf8s(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2hf8s(ymm1|k2|T_z, zword_b[rax+128]); diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index f0d99db..331dbe1 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -2559,6 +2559,18 @@ private: Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM; opVex(x.copyAndSetKind(kind), &xm0, op, type, code); } + // (x, x, x/m), (x, y, y/m), (y, z, z/m) + void opCvt6(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code) + { + int b1 = x1.getBit(); + int b2 = x2.getBit(); + int b3 = op.getBit(); + if ((b1 == 128 && (b2 == 128 || b2 == 256) && (b2 == b3 || op.isMEM())) || (b1 == 256 && b2 == 512 && (b3 == b2 || op.isMEM()))) { + opVex(x1, &x2, op, type, code); + return; + } + XBYAK_THROW(ERR_BAD_COMBINATION); + } const Xmm& cvtIdx0(const Operand& x) const { return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0; diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 93d9350..bbbd058 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2199,12 +2199,29 @@ void vcmpunordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); } void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); } void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); } -void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); } +void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8A); } void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8A); } void vcomsbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); } +void vcomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F3|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F2|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcvt2ps2phx(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x67); } +void vcvtbiasph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtbiasph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtbiasph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtbiasph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x5B); } +void vcvthf82ph(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_N1, 0x1E); } +void vcvtne2ph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } +void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtneph2hf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW1|T_ER_Z|T_MUST_EVEX|T_B64, 0x5A); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); } @@ -2610,7 +2627,10 @@ void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, void vsubnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5C); } void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C); } void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C); } -void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); } +void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F3|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F2|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } #ifdef XBYAK64 void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); } void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x7C); } -- cgit v1.2.3 From 6e083527507052145d99585eb028342cba81df60 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 11:14:34 +0900 Subject: move vpmadd52{h,l}uq to avx512 --- gen/gen_avx512.cpp | 3 +++ gen/gen_code.cpp | 2 -- xbyak/xbyak_mnemonic.h | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 656bf4e..44ef945 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -327,6 +327,9 @@ void putX_X_XM_IMM() { 0x77, "vpermi2ps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false }, { 0x77, "vpermi2pd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0x25, "vpternlogd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, true }, { 0x25, "vpternlogq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, true }, diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index ad6806b..c7ee311 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1893,8 +1893,6 @@ void put() { 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, - { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, - { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index bbbd058..891deea 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1468,8 +1468,6 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); } void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); } void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); } -void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_B64, 0xB5, encoding); } -void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_B64, 0xB4, encoding); } void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_YMM|T_EVEX, 0x04); } void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM|T_EVEX, 0xF5); } void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); } @@ -2476,6 +2474,8 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_6 void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x91, 0); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x44); } +void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xB5); } +void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xB4); } void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x3D); } void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x3F); } void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x39); } -- cgit v1.2.3 From dedb7f52af9ee648c41ae27fb48978a5ae68901e Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 11:16:27 +0900 Subject: add vdpphps --- gen/gen_avx512.cpp | 4 ++++ test/avx10/misc.txt | 13 +++++++++++++ xbyak/xbyak_mnemonic.h | 1 + 3 files changed, 18 insertions(+) create mode 100644 test/avx10/misc.txt diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 44ef945..e7b2336 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -445,6 +445,9 @@ void putX_X_XM_IMM() { 0x74, "vcvtne2ph2bf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, { 0x18, "vcvtne2ph2hf8", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, { 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + + { 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false }, +// { 0x42, "vmpsadbw", T_MUST_EVEX | T_F3 | T_0F3A | T_EW0 | T_YMM | T_B32, true }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -452,6 +455,7 @@ void putX_X_XM_IMM() printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X%s); }\n" , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : ""); } +// puts("void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A | T_YMM, 0x42, encoding, imm, T_66 | T_W0 | T_YMM, T_F3 | T_EW0 | T_B32); }"); } void putShift() diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt new file mode 100644 index 0000000..f7d1351 --- /dev/null +++ b/test/avx10/misc.txt @@ -0,0 +1,13 @@ +vdpphps(xm1, xm2, xm3); +vdpphps(xm1, xm2, ptr[rax+128]); +vdpphps(xm1, xm2, ptr_b[rax+128]); + +vdpphps(ym1, ym2, ym3); +vdpphps(ym1, ym2, ptr[rax+128]); +vdpphps(ym1, ym2, ptr_b[rax+128]); + +vdpphps(zm1, zm2, zm3); +vdpphps(zm1, zm2, ptr[rax+128]); +vdpphps(zm1, zm2, ptr_b[rax+128]); + +// skip vmpsadbw diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 891deea..cc88238 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2280,6 +2280,7 @@ void vdivnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E); } void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E); } void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52); } +void vdpphps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x52); } void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); } void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); } void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x88); } -- cgit v1.2.3 From 749aa31dd4a31acf5095cc7db8dfd2d24f0b8787 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 11:30:39 +0900 Subject: mov vcvtneps2bf16 to avx512 --- gen/gen_avx512.cpp | 2 ++ gen/gen_code.cpp | 1 - xbyak/xbyak_mnemonic.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index e7b2336..109afc6 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -619,6 +619,8 @@ void putCvt() { 0x2A, "vcvtsi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 }, { 0x7B, "vcvtusi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 }, + + { 0x72, "vcvtneps2bf16", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 2 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index c7ee311..764d118 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1802,7 +1802,6 @@ void put() const Tbl& p = tbl[i]; printf("void %s(const Xmm& x, const Address& addr) { opVex(x, 0, addr, %s, 0x%02X); }\n", p.name, type2String(p.type).c_str(), p.code); } - printf("void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, %s|orEvexIf(encoding), 0x72); }\n", type2String(T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32).c_str()); } // haswell gpr(reg, reg, r/m) { diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index cc88238..daafcd1 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1213,7 +1213,6 @@ void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3| void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66|T_0F38|T_W0|T_YMM, 0xB0); } void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2|T_0F38|T_W0|T_YMM, 0xB0); } void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38|T_W0|T_YMM, 0xB0); } -void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32|orEvexIf(encoding), 0x72); } void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); } void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); } void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); } @@ -2220,6 +2219,7 @@ void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } void vcvtneph2hf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } +void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW1|T_ER_Z|T_MUST_EVEX|T_B64, 0x5A); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); } -- cgit v1.2.3 From 7af7abbf954dd729f6eb95ef63495526ffd96b77 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 11:44:30 +0900 Subject: extend setDefaultEncoding for mpsadbw --- xbyak/xbyak.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 331dbe1..1642290 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -2661,13 +2661,13 @@ private: if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int sel = 0) { - opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code); + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, sel), code); } - int orEvexIf(PreferredEncoding encoding) { + int orEvexIf(PreferredEncoding encoding, int sel = 0) { if (encoding == DefaultEncoding) { - encoding = defaultEncoding_; + encoding = defaultEncoding_[sel]; } if (encoding == EvexEncoding) { #ifdef XBYAK_DISABLE_AVX512 @@ -2845,7 +2845,7 @@ public: #endif private: bool isDefaultJmpNEAR_; - PreferredEncoding defaultEncoding_; + PreferredEncoding defaultEncoding_[2]; // 0:vnni, 1:vmpsadbw public: void L(const std::string& label) { labelMgr_.defineSlabel(label); } void L(Label& label) { labelMgr_.defineClabel(label); } @@ -3131,8 +3131,9 @@ public: , es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs) #endif , isDefaultJmpNEAR_(false) - , defaultEncoding_(EvexEncoding) { + defaultEncoding_[0] = EvexEncoding; // use avx512-vnni not avx-vnni + defaultEncoding_[1] = VexEncoding; // use vmpsadbw(avx) not avx10.2 labelMgr_.set(this); } void reset() @@ -3170,7 +3171,8 @@ public: #endif // set default encoding to select Vex or Evex - void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; } + void setDefaultEncoding(PreferredEncoding vnniEnc, PreferredEncoding mpsadbwEnc = VexEncoding) + { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = mpsadbwEnc; } void sha1msg12(const Xmm& x, const Operand& op) { -- cgit v1.2.3 From eca17384c575d06474495e79fad87646bd7d067a Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 11:57:55 +0900 Subject: add vmpsadbw for avx10.2 --- gen/gen_avx512.cpp | 3 +-- gen/gen_code.cpp | 2 +- test/avx10/misc.txt | 9 ++++++++- test/avx10_test.cpp | 24 ++++++++++++++++++++++++ test/test_by_xed.cpp | 3 +++ test/test_by_xed.py | 5 +++++ xbyak/xbyak.h | 16 ++++++++-------- xbyak/xbyak_mnemonic.h | 2 +- 8 files changed, 51 insertions(+), 13 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 109afc6..9840844 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -447,7 +447,6 @@ void putX_X_XM_IMM() { 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, { 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false }, -// { 0x42, "vmpsadbw", T_MUST_EVEX | T_F3 | T_0F3A | T_EW0 | T_YMM | T_B32, true }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -455,7 +454,7 @@ void putX_X_XM_IMM() printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X%s); }\n" , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : ""); } -// puts("void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A | T_YMM, 0x42, encoding, imm, T_66 | T_W0 | T_YMM, T_F3 | T_EW0 | T_B32); }"); + puts("void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A | T_YMM, 0x42, encoding, imm, T_66 | T_W0 | T_YMM, T_F3 | T_0F3A | T_EW0 | T_B32, 1); }"); } void putShift() diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index 764d118..58c176a 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -57,7 +57,7 @@ void putX_X_XM(bool omitOnly) { 0x0C, "blendps", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, { 0x41, "dppd", T_0F3A | T_66 | T_W0, true, true, 3 }, { 0x40, "dpps", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, - { 0x42, "mpsadbw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, + { 0x42, "mpsadbw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 1 }, { 0x0E, "pblendw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, { 0x02, "pblendd", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 2 }, { 0x0B, "roundsd", T_0F3A | T_66 | T_W0, true, true, 3 }, diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt index f7d1351..5c39e81 100644 --- a/test/avx10/misc.txt +++ b/test/avx10/misc.txt @@ -10,4 +10,11 @@ vdpphps(zm1, zm2, zm3); vdpphps(zm1, zm2, ptr[rax+128]); vdpphps(zm1, zm2, ptr_b[rax+128]); -// skip vmpsadbw +vmpsadbw(xm1, xm3, xm15, 3); +vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5); + +vmpsadbw(ym1|k4, ym3, ym15, 3); +vmpsadbw(ym1, ym4, ptr[rax+128], 5); + +vmpsadbw(zm1|k4, zm3, zm15, 3); +vmpsadbw(zm1, zm4, ptr[rax+128], 5); diff --git a/test/avx10_test.cpp b/test/avx10_test.cpp index 9a4a848..5f742fe 100644 --- a/test/avx10_test.cpp +++ b/test/avx10_test.cpp @@ -228,3 +228,27 @@ CYBOZU_TEST_AUTO(ymm_with_sae) CYBOZU_TEST_EQUAL(c.getSize(), n); CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } + +CYBOZU_TEST_AUTO(vmpsadbw) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + setDefaultEncoding(); + vmpsadbw(xm1, xm3, xm15, 3); // vex(avx) + vmpsadbw(ym1, ym3, ptr[rax+128], 3); // vex(avx2) + setDefaultEncoding(VexEncoding, EvexEncoding); + vmpsadbw(ym1, ym3, ym15, 3); // evex(avx10.2) + vmpsadbw(ym1, ym3, ptr[rax+128], 3); // evex(avx10.2) + } + } c; + const uint8_t tbl[] = { + 0xc4, 0xc3, 0x61, 0x42, 0xcf, 0x03, + 0xc4, 0xe3, 0x65, 0x42, 0x88, 0x80, 0x00, 0x00, 0x00, 0x03, + 0x62, 0xd3, 0x66, 0x28, 0x42, 0xcf, 0x03, + 0x62, 0xf3, 0x66, 0x28, 0x42, 0x48, 0x04, 0x03, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} diff --git a/test/test_by_xed.cpp b/test/test_by_xed.cpp index 93c370c..ddac779 100644 --- a/test/test_by_xed.cpp +++ b/test/test_by_xed.cpp @@ -1,10 +1,13 @@ #include #include +using namespace Xbyak; + struct Code : Xbyak::CodeGenerator { Code() : Xbyak::CodeGenerator(4096*8) { + setDefaultEncoding(VexEncoding, EvexEncoding); #include "tmp.cpp" } }; diff --git a/test/test_by_xed.py b/test/test_by_xed.py index 5b84995..afd77d8 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -210,6 +210,11 @@ def parseNmemonic(s): args = [] attrs = [] + # remove Xbyak::{Evex,Vex}Encoding + r = re.search(r'(,[^,]*Encoding)', s) + if r: + s = s.replace(r.group(1), '') + (s, broadcast) = parseBroadcast(s) # replace xm0 with xmm0 diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 1642290..c5de008 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -2661,11 +2661,11 @@ private: if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int sel = 0) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0) { - opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, sel), code); + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm); } - int orEvexIf(PreferredEncoding encoding, int sel = 0) { + int orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { if (encoding == DefaultEncoding) { encoding = defaultEncoding_[sel]; } @@ -2673,9 +2673,9 @@ private: #ifdef XBYAK_DISABLE_AVX512 XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - return T_MUST_EVEX; + return T_MUST_EVEX | typeEvex; } - return 0; + return typeVex; } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -3132,8 +3132,8 @@ public: #endif , isDefaultJmpNEAR_(false) { - defaultEncoding_[0] = EvexEncoding; // use avx512-vnni not avx-vnni - defaultEncoding_[1] = VexEncoding; // use vmpsadbw(avx) not avx10.2 + // select avx512-vnni, vmpsadbw(avx) + setDefaultEncoding(); labelMgr_.set(this); } void reset() @@ -3171,7 +3171,7 @@ public: #endif // set default encoding to select Vex or Evex - void setDefaultEncoding(PreferredEncoding vnniEnc, PreferredEncoding mpsadbwEnc = VexEncoding) + void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding mpsadbwEnc = VexEncoding) { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = mpsadbwEnc; } void sha1msg12(const Xmm& x, const Operand& op) diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index daafcd1..8515e41 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1369,7 +1369,6 @@ void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_ void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x10); } void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x11); } void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x10); } -void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_W0|T_YMM, 0x42, imm); } void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59); } void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59); } void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59); } @@ -2408,6 +2407,7 @@ void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); } +void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A | T_YMM, 0x42, encoding, imm, T_66 | T_W0 | T_YMM, T_F3 | T_0F3A | T_EW0 | T_B32, 1); } void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); } void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); } void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); } -- cgit v1.2.3 From 2818beeffd198dae543019347360252d0ea7b78f Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 13:50:03 +0900 Subject: [skip ci] [doc] setDefaultEncoding for vmpsadbw --- doc/usage.md | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 0911b91..1ab56e1 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -112,12 +112,24 @@ vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding setDefaultEncoding(VexEncoding); // default encoding is VEX vpdpbusd(xm0, xm1, xm2); // VEX encoding + +vmpsadbw(xm1, xm3, xm15, 3); // default encoding +vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // vex(avx) +vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // evex(avx10.2) +setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument. +vmpsadbw(xm1, xm3, xm15, 3); // evex(avx10.2) + ``` -- setDefaultEncoding(PreferredEncoding encoding); - - Set the default encoding to select EVEX or VEX. - - The default value is EvexEncoding. - - This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd. +- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding mpsadbwEnc = VexEncoding)` + - 1st argument. Set the default encoding to select EVEX or VEX for VNNI + - The default value is EvexEncoding (AVX512_VNNI). + - encoded as AVX-VNNI if VexEncoding is set. + - This parameter affects to vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds. + - 2nd argument. Set the default encoding to select EVEX or VEX for vmpsadbw + - The default value is VexEncoding (AVX/AVX2). + - encoded as AVX10.2 if EvexEncoding is set. + - This parameter affects to vmpsadbw. ### Remark * `k1`, ..., `k7` are opmask registers. -- cgit v1.2.3 From 14ae9bf4859739ca9b23f421d23693a15e75769d Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 14:55:20 +0900 Subject: add vpdpbssd for avx10.2 --- gen/gen_avx512.cpp | 33 ++++++++++++++++++++++++++++++++- gen/gen_code.cpp | 2 +- test/avx10/misc.txt | 12 ++++++++++++ xbyak/xbyak_mnemonic.h | 4 ++-- 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 9840844..9159a64 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -454,7 +454,37 @@ void putX_X_XM_IMM() printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X%s); }\n" , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : ""); } - puts("void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A | T_YMM, 0x42, encoding, imm, T_66 | T_W0 | T_YMM, T_F3 | T_0F3A | T_EW0 | T_B32, 1); }"); +} + +void putX_X_XM_IMM_AVX10() +{ + const struct Tbl { + uint8_t code; + const char *name; + uint64_t type; + uint64_t typeVex; + uint64_t typeEvex; + int sel; + bool hasIMM; + } tbl[] = { + { 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, +#if 0 + { 0x51, "vpdpbssds", T_MUST_EVEX | T_YMM | T_F2 | T_0F38 | T_EW0 | T_B32, false }, + { 0x50, "vpdpbsud", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false }, + { 0x51, "vpdpbsuds", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false }, + { 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, + { 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, +#endif + { 0x42, "vmpsadbw", T_0F3A|T_YMM, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1, true }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl *p = &tbl[i]; + std::string s = type2String(p->type); + std::string sVex = type2String(p->typeVex); + std::string sEvex = type2String(p->typeEvex); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding, %s, %s, %s, %d); }\n" + , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? "imm" : "NONE", sVex.c_str(), sEvex.c_str(), p->sel); + } } void putShift() @@ -1059,6 +1089,7 @@ int main(int argc, char *[]) putM_X(); putXM_X(); putX_X_XM_IMM(); + putX_X_XM_IMM_AVX10(); putShift(); putExtractInsert(); putCvt(); diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index 58c176a..caa9e79 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1907,7 +1907,7 @@ void put() const char *name; uint64_t type; } tbl[] = { - { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt index 5c39e81..8993107 100644 --- a/test/avx10/misc.txt +++ b/test/avx10/misc.txt @@ -18,3 +18,15 @@ vmpsadbw(ym1, ym4, ptr[rax+128], 5); vmpsadbw(zm1|k4, zm3, zm15, 3); vmpsadbw(zm1, zm4, ptr[rax+128], 5); + +vpdpbssd(xm1, xm2, xm3); +vpdpbssd(xm1, xm2, ptr[rax+128]); +vpdpbssd(xm1, xm2, ptr_b[rax+128]); + +vpdpbssd(ym1, ym2, ym3); +vpdpbssd(ym1, ym2, ptr[rax+128]); +vpdpbssd(ym1, ym2, ptr_b[rax+128]); + +vpdpbssd(zm1, zm2, zm3); +vpdpbssd(zm1, zm2, ptr[rax+128]); +vpdpbssd(zm1, zm2, ptr_b[rax+128]); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 8515e41..dbe52e9 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1419,7 +1419,6 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); } -void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x50); } void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x51); } void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x50); } void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x51); } @@ -2407,7 +2406,7 @@ void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); } -void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A | T_YMM, 0x42, encoding, imm, T_66 | T_W0 | T_YMM, T_F3 | T_0F3A | T_EW0 | T_B32, 1); } +void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); } void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); } void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); } void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); } @@ -2451,6 +2450,7 @@ void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T void vpcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x63); } void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); } void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); } +void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); } -- cgit v1.2.3 From f3f2dd2d748859fd4438ab596950ba52769607a4 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 15:07:35 +0900 Subject: [skip ci] [doc] update setDefaultEncoding --- doc/usage.md | 17 ++++++++--------- xbyak/xbyak.h | 8 +++++--- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 1ab56e1..53c0bb9 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -121,15 +121,14 @@ vmpsadbw(xm1, xm3, xm15, 3); // evex(avx10.2) ``` -- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding mpsadbwEnc = VexEncoding)` - - 1st argument. Set the default encoding to select EVEX or VEX for VNNI - - The default value is EvexEncoding (AVX512_VNNI). - - encoded as AVX-VNNI if VexEncoding is set. - - This parameter affects to vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds. - - 2nd argument. Set the default encoding to select EVEX or VEX for vmpsadbw - - The default value is VexEncoding (AVX/AVX2). - - encoded as AVX10.2 if EvexEncoding is set. - - This parameter affects to vmpsadbw. +- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)` + +param|vnniEnc|avx10Enc +-|-|- +EvexEncoding|AVX512_VNNI|AVX10.2 +VexEncoding|AVX/AVX2|AVX-VNNI-INT8 +default|EvexEncoding|VexEncoding +mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd ### Remark * `k1`, ..., `k7` are opmask registers. diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index c5de008..454613c 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -3170,9 +3170,11 @@ public: #undef jnl #endif - // set default encoding to select Vex or Evex - void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding mpsadbwEnc = VexEncoding) - { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = mpsadbwEnc; } + // set default encoding + // vnniEnc : control AVX512_VNNI (evex:default) or AVX-VNNI (vex) + // avx10Enc : control mpsadbw, AVX-VNNI-INT8 (vex:default) or AVX10.2 (evex) + void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding) + { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; } void sha1msg12(const Xmm& x, const Operand& op) { -- cgit v1.2.3 From f6c66cf6b81f7a063a930cdfc0a62c68e6e2d0fc Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 15:27:05 +0900 Subject: vpdpb[su,uu,ss]d[,s] support avx10.2 --- doc/usage.md | 30 ++++++++++++++--------- gen/gen_avx512.cpp | 8 ++++--- gen/gen_code.cpp | 10 ++++---- test/avx10/misc.txt | 65 ++++++++++++++++++++++++++++++++++++++++++++++++-- xbyak/xbyak_mnemonic.h | 10 ++++---- 5 files changed, 97 insertions(+), 26 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 53c0bb9..9398755 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -106,29 +106,37 @@ vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit +``` + +## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc. +Some mnemonics have two types of encodings: VEX and EVEX. +The functions for these mnemonics include an optional parameter as the last argument to specify the encoding. +The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first), +and can be specified using setDefaultEncoding. -vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX +``` +vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI) vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above -vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding +vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI) setDefaultEncoding(VexEncoding); // default encoding is VEX -vpdpbusd(xm0, xm1, xm2); // VEX encoding +vpdpbusd(xm0, xm1, xm2); // VEX -vmpsadbw(xm1, xm3, xm15, 3); // default encoding -vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // vex(avx) -vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // evex(avx10.2) +vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI) +vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above +vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2) setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument. -vmpsadbw(xm1, xm3, xm15, 3); // evex(avx10.2) - +vmpsadbw(xm1, xm3, xm15, 3); // EVEX ``` - `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)` +Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param. param|vnniEnc|avx10Enc -|-|- -EvexEncoding|AVX512_VNNI|AVX10.2 -VexEncoding|AVX/AVX2|AVX-VNNI-INT8 +EvexEncoding|AVX512-VNNI|AVX10.2 +VexEncoding|AVX-VNNI|AVX-VNNI-INT8 default|EvexEncoding|VexEncoding -mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd +mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds ### Remark * `k1`, ..., `k7` are opmask registers. diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 9159a64..ed7440c 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -468,10 +468,12 @@ void putX_X_XM_IMM_AVX10() bool hasIMM; } tbl[] = { { 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, #if 0 - { 0x51, "vpdpbssds", T_MUST_EVEX | T_YMM | T_F2 | T_0F38 | T_EW0 | T_B32, false }, - { 0x50, "vpdpbsud", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false }, - { 0x51, "vpdpbsuds", T_MUST_EVEX | T_YMM | T_F3 | T_0F38 | T_EW0 | T_B32, false }, { 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, { 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, #endif diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index caa9e79..a71d416 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1908,11 +1908,11 @@ void put() uint64_t type; } tbl[] = { // { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, - { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt index 8993107..380e9a9 100644 --- a/test/avx10/misc.txt +++ b/test/avx10/misc.txt @@ -9,7 +9,7 @@ vdpphps(ym1, ym2, ptr_b[rax+128]); vdpphps(zm1, zm2, zm3); vdpphps(zm1, zm2, ptr[rax+128]); vdpphps(zm1, zm2, ptr_b[rax+128]); - +// vmpsadbw(xm1, xm3, xm15, 3); vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5); @@ -18,7 +18,7 @@ vmpsadbw(ym1, ym4, ptr[rax+128], 5); vmpsadbw(zm1|k4, zm3, zm15, 3); vmpsadbw(zm1, zm4, ptr[rax+128], 5); - +// vpdpbssd(xm1, xm2, xm3); vpdpbssd(xm1, xm2, ptr[rax+128]); vpdpbssd(xm1, xm2, ptr_b[rax+128]); @@ -30,3 +30,64 @@ vpdpbssd(ym1, ym2, ptr_b[rax+128]); vpdpbssd(zm1, zm2, zm3); vpdpbssd(zm1, zm2, ptr[rax+128]); vpdpbssd(zm1, zm2, ptr_b[rax+128]); +// +vpdpbssds(xm1, xm2, xm3); +vpdpbssds(xm1, xm2, ptr[rax+128]); +vpdpbssds(xm1, xm2, ptr_b[rax+128]); + +vpdpbssds(ym1, ym2, ym3); +vpdpbssds(ym1, ym2, ptr[rax+128]); +vpdpbssds(ym1, ym2, ptr_b[rax+128]); + +vpdpbssds(zm1, zm2, zm3); +vpdpbssds(zm1, zm2, ptr[rax+128]); +vpdpbssds(zm1, zm2, ptr_b[rax+128]); +// +vpdpbsud(xm1, xm2, xm3); +vpdpbsud(xm1, xm2, ptr[rax+128]); +vpdpbsud(xm1, xm2, ptr_b[rax+128]); + +vpdpbsud(ym1, ym2, ym3); +vpdpbsud(ym1, ym2, ptr[rax+128]); +vpdpbsud(ym1, ym2, ptr_b[rax+128]); + +vpdpbsud(zm1, zm2, zm3); +vpdpbsud(zm1, zm2, ptr[rax+128]); +vpdpbsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpbsuds(xm1, xm2, xm3); +vpdpbsuds(xm1, xm2, ptr[rax+128]); +vpdpbsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpbsuds(ym1, ym2, ym3); +vpdpbsuds(ym1, ym2, ptr[rax+128]); +vpdpbsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpbsuds(zm1, zm2, zm3); +vpdpbsuds(zm1, zm2, ptr[rax+128]); +vpdpbsuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpbuud(xm1, xm2, xm3); +vpdpbuud(xm1, xm2, ptr[rax+128]); +vpdpbuud(xm1, xm2, ptr_b[rax+128]); + +vpdpbuud(ym1, ym2, ym3); +vpdpbuud(ym1, ym2, ptr[rax+128]); +vpdpbuud(ym1, ym2, ptr_b[rax+128]); + +vpdpbuud(zm1, zm2, zm3); +vpdpbuud(zm1, zm2, ptr[rax+128]); +vpdpbuud(zm1, zm2, ptr_b[rax+128]); +// +vpdpbuuds(xm1, xm2, xm3); +vpdpbuuds(xm1, xm2, ptr[rax+128]); +vpdpbuuds(xm1, xm2, ptr_b[rax+128]); + +vpdpbuuds(ym1, ym2, ym3); +vpdpbuuds(ym1, ym2, ptr[rax+128]); +vpdpbuuds(ym1, ym2, ptr_b[rax+128]); + +vpdpbuuds(zm1, zm2, zm3); +vpdpbuuds(zm1, zm2, ptr[rax+128]); +vpdpbuuds(zm1, zm2, ptr_b[rax+128]); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index dbe52e9..c3c6c8b 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1419,13 +1419,8 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); } -void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x51); } -void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x51); } void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x50, encoding); } void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); } -void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x51); } void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); } void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); } void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); } @@ -2451,6 +2446,11 @@ void vpcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); } void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); } void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); } -- cgit v1.2.3 From 08f71cee951cfdda6b056165e0491b686a2b05bf Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 15:45:43 +0900 Subject: vpdpw[su,us,uu]d[,s] support avx10.2 --- doc/usage.md | 2 +- gen/gen_avx512.cpp | 14 +++++++--- gen/gen_code.cpp | 14 ++++++---- test/avx10/misc.txt | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++ xbyak/xbyak_mnemonic.h | 12 ++++---- 5 files changed, 99 insertions(+), 17 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 9398755..5b25513 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -136,7 +136,7 @@ param|vnniEnc|avx10Enc EvexEncoding|AVX512-VNNI|AVX10.2 VexEncoding|AVX-VNNI|AVX-VNNI-INT8 default|EvexEncoding|VexEncoding -mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds +mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds ### Remark * `k1`, ..., `k7` are opmask registers. diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index ed7440c..2b8a328 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -467,16 +467,22 @@ void putX_X_XM_IMM_AVX10() int sel; bool hasIMM; } tbl[] = { + // vpdpb[su,uu,ss]d[,s] { 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, -#if 0 - { 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, - { 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, -#endif + + // vpdpw[su,us,uu]d[,s] + { 0xD2, "vpdpwsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD2, "vpdpwusd", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwusds", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD2, "vpdpwuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x42, "vmpsadbw", T_0F3A|T_YMM, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1, true }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index a71d416..a22c12b 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1901,6 +1901,7 @@ void put() } // avx-vnni-int8 // avx-vnni-int16 +#if 0 { const struct Tbl { uint8_t code; @@ -1914,12 +1915,12 @@ void put() // { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, // { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, - { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, - { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, +// { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, +// { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -1927,6 +1928,7 @@ void put() printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code); } } +#endif } void put32() diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt index 380e9a9..9464d03 100644 --- a/test/avx10/misc.txt +++ b/test/avx10/misc.txt @@ -91,3 +91,77 @@ vpdpbuuds(ym1, ym2, ptr_b[rax+128]); vpdpbuuds(zm1, zm2, zm3); vpdpbuuds(zm1, zm2, ptr[rax+128]); vpdpbuuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpwsud(xm1, xm2, xm3); +vpdpwsud(xm1, xm2, ptr[rax+128]); +vpdpwsud(xm1, xm2, ptr_b[rax+128]); + +vpdpwsud(ym1, ym2, ym3); +vpdpwsud(ym1, ym2, ptr[rax+128]); +vpdpwsud(ym1, ym2, ptr_b[rax+128]); + +vpdpwsud(zm1, zm2, zm3); +vpdpwsud(zm1, zm2, ptr[rax+128]); +vpdpwsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsuds(xm1, xm2, xm3); +vpdpwsuds(xm1, xm2, ptr[rax+128]); +vpdpwsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwsuds(ym1, ym2, ym3); +vpdpwsuds(ym1, ym2, ptr[rax+128]); +vpdpwsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwsuds(zm1, zm2, zm3); +vpdpwsuds(zm1, zm2, ptr[rax+128]); +vpdpwsuds(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsud(xm1, xm2, xm3); +vpdpwsud(xm1, xm2, ptr[rax+128]); +vpdpwsud(xm1, xm2, ptr_b[rax+128]); + +vpdpwsud(ym1, ym2, ym3); +vpdpwsud(ym1, ym2, ptr[rax+128]); +vpdpwsud(ym1, ym2, ptr_b[rax+128]); + +vpdpwsud(zm1, zm2, zm3); +vpdpwsud(zm1, zm2, ptr[rax+128]); +vpdpwsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsuds(xm1, xm2, xm3); +vpdpwsuds(xm1, xm2, ptr[rax+128]); +vpdpwsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwsuds(ym1, ym2, ym3); +vpdpwsuds(ym1, ym2, ptr[rax+128]); +vpdpwsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwsuds(zm1, zm2, zm3); +vpdpwsuds(zm1, zm2, ptr[rax+128]); +vpdpwsuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpwuud(xm1, xm2, xm3); +vpdpwuud(xm1, xm2, ptr[rax+128]); +vpdpwuud(xm1, xm2, ptr_b[rax+128]); + +vpdpwuud(ym1, ym2, ym3); +vpdpwuud(ym1, ym2, ptr[rax+128]); +vpdpwuud(ym1, ym2, ptr_b[rax+128]); + +vpdpwuud(zm1, zm2, zm3); +vpdpwuud(zm1, zm2, ptr[rax+128]); +vpdpwuud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwuuds(xm1, xm2, xm3); +vpdpwuuds(xm1, xm2, ptr[rax+128]); +vpdpwuuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwuuds(ym1, ym2, ym3); +vpdpwuuds(ym1, ym2, ptr[rax+128]); +vpdpwuuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwuuds(zm1, zm2, zm3); +vpdpwuuds(zm1, zm2, ptr[rax+128]); +vpdpwuuds(zm1, zm2, ptr_b[rax+128]); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index c3c6c8b..dc2e1ad 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1423,12 +1423,6 @@ void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); } void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); } void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); } -void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD3); } -void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD3); } -void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD3); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x36); } @@ -2451,6 +2445,12 @@ void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); } -- cgit v1.2.3 From 7c540206d168012b5e151769b2be1681cf07c175 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 15:51:55 +0900 Subject: v7.10 --- CMakeLists.txt | 2 +- doc/changelog.md | 1 + meson.build | 2 +- readme.md | 2 +- readme.txt | 2 +- xbyak/xbyak.h | 2 +- xbyak/xbyak_mnemonic.h | 2 +- 7 files changed, 7 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 79b0f51..72dad78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.5) -project(xbyak LANGUAGES CXX VERSION 7.09.1) +project(xbyak LANGUAGES CXX VERSION 7.10) file(GLOB headers xbyak/*.h) diff --git a/doc/changelog.md b/doc/changelog.md index af0f6aa..5e25c2d 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,5 +1,6 @@ # History +* 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended. * 2024/Oct/10 ver 7.09.1 fix the names of vpcompressb and vpcompressw * 2024/Oct/08 ver 7.09 support YMM embedded rounding of AVX10.2 and fix some mnemonics with {sae}/{er}. * 2024/Oct/07 ver 7.08 support rdfsbase etc. diff --git a/meson.build b/meson.build index 0fea416..3fb5e51 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '7.09.1', + version: '7.10', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' ) diff --git a/readme.md b/readme.md index 3ee7dd1..49f0a9d 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,5 @@ -# Xbyak 7.09.1 [![Badge Build]][Build Status] +# Xbyak 7.10 [![Badge Build]][Build Status] *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* diff --git a/readme.txt b/readme.txt index 768049b..deabcd8 100644 --- a/readme.txt +++ b/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.09.1 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.10 ----------------------------------------------------------------------------- ◎概要 diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 454613c..552e451 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -155,7 +155,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x7091 /* 0xABCD = A.BC(.D) */ + VERSION = 0x7100 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index dc2e1ad..0397ffd 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "7.09.1"; } +const char *getVersionString() const { return "7.10"; } void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); } -- cgit v1.2.3