vaddnepbf16(xm1, xm2, xm3); vaddnepbf16(ym1|k1, ym2, ptr[rax+128]); vaddnepbf16(ym1|k1, ym2, ptr_b[rax+128]); vaddnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vdivnepbf16(xm1, xm2, xm3); vdivnepbf16(ym1|k1, ym2, ptr[rax+128]); vdivnepbf16(ym1|k1, ym2, ptr_b[rax+128]); vdivnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vmaxpbf16(xm1, xm2, xm3); vmaxpbf16(ym1|k1, ym2, ptr[rax+128]); vmaxpbf16(ym1|k1, ym2, ptr_b[rax+128]); vmaxpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vminpbf16(xm1, xm2, xm3); vminpbf16(ym1|k1, ym2, ptr[rax+128]); vminpbf16(ym1|k1, ym2, ptr_b[rax+128]); vminpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vmulnepbf16(xm1, xm2, xm3); vmulnepbf16(ym1|k1, ym2, ptr[rax+128]); vmulnepbf16(ym1|k1, ym2, ptr_b[rax+128]); vmulnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vscalefpbf16(xm1, xm2, xm3); vscalefpbf16(ym1|k1, ym2, ptr[rax+128]); vscalefpbf16(ym1|k1, ym2, ptr_b[rax+128]); vscalefpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vsubnepbf16(xm1, xm2, xm3); vsubnepbf16(ym1|k1, ym2, ptr[rax+128]); vsubnepbf16(ym1|k1, ym2, ptr_b[rax+128]); vsubnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); // madd vfmadd132nepbf16(xm1, xm2, xm3); vfmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]); vfmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vfmadd213nepbf16(xm1, xm2, xm3); vfmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]); vfmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vfmadd231nepbf16(xm1, xm2, xm3); vfmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]); vfmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); // nmadd vfnmadd132nepbf16(xm1, xm2, xm3); vfnmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]); vfnmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfnmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vfnmadd213nepbf16(xm1, xm2, xm3); vfnmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]); vfnmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfnmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vfnmadd231nepbf16(xm1, xm2, xm3); vfnmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]); vfnmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfnmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); // msub vfmsub132nepbf16(xm1, xm2, xm3); vfmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]); vfmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vfmsub213nepbf16(xm1, xm2, xm3); vfmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]); vfmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vfmsub231nepbf16(xm1, xm2, xm3); vfmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); vfmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); // nmsub vfnmsub132nepbf16(xm1, xm2, xm3); vfnmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]); vfnmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfnmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vfnmsub213nepbf16(xm1, xm2, xm3); vfnmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]); vfnmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfnmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vfnmsub231nepbf16(xm1, xm2, xm3); vfnmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); vfnmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); vfnmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); vcmppbf16(k1, xm5, xm4, 5); vcmppbf16(k2, ym5, ym4, 6); vcmppbf16(k3, ym15, ptr_b[rax+128], 7); vcmppbf16(k4, zm30, zm20, 8); vcmppbf16(k5, zm1, ptr[rax+128], 9); vcmppbf16(k6, zm10, ptr_b[rax+128], 10); vfpclasspbf16(k1, xm4, 5); vfpclasspbf16(k2|k5, ym4, 6); vfpclasspbf16(k3|k5, zm20, 7); vfpclasspbf16(k3|k5, xword[rax+128], 8); vfpclasspbf16(k3, xword_b[rax+128], 9); vfpclasspbf16(k5|k5, yword[rax+128], 10); vfpclasspbf16(k6|k5, yword_b[rax+128], 11); vfpclasspbf16(k7|k5, zword[rax+128], 12); vfpclasspbf16(k7|k5, zword_b[rax+128], 13); vcomsbf16(xm2, xm3); vcomsbf16(xm2, ptr[rax+128]); //vgetexppbf16(xm1|k3, xmm2); //vgetexppbf16(xm1|k3, ptr[rax+128]); //vgetexppbf16(xm1|k3, ptr_b[rax+128]); //vgetexppbf16(ym1|k3, ymm2); //vgetexppbf16(ym1|k3, ptr[rax+128]); //vgetexppbf16(ym1|k3, ptr_b[rax+128]); //vgetexppbf16(zm1|k3, zmm2); //vgetexppbf16(zm1|k3, ptr[rax+128]); //vgetexppbf16(zm1|k3, ptr_b[rax+128]); vgetmantpbf16(xm1|k3, xmm2, 3); vgetmantpbf16(xm1|k3, ptr[rax+128], 5); vgetmantpbf16(xm1|k3, ptr_b[rax+128], 9); vgetmantpbf16(ym1|k3, ymm2, 3); vgetmantpbf16(ym1|k3, ptr[rax+128], 5); vgetmantpbf16(ym1|k3, ptr_b[rax+128], 9); vgetmantpbf16(zm1|k3, zmm2, 3); vgetmantpbf16(zm1|k3, ptr[rax+128], 5); vgetmantpbf16(zm1|k3, ptr_b[rax+128], 9); vrcppbf16(xm1|k5, xm2); vrcppbf16(xm1|k5, ptr[rcx+128]); vrcppbf16(xm1|k5, ptr_b[rcx+128]); vrcppbf16(ym1|k5, ym2); vrcppbf16(ym1|k5, ptr[rcx+128]); vrcppbf16(ym1|k5, ptr_b[rcx+128]); vrcppbf16(zm1|k5, zm2); vrcppbf16(zm1|k5, ptr[rcx+128]); vrcppbf16(zm1|k5, ptr_b[rcx+128]); vreducenepbf16(xm1|k4, xm2, 1); vreducenepbf16(xm1|k4, ptr[rax+128], 1); vreducenepbf16(xm1|k4, ptr_b[rax+128], 1); vreducenepbf16(ym1|k4, ym2, 1); vreducenepbf16(ym1|k4, ptr[rax+128], 1); vreducenepbf16(ym1|k4, ptr_b[rax+128], 1); vreducenepbf16(zm1|k4, zm2, 1); vreducenepbf16(zm1|k4, ptr[rax+128], 1); vreducenepbf16(zm1|k4, ptr_b[rax+128], 1); vrndscalenepbf16(xm1|k4, xm2, 1); vrndscalenepbf16(xm1|k4, ptr[rax+128], 1); vrndscalenepbf16(xm1|k4, ptr_b[rax+128], 1); vrndscalenepbf16(ym1|k4, ym2, 1); vrndscalenepbf16(ym1|k4, ptr[rax+128], 1); vrndscalenepbf16(ym1|k4, ptr_b[rax+128], 1); vrndscalenepbf16(zm1|k4, zm2, 1); vrndscalenepbf16(zm1|k4, ptr[rax+128], 1); vrndscalenepbf16(zm1|k4, ptr_b[rax+128], 1); vrsqrtpbf16(xm1|k5, xm2); vrsqrtpbf16(xm1|k5, ptr[rcx+128]); vrsqrtpbf16(xm1|k5, ptr_b[rcx+128]); vrsqrtpbf16(ym1|k5, ym2); vrsqrtpbf16(ym1|k5, ptr[rcx+128]); vrsqrtpbf16(ym1|k5, ptr_b[rcx+128]); vrsqrtpbf16(zm1|k5, zm2); vrsqrtpbf16(zm1|k5, ptr[rcx+128]); vrsqrtpbf16(zm1|k5, ptr_b[rcx+128]); vscalefpbf16(xm1|k5, xm5, xm2); vscalefpbf16(xm1|k5, xm5, ptr[rcx+128]); vscalefpbf16(xm1|k5, xm5, ptr_b[rcx+128]); vscalefpbf16(ym1|k5, ym9, ym2); vscalefpbf16(ym1|k5, ym9, ptr[rcx+128]); vscalefpbf16(ym1|k5, ym9, ptr_b[rcx+128]); vscalefpbf16(zm1|k5, zm30, zm2); vscalefpbf16(zm1|k5, zm30, ptr[rcx+128]); vscalefpbf16(zm1|k5, zm30, ptr_b[rcx+128]); vsqrtnepbf16(xm5|k3, xmm4); vsqrtnepbf16(xm5|k3, ptr[rax+128]); vsqrtnepbf16(xm5|k3, ptr_b[rax+128]); vsqrtnepbf16(ym5|k3, ymm4); vsqrtnepbf16(ym5|k3, ptr[rax+128]); vsqrtnepbf16(ym5|k3, ptr_b[rax+128]); vsqrtnepbf16(zm5|k3, zmm4); vsqrtnepbf16(zm5|k3, ptr[rax+128]); vsqrtnepbf16(zm5|k3, ptr_b[rax+128]);