diff options
author | Merry <[email protected]> | 2024-01-29 23:47:04 +0000 |
---|---|---|
committer | Merry <[email protected]> | 2024-01-30 00:29:12 +0000 |
commit | 0f20181a45574654e3a81a0dd0dfef3a27730ddf (patch) | |
tree | 40890040e18a6724f56f5fdfb436fc2f9493652c | |
parent | 1e1ba4e0c2096040e34e21f96364067390466c13 (diff) | |
download | dynarmic-0f20181a45574654e3a81a0dd0dfef3a27730ddf.tar.gz dynarmic-0f20181a45574654e3a81a0dd0dfef3a27730ddf.zip |
emit_x64_vector: Fix AVX-512 implementation of EmitVectorTableLookup64
-rw-r--r-- | src/dynarmic/backend/x64/emit_x64_vector.cpp | 76 |
1 files changed, 49 insertions, 27 deletions
diff --git a/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/backend/x64/emit_x64_vector.cpp index 8fa6f3ea..a91dae92 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -4876,57 +4876,79 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { const bool is_defaults_zero = inst->GetArg(0).IsZero(); if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI)) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); - Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(args[0]); - - const u8 index_count = u8(table_size * 8); - const u64 index_count64 = mcl::bit::replicate_element<u8, u64>(index_count); + const Xbyak::Xmm indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(args[2]) : ctx.reg_alloc.UseScratchXmm(args[2]); - Xbyak::Opmask valid_indices = k1; - code.vpcmpb(valid_indices, indicies, code.Const(xword, index_count64, 0), CmpInt::LessThan); + const u64 index_count = mcl::bit::replicate_element<u8, u64>(static_cast<u8>(table_size * 8)); - if (is_defaults_zero) { - defaults = defaults | valid_indices | T_z; - } else { - defaults = defaults | valid_indices; - } + code.vpcmpub(k1, indicies, code.Const(xword, index_count, 0), CmpInt::LessThan); switch (table_size) { case 1: { - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); - code.vpermb(defaults, indicies, xmm_table0); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]); + if (is_defaults_zero) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.vpermb(result | k1 | T_z, indicies, xmm_table0); + ctx.reg_alloc.DefineValue(inst, result); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vpermb(result | k1, indicies, xmm_table0); + ctx.reg_alloc.DefineValue(inst, result); + } break; } case 2: { - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(table[0]); const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); - code.vpunpcklqdq(xmm_table0, xmm_table0, xmm_table0_upper); - code.vpermb(defaults, indicies, xmm_table0); + code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper); + if (is_defaults_zero) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.vpermb(result | k1 | T_z, indicies, xmm0); + ctx.reg_alloc.DefineValue(inst, result); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vpermb(result | k1, indicies, xmm0); + ctx.reg_alloc.DefineValue(inst, result); + } break; } case 3: { - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(table[0]); const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(table[2]); - code.vpunpcklqdq(xmm_table0, xmm_table0, xmm_table0_upper); - code.vpermi2b(indicies, xmm_table0, xmm_table1); - code.vmovdqu8(defaults, indicies); + code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper); + if (is_defaults_zero) { + code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1); + ctx.reg_alloc.DefineValue(inst, indicies); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vpermi2b(indicies, xmm0, xmm_table1); + code.vmovdqu8(result | k1, indicies); + ctx.reg_alloc.DefineValue(inst, result); + } break; } case 4: { - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(table[0]); const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]); const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]); - code.vpunpcklqdq(xmm_table0, xmm_table0, xmm_table0_upper); + code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper); code.vpunpcklqdq(xmm_table1, xmm_table1, xmm_table1_upper); - code.vpermi2b(indicies, xmm_table0, xmm_table1); - code.vmovdqu8(defaults, indicies); + if (is_defaults_zero) { + code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1); + ctx.reg_alloc.DefineValue(inst, indicies); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vpermi2b(indicies, xmm0, xmm_table1); + code.vmovdqu8(result | k1, indicies); + ctx.reg_alloc.DefineValue(inst, result); + } break; } + default: + UNREACHABLE(); + break; } - - ctx.reg_alloc.DefineValue(inst, defaults); return; } |