diff options
author | zmt00 <[email protected]> | 2023-12-02 10:36:27 -0800 |
---|---|---|
committer | merry <[email protected]> | 2023-12-02 20:33:45 +0000 |
commit | 4c2bd4ed296170be02ee67d5164d75f5af84ac40 (patch) | |
tree | d52554991c17987abd4b35d53d76efacec5e7a83 | |
parent | d68b916f574d23fe0775da2be387466186c96fe4 (diff) | |
download | dynarmic-4c2bd4ed296170be02ee67d5164d75f5af84ac40.tar.gz dynarmic-4c2bd4ed296170be02ee67d5164d75f5af84ac40.zip |
emit_x64_vector: Add SSE4.1 implementation of VUZP{1,2}.2S
-rw-r--r-- | src/dynarmic/backend/x64/emit_x64_vector.cpp | 34 |
1 files changed, 25 insertions, 9 deletions
diff --git a/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/backend/x64/emit_x64_vector.cpp index 0da5ca51..5c7c53ee 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -1149,8 +1149,13 @@ void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); - code.unpcklps(lhs, rhs); - code.movq(lhs, lhs); + if (code.HasHostFeature(HostFeature::SSE41)) { + // copy bytes 0:3 of rhs to lhs, zero out upper 8 bytes + code.insertps(lhs, rhs, 0b00011100); + } else { + code.unpcklps(lhs, rhs); + code.movq(lhs, lhs); + } ctx.reg_alloc.DefineValue(inst, lhs); } @@ -1229,15 +1234,26 @@ void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorDeinterleaveOddLower32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); - code.xorps(zero, zero); - code.unpcklps(lhs, rhs); - code.unpckhpd(lhs, zero); + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm lhs = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); - ctx.reg_alloc.DefineValue(inst, lhs); + // copy bytes 4:7 of lhs to bytes 0:3 of rhs, zero out upper 8 bytes + code.insertps(rhs, lhs, 0b01001100); + + ctx.reg_alloc.DefineValue(inst, rhs); + } else { + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + + code.xorps(zero, zero); + code.unpcklps(lhs, rhs); + code.unpckhpd(lhs, zero); + + ctx.reg_alloc.DefineValue(inst, lhs); + } } void EmitX64::EmitVectorEor(EmitContext& ctx, IR::Inst* inst) { |