summaryrefslogtreecommitdiffstats
path: root/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp')
-rw-r--r--js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp1300
1 files changed, 1300 insertions, 0 deletions
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
new file mode 100644
index 0000000000..71e538004f
--- /dev/null
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@@ -0,0 +1,1300 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "jit/MacroAssembler.h"
+#include "jit/x86-shared/MacroAssembler-x86-shared.h"
+
+#include "jit/MacroAssembler-inl.h"
+
+using namespace js;
+using namespace js::jit;
+
+using mozilla::DebugOnly;
+using mozilla::FloatingPoint;
+using mozilla::Maybe;
+using mozilla::SpecificNaN;
+
+void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) {
+ ScratchSimd128Scope scratch(asMasm());
+
+ vmovd(input, output);
+ zeroSimd128Int(scratch);
+ vpshufb(scratch, output, output);
+}
+
+void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) {
+ vmovd(input, output);
+ vpshuflw(0, output, output);
+ vpshufd(0, output, output);
+}
+
+void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) {
+ vmovd(input, output);
+ vpshufd(0, output, output);
+}
+
+void MacroAssemblerX86Shared::splatX4(FloatRegister input,
+ FloatRegister output) {
+ MOZ_ASSERT(input.isSingle() && output.isSimd128());
+ asMasm().moveSimd128Float(input.asSimd128(), output);
+ vshufps(0, output, output, output);
+}
+
+void MacroAssemblerX86Shared::splatX2(FloatRegister input,
+ FloatRegister output) {
+ MOZ_ASSERT(input.isDouble() && output.isSimd128());
+ asMasm().moveSimd128Float(input.asSimd128(), output);
+ vshufpd(0, output, output, output);
+}
+
+void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input,
+ Register output,
+ unsigned lane) {
+ if (lane == 0) {
+ // The value we want to extract is in the low double-word
+ moveLowInt32(input, output);
+ } else {
+ vpextrd(lane, input, output);
+ }
+}
+
+void MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input,
+ FloatRegister output,
+ unsigned lane) {
+ MOZ_ASSERT(input.isSimd128() && output.isSingle());
+ if (lane == 0) {
+ // The value we want to extract is in the low double-word
+ if (input.asSingle() != output) {
+ moveFloat32(input, output);
+ }
+ } else if (lane == 2) {
+ moveHighPairToLowPairFloat32(input, output);
+ } else {
+ uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
+ shuffleFloat32(mask, input, output.asSimd128());
+ }
+}
+
+void MacroAssemblerX86Shared::extractLaneFloat64x2(FloatRegister input,
+ FloatRegister output,
+ unsigned lane) {
+ MOZ_ASSERT(input.isSimd128() && output.isDouble());
+ if (lane == 0) {
+ // The value we want to extract is in the low quadword
+ if (input.asDouble() != output) {
+ moveDouble(input, output);
+ }
+ } else {
+ vpalignr(Operand(input), output, 8);
+ }
+}
+
+void MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input,
+ Register output, unsigned lane,
+ SimdSign sign) {
+ vpextrw(lane, input, output);
+ if (sign == SimdSign::Signed) {
+ movswl(output, output);
+ }
+}
+
+void MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input,
+ Register output, unsigned lane,
+ SimdSign sign) {
+ vpextrb(lane, input, output);
+ if (sign == SimdSign::Signed) {
+ movsbl(output, output);
+ }
+}
+
+void MacroAssemblerX86Shared::replaceLaneFloat32x4(FloatRegister rhs,
+ FloatRegister lhsDest,
+ unsigned lane) {
+ MOZ_ASSERT(lhsDest.isSimd128() && rhs.isSingle());
+
+ if (lane == 0) {
+ if (rhs.asSimd128() == lhsDest) {
+ // no-op, although this should not normally happen for type checking
+ // reasons higher up in the stack.
+ } else {
+ // move low dword of value into low dword of output
+ vmovss(rhs, lhsDest, lhsDest);
+ }
+ } else {
+ vinsertps(vinsertpsMask(0, lane), rhs, lhsDest, lhsDest);
+ }
+}
+
+void MacroAssemblerX86Shared::replaceLaneFloat64x2(FloatRegister rhs,
+ FloatRegister lhsDest,
+ unsigned lane) {
+ MOZ_ASSERT(lhsDest.isSimd128() && rhs.isDouble());
+
+ if (lane == 0) {
+ if (rhs.asSimd128() == lhsDest) {
+ // no-op, although this should not normally happen for type checking
+ // reasons higher up in the stack.
+ } else {
+ // move low qword of value into low qword of output
+ vmovsd(rhs, lhsDest, lhsDest);
+ }
+ } else {
+ // move low qword of value into high qword of output
+ vshufpd(0, rhs, lhsDest, lhsDest);
+ }
+}
+
+void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister output,
+ FloatRegister temp,
+ const uint8_t lanes[16]) {
+ MOZ_ASSERT(lhs == output);
+ MOZ_ASSERT(lhs == rhs || !temp.isInvalid());
+
+ // TODO: Consider whether PBLENDVB would not be better, even if it is variable
+ // and requires xmm0 to be free and the loading of a mask.
+
+ // Set scratch = lanes to select from lhs.
+ int8_t mask[16];
+ for (unsigned i = 0; i < 16; i++) {
+ mask[i] = ~lanes[i];
+ }
+ ScratchSimd128Scope scratch(asMasm());
+ asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(mask), scratch);
+ if (lhs == rhs) {
+ asMasm().moveSimd128Int(rhs, temp);
+ rhs = temp;
+ }
+ vpand(Operand(scratch), lhs, lhs);
+ vpandn(Operand(rhs), scratch, scratch);
+ vpor(scratch, lhs, lhs);
+}
+
+void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister output,
+ const uint16_t lanes[8]) {
+ MOZ_ASSERT(lhs == output);
+
+ uint32_t mask = 0;
+ for (unsigned i = 0; i < 8; i++) {
+ if (lanes[i]) {
+ mask |= (1 << i);
+ }
+ }
+ vpblendw(mask, rhs, lhs, lhs);
+}
+
+void MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs,
+ FloatRegister rhs,
+ FloatRegister output,
+ const uint8_t lanes[16]) {
+ ScratchSimd128Scope scratch(asMasm());
+
+ // Use pshufb instructions to gather the lanes from each source vector.
+ // A negative index creates a zero lane, so the two vectors can be combined.
+
+ // Register preference: lhs == output.
+
+ // Set scratch = lanes from rhs.
+ int8_t idx[16];
+ for (unsigned i = 0; i < 16; i++) {
+ idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1;
+ }
+ moveSimd128Int(rhs, scratch);
+ asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), scratch);
+
+ // Set output = lanes from lhs.
+ for (unsigned i = 0; i < 16; i++) {
+ idx[i] = lanes[i] < 16 ? lanes[i] : -1;
+ }
+ moveSimd128Int(lhs, output);
+ asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), output);
+
+ // Combine.
+ vpor(scratch, output, output);
+}
+
+static inline FloatRegister ToSimdFloatRegister(const Operand& op) {
+ return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128);
+}
+
+void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,
+ Assembler::Condition cond,
+ FloatRegister output) {
+ static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
+ switch (cond) {
+ case Assembler::Condition::GreaterThan:
+ vpcmpgtb(rhs, lhs, output);
+ break;
+ case Assembler::Condition::Equal:
+ vpcmpeqb(rhs, lhs, output);
+ break;
+ case Assembler::Condition::LessThan: {
+ ScratchSimd128Scope scratch(asMasm());
+ // This is bad, but Ion does not use it.
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG) {
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ } else {
+ loadAlignedSimd128Int(rhs, scratch);
+ }
+ // src := src > lhs (i.e. lhs < rhs)
+ vpcmpgtb(Operand(lhs), scratch, scratch);
+ moveSimd128Int(scratch, output);
+ break;
+ }
+ case Assembler::Condition::NotEqual:
+ vpcmpeqb(rhs, lhs, output);
+ asMasm().bitwiseXorSimd128(allOnes, output);
+ break;
+ case Assembler::Condition::GreaterThanOrEqual: {
+ ScratchSimd128Scope scratch(asMasm());
+ // This is bad, but Ion does not use it.
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG) {
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ } else {
+ loadAlignedSimd128Int(rhs, scratch);
+ }
+ vpcmpgtb(Operand(lhs), scratch, scratch);
+ asMasm().loadConstantSimd128Int(allOnes, output);
+ vpxor(Operand(scratch), output, output);
+ break;
+ }
+ case Assembler::Condition::LessThanOrEqual:
+ // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+ vpcmpgtb(rhs, lhs, output);
+ asMasm().bitwiseXorSimd128(allOnes, output);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+void MacroAssemblerX86Shared::compareInt8x16(Assembler::Condition cond,
+ const SimdConstant& rhs,
+ FloatRegister lhsDest) {
+ bool complement = false;
+ switch (cond) {
+ case Assembler::Condition::NotEqual:
+ complement = true;
+ [[fallthrough]];
+ case Assembler::Condition::Equal:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpeqb,
+ &MacroAssembler::vpcmpeqbSimd128);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ complement = true;
+ [[fallthrough]];
+ case Assembler::Condition::GreaterThan:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpgtb,
+ &MacroAssembler::vpcmpgtbSimd128);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+ if (complement) {
+ asMasm().bitwiseXorSimd128(SimdConstant::SplatX16(-1), lhsDest);
+ }
+}
+
+void MacroAssemblerX86Shared::unsignedCompareInt8x16(
+ FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) {
+ // We widen the inputs to 16 bits, transforming them to nonnegative values;
+ // then compare them as signed using the logic from compareInt8x16(); then
+ // merge the results (which is surprisingly complicated). rhs is left
+ // untouched. The logic is open-coded to streamline it.
+ //
+ // TODO? Rhs could be in memory (for Ion, anyway), in which case loading it
+ // into scratch first would be better than loading it twice from memory.
+
+ MOZ_ASSERT(lhs == output);
+ MOZ_ASSERT(lhs != tmp1 && lhs != tmp2);
+ MOZ_ASSERT_IF(
+ rhs.kind() == Operand::FPREG,
+ ToSimdFloatRegister(rhs) != tmp1 && ToSimdFloatRegister(rhs) != tmp2);
+
+ bool complement = false;
+ switch (cond) {
+ case Assembler::Above:
+ case Assembler::BelowOrEqual:
+ complement = cond == Assembler::BelowOrEqual;
+
+ // Low eight bytes of inputs widened to words
+ vpmovzxbw(Operand(lhs), tmp1);
+ vpmovzxbw(rhs, tmp2);
+ // Compare leaving 16-bit results
+ vpcmpgtw(Operand(tmp2), tmp1, tmp1); // lhs < rhs in tmp1
+
+ // High eight bytes of inputs widened to words
+ vpalignr(rhs, tmp2, 8);
+ vpmovzxbw(Operand(tmp2), tmp2);
+ vpalignr(Operand(lhs), output, 8);
+ vpmovzxbw(Operand(output), output);
+ // Compare leaving 16-bit results
+ vpcmpgtw(Operand(tmp2), output, output); // lhs < rhs in output
+
+ break;
+ case Assembler::Below:
+ case Assembler::AboveOrEqual:
+ complement = cond == Assembler::AboveOrEqual;
+
+ // Same as above but with operands reversed
+
+ // Low eight bytes of inputs widened to words
+ vpmovzxbw(Operand(lhs), tmp2);
+ vpmovzxbw(rhs, tmp1);
+ // Compare leaving 16-bit results
+ vpcmpgtw(Operand(tmp2), tmp1, tmp1); // rhs < lhs in tmp1
+
+ // High eight bytes of inputs widened to words
+ vpalignr(Operand(lhs), tmp2, 8);
+ vpmovzxbw(Operand(tmp2), tmp2);
+ vpalignr(rhs, output, 8);
+ vpmovzxbw(Operand(output), output);
+ // Compare leaving 16-bit results
+ vpcmpgtw(Operand(tmp2), output, output); // rhs < lhs in output
+
+ break;
+ default:
+ MOZ_CRASH("Unsupported condition code");
+ }
+
+ // Merge output (results of high byte compares) and tmp1 (results of low byte
+ // compares) by truncating word results to bytes (to avoid signed saturation),
+ // packing, and then concatenating and shifting.
+ vpsrlw(Imm32(8), tmp1, tmp1);
+ vpackuswb(Operand(tmp1), tmp1, tmp1);
+ vpsrlw(Imm32(8), output, output);
+ vpackuswb(Operand(output), output, output);
+ vpalignr(Operand(tmp1), output, 8);
+
+ // Complement when needed for opposite sense of the operator.
+ if (complement) {
+ vpcmpeqd(Operand(tmp1), tmp1, tmp1);
+ vpxor(Operand(tmp1), output, output);
+ }
+}
+
+void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs,
+ Assembler::Condition cond,
+ FloatRegister output) {
+ static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
+
+ switch (cond) {
+ case Assembler::Condition::GreaterThan:
+ vpcmpgtw(rhs, lhs, output);
+ break;
+ case Assembler::Condition::Equal:
+ vpcmpeqw(rhs, lhs, output);
+ break;
+ case Assembler::Condition::LessThan: {
+ ScratchSimd128Scope scratch(asMasm());
+ // This is bad, but Ion does not use it.
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG) {
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ } else {
+ loadAlignedSimd128Int(rhs, scratch);
+ }
+ // src := src > lhs (i.e. lhs < rhs)
+ vpcmpgtw(Operand(lhs), scratch, scratch);
+ moveSimd128Int(scratch, output);
+ break;
+ }
+ case Assembler::Condition::NotEqual:
+ vpcmpeqw(rhs, lhs, output);
+ asMasm().bitwiseXorSimd128(allOnes, output);
+ break;
+ case Assembler::Condition::GreaterThanOrEqual: {
+ ScratchSimd128Scope scratch(asMasm());
+ // This is bad, but Ion does not use it.
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG) {
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ } else {
+ loadAlignedSimd128Int(rhs, scratch);
+ }
+ vpcmpgtw(Operand(lhs), scratch, scratch);
+ asMasm().loadConstantSimd128Int(allOnes, output);
+ vpxor(Operand(scratch), output, output);
+ break;
+ }
+ case Assembler::Condition::LessThanOrEqual:
+ // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+ vpcmpgtw(rhs, lhs, output);
+ asMasm().bitwiseXorSimd128(allOnes, output);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+void MacroAssemblerX86Shared::compareInt16x8(Assembler::Condition cond,
+ const SimdConstant& rhs,
+ FloatRegister lhsDest) {
+ bool complement = false;
+ switch (cond) {
+ case Assembler::Condition::NotEqual:
+ complement = true;
+ [[fallthrough]];
+ case Assembler::Condition::Equal:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpeqw,
+ &MacroAssembler::vpcmpeqwSimd128);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ complement = true;
+ [[fallthrough]];
+ case Assembler::Condition::GreaterThan:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpgtw,
+ &MacroAssembler::vpcmpgtwSimd128);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+ if (complement) {
+ asMasm().bitwiseXorSimd128(SimdConstant::SplatX16(-1), lhsDest);
+ }
+}
+
+void MacroAssemblerX86Shared::unsignedCompareInt16x8(
+ FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) {
+ // See comments at unsignedCompareInt8x16.
+
+ MOZ_ASSERT(lhs == output);
+ MOZ_ASSERT(lhs != tmp1 && lhs != tmp2);
+ MOZ_ASSERT_IF(
+ rhs.kind() == Operand::FPREG,
+ ToSimdFloatRegister(rhs) != tmp1 && ToSimdFloatRegister(rhs) != tmp2);
+
+ bool complement = false;
+ switch (cond) {
+ case Assembler::Above:
+ case Assembler::BelowOrEqual:
+ complement = cond == Assembler::BelowOrEqual;
+
+ vpmovzxwd(Operand(lhs), tmp1);
+ vpmovzxwd(rhs, tmp2);
+ vpcmpgtd(Operand(tmp2), tmp1, tmp1);
+
+ vpalignr(rhs, tmp2, 8);
+ vpmovzxwd(Operand(tmp2), tmp2);
+ vpalignr(Operand(lhs), output, 8);
+ vpmovzxwd(Operand(output), output);
+ vpcmpgtd(Operand(tmp2), output, output);
+
+ break;
+ case Assembler::Below:
+ case Assembler::AboveOrEqual:
+ complement = cond == Assembler::AboveOrEqual;
+
+ vpmovzxwd(Operand(lhs), tmp2);
+ vpmovzxwd(rhs, tmp1);
+ vpcmpgtd(Operand(tmp2), tmp1, tmp1);
+
+ vpalignr(Operand(lhs), tmp2, 8);
+ vpmovzxwd(Operand(tmp2), tmp2);
+ vpalignr(rhs, output, 8);
+ vpmovzxwd(Operand(output), output);
+ vpcmpgtd(Operand(tmp2), output, output);
+
+ break;
+ default:
+ MOZ_CRASH();
+ }
+
+ vpsrld(Imm32(16), tmp1, tmp1);
+ vpackusdw(Operand(tmp1), tmp1, tmp1);
+ vpsrld(Imm32(16), output, output);
+ vpackusdw(Operand(output), output, output);
+ vpalignr(Operand(tmp1), output, 8);
+
+ if (complement) {
+ vpcmpeqd(Operand(tmp1), tmp1, tmp1);
+ vpxor(Operand(tmp1), output, output);
+ }
+}
+
+void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs,
+ Assembler::Condition cond,
+ FloatRegister output) {
+ static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
+ switch (cond) {
+ case Assembler::Condition::GreaterThan:
+ vpcmpgtd(rhs, lhs, lhs);
+ break;
+ case Assembler::Condition::Equal:
+ vpcmpeqd(rhs, lhs, lhs);
+ break;
+ case Assembler::Condition::LessThan: {
+ ScratchSimd128Scope scratch(asMasm());
+ // This is bad, but Ion does not use it.
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG) {
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ } else {
+ loadAlignedSimd128Int(rhs, scratch);
+ }
+ // src := src > lhs (i.e. lhs < rhs)
+ vpcmpgtd(Operand(lhs), scratch, scratch);
+ moveSimd128Int(scratch, lhs);
+ break;
+ }
+ case Assembler::Condition::NotEqual:
+ vpcmpeqd(rhs, lhs, lhs);
+ asMasm().bitwiseXorSimd128(allOnes, lhs);
+ break;
+ case Assembler::Condition::GreaterThanOrEqual: {
+ ScratchSimd128Scope scratch(asMasm());
+ // This is bad, but Ion does not use it.
+ // src := rhs
+ if (rhs.kind() == Operand::FPREG) {
+ moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
+ } else {
+ loadAlignedSimd128Int(rhs, scratch);
+ }
+ vpcmpgtd(Operand(lhs), scratch, scratch);
+ asMasm().loadConstantSimd128Int(allOnes, lhs);
+ vpxor(Operand(scratch), lhs, lhs);
+ break;
+ }
+ case Assembler::Condition::LessThanOrEqual:
+ // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+ vpcmpgtd(rhs, lhs, lhs);
+ asMasm().bitwiseXorSimd128(allOnes, lhs);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+void MacroAssemblerX86Shared::compareInt32x4(Assembler::Condition cond,
+ const SimdConstant& rhs,
+ FloatRegister lhsDest) {
+ bool complement = false;
+ switch (cond) {
+ case Assembler::Condition::NotEqual:
+ complement = true;
+ [[fallthrough]];
+ case Assembler::Condition::Equal:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpeqd,
+ &MacroAssembler::vpcmpeqdSimd128);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ complement = true;
+ [[fallthrough]];
+ case Assembler::Condition::GreaterThan:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpgtd,
+ &MacroAssembler::vpcmpgtdSimd128);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+ if (complement) {
+ asMasm().bitwiseXorSimd128(SimdConstant::SplatX16(-1), lhsDest);
+ }
+}
+
+void MacroAssemblerX86Shared::unsignedCompareInt32x4(
+ FloatRegister lhs, Operand rhs, Assembler::Condition cond,
+ FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) {
+ // See comments at unsignedCompareInt8x16, the logic is similar. However we
+ // only have PCMPGTQ on SSE4.2 or later, so for SSE4.1 we need to use subtract
+ // to compute the flags.
+
+ MOZ_ASSERT(lhs == output);
+ MOZ_ASSERT(lhs != tmp1 && lhs != tmp2);
+ MOZ_ASSERT_IF(
+ rhs.kind() == Operand::FPREG,
+ ToSimdFloatRegister(rhs) != tmp1 && ToSimdFloatRegister(rhs) != tmp2);
+
+ bool complement = false;
+ switch (cond) {
+ case Assembler::Below:
+ case Assembler::AboveOrEqual:
+ complement = cond == Assembler::AboveOrEqual;
+
+ // The effect of the subtract is that the high doubleword of each quadword
+ // becomes either 0 (ge) or -1 (lt).
+
+ vpmovzxdq(Operand(lhs), tmp1);
+ vpmovzxdq(rhs, tmp2);
+ vpsubq(Operand(tmp2), tmp1, tmp1); // flag1 junk flag0 junk
+ vpsrlq(Imm32(32), tmp1, tmp1); // zero flag1 zero flag0
+ vpshufd(MacroAssembler::ComputeShuffleMask(0, 2, 3, 3), tmp1,
+ tmp1); // zero zero flag1 flag0
+
+ vpalignr(rhs, tmp2, 8);
+ vpmovzxdq(Operand(tmp2), tmp2);
+ vpalignr(Operand(lhs), output, 8);
+ vpmovzxdq(Operand(output), output);
+ vpsubq(Operand(tmp2), output, output); // flag3 junk flag2 junk
+ vpsrlq(Imm32(32), output, output); // zero flag3 zero flag2
+ vpshufd(MacroAssembler::ComputeShuffleMask(3, 3, 0, 2), output,
+ output); // flag3 flag2 zero zero
+
+ vpor(Operand(tmp1), output, output);
+ break;
+
+ case Assembler::Above:
+ case Assembler::BelowOrEqual:
+ complement = cond == Assembler::BelowOrEqual;
+
+ // The effect of the subtract is that the high doubleword of each quadword
+ // becomes either 0 (le) or -1 (gt).
+
+ vpmovzxdq(Operand(lhs), tmp2);
+ vpmovzxdq(rhs, tmp1);
+ vpsubq(Operand(tmp2), tmp1, tmp1); // flag1 junk flag0 junk
+ vpsrlq(Imm32(32), tmp1, tmp1); // zero flag1 zero flag0
+ vpshufd(MacroAssembler::ComputeShuffleMask(0, 2, 3, 3), tmp1,
+ tmp1); // zero zero flag1 flag0
+
+ vpalignr(Operand(lhs), tmp2, 8);
+ vpmovzxdq(Operand(tmp2), tmp2);
+ vpalignr(rhs, output, 8);
+ vpmovzxdq(Operand(output), output);
+ vpsubq(Operand(tmp2), output, output); // flag3 junk flag2 junk
+ vpsrlq(Imm32(32), output, output); // zero flag3 zero flag2
+ vpshufd(MacroAssembler::ComputeShuffleMask(3, 3, 0, 2), output,
+ output); // flag3 flag2 zero zero
+
+ vpor(Operand(tmp1), output, output);
+ break;
+
+ default:
+ MOZ_CRASH();
+ }
+
+ if (complement) {
+ vpcmpeqd(Operand(tmp1), tmp1, tmp1);
+ vpxor(Operand(tmp1), output, output);
+ }
+}
+
+void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs,
+ Assembler::Condition cond,
+ FloatRegister output) {
+ if (HasAVX()) {
+ MOZ_CRASH("Can do better here with three-address compares");
+ }
+
+ // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output.
+ // This is bad, but Ion does not need this fixup.
+ ScratchSimd128Scope scratch(asMasm());
+ if (!lhs.aliases(output)) {
+ if (rhs.kind() == Operand::FPREG &&
+ output.aliases(FloatRegister::FromCode(rhs.fpu()))) {
+ vmovaps(rhs, scratch);
+ rhs = Operand(scratch);
+ }
+ vmovaps(lhs, output);
+ }
+
+ switch (cond) {
+ case Assembler::Condition::Equal:
+ vcmpeqps(rhs, output);
+ break;
+ case Assembler::Condition::LessThan:
+ vcmpltps(rhs, output);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ vcmpleps(rhs, output);
+ break;
+ case Assembler::Condition::NotEqual:
+ vcmpneqps(rhs, output);
+ break;
+ case Assembler::Condition::GreaterThanOrEqual:
+ case Assembler::Condition::GreaterThan:
+ // We reverse these operations in the -inl.h file so that we don't have to
+ // copy into and out of temporaries after codegen.
+ MOZ_CRASH("should have reversed this");
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+void MacroAssemblerX86Shared::compareFloat32x4(Assembler::Condition cond,
+ const SimdConstant& rhs,
+ FloatRegister lhsDest) {
+ switch (cond) {
+ case Assembler::Condition::Equal:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpeqps,
+ &MacroAssembler::vcmpeqpsSimd128);
+ break;
+ case Assembler::Condition::LessThan:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpltps,
+ &MacroAssembler::vcmpltpsSimd128);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpleps,
+ &MacroAssembler::vcmplepsSimd128);
+ break;
+ case Assembler::Condition::NotEqual:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpneqps,
+ &MacroAssembler::vcmpneqpsSimd128);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs,
+ Assembler::Condition cond,
+ FloatRegister output) {
+ if (HasAVX()) {
+ MOZ_CRASH("Can do better here with three-address compares");
+ }
+
+ // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output.
+ // This is bad, but Ion does not need this fixup.
+ ScratchSimd128Scope scratch(asMasm());
+ if (!lhs.aliases(output)) {
+ if (rhs.kind() == Operand::FPREG &&
+ output.aliases(FloatRegister::FromCode(rhs.fpu()))) {
+ vmovapd(rhs, scratch);
+ rhs = Operand(scratch);
+ }
+ vmovapd(lhs, output);
+ }
+
+ switch (cond) {
+ case Assembler::Condition::Equal:
+ vcmpeqpd(rhs, output);
+ break;
+ case Assembler::Condition::LessThan:
+ vcmpltpd(rhs, output);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ vcmplepd(rhs, output);
+ break;
+ case Assembler::Condition::NotEqual:
+ vcmpneqpd(rhs, output);
+ break;
+ case Assembler::Condition::GreaterThanOrEqual:
+ case Assembler::Condition::GreaterThan:
+ // We reverse these operations in the -inl.h file so that we don't have to
+ // copy into and out of temporaries after codegen.
+ MOZ_CRASH("should have reversed this");
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+void MacroAssemblerX86Shared::compareFloat64x2(Assembler::Condition cond,
+ const SimdConstant& rhs,
+ FloatRegister lhsDest) {
+ switch (cond) {
+ case Assembler::Condition::Equal:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpeqpd,
+ &MacroAssembler::vcmpeqpdSimd128);
+ break;
+ case Assembler::Condition::LessThan:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpltpd,
+ &MacroAssembler::vcmpltpdSimd128);
+ break;
+ case Assembler::Condition::LessThanOrEqual:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vcmplepd,
+ &MacroAssembler::vcmplepdSimd128);
+ break;
+ case Assembler::Condition::NotEqual:
+ binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpneqpd,
+ &MacroAssembler::vcmpneqpdSimd128);
+ break;
+ default:
+ MOZ_CRASH("unexpected condition op");
+ }
+}
+
+// Semantics of wasm max and min.
+//
+// * -0 < 0
+// * If one input is NaN then that NaN is the output
+// * If both inputs are NaN then the output is selected nondeterministically
+// * Any returned NaN is always made quiet
+// * The MVP spec 2.2.3 says "No distinction is made between signalling and
+// quiet NaNs", suggesting SNaN inputs are allowed and should not fault
+//
+// Semantics of maxps/minps/maxpd/minpd:
+//
+// * If the values are both +/-0 the rhs is returned
+// * If the rhs is SNaN then the rhs is returned
+// * If either value is NaN then the rhs is returned
+// * An SNaN operand does not appear to give rise to an exception, at least
+// not in the JS shell on Linux, though the Intel spec lists Invalid
+// as one of the possible exceptions
+
+// Various unaddressed considerations:
+//
+// It's pretty insane for this to take an Operand rhs - it really needs to be
+// a register, given the number of times we access it.
+//
+// Constant load can be folded into the ANDPS. Do we care? It won't save us
+// any registers, since output/temp1/temp2/scratch are all live at the same time
+// after the first instruction of the slow path.
+//
+// Can we use blend for the NaN extraction/insertion? We'd need xmm0 for the
+// mask, which is no fun. But it would be lhs UNORD lhs -> mask, blend;
+// rhs UNORD rhs -> mask; blend. Better than the mess we have below. But
+// we'd still need to setup the QNaN bits, unless we can blend those too
+// with the lhs UNORD rhs mask?
+//
+// If we could determine that both input lanes are NaN then the result of the
+// fast path should be fine modulo the QNaN bits, but it's not obvious this is
+// much of an advantage.
+
+void MacroAssemblerX86Shared::minMaxFloat32x4(bool isMin, FloatRegister lhs_,
+ Operand rhs, FloatRegister temp1,
+ FloatRegister temp2,
+ FloatRegister output) {
+ ScratchSimd128Scope scratch(asMasm());
+ Label l;
+ SimdConstant quietBits(SimdConstant::SplatX4(int32_t(0x00400000)));
+
+ /* clang-format off */ /* leave my comments alone */
+ FloatRegister lhs = reusedInputSimd128Float(lhs_, scratch);
+ if (isMin) {
+ vmovaps(lhs, output); // compute
+ vminps(rhs, output, output); // min lhs, rhs
+ vmovaps(rhs, temp1); // compute
+ vminps(Operand(lhs), temp1, temp1); // min rhs, lhs
+ vorps(temp1, output, output); // fix min(-0, 0) with OR
+ } else {
+ vmovaps(lhs, output); // compute
+ vmaxps(rhs, output, output); // max lhs, rhs
+ vmovaps(rhs, temp1); // compute
+ vmaxps(Operand(lhs), temp1, temp1); // max rhs, lhs
+ vandps(temp1, output, output); // fix max(-0, 0) with AND
+ }
+ vmovaps(lhs, temp1); // compute
+ vcmpunordps(rhs, temp1); // lhs UNORD rhs
+ vptest(temp1, temp1); // check if any unordered
+ j(Assembler::Equal, &l); // and exit if not
+
+ // Slow path.
+ // output has result for non-NaN lanes, garbage in NaN lanes.
+ // temp1 has lhs UNORD rhs.
+ // temp2 is dead.
+
+ vmovaps(temp1, temp2); // clear NaN lanes of result
+ vpandn(output, temp2, temp2); // result now in temp2
+ asMasm().vpandSimd128(quietBits, temp1); // setup QNaN bits in NaN lanes
+ vorps(temp1, temp2, temp2); // and OR into result
+ vmovaps(lhs, temp1); // find NaN lanes
+ vcmpunordps(Operand(temp1), temp1); // in lhs
+ vmovaps(temp1, output); // (and save them for later)
+ vandps(lhs, temp1, temp1); // and extract the NaNs
+ vorps(temp1, temp2, temp2); // and add to the result
+ vmovaps(rhs, temp1); // find NaN lanes
+ vcmpunordps(Operand(temp1), temp1); // in rhs
+ vpandn(temp1, output, output); // except if they were in lhs
+ vandps(rhs, output, output); // and extract the NaNs
+ vorps(temp2, output, output); // and add to the result
+
+ bind(&l);
+ /* clang-format on */
+}
+
+// Exactly as above.
+void MacroAssemblerX86Shared::minMaxFloat64x2(bool isMin, FloatRegister lhs_,
+ Operand rhs, FloatRegister temp1,
+ FloatRegister temp2,
+ FloatRegister output) {
+ ScratchSimd128Scope scratch(asMasm());
+ Label l;
+ SimdConstant quietBits(SimdConstant::SplatX2(int64_t(0x0008000000000000ull)));
+
+ /* clang-format off */ /* leave my comments alone */
+ FloatRegister lhs = reusedInputSimd128Float(lhs_, scratch);
+ if (isMin) {
+ vmovapd(lhs, output); // compute
+ vminpd(rhs, output, output); // min lhs, rhs
+ vmovapd(rhs, temp1); // compute
+ vminpd(Operand(lhs), temp1, temp1); // min rhs, lhs
+ vorpd(temp1, output, output); // fix min(-0, 0) with OR
+ } else {
+ vmovapd(lhs, output); // compute
+ vmaxpd(rhs, output, output); // max lhs, rhs
+ vmovapd(rhs, temp1); // compute
+ vmaxpd(Operand(lhs), temp1, temp1); // max rhs, lhs
+ vandpd(temp1, output, output); // fix max(-0, 0) with AND
+ }
+ vmovapd(lhs, temp1); // compute
+ vcmpunordpd(rhs, temp1); // lhs UNORD rhs
+ vptest(temp1, temp1); // check if any unordered
+ j(Assembler::Equal, &l); // and exit if not
+
+ // Slow path.
+ // output has result for non-NaN lanes, garbage in NaN lanes.
+ // temp1 has lhs UNORD rhs.
+ // temp2 is dead.
+
+ vmovapd(temp1, temp2); // clear NaN lanes of result
+ vpandn(output, temp2, temp2); // result now in temp2
+ asMasm().vpandSimd128(quietBits, temp1); // setup QNaN bits in NaN lanes
+ vorpd(temp1, temp2, temp2); // and OR into result
+ vmovapd(lhs, temp1); // find NaN lanes
+ vcmpunordpd(Operand(temp1), temp1); // in lhs
+ vmovapd(temp1, output); // (and save them for later)
+ vandpd(lhs, temp1, temp1); // and extract the NaNs
+ vorpd(temp1, temp2, temp2); // and add to the result
+ vmovapd(rhs, temp1); // find NaN lanes
+ vcmpunordpd(Operand(temp1), temp1); // in rhs
+ vpandn(temp1, output, output); // except if they were in lhs
+ vandpd(rhs, output, output); // and extract the NaNs
+ vorpd(temp2, output, output); // and add to the result
+
+ bind(&l);
+ /* clang-format on */
+}
+
+void MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, Operand rhs,
+ FloatRegister temp1,
+ FloatRegister temp2,
+ FloatRegister output) {
+ minMaxFloat32x4(/*isMin=*/true, lhs, rhs, temp1, temp2, output);
+}
+
+void MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, Operand rhs,
+ FloatRegister temp1,
+ FloatRegister temp2,
+ FloatRegister output) {
+ minMaxFloat32x4(/*isMin=*/false, lhs, rhs, temp1, temp2, output);
+}
+
+void MacroAssemblerX86Shared::minFloat64x2(FloatRegister lhs, Operand rhs,
+ FloatRegister temp1,
+ FloatRegister temp2,
+ FloatRegister output) {
+ minMaxFloat64x2(/*isMin=*/true, lhs, rhs, temp1, temp2, output);
+}
+
+void MacroAssemblerX86Shared::maxFloat64x2(FloatRegister lhs, Operand rhs,
+ FloatRegister temp1,
+ FloatRegister temp2,
+ FloatRegister output) {
+ minMaxFloat64x2(/*isMin=*/false, lhs, rhs, temp1, temp2, output);
+}
+
+static inline void MaskSimdShiftCount(MacroAssembler& masm, unsigned shiftmask,
+ Register count, Register temp,
+ FloatRegister dest) {
+ masm.mov(count, temp);
+ masm.andl(Imm32(shiftmask), temp);
+ masm.vmovd(temp, dest);
+}
+
+void MacroAssemblerX86Shared::packedShiftByScalarInt8x16(
+ FloatRegister in, Register count, Register temp, FloatRegister xtmp,
+ FloatRegister dest,
+ void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister,
+ FloatRegister),
+ void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister)) {
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 7, count, temp, scratch);
+
+ // High bytes
+ vpalignr(Operand(in), xtmp, 8);
+ (this->*extend)(Operand(xtmp), xtmp);
+ (this->*shift)(scratch, xtmp, xtmp);
+
+ // Low bytes
+ (this->*extend)(Operand(dest), dest);
+ (this->*shift)(scratch, dest, dest);
+
+ // Mask off garbage to avoid saturation during packing
+ asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x00FF00FF)),
+ scratch);
+ vpand(Operand(scratch), xtmp, xtmp);
+ vpand(Operand(scratch), dest, dest);
+
+ vpackuswb(Operand(xtmp), dest, dest);
+}
+
+void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
+ FloatRegister in, Register count, Register temp, FloatRegister xtmp,
+ FloatRegister dest) {
+ packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
+ &MacroAssemblerX86Shared::vpsllw,
+ &MacroAssemblerX86Shared::vpmovzxbw);
+}
+
+void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
+ Imm32 count, FloatRegister src, FloatRegister dest) {
+ MOZ_ASSERT(count.value <= 7);
+ asMasm().moveSimd128(src, dest);
+ // Use the doubling trick for low shift counts, otherwise mask off the bits
+ // that are shifted out of the low byte of each word and use word shifts. The
+ // optimal cutoff remains to be explored.
+ if (count.value <= 3) {
+ for (int32_t shift = count.value; shift > 0; --shift) {
+ asMasm().addInt8x16(dest, dest);
+ }
+ } else {
+ asMasm().bitwiseAndSimd128(SimdConstant::SplatX16(0xFF >> count.value),
+ dest);
+ vpsllw(count, dest, dest);
+ }
+}
+
+void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
+ FloatRegister in, Register count, Register temp, FloatRegister xtmp,
+ FloatRegister dest) {
+ packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
+ &MacroAssemblerX86Shared::vpsraw,
+ &MacroAssemblerX86Shared::vpmovsxbw);
+}
+
+void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
+ Imm32 count, FloatRegister src, FloatRegister temp, FloatRegister dest) {
+ MOZ_ASSERT(count.value <= 7);
+ ScratchSimd128Scope scratch(asMasm());
+
+ asMasm().moveSimd128(src, scratch);
+ vpslldq(Imm32(1), scratch, scratch); // Low bytes -> high bytes
+ vpsraw(Imm32(count.value + 8), scratch, scratch); // Shift low bytes
+ asMasm().moveSimd128(src, dest);
+ vpsraw(count, dest, dest); // Shift high bytes
+ asMasm().loadConstantSimd128Int(SimdConstant::SplatX8(0xFF00), temp);
+ vpand(Operand(temp), dest, dest); // Keep high bytes
+ vpandn(Operand(scratch), temp, temp); // Keep low bytes
+ vpor(Operand(temp), dest, dest); // Combine
+}
+
+void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
+ FloatRegister in, Register count, Register temp, FloatRegister xtmp,
+ FloatRegister dest) {
+ packedShiftByScalarInt8x16(in, count, temp, xtmp, dest,
+ &MacroAssemblerX86Shared::vpsrlw,
+ &MacroAssemblerX86Shared::vpmovzxbw);
+}
+
+void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
+ Imm32 count, FloatRegister src, FloatRegister dest) {
+ MOZ_ASSERT(count.value <= 7);
+ asMasm().moveSimd128(src, dest);
+ asMasm().bitwiseAndSimd128(
+ SimdConstant::SplatX16((0xFF << count.value) & 0xFF), dest);
+ vpsrlw(count, dest, dest);
+}
+
+void MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(
+ FloatRegister in, Register count, Register temp, FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+ vpsllw(scratch, in, dest);
+}
+
+void MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(
+ FloatRegister in, Register count, Register temp, FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+ vpsraw(scratch, in, dest);
+}
+
+void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8(
+ FloatRegister in, Register count, Register temp, FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 15, count, temp, scratch);
+ vpsrlw(scratch, in, dest);
+}
+
+void MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4(
+ FloatRegister in, Register count, Register temp, FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+ vpslld(scratch, in, dest);
+}
+
+void MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4(
+ FloatRegister in, Register count, Register temp, FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+ vpsrad(scratch, in, dest);
+}
+
+void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4(
+ FloatRegister in, Register count, Register temp, FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 31, count, temp, scratch);
+ vpsrld(scratch, in, dest);
+}
+
+void MacroAssemblerX86Shared::packedLeftShiftByScalarInt64x2(
+ FloatRegister in, Register count, Register temp, FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 63, count, temp, scratch);
+ vpsllq(scratch, in, dest);
+}
+
+void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
+ FloatRegister in, Register count, Register temp1, FloatRegister temp2,
+ FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ movl(count, temp1); // temp1 is zero-extended shift count
+ andl(Imm32(63), temp1); // temp1 is masked shift count
+ vmovd(temp1, scratch); // and scratch 64-bit ditto
+ vpxor(Operand(temp2), temp2, temp2); // temp2=0
+ vpcmpgtq(Operand(in), temp2, temp2); // temp2=~0 where `in` negative
+ vpsrlq(scratch, in, dest); // dest shifted, maybe wrong sign
+ negl(temp1); // temp1 is - masked count
+ addl(Imm32(63), temp1); // temp1 is 63 - masked count
+ vmovd(temp1, scratch); // and scratch ditto
+ vpsllq(scratch, temp2, temp2); // temp2 has the sign bits
+ vpor(Operand(temp2), dest, dest); // dest has right sign
+}
+
+void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2(
+ FloatRegister in, Register count, Register temp, FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ MaskSimdShiftCount(asMasm(), 63, count, temp, scratch);
+ vpsrlq(scratch, in, dest);
+}
+
+void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
+ Imm32 count, FloatRegister src, FloatRegister dest) {
+ MOZ_ASSERT(count.value < 32);
+#ifdef ENABLE_WASM_SIMD
+ MOZ_ASSERT(!MacroAssembler::MustScalarizeShiftSimd128(wasm::SimdOp::I64x2ShrS,
+ count));
+#endif
+
+ ScratchSimd128Scope scratch(asMasm());
+ // Compute high dwords and mask low dwords
+ asMasm().moveSimd128(src, scratch);
+ vpsrad(count, scratch, scratch);
+ asMasm().vpandSimd128(SimdConstant::SplatX2(int64_t(0xFFFFFFFF00000000LL)),
+ scratch);
+ // Compute low dwords (high dwords at most have clear high bits where the
+ // result will have set low high bits)
+ asMasm().moveSimd128(src, dest);
+ vpsrlq(count, dest, dest);
+ // Merge the parts
+ vpor(scratch, dest, dest);
+}
+
+void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
+ FloatRegister onTrue,
+ FloatRegister onFalse,
+ FloatRegister temp,
+ FloatRegister output) {
+ // Normally the codegen will attempt to enforce these register assignments so
+ // that the moves are avoided.
+
+ asMasm().moveSimd128Int(onTrue, output);
+ asMasm().moveSimd128Int(mask, temp);
+
+ // SSE4.1 has plain blendvps which can do this, but it is awkward
+ // to use because it requires the mask to be in xmm0.
+
+ vpand(Operand(temp), output, output);
+ vpandn(Operand(onFalse), temp, temp);
+ vpor(Operand(temp), output, output);
+}
+
+// Code sequences for int32x4<->float32x4 culled from v8; commentary added.
+
+void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4(
+ FloatRegister src, FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ asMasm().moveSimd128Int(src, dest);
+ vpxor(Operand(scratch), scratch, scratch); // extract low bits
+ vpblendw(0x55, dest, scratch, scratch); // into scratch
+ vpsubd(Operand(scratch), dest, dest); // and high bits into dest
+ vcvtdq2ps(scratch, scratch); // convert low bits
+ vpsrld(Imm32(1), dest, dest); // get high into unsigned range
+ vcvtdq2ps(dest, dest); // convert
+ vaddps(Operand(dest), dest, dest); // and back into signed
+ vaddps(Operand(scratch), dest, dest); // combine high+low: may round
+}
+
+void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
+ FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ asMasm().moveSimd128Float(src, dest);
+
+ // The cvttps2dq instruction is the workhorse but does not handle NaN or out
+ // of range values as we need it to. We want to saturate too-large positive
+ // values to 7FFFFFFFh and too-large negative values to 80000000h. NaN and -0
+ // become 0.
+
+ // Convert NaN to 0 by masking away values that compare unordered to itself.
+ vmovaps(dest, scratch);
+ vcmpeqps(Operand(scratch), scratch);
+ vpand(Operand(scratch), dest, dest);
+
+ // Compute the complement of each non-NaN lane's sign bit, we'll need this to
+ // correct the result of cvttps2dq. All other output bits are garbage.
+ vpxor(Operand(dest), scratch, scratch);
+
+ // Convert. This will make the output 80000000h if the input is out of range.
+ vcvttps2dq(dest, dest);
+
+ // Preserve the computed complemented sign bit if the output was 80000000h.
+ // The sign bit will be 1 precisely for nonnegative values that overflowed.
+ vpand(Operand(dest), scratch, scratch);
+
+ // Create a mask with that sign bit. Now a lane is either FFFFFFFFh if there
+ // was a positive overflow, otherwise zero.
+ vpsrad(Imm32(31), scratch, scratch);
+
+ // Convert overflow lanes to 0x7FFFFFFF.
+ vpxor(Operand(scratch), dest, dest);
+}
+
+void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
+ FloatRegister src, FloatRegister temp, FloatRegister dest) {
+ ScratchSimd128Scope scratch(asMasm());
+ asMasm().moveSimd128Float(src, dest);
+
+ // The cvttps2dq instruction is the workhorse but does not handle NaN or out
+ // of range values as we need it to. We want to saturate too-large positive
+ // values to FFFFFFFFh and negative values to zero. NaN and -0 become 0.
+
+ // Convert NaN and negative values to zeroes in dest.
+ vpxor(Operand(scratch), scratch, scratch);
+ vmaxps(Operand(scratch), dest, dest);
+
+ // Place the largest positive signed integer in all lanes in scratch.
+ // We use it to bias the conversion to handle edge cases.
+ asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(2147483647.f),
+ scratch);
+
+ // temp = dest - 7FFFFFFFh (as floating), this brings integers in the unsigned
+ // range but above the signed range into the signed range; 0 => -7FFFFFFFh.
+ vmovaps(dest, temp);
+ vsubps(Operand(scratch), temp, temp);
+
+ // scratch = mask of biased values that are greater than 7FFFFFFFh.
+ vcmpleps(Operand(temp), scratch);
+
+ // Convert the biased values to integer. Positive values above 7FFFFFFFh will
+ // have been converted to 80000000h, all others become the expected integer.
+ vcvttps2dq(temp, temp);
+
+ // As lanes of scratch are ~0 where the result overflows, this computes
+ // 7FFFFFFF in lanes of temp that are 80000000h, and leaves other lanes
+ // untouched as the biased integer.
+ vpxor(Operand(scratch), temp, temp);
+
+ // Convert negative biased lanes in temp to zero. After this, temp will be
+ // zero where the result should be zero or is less than 80000000h, 7FFFFFFF
+ // where the result overflows, and will have the converted biased result in
+ // other lanes (for input values >= 80000000h).
+ vpxor(Operand(scratch), scratch, scratch);
+ vpmaxsd(Operand(scratch), temp, temp);
+
+ // Convert. Overflow lanes above 7FFFFFFFh will be 80000000h, other lanes will
+ // be what they should be.
+ vcvttps2dq(dest, dest);
+
+ // Add temp to the result. Overflow lanes with 80000000h becomes FFFFFFFFh,
+ // biased high-value unsigned lanes become unbiased, everything else is left
+ // unchanged.
+ vpaddd(Operand(temp), dest, dest);
+}