/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: set ts=8 sts=2 et sw=2 tw=80: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "jit/MacroAssembler.h" #include "jit/x86-shared/MacroAssembler-x86-shared.h" #include "jit/MacroAssembler-inl.h" using namespace js; using namespace js::jit; using mozilla::DebugOnly; using mozilla::FloatingPoint; using mozilla::Maybe; using mozilla::SpecificNaN; void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) { ScratchSimd128Scope scratch(asMasm()); vmovd(input, output); zeroSimd128Int(scratch); vpshufb(scratch, output, output); } void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) { vmovd(input, output); vpshuflw(0, output, output); vpshufd(0, output, output); } void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) { vmovd(input, output); vpshufd(0, output, output); } void MacroAssemblerX86Shared::splatX4(FloatRegister input, FloatRegister output) { MOZ_ASSERT(input.isSingle() && output.isSimd128()); asMasm().moveSimd128Float(input.asSimd128(), output); vshufps(0, output, output, output); } void MacroAssemblerX86Shared::splatX2(FloatRegister input, FloatRegister output) { MOZ_ASSERT(input.isDouble() && output.isSimd128()); asMasm().moveSimd128Float(input.asSimd128(), output); vshufpd(0, output, output, output); } void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input, Register output, unsigned lane) { if (lane == 0) { // The value we want to extract is in the low double-word moveLowInt32(input, output); } else { vpextrd(lane, input, output); } } void MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input, FloatRegister output, unsigned lane) { MOZ_ASSERT(input.isSimd128() && output.isSingle()); if (lane == 0) { // The value we want to extract is in the low double-word if (input.asSingle() != output) { moveFloat32(input, output); } } else if (lane == 2) { moveHighPairToLowPairFloat32(input, output); } else { uint32_t mask = MacroAssembler::ComputeShuffleMask(lane); shuffleFloat32(mask, input, output.asSimd128()); } } void MacroAssemblerX86Shared::extractLaneFloat64x2(FloatRegister input, FloatRegister output, unsigned lane) { MOZ_ASSERT(input.isSimd128() && output.isDouble()); if (lane == 0) { // The value we want to extract is in the low quadword if (input.asDouble() != output) { moveDouble(input, output); } } else { vpalignr(Operand(input), output, 8); } } void MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input, Register output, unsigned lane, SimdSign sign) { vpextrw(lane, input, output); if (sign == SimdSign::Signed) { movswl(output, output); } } void MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input, Register output, unsigned lane, SimdSign sign) { vpextrb(lane, input, output); if (sign == SimdSign::Signed) { movsbl(output, output); } } void MacroAssemblerX86Shared::replaceLaneFloat32x4(FloatRegister rhs, FloatRegister lhsDest, unsigned lane) { MOZ_ASSERT(lhsDest.isSimd128() && rhs.isSingle()); if (lane == 0) { if (rhs.asSimd128() == lhsDest) { // no-op, although this should not normally happen for type checking // reasons higher up in the stack. } else { // move low dword of value into low dword of output vmovss(rhs, lhsDest, lhsDest); } } else { vinsertps(vinsertpsMask(0, lane), rhs, lhsDest, lhsDest); } } void MacroAssemblerX86Shared::replaceLaneFloat64x2(FloatRegister rhs, FloatRegister lhsDest, unsigned lane) { MOZ_ASSERT(lhsDest.isSimd128() && rhs.isDouble()); if (lane == 0) { if (rhs.asSimd128() == lhsDest) { // no-op, although this should not normally happen for type checking // reasons higher up in the stack. } else { // move low qword of value into low qword of output vmovsd(rhs, lhsDest, lhsDest); } } else { // move low qword of value into high qword of output vshufpd(0, rhs, lhsDest, lhsDest); } } void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output, FloatRegister temp, const uint8_t lanes[16]) { MOZ_ASSERT(lhs == output); MOZ_ASSERT(lhs == rhs || !temp.isInvalid()); // TODO: Consider whether PBLENDVB would not be better, even if it is variable // and requires xmm0 to be free and the loading of a mask. // Set scratch = lanes to select from lhs. int8_t mask[16]; for (unsigned i = 0; i < 16; i++) { mask[i] = ~lanes[i]; } ScratchSimd128Scope scratch(asMasm()); asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(mask), scratch); if (lhs == rhs) { asMasm().moveSimd128Int(rhs, temp); rhs = temp; } vpand(Operand(scratch), lhs, lhs); vpandn(Operand(rhs), scratch, scratch); vpor(scratch, lhs, lhs); } void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs, FloatRegister output, const uint16_t lanes[8]) { MOZ_ASSERT(lhs == output); uint32_t mask = 0; for (unsigned i = 0; i < 8; i++) { if (lanes[i]) { mask |= (1 << i); } } vpblendw(mask, rhs, lhs, lhs); } void MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output, const uint8_t lanes[16]) { ScratchSimd128Scope scratch(asMasm()); // Use pshufb instructions to gather the lanes from each source vector. // A negative index creates a zero lane, so the two vectors can be combined. // Register preference: lhs == output. // Set scratch = lanes from rhs. int8_t idx[16]; for (unsigned i = 0; i < 16; i++) { idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1; } moveSimd128Int(rhs, scratch); asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), scratch); // Set output = lanes from lhs. for (unsigned i = 0; i < 16; i++) { idx[i] = lanes[i] < 16 ? lanes[i] : -1; } moveSimd128Int(lhs, output); asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), output); // Combine. vpor(scratch, output, output); } static inline FloatRegister ToSimdFloatRegister(const Operand& op) { return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128); } void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { static const SimdConstant allOnes = SimdConstant::SplatX16(-1); switch (cond) { case Assembler::Condition::GreaterThan: vpcmpgtb(rhs, lhs, output); break; case Assembler::Condition::Equal: vpcmpeqb(rhs, lhs, output); break; case Assembler::Condition::LessThan: { ScratchSimd128Scope scratch(asMasm()); // This is bad, but Ion does not use it. // src := rhs if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), scratch); } else { loadAlignedSimd128Int(rhs, scratch); } // src := src > lhs (i.e. lhs < rhs) vpcmpgtb(Operand(lhs), scratch, scratch); moveSimd128Int(scratch, output); break; } case Assembler::Condition::NotEqual: vpcmpeqb(rhs, lhs, output); asMasm().bitwiseXorSimd128(allOnes, output); break; case Assembler::Condition::GreaterThanOrEqual: { ScratchSimd128Scope scratch(asMasm()); // This is bad, but Ion does not use it. // src := rhs if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), scratch); } else { loadAlignedSimd128Int(rhs, scratch); } vpcmpgtb(Operand(lhs), scratch, scratch); asMasm().loadConstantSimd128Int(allOnes, output); vpxor(Operand(scratch), output, output); break; } case Assembler::Condition::LessThanOrEqual: // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. vpcmpgtb(rhs, lhs, output); asMasm().bitwiseXorSimd128(allOnes, output); break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareInt8x16(Assembler::Condition cond, const SimdConstant& rhs, FloatRegister lhsDest) { bool complement = false; switch (cond) { case Assembler::Condition::NotEqual: complement = true; [[fallthrough]]; case Assembler::Condition::Equal: binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpeqb, &MacroAssembler::vpcmpeqbSimd128); break; case Assembler::Condition::LessThanOrEqual: complement = true; [[fallthrough]]; case Assembler::Condition::GreaterThan: binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpgtb, &MacroAssembler::vpcmpgtbSimd128); break; default: MOZ_CRASH("unexpected condition op"); } if (complement) { asMasm().bitwiseXorSimd128(SimdConstant::SplatX16(-1), lhsDest); } } void MacroAssemblerX86Shared::unsignedCompareInt8x16( FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) { // We widen the inputs to 16 bits, transforming them to nonnegative values; // then compare them as signed using the logic from compareInt8x16(); then // merge the results (which is surprisingly complicated). rhs is left // untouched. The logic is open-coded to streamline it. // // TODO? Rhs could be in memory (for Ion, anyway), in which case loading it // into scratch first would be better than loading it twice from memory. MOZ_ASSERT(lhs == output); MOZ_ASSERT(lhs != tmp1 && lhs != tmp2); MOZ_ASSERT_IF( rhs.kind() == Operand::FPREG, ToSimdFloatRegister(rhs) != tmp1 && ToSimdFloatRegister(rhs) != tmp2); bool complement = false; switch (cond) { case Assembler::Above: case Assembler::BelowOrEqual: complement = cond == Assembler::BelowOrEqual; // Low eight bytes of inputs widened to words vpmovzxbw(Operand(lhs), tmp1); vpmovzxbw(rhs, tmp2); // Compare leaving 16-bit results vpcmpgtw(Operand(tmp2), tmp1, tmp1); // lhs < rhs in tmp1 // High eight bytes of inputs widened to words vpalignr(rhs, tmp2, 8); vpmovzxbw(Operand(tmp2), tmp2); vpalignr(Operand(lhs), output, 8); vpmovzxbw(Operand(output), output); // Compare leaving 16-bit results vpcmpgtw(Operand(tmp2), output, output); // lhs < rhs in output break; case Assembler::Below: case Assembler::AboveOrEqual: complement = cond == Assembler::AboveOrEqual; // Same as above but with operands reversed // Low eight bytes of inputs widened to words vpmovzxbw(Operand(lhs), tmp2); vpmovzxbw(rhs, tmp1); // Compare leaving 16-bit results vpcmpgtw(Operand(tmp2), tmp1, tmp1); // rhs < lhs in tmp1 // High eight bytes of inputs widened to words vpalignr(Operand(lhs), tmp2, 8); vpmovzxbw(Operand(tmp2), tmp2); vpalignr(rhs, output, 8); vpmovzxbw(Operand(output), output); // Compare leaving 16-bit results vpcmpgtw(Operand(tmp2), output, output); // rhs < lhs in output break; default: MOZ_CRASH("Unsupported condition code"); } // Merge output (results of high byte compares) and tmp1 (results of low byte // compares) by truncating word results to bytes (to avoid signed saturation), // packing, and then concatenating and shifting. vpsrlw(Imm32(8), tmp1, tmp1); vpackuswb(Operand(tmp1), tmp1, tmp1); vpsrlw(Imm32(8), output, output); vpackuswb(Operand(output), output, output); vpalignr(Operand(tmp1), output, 8); // Complement when needed for opposite sense of the operator. if (complement) { vpcmpeqd(Operand(tmp1), tmp1, tmp1); vpxor(Operand(tmp1), output, output); } } void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { static const SimdConstant allOnes = SimdConstant::SplatX8(-1); switch (cond) { case Assembler::Condition::GreaterThan: vpcmpgtw(rhs, lhs, output); break; case Assembler::Condition::Equal: vpcmpeqw(rhs, lhs, output); break; case Assembler::Condition::LessThan: { ScratchSimd128Scope scratch(asMasm()); // This is bad, but Ion does not use it. // src := rhs if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), scratch); } else { loadAlignedSimd128Int(rhs, scratch); } // src := src > lhs (i.e. lhs < rhs) vpcmpgtw(Operand(lhs), scratch, scratch); moveSimd128Int(scratch, output); break; } case Assembler::Condition::NotEqual: vpcmpeqw(rhs, lhs, output); asMasm().bitwiseXorSimd128(allOnes, output); break; case Assembler::Condition::GreaterThanOrEqual: { ScratchSimd128Scope scratch(asMasm()); // This is bad, but Ion does not use it. // src := rhs if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), scratch); } else { loadAlignedSimd128Int(rhs, scratch); } vpcmpgtw(Operand(lhs), scratch, scratch); asMasm().loadConstantSimd128Int(allOnes, output); vpxor(Operand(scratch), output, output); break; } case Assembler::Condition::LessThanOrEqual: // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. vpcmpgtw(rhs, lhs, output); asMasm().bitwiseXorSimd128(allOnes, output); break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareInt16x8(Assembler::Condition cond, const SimdConstant& rhs, FloatRegister lhsDest) { bool complement = false; switch (cond) { case Assembler::Condition::NotEqual: complement = true; [[fallthrough]]; case Assembler::Condition::Equal: binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpeqw, &MacroAssembler::vpcmpeqwSimd128); break; case Assembler::Condition::LessThanOrEqual: complement = true; [[fallthrough]]; case Assembler::Condition::GreaterThan: binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpgtw, &MacroAssembler::vpcmpgtwSimd128); break; default: MOZ_CRASH("unexpected condition op"); } if (complement) { asMasm().bitwiseXorSimd128(SimdConstant::SplatX16(-1), lhsDest); } } void MacroAssemblerX86Shared::unsignedCompareInt16x8( FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) { // See comments at unsignedCompareInt8x16. MOZ_ASSERT(lhs == output); MOZ_ASSERT(lhs != tmp1 && lhs != tmp2); MOZ_ASSERT_IF( rhs.kind() == Operand::FPREG, ToSimdFloatRegister(rhs) != tmp1 && ToSimdFloatRegister(rhs) != tmp2); bool complement = false; switch (cond) { case Assembler::Above: case Assembler::BelowOrEqual: complement = cond == Assembler::BelowOrEqual; vpmovzxwd(Operand(lhs), tmp1); vpmovzxwd(rhs, tmp2); vpcmpgtd(Operand(tmp2), tmp1, tmp1); vpalignr(rhs, tmp2, 8); vpmovzxwd(Operand(tmp2), tmp2); vpalignr(Operand(lhs), output, 8); vpmovzxwd(Operand(output), output); vpcmpgtd(Operand(tmp2), output, output); break; case Assembler::Below: case Assembler::AboveOrEqual: complement = cond == Assembler::AboveOrEqual; vpmovzxwd(Operand(lhs), tmp2); vpmovzxwd(rhs, tmp1); vpcmpgtd(Operand(tmp2), tmp1, tmp1); vpalignr(Operand(lhs), tmp2, 8); vpmovzxwd(Operand(tmp2), tmp2); vpalignr(rhs, output, 8); vpmovzxwd(Operand(output), output); vpcmpgtd(Operand(tmp2), output, output); break; default: MOZ_CRASH(); } vpsrld(Imm32(16), tmp1, tmp1); vpackusdw(Operand(tmp1), tmp1, tmp1); vpsrld(Imm32(16), output, output); vpackusdw(Operand(output), output, output); vpalignr(Operand(tmp1), output, 8); if (complement) { vpcmpeqd(Operand(tmp1), tmp1, tmp1); vpxor(Operand(tmp1), output, output); } } void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { static const SimdConstant allOnes = SimdConstant::SplatX4(-1); switch (cond) { case Assembler::Condition::GreaterThan: vpcmpgtd(rhs, lhs, lhs); break; case Assembler::Condition::Equal: vpcmpeqd(rhs, lhs, lhs); break; case Assembler::Condition::LessThan: { ScratchSimd128Scope scratch(asMasm()); // This is bad, but Ion does not use it. // src := rhs if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), scratch); } else { loadAlignedSimd128Int(rhs, scratch); } // src := src > lhs (i.e. lhs < rhs) vpcmpgtd(Operand(lhs), scratch, scratch); moveSimd128Int(scratch, lhs); break; } case Assembler::Condition::NotEqual: vpcmpeqd(rhs, lhs, lhs); asMasm().bitwiseXorSimd128(allOnes, lhs); break; case Assembler::Condition::GreaterThanOrEqual: { ScratchSimd128Scope scratch(asMasm()); // This is bad, but Ion does not use it. // src := rhs if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), scratch); } else { loadAlignedSimd128Int(rhs, scratch); } vpcmpgtd(Operand(lhs), scratch, scratch); asMasm().loadConstantSimd128Int(allOnes, lhs); vpxor(Operand(scratch), lhs, lhs); break; } case Assembler::Condition::LessThanOrEqual: // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. vpcmpgtd(rhs, lhs, lhs); asMasm().bitwiseXorSimd128(allOnes, lhs); break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareInt32x4(Assembler::Condition cond, const SimdConstant& rhs, FloatRegister lhsDest) { bool complement = false; switch (cond) { case Assembler::Condition::NotEqual: complement = true; [[fallthrough]]; case Assembler::Condition::Equal: binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpeqd, &MacroAssembler::vpcmpeqdSimd128); break; case Assembler::Condition::LessThanOrEqual: complement = true; [[fallthrough]]; case Assembler::Condition::GreaterThan: binarySimd128(rhs, lhsDest, &MacroAssembler::vpcmpgtd, &MacroAssembler::vpcmpgtdSimd128); break; default: MOZ_CRASH("unexpected condition op"); } if (complement) { asMasm().bitwiseXorSimd128(SimdConstant::SplatX16(-1), lhsDest); } } void MacroAssemblerX86Shared::unsignedCompareInt32x4( FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) { // See comments at unsignedCompareInt8x16, the logic is similar. However we // only have PCMPGTQ on SSE4.2 or later, so for SSE4.1 we need to use subtract // to compute the flags. MOZ_ASSERT(lhs == output); MOZ_ASSERT(lhs != tmp1 && lhs != tmp2); MOZ_ASSERT_IF( rhs.kind() == Operand::FPREG, ToSimdFloatRegister(rhs) != tmp1 && ToSimdFloatRegister(rhs) != tmp2); bool complement = false; switch (cond) { case Assembler::Below: case Assembler::AboveOrEqual: complement = cond == Assembler::AboveOrEqual; // The effect of the subtract is that the high doubleword of each quadword // becomes either 0 (ge) or -1 (lt). vpmovzxdq(Operand(lhs), tmp1); vpmovzxdq(rhs, tmp2); vpsubq(Operand(tmp2), tmp1, tmp1); // flag1 junk flag0 junk vpsrlq(Imm32(32), tmp1, tmp1); // zero flag1 zero flag0 vpshufd(MacroAssembler::ComputeShuffleMask(0, 2, 3, 3), tmp1, tmp1); // zero zero flag1 flag0 vpalignr(rhs, tmp2, 8); vpmovzxdq(Operand(tmp2), tmp2); vpalignr(Operand(lhs), output, 8); vpmovzxdq(Operand(output), output); vpsubq(Operand(tmp2), output, output); // flag3 junk flag2 junk vpsrlq(Imm32(32), output, output); // zero flag3 zero flag2 vpshufd(MacroAssembler::ComputeShuffleMask(3, 3, 0, 2), output, output); // flag3 flag2 zero zero vpor(Operand(tmp1), output, output); break; case Assembler::Above: case Assembler::BelowOrEqual: complement = cond == Assembler::BelowOrEqual; // The effect of the subtract is that the high doubleword of each quadword // becomes either 0 (le) or -1 (gt). vpmovzxdq(Operand(lhs), tmp2); vpmovzxdq(rhs, tmp1); vpsubq(Operand(tmp2), tmp1, tmp1); // flag1 junk flag0 junk vpsrlq(Imm32(32), tmp1, tmp1); // zero flag1 zero flag0 vpshufd(MacroAssembler::ComputeShuffleMask(0, 2, 3, 3), tmp1, tmp1); // zero zero flag1 flag0 vpalignr(Operand(lhs), tmp2, 8); vpmovzxdq(Operand(tmp2), tmp2); vpalignr(rhs, output, 8); vpmovzxdq(Operand(output), output); vpsubq(Operand(tmp2), output, output); // flag3 junk flag2 junk vpsrlq(Imm32(32), output, output); // zero flag3 zero flag2 vpshufd(MacroAssembler::ComputeShuffleMask(3, 3, 0, 2), output, output); // flag3 flag2 zero zero vpor(Operand(tmp1), output, output); break; default: MOZ_CRASH(); } if (complement) { vpcmpeqd(Operand(tmp1), tmp1, tmp1); vpxor(Operand(tmp1), output, output); } } void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { if (HasAVX()) { MOZ_CRASH("Can do better here with three-address compares"); } // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output. // This is bad, but Ion does not need this fixup. ScratchSimd128Scope scratch(asMasm()); if (!lhs.aliases(output)) { if (rhs.kind() == Operand::FPREG && output.aliases(FloatRegister::FromCode(rhs.fpu()))) { vmovaps(rhs, scratch); rhs = Operand(scratch); } vmovaps(lhs, output); } switch (cond) { case Assembler::Condition::Equal: vcmpeqps(rhs, output); break; case Assembler::Condition::LessThan: vcmpltps(rhs, output); break; case Assembler::Condition::LessThanOrEqual: vcmpleps(rhs, output); break; case Assembler::Condition::NotEqual: vcmpneqps(rhs, output); break; case Assembler::Condition::GreaterThanOrEqual: case Assembler::Condition::GreaterThan: // We reverse these operations in the -inl.h file so that we don't have to // copy into and out of temporaries after codegen. MOZ_CRASH("should have reversed this"); default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareFloat32x4(Assembler::Condition cond, const SimdConstant& rhs, FloatRegister lhsDest) { switch (cond) { case Assembler::Condition::Equal: binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpeqps, &MacroAssembler::vcmpeqpsSimd128); break; case Assembler::Condition::LessThan: binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpltps, &MacroAssembler::vcmpltpsSimd128); break; case Assembler::Condition::LessThanOrEqual: binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpleps, &MacroAssembler::vcmplepsSimd128); break; case Assembler::Condition::NotEqual: binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpneqps, &MacroAssembler::vcmpneqpsSimd128); break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { if (HasAVX()) { MOZ_CRASH("Can do better here with three-address compares"); } // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output. // This is bad, but Ion does not need this fixup. ScratchSimd128Scope scratch(asMasm()); if (!lhs.aliases(output)) { if (rhs.kind() == Operand::FPREG && output.aliases(FloatRegister::FromCode(rhs.fpu()))) { vmovapd(rhs, scratch); rhs = Operand(scratch); } vmovapd(lhs, output); } switch (cond) { case Assembler::Condition::Equal: vcmpeqpd(rhs, output); break; case Assembler::Condition::LessThan: vcmpltpd(rhs, output); break; case Assembler::Condition::LessThanOrEqual: vcmplepd(rhs, output); break; case Assembler::Condition::NotEqual: vcmpneqpd(rhs, output); break; case Assembler::Condition::GreaterThanOrEqual: case Assembler::Condition::GreaterThan: // We reverse these operations in the -inl.h file so that we don't have to // copy into and out of temporaries after codegen. MOZ_CRASH("should have reversed this"); default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareFloat64x2(Assembler::Condition cond, const SimdConstant& rhs, FloatRegister lhsDest) { switch (cond) { case Assembler::Condition::Equal: binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpeqpd, &MacroAssembler::vcmpeqpdSimd128); break; case Assembler::Condition::LessThan: binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpltpd, &MacroAssembler::vcmpltpdSimd128); break; case Assembler::Condition::LessThanOrEqual: binarySimd128(rhs, lhsDest, &MacroAssembler::vcmplepd, &MacroAssembler::vcmplepdSimd128); break; case Assembler::Condition::NotEqual: binarySimd128(rhs, lhsDest, &MacroAssembler::vcmpneqpd, &MacroAssembler::vcmpneqpdSimd128); break; default: MOZ_CRASH("unexpected condition op"); } } // Semantics of wasm max and min. // // * -0 < 0 // * If one input is NaN then that NaN is the output // * If both inputs are NaN then the output is selected nondeterministically // * Any returned NaN is always made quiet // * The MVP spec 2.2.3 says "No distinction is made between signalling and // quiet NaNs", suggesting SNaN inputs are allowed and should not fault // // Semantics of maxps/minps/maxpd/minpd: // // * If the values are both +/-0 the rhs is returned // * If the rhs is SNaN then the rhs is returned // * If either value is NaN then the rhs is returned // * An SNaN operand does not appear to give rise to an exception, at least // not in the JS shell on Linux, though the Intel spec lists Invalid // as one of the possible exceptions // Various unaddressed considerations: // // It's pretty insane for this to take an Operand rhs - it really needs to be // a register, given the number of times we access it. // // Constant load can be folded into the ANDPS. Do we care? It won't save us // any registers, since output/temp1/temp2/scratch are all live at the same time // after the first instruction of the slow path. // // Can we use blend for the NaN extraction/insertion? We'd need xmm0 for the // mask, which is no fun. But it would be lhs UNORD lhs -> mask, blend; // rhs UNORD rhs -> mask; blend. Better than the mess we have below. But // we'd still need to setup the QNaN bits, unless we can blend those too // with the lhs UNORD rhs mask? // // If we could determine that both input lanes are NaN then the result of the // fast path should be fine modulo the QNaN bits, but it's not obvious this is // much of an advantage. void MacroAssemblerX86Shared::minMaxFloat32x4(bool isMin, FloatRegister lhs_, Operand rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { ScratchSimd128Scope scratch(asMasm()); Label l; SimdConstant quietBits(SimdConstant::SplatX4(int32_t(0x00400000))); /* clang-format off */ /* leave my comments alone */ FloatRegister lhs = reusedInputSimd128Float(lhs_, scratch); if (isMin) { vmovaps(lhs, output); // compute vminps(rhs, output, output); // min lhs, rhs vmovaps(rhs, temp1); // compute vminps(Operand(lhs), temp1, temp1); // min rhs, lhs vorps(temp1, output, output); // fix min(-0, 0) with OR } else { vmovaps(lhs, output); // compute vmaxps(rhs, output, output); // max lhs, rhs vmovaps(rhs, temp1); // compute vmaxps(Operand(lhs), temp1, temp1); // max rhs, lhs vandps(temp1, output, output); // fix max(-0, 0) with AND } vmovaps(lhs, temp1); // compute vcmpunordps(rhs, temp1); // lhs UNORD rhs vptest(temp1, temp1); // check if any unordered j(Assembler::Equal, &l); // and exit if not // Slow path. // output has result for non-NaN lanes, garbage in NaN lanes. // temp1 has lhs UNORD rhs. // temp2 is dead. vmovaps(temp1, temp2); // clear NaN lanes of result vpandn(output, temp2, temp2); // result now in temp2 asMasm().vpandSimd128(quietBits, temp1); // setup QNaN bits in NaN lanes vorps(temp1, temp2, temp2); // and OR into result vmovaps(lhs, temp1); // find NaN lanes vcmpunordps(Operand(temp1), temp1); // in lhs vmovaps(temp1, output); // (and save them for later) vandps(lhs, temp1, temp1); // and extract the NaNs vorps(temp1, temp2, temp2); // and add to the result vmovaps(rhs, temp1); // find NaN lanes vcmpunordps(Operand(temp1), temp1); // in rhs vpandn(temp1, output, output); // except if they were in lhs vandps(rhs, output, output); // and extract the NaNs vorps(temp2, output, output); // and add to the result bind(&l); /* clang-format on */ } // Exactly as above. void MacroAssemblerX86Shared::minMaxFloat64x2(bool isMin, FloatRegister lhs_, Operand rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { ScratchSimd128Scope scratch(asMasm()); Label l; SimdConstant quietBits(SimdConstant::SplatX2(int64_t(0x0008000000000000ull))); /* clang-format off */ /* leave my comments alone */ FloatRegister lhs = reusedInputSimd128Float(lhs_, scratch); if (isMin) { vmovapd(lhs, output); // compute vminpd(rhs, output, output); // min lhs, rhs vmovapd(rhs, temp1); // compute vminpd(Operand(lhs), temp1, temp1); // min rhs, lhs vorpd(temp1, output, output); // fix min(-0, 0) with OR } else { vmovapd(lhs, output); // compute vmaxpd(rhs, output, output); // max lhs, rhs vmovapd(rhs, temp1); // compute vmaxpd(Operand(lhs), temp1, temp1); // max rhs, lhs vandpd(temp1, output, output); // fix max(-0, 0) with AND } vmovapd(lhs, temp1); // compute vcmpunordpd(rhs, temp1); // lhs UNORD rhs vptest(temp1, temp1); // check if any unordered j(Assembler::Equal, &l); // and exit if not // Slow path. // output has result for non-NaN lanes, garbage in NaN lanes. // temp1 has lhs UNORD rhs. // temp2 is dead. vmovapd(temp1, temp2); // clear NaN lanes of result vpandn(output, temp2, temp2); // result now in temp2 asMasm().vpandSimd128(quietBits, temp1); // setup QNaN bits in NaN lanes vorpd(temp1, temp2, temp2); // and OR into result vmovapd(lhs, temp1); // find NaN lanes vcmpunordpd(Operand(temp1), temp1); // in lhs vmovapd(temp1, output); // (and save them for later) vandpd(lhs, temp1, temp1); // and extract the NaNs vorpd(temp1, temp2, temp2); // and add to the result vmovapd(rhs, temp1); // find NaN lanes vcmpunordpd(Operand(temp1), temp1); // in rhs vpandn(temp1, output, output); // except if they were in lhs vandpd(rhs, output, output); // and extract the NaNs vorpd(temp2, output, output); // and add to the result bind(&l); /* clang-format on */ } void MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { minMaxFloat32x4(/*isMin=*/true, lhs, rhs, temp1, temp2, output); } void MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, Operand rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { minMaxFloat32x4(/*isMin=*/false, lhs, rhs, temp1, temp2, output); } void MacroAssemblerX86Shared::minFloat64x2(FloatRegister lhs, Operand rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { minMaxFloat64x2(/*isMin=*/true, lhs, rhs, temp1, temp2, output); } void MacroAssemblerX86Shared::maxFloat64x2(FloatRegister lhs, Operand rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { minMaxFloat64x2(/*isMin=*/false, lhs, rhs, temp1, temp2, output); } static inline void MaskSimdShiftCount(MacroAssembler& masm, unsigned shiftmask, Register count, Register temp, FloatRegister dest) { masm.mov(count, temp); masm.andl(Imm32(shiftmask), temp); masm.vmovd(temp, dest); } void MacroAssemblerX86Shared::packedShiftByScalarInt8x16( FloatRegister in, Register count, Register temp, FloatRegister xtmp, FloatRegister dest, void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister, FloatRegister), void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister)) { ScratchSimd128Scope scratch(asMasm()); MaskSimdShiftCount(asMasm(), 7, count, temp, scratch); // High bytes vpalignr(Operand(in), xtmp, 8); (this->*extend)(Operand(xtmp), xtmp); (this->*shift)(scratch, xtmp, xtmp); // Low bytes (this->*extend)(Operand(dest), dest); (this->*shift)(scratch, dest, dest); // Mask off garbage to avoid saturation during packing asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x00FF00FF)), scratch); vpand(Operand(scratch), xtmp, xtmp); vpand(Operand(scratch), dest, dest); vpackuswb(Operand(xtmp), dest, dest); } void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16( FloatRegister in, Register count, Register temp, FloatRegister xtmp, FloatRegister dest) { packedShiftByScalarInt8x16(in, count, temp, xtmp, dest, &MacroAssemblerX86Shared::vpsllw, &MacroAssemblerX86Shared::vpmovzxbw); } void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16( Imm32 count, FloatRegister src, FloatRegister dest) { MOZ_ASSERT(count.value <= 7); asMasm().moveSimd128(src, dest); // Use the doubling trick for low shift counts, otherwise mask off the bits // that are shifted out of the low byte of each word and use word shifts. The // optimal cutoff remains to be explored. if (count.value <= 3) { for (int32_t shift = count.value; shift > 0; --shift) { asMasm().addInt8x16(dest, dest); } } else { asMasm().bitwiseAndSimd128(SimdConstant::SplatX16(0xFF >> count.value), dest); vpsllw(count, dest, dest); } } void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16( FloatRegister in, Register count, Register temp, FloatRegister xtmp, FloatRegister dest) { packedShiftByScalarInt8x16(in, count, temp, xtmp, dest, &MacroAssemblerX86Shared::vpsraw, &MacroAssemblerX86Shared::vpmovsxbw); } void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16( Imm32 count, FloatRegister src, FloatRegister temp, FloatRegister dest) { MOZ_ASSERT(count.value <= 7); ScratchSimd128Scope scratch(asMasm()); asMasm().moveSimd128(src, scratch); vpslldq(Imm32(1), scratch, scratch); // Low bytes -> high bytes vpsraw(Imm32(count.value + 8), scratch, scratch); // Shift low bytes asMasm().moveSimd128(src, dest); vpsraw(count, dest, dest); // Shift high bytes asMasm().loadConstantSimd128Int(SimdConstant::SplatX8(0xFF00), temp); vpand(Operand(temp), dest, dest); // Keep high bytes vpandn(Operand(scratch), temp, temp); // Keep low bytes vpor(Operand(temp), dest, dest); // Combine } void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16( FloatRegister in, Register count, Register temp, FloatRegister xtmp, FloatRegister dest) { packedShiftByScalarInt8x16(in, count, temp, xtmp, dest, &MacroAssemblerX86Shared::vpsrlw, &MacroAssemblerX86Shared::vpmovzxbw); } void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16( Imm32 count, FloatRegister src, FloatRegister dest) { MOZ_ASSERT(count.value <= 7); asMasm().moveSimd128(src, dest); asMasm().bitwiseAndSimd128( SimdConstant::SplatX16((0xFF << count.value) & 0xFF), dest); vpsrlw(count, dest, dest); } void MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8( FloatRegister in, Register count, Register temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); MaskSimdShiftCount(asMasm(), 15, count, temp, scratch); vpsllw(scratch, in, dest); } void MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8( FloatRegister in, Register count, Register temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); MaskSimdShiftCount(asMasm(), 15, count, temp, scratch); vpsraw(scratch, in, dest); } void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8( FloatRegister in, Register count, Register temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); MaskSimdShiftCount(asMasm(), 15, count, temp, scratch); vpsrlw(scratch, in, dest); } void MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4( FloatRegister in, Register count, Register temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); MaskSimdShiftCount(asMasm(), 31, count, temp, scratch); vpslld(scratch, in, dest); } void MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4( FloatRegister in, Register count, Register temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); MaskSimdShiftCount(asMasm(), 31, count, temp, scratch); vpsrad(scratch, in, dest); } void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4( FloatRegister in, Register count, Register temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); MaskSimdShiftCount(asMasm(), 31, count, temp, scratch); vpsrld(scratch, in, dest); } void MacroAssemblerX86Shared::packedLeftShiftByScalarInt64x2( FloatRegister in, Register count, Register temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); MaskSimdShiftCount(asMasm(), 63, count, temp, scratch); vpsllq(scratch, in, dest); } void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2( FloatRegister in, Register count, Register temp1, FloatRegister temp2, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); movl(count, temp1); // temp1 is zero-extended shift count andl(Imm32(63), temp1); // temp1 is masked shift count vmovd(temp1, scratch); // and scratch 64-bit ditto vpxor(Operand(temp2), temp2, temp2); // temp2=0 vpcmpgtq(Operand(in), temp2, temp2); // temp2=~0 where `in` negative vpsrlq(scratch, in, dest); // dest shifted, maybe wrong sign negl(temp1); // temp1 is - masked count addl(Imm32(63), temp1); // temp1 is 63 - masked count vmovd(temp1, scratch); // and scratch ditto vpsllq(scratch, temp2, temp2); // temp2 has the sign bits vpor(Operand(temp2), dest, dest); // dest has right sign } void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2( FloatRegister in, Register count, Register temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); MaskSimdShiftCount(asMasm(), 63, count, temp, scratch); vpsrlq(scratch, in, dest); } void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2( Imm32 count, FloatRegister src, FloatRegister dest) { MOZ_ASSERT(count.value < 32); #ifdef ENABLE_WASM_SIMD MOZ_ASSERT(!MacroAssembler::MustScalarizeShiftSimd128(wasm::SimdOp::I64x2ShrS, count)); #endif ScratchSimd128Scope scratch(asMasm()); // Compute high dwords and mask low dwords asMasm().moveSimd128(src, scratch); vpsrad(count, scratch, scratch); asMasm().vpandSimd128(SimdConstant::SplatX2(int64_t(0xFFFFFFFF00000000LL)), scratch); // Compute low dwords (high dwords at most have clear high bits where the // result will have set low high bits) asMasm().moveSimd128(src, dest); vpsrlq(count, dest, dest); // Merge the parts vpor(scratch, dest, dest); } void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse, FloatRegister temp, FloatRegister output) { // Normally the codegen will attempt to enforce these register assignments so // that the moves are avoided. asMasm().moveSimd128Int(onTrue, output); asMasm().moveSimd128Int(mask, temp); // SSE4.1 has plain blendvps which can do this, but it is awkward // to use because it requires the mask to be in xmm0. vpand(Operand(temp), output, output); vpandn(Operand(onFalse), temp, temp); vpor(Operand(temp), output, output); } // Code sequences for int32x4<->float32x4 culled from v8; commentary added. void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4( FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); asMasm().moveSimd128Int(src, dest); vpxor(Operand(scratch), scratch, scratch); // extract low bits vpblendw(0x55, dest, scratch, scratch); // into scratch vpsubd(Operand(scratch), dest, dest); // and high bits into dest vcvtdq2ps(scratch, scratch); // convert low bits vpsrld(Imm32(1), dest, dest); // get high into unsigned range vcvtdq2ps(dest, dest); // convert vaddps(Operand(dest), dest, dest); // and back into signed vaddps(Operand(scratch), dest, dest); // combine high+low: may round } void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); asMasm().moveSimd128Float(src, dest); // The cvttps2dq instruction is the workhorse but does not handle NaN or out // of range values as we need it to. We want to saturate too-large positive // values to 7FFFFFFFh and too-large negative values to 80000000h. NaN and -0 // become 0. // Convert NaN to 0 by masking away values that compare unordered to itself. vmovaps(dest, scratch); vcmpeqps(Operand(scratch), scratch); vpand(Operand(scratch), dest, dest); // Compute the complement of each non-NaN lane's sign bit, we'll need this to // correct the result of cvttps2dq. All other output bits are garbage. vpxor(Operand(dest), scratch, scratch); // Convert. This will make the output 80000000h if the input is out of range. vcvttps2dq(dest, dest); // Preserve the computed complemented sign bit if the output was 80000000h. // The sign bit will be 1 precisely for nonnegative values that overflowed. vpand(Operand(dest), scratch, scratch); // Create a mask with that sign bit. Now a lane is either FFFFFFFFh if there // was a positive overflow, otherwise zero. vpsrad(Imm32(31), scratch, scratch); // Convert overflow lanes to 0x7FFFFFFF. vpxor(Operand(scratch), dest, dest); } void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4( FloatRegister src, FloatRegister temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); asMasm().moveSimd128Float(src, dest); // The cvttps2dq instruction is the workhorse but does not handle NaN or out // of range values as we need it to. We want to saturate too-large positive // values to FFFFFFFFh and negative values to zero. NaN and -0 become 0. // Convert NaN and negative values to zeroes in dest. vpxor(Operand(scratch), scratch, scratch); vmaxps(Operand(scratch), dest, dest); // Place the largest positive signed integer in all lanes in scratch. // We use it to bias the conversion to handle edge cases. asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(2147483647.f), scratch); // temp = dest - 7FFFFFFFh (as floating), this brings integers in the unsigned // range but above the signed range into the signed range; 0 => -7FFFFFFFh. vmovaps(dest, temp); vsubps(Operand(scratch), temp, temp); // scratch = mask of biased values that are greater than 7FFFFFFFh. vcmpleps(Operand(temp), scratch); // Convert the biased values to integer. Positive values above 7FFFFFFFh will // have been converted to 80000000h, all others become the expected integer. vcvttps2dq(temp, temp); // As lanes of scratch are ~0 where the result overflows, this computes // 7FFFFFFF in lanes of temp that are 80000000h, and leaves other lanes // untouched as the biased integer. vpxor(Operand(scratch), temp, temp); // Convert negative biased lanes in temp to zero. After this, temp will be // zero where the result should be zero or is less than 80000000h, 7FFFFFFF // where the result overflows, and will have the converted biased result in // other lanes (for input values >= 80000000h). vpxor(Operand(scratch), scratch, scratch); vpmaxsd(Operand(scratch), temp, temp); // Convert. Overflow lanes above 7FFFFFFFh will be 80000000h, other lanes will // be what they should be. vcvttps2dq(dest, dest); // Add temp to the result. Overflow lanes with 80000000h becomes FFFFFFFFh, // biased high-value unsigned lanes become unbiased, everything else is left // unchanged. vpaddd(Operand(temp), dest, dest); }