diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 01:47:29 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 01:47:29 +0000 |
commit | 0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d (patch) | |
tree | a31f07c9bcca9d56ce61e9a1ffd30ef350d513aa /js/src/jit/x86-shared/Lowering-x86-shared.cpp | |
parent | Initial commit. (diff) | |
download | firefox-esr-0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d.tar.xz firefox-esr-0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d.zip |
Adding upstream version 115.8.0esr.upstream/115.8.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'js/src/jit/x86-shared/Lowering-x86-shared.cpp')
-rw-r--r-- | js/src/jit/x86-shared/Lowering-x86-shared.cpp | 1863 |
1 files changed, 1863 insertions, 0 deletions
diff --git a/js/src/jit/x86-shared/Lowering-x86-shared.cpp b/js/src/jit/x86-shared/Lowering-x86-shared.cpp new file mode 100644 index 0000000000..bef178b2f5 --- /dev/null +++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp @@ -0,0 +1,1863 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "jit/x86-shared/Lowering-x86-shared.h" + +#include "mozilla/MathAlgorithms.h" + +#include "jit/Lowering.h" +#include "jit/MIR.h" + +#include "jit/shared/Lowering-shared-inl.h" + +using namespace js; +using namespace js::jit; + +using mozilla::Abs; +using mozilla::FloorLog2; +using mozilla::Maybe; +using mozilla::Nothing; +using mozilla::Some; + +LTableSwitch* LIRGeneratorX86Shared::newLTableSwitch( + const LAllocation& in, const LDefinition& inputCopy, + MTableSwitch* tableswitch) { + return new (alloc()) LTableSwitch(in, inputCopy, temp(), tableswitch); +} + +LTableSwitchV* LIRGeneratorX86Shared::newLTableSwitchV( + MTableSwitch* tableswitch) { + return new (alloc()) LTableSwitchV(useBox(tableswitch->getOperand(0)), temp(), + tempDouble(), temp(), tableswitch); +} + +void LIRGenerator::visitPowHalf(MPowHalf* ins) { + MDefinition* input = ins->input(); + MOZ_ASSERT(input->type() == MIRType::Double); + LPowHalfD* lir = new (alloc()) LPowHalfD(useRegisterAtStart(input)); + define(lir, ins); +} + +void LIRGeneratorX86Shared::lowerForShift(LInstructionHelper<1, 2, 0>* ins, + MDefinition* mir, MDefinition* lhs, + MDefinition* rhs) { + ins->setOperand(0, useRegisterAtStart(lhs)); + + // Shift operand should be constant or, unless BMI2 is available, in register + // ecx. x86 can't shift a non-ecx register. + if (rhs->isConstant()) { + ins->setOperand(1, useOrConstantAtStart(rhs)); + } else if (Assembler::HasBMI2() && !mir->isRotate()) { + ins->setOperand(1, willHaveDifferentLIRNodes(lhs, rhs) + ? useRegister(rhs) + : useRegisterAtStart(rhs)); + } else { + ins->setOperand(1, willHaveDifferentLIRNodes(lhs, rhs) + ? useFixed(rhs, ecx) + : useFixedAtStart(rhs, ecx)); + } + + defineReuseInput(ins, mir, 0); +} + +template <size_t Temps> +void LIRGeneratorX86Shared::lowerForShiftInt64( + LInstructionHelper<INT64_PIECES, INT64_PIECES + 1, Temps>* ins, + MDefinition* mir, MDefinition* lhs, MDefinition* rhs) { + ins->setInt64Operand(0, useInt64RegisterAtStart(lhs)); +#if defined(JS_NUNBOX32) + if (mir->isRotate()) { + ins->setTemp(0, temp()); + } +#endif + + static_assert(LShiftI64::Rhs == INT64_PIECES, + "Assume Rhs is located at INT64_PIECES."); + static_assert(LRotateI64::Count == INT64_PIECES, + "Assume Count is located at INT64_PIECES."); + + // Shift operand should be constant or, unless BMI2 is available, in register + // ecx. x86 can't shift a non-ecx register. + if (rhs->isConstant()) { + ins->setOperand(INT64_PIECES, useOrConstantAtStart(rhs)); +#ifdef JS_CODEGEN_X64 + } else if (Assembler::HasBMI2() && !mir->isRotate()) { + ins->setOperand(INT64_PIECES, useRegister(rhs)); +#endif + } else { + // The operands are int64, but we only care about the lower 32 bits of + // the RHS. On 32-bit, the code below will load that part in ecx and + // will discard the upper half. + ensureDefined(rhs); + LUse use(ecx); + use.setVirtualRegister(rhs->virtualRegister()); + ins->setOperand(INT64_PIECES, use); + } + + defineInt64ReuseInput(ins, mir, 0); +} + +template void LIRGeneratorX86Shared::lowerForShiftInt64( + LInstructionHelper<INT64_PIECES, INT64_PIECES + 1, 0>* ins, + MDefinition* mir, MDefinition* lhs, MDefinition* rhs); +template void LIRGeneratorX86Shared::lowerForShiftInt64( + LInstructionHelper<INT64_PIECES, INT64_PIECES + 1, 1>* ins, + MDefinition* mir, MDefinition* lhs, MDefinition* rhs); + +void LIRGeneratorX86Shared::lowerForCompareI64AndBranch( + MTest* mir, MCompare* comp, JSOp op, MDefinition* left, MDefinition* right, + MBasicBlock* ifTrue, MBasicBlock* ifFalse) { + auto* lir = new (alloc()) + LCompareI64AndBranch(comp, op, useInt64Register(left), + useInt64OrConstant(right), ifTrue, ifFalse); + add(lir, mir); +} + +void LIRGeneratorX86Shared::lowerForALU(LInstructionHelper<1, 1, 0>* ins, + MDefinition* mir, MDefinition* input) { + ins->setOperand(0, useRegisterAtStart(input)); + defineReuseInput(ins, mir, 0); +} + +void LIRGeneratorX86Shared::lowerForALU(LInstructionHelper<1, 2, 0>* ins, + MDefinition* mir, MDefinition* lhs, + MDefinition* rhs) { + ins->setOperand(0, useRegisterAtStart(lhs)); + ins->setOperand(1, willHaveDifferentLIRNodes(lhs, rhs) + ? useOrConstant(rhs) + : useOrConstantAtStart(rhs)); + defineReuseInput(ins, mir, 0); +} + +template <size_t Temps> +void LIRGeneratorX86Shared::lowerForFPU(LInstructionHelper<1, 2, Temps>* ins, + MDefinition* mir, MDefinition* lhs, + MDefinition* rhs) { + // Without AVX, we'll need to use the x86 encodings where one of the + // inputs must be the same location as the output. + if (!Assembler::HasAVX()) { + ins->setOperand(0, useRegisterAtStart(lhs)); + ins->setOperand( + 1, willHaveDifferentLIRNodes(lhs, rhs) ? use(rhs) : useAtStart(rhs)); + defineReuseInput(ins, mir, 0); + } else { + ins->setOperand(0, useRegisterAtStart(lhs)); + ins->setOperand(1, useAtStart(rhs)); + define(ins, mir); + } +} + +template void LIRGeneratorX86Shared::lowerForFPU( + LInstructionHelper<1, 2, 0>* ins, MDefinition* mir, MDefinition* lhs, + MDefinition* rhs); +template void LIRGeneratorX86Shared::lowerForFPU( + LInstructionHelper<1, 2, 1>* ins, MDefinition* mir, MDefinition* lhs, + MDefinition* rhs); + +void LIRGeneratorX86Shared::lowerForBitAndAndBranch(LBitAndAndBranch* baab, + MInstruction* mir, + MDefinition* lhs, + MDefinition* rhs) { + baab->setOperand(0, useRegisterAtStart(lhs)); + baab->setOperand(1, useRegisterOrConstantAtStart(rhs)); + add(baab, mir); +} + +void LIRGeneratorX86Shared::lowerNegI(MInstruction* ins, MDefinition* input) { + defineReuseInput(new (alloc()) LNegI(useRegisterAtStart(input)), ins, 0); +} + +void LIRGeneratorX86Shared::lowerNegI64(MInstruction* ins, MDefinition* input) { + defineInt64ReuseInput(new (alloc()) LNegI64(useInt64RegisterAtStart(input)), + ins, 0); +} + +void LIRGenerator::visitAbs(MAbs* ins) { + defineReuseInput(allocateAbs(ins, useRegisterAtStart(ins->input())), ins, 0); +} + +void LIRGeneratorX86Shared::lowerMulI(MMul* mul, MDefinition* lhs, + MDefinition* rhs) { + // Note: If we need a negative zero check, lhs is used twice. + LAllocation lhsCopy = mul->canBeNegativeZero() ? use(lhs) : LAllocation(); + LMulI* lir = new (alloc()) + LMulI(useRegisterAtStart(lhs), + willHaveDifferentLIRNodes(lhs, rhs) ? useOrConstant(rhs) + : useOrConstantAtStart(rhs), + lhsCopy); + if (mul->fallible()) { + assignSnapshot(lir, mul->bailoutKind()); + } + defineReuseInput(lir, mul, 0); +} + +void LIRGeneratorX86Shared::lowerDivI(MDiv* div) { + if (div->isUnsigned()) { + lowerUDiv(div); + return; + } + + // Division instructions are slow. Division by constant denominators can be + // rewritten to use other instructions. + if (div->rhs()->isConstant()) { + int32_t rhs = div->rhs()->toConstant()->toInt32(); + + // Division by powers of two can be done by shifting, and division by + // other numbers can be done by a reciprocal multiplication technique. + int32_t shift = FloorLog2(Abs(rhs)); + if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) { + LAllocation lhs = useRegisterAtStart(div->lhs()); + LDivPowTwoI* lir; + // When truncated with maybe a non-zero remainder, we have to round the + // result toward 0. This requires an extra register to round up/down + // whether the left-hand-side is signed. + bool needRoundNeg = div->canBeNegativeDividend() && div->isTruncated(); + if (!needRoundNeg) { + // Numerator is unsigned, so does not need adjusting. + lir = new (alloc()) LDivPowTwoI(lhs, lhs, shift, rhs < 0); + } else { + // Numerator might be signed, and needs adjusting, and an extra lhs copy + // is needed to round the result of the integer division towards zero. + lir = new (alloc()) + LDivPowTwoI(lhs, useRegister(div->lhs()), shift, rhs < 0); + } + if (div->fallible()) { + assignSnapshot(lir, div->bailoutKind()); + } + defineReuseInput(lir, div, 0); + return; + } + if (rhs != 0) { + LDivOrModConstantI* lir; + lir = new (alloc()) + LDivOrModConstantI(useRegister(div->lhs()), rhs, tempFixed(eax)); + if (div->fallible()) { + assignSnapshot(lir, div->bailoutKind()); + } + defineFixed(lir, div, LAllocation(AnyRegister(edx))); + return; + } + } + + LDivI* lir = new (alloc()) + LDivI(useRegister(div->lhs()), useRegister(div->rhs()), tempFixed(edx)); + if (div->fallible()) { + assignSnapshot(lir, div->bailoutKind()); + } + defineFixed(lir, div, LAllocation(AnyRegister(eax))); +} + +void LIRGeneratorX86Shared::lowerModI(MMod* mod) { + if (mod->isUnsigned()) { + lowerUMod(mod); + return; + } + + if (mod->rhs()->isConstant()) { + int32_t rhs = mod->rhs()->toConstant()->toInt32(); + int32_t shift = FloorLog2(Abs(rhs)); + if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) { + LModPowTwoI* lir = + new (alloc()) LModPowTwoI(useRegisterAtStart(mod->lhs()), shift); + if (mod->fallible()) { + assignSnapshot(lir, mod->bailoutKind()); + } + defineReuseInput(lir, mod, 0); + return; + } + if (rhs != 0) { + LDivOrModConstantI* lir; + lir = new (alloc()) + LDivOrModConstantI(useRegister(mod->lhs()), rhs, tempFixed(edx)); + if (mod->fallible()) { + assignSnapshot(lir, mod->bailoutKind()); + } + defineFixed(lir, mod, LAllocation(AnyRegister(eax))); + return; + } + } + + LModI* lir = new (alloc()) + LModI(useRegister(mod->lhs()), useRegister(mod->rhs()), tempFixed(eax)); + if (mod->fallible()) { + assignSnapshot(lir, mod->bailoutKind()); + } + defineFixed(lir, mod, LAllocation(AnyRegister(edx))); +} + +void LIRGenerator::visitWasmNeg(MWasmNeg* ins) { + switch (ins->type()) { + case MIRType::Int32: + defineReuseInput(new (alloc()) LNegI(useRegisterAtStart(ins->input())), + ins, 0); + break; + case MIRType::Float32: + defineReuseInput(new (alloc()) LNegF(useRegisterAtStart(ins->input())), + ins, 0); + break; + case MIRType::Double: + defineReuseInput(new (alloc()) LNegD(useRegisterAtStart(ins->input())), + ins, 0); + break; + default: + MOZ_CRASH(); + } +} + +void LIRGeneratorX86Shared::lowerWasmSelectI(MWasmSelect* select) { + auto* lir = new (alloc()) + LWasmSelect(useRegisterAtStart(select->trueExpr()), + useAny(select->falseExpr()), useRegister(select->condExpr())); + defineReuseInput(lir, select, LWasmSelect::TrueExprIndex); +} + +void LIRGeneratorX86Shared::lowerWasmSelectI64(MWasmSelect* select) { + auto* lir = new (alloc()) LWasmSelectI64( + useInt64RegisterAtStart(select->trueExpr()), + useInt64(select->falseExpr()), useRegister(select->condExpr())); + defineInt64ReuseInput(lir, select, LWasmSelectI64::TrueExprIndex); +} + +void LIRGenerator::visitAsmJSLoadHeap(MAsmJSLoadHeap* ins) { + MDefinition* base = ins->base(); + MOZ_ASSERT(base->type() == MIRType::Int32); + + MDefinition* boundsCheckLimit = ins->boundsCheckLimit(); + MOZ_ASSERT_IF(ins->needsBoundsCheck(), + boundsCheckLimit->type() == MIRType::Int32); + + // For simplicity, require a register if we're going to emit a bounds-check + // branch, so that we don't have special cases for constants. This should + // only happen in rare constant-folding cases since asm.js sets the minimum + // heap size based when accessed via constant. + LAllocation baseAlloc = ins->needsBoundsCheck() + ? useRegisterAtStart(base) + : useRegisterOrZeroAtStart(base); + + LAllocation limitAlloc = ins->needsBoundsCheck() + ? useRegisterAtStart(boundsCheckLimit) + : LAllocation(); + LAllocation memoryBaseAlloc = ins->hasMemoryBase() + ? useRegisterAtStart(ins->memoryBase()) + : LAllocation(); + + auto* lir = + new (alloc()) LAsmJSLoadHeap(baseAlloc, limitAlloc, memoryBaseAlloc); + define(lir, ins); +} + +void LIRGenerator::visitAsmJSStoreHeap(MAsmJSStoreHeap* ins) { + MDefinition* base = ins->base(); + MOZ_ASSERT(base->type() == MIRType::Int32); + + MDefinition* boundsCheckLimit = ins->boundsCheckLimit(); + MOZ_ASSERT_IF(ins->needsBoundsCheck(), + boundsCheckLimit->type() == MIRType::Int32); + + // For simplicity, require a register if we're going to emit a bounds-check + // branch, so that we don't have special cases for constants. This should + // only happen in rare constant-folding cases since asm.js sets the minimum + // heap size based when accessed via constant. + LAllocation baseAlloc = ins->needsBoundsCheck() + ? useRegisterAtStart(base) + : useRegisterOrZeroAtStart(base); + + LAllocation limitAlloc = ins->needsBoundsCheck() + ? useRegisterAtStart(boundsCheckLimit) + : LAllocation(); + LAllocation memoryBaseAlloc = ins->hasMemoryBase() + ? useRegisterAtStart(ins->memoryBase()) + : LAllocation(); + + LAsmJSStoreHeap* lir = nullptr; + switch (ins->access().type()) { + case Scalar::Int8: + case Scalar::Uint8: +#ifdef JS_CODEGEN_X86 + // See comment for LIRGeneratorX86::useByteOpRegister. + lir = new (alloc()) LAsmJSStoreHeap( + baseAlloc, useFixed(ins->value(), eax), limitAlloc, memoryBaseAlloc); + break; +#endif + case Scalar::Int16: + case Scalar::Uint16: + case Scalar::Int32: + case Scalar::Uint32: + case Scalar::Float32: + case Scalar::Float64: + // For now, don't allow constant values. The immediate operand affects + // instruction layout which affects patching. + lir = new (alloc()) + LAsmJSStoreHeap(baseAlloc, useRegisterAtStart(ins->value()), + limitAlloc, memoryBaseAlloc); + break; + case Scalar::Int64: + case Scalar::Simd128: + MOZ_CRASH("NYI"); + case Scalar::Uint8Clamped: + case Scalar::BigInt64: + case Scalar::BigUint64: + case Scalar::MaxTypedArrayViewType: + MOZ_CRASH("unexpected array type"); + } + add(lir, ins); +} + +void LIRGeneratorX86Shared::lowerUDiv(MDiv* div) { + if (div->rhs()->isConstant()) { + // NOTE: the result of toInt32 is coerced to uint32_t. + uint32_t rhs = div->rhs()->toConstant()->toInt32(); + int32_t shift = FloorLog2(rhs); + + LAllocation lhs = useRegisterAtStart(div->lhs()); + if (rhs != 0 && uint32_t(1) << shift == rhs) { + LDivPowTwoI* lir = new (alloc()) LDivPowTwoI(lhs, lhs, shift, false); + if (div->fallible()) { + assignSnapshot(lir, div->bailoutKind()); + } + defineReuseInput(lir, div, 0); + } else { + LUDivOrModConstant* lir = new (alloc()) + LUDivOrModConstant(useRegister(div->lhs()), rhs, tempFixed(eax)); + if (div->fallible()) { + assignSnapshot(lir, div->bailoutKind()); + } + defineFixed(lir, div, LAllocation(AnyRegister(edx))); + } + return; + } + + LUDivOrMod* lir = new (alloc()) LUDivOrMod( + useRegister(div->lhs()), useRegister(div->rhs()), tempFixed(edx)); + if (div->fallible()) { + assignSnapshot(lir, div->bailoutKind()); + } + defineFixed(lir, div, LAllocation(AnyRegister(eax))); +} + +void LIRGeneratorX86Shared::lowerUMod(MMod* mod) { + if (mod->rhs()->isConstant()) { + uint32_t rhs = mod->rhs()->toConstant()->toInt32(); + int32_t shift = FloorLog2(rhs); + + if (rhs != 0 && uint32_t(1) << shift == rhs) { + LModPowTwoI* lir = + new (alloc()) LModPowTwoI(useRegisterAtStart(mod->lhs()), shift); + if (mod->fallible()) { + assignSnapshot(lir, mod->bailoutKind()); + } + defineReuseInput(lir, mod, 0); + } else { + LUDivOrModConstant* lir = new (alloc()) + LUDivOrModConstant(useRegister(mod->lhs()), rhs, tempFixed(edx)); + if (mod->fallible()) { + assignSnapshot(lir, mod->bailoutKind()); + } + defineFixed(lir, mod, LAllocation(AnyRegister(eax))); + } + return; + } + + LUDivOrMod* lir = new (alloc()) LUDivOrMod( + useRegister(mod->lhs()), useRegister(mod->rhs()), tempFixed(eax)); + if (mod->fallible()) { + assignSnapshot(lir, mod->bailoutKind()); + } + defineFixed(lir, mod, LAllocation(AnyRegister(edx))); +} + +void LIRGeneratorX86Shared::lowerUrshD(MUrsh* mir) { + MDefinition* lhs = mir->lhs(); + MDefinition* rhs = mir->rhs(); + + MOZ_ASSERT(lhs->type() == MIRType::Int32); + MOZ_ASSERT(rhs->type() == MIRType::Int32); + MOZ_ASSERT(mir->type() == MIRType::Double); + +#ifdef JS_CODEGEN_X64 + static_assert(ecx == rcx); +#endif + + // Without BMI2, x86 can only shift by ecx. + LUse lhsUse = useRegisterAtStart(lhs); + LAllocation rhsAlloc; + if (rhs->isConstant()) { + rhsAlloc = useOrConstant(rhs); + } else if (Assembler::HasBMI2()) { + rhsAlloc = useRegister(rhs); + } else { + rhsAlloc = useFixed(rhs, ecx); + } + + LUrshD* lir = new (alloc()) LUrshD(lhsUse, rhsAlloc, tempCopy(lhs, 0)); + define(lir, mir); +} + +void LIRGeneratorX86Shared::lowerPowOfTwoI(MPow* mir) { + int32_t base = mir->input()->toConstant()->toInt32(); + MDefinition* power = mir->power(); + + // Shift operand should be in register ecx, unless BMI2 is available. + // x86 can't shift a non-ecx register. + LAllocation powerAlloc = + Assembler::HasBMI2() ? useRegister(power) : useFixed(power, ecx); + auto* lir = new (alloc()) LPowOfTwoI(powerAlloc, base); + assignSnapshot(lir, mir->bailoutKind()); + define(lir, mir); +} + +void LIRGeneratorX86Shared::lowerBigIntLsh(MBigIntLsh* ins) { + // Shift operand should be in register ecx, unless BMI2 is available. + // x86 can't shift a non-ecx register. + LDefinition shiftAlloc = Assembler::HasBMI2() ? temp() : tempFixed(ecx); + auto* lir = + new (alloc()) LBigIntLsh(useRegister(ins->lhs()), useRegister(ins->rhs()), + temp(), shiftAlloc, temp()); + define(lir, ins); + assignSafepoint(lir, ins); +} + +void LIRGeneratorX86Shared::lowerBigIntRsh(MBigIntRsh* ins) { + // Shift operand should be in register ecx, unless BMI2 is available. + // x86 can't shift a non-ecx register. + LDefinition shiftAlloc = Assembler::HasBMI2() ? temp() : tempFixed(ecx); + auto* lir = + new (alloc()) LBigIntRsh(useRegister(ins->lhs()), useRegister(ins->rhs()), + temp(), shiftAlloc, temp()); + define(lir, ins); + assignSafepoint(lir, ins); +} + +void LIRGeneratorX86Shared::lowerWasmBuiltinTruncateToInt32( + MWasmBuiltinTruncateToInt32* ins) { + MDefinition* opd = ins->input(); + MOZ_ASSERT(opd->type() == MIRType::Double || opd->type() == MIRType::Float32); + + LDefinition maybeTemp = + Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempDouble(); + if (opd->type() == MIRType::Double) { + define(new (alloc()) LWasmBuiltinTruncateDToInt32( + useRegister(opd), useFixed(ins->instance(), InstanceReg), + maybeTemp), + ins); + return; + } + + define( + new (alloc()) LWasmBuiltinTruncateFToInt32( + useRegister(opd), useFixed(ins->instance(), InstanceReg), maybeTemp), + ins); +} + +void LIRGeneratorX86Shared::lowerTruncateDToInt32(MTruncateToInt32* ins) { + MDefinition* opd = ins->input(); + MOZ_ASSERT(opd->type() == MIRType::Double); + + LDefinition maybeTemp = + Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempDouble(); + define(new (alloc()) LTruncateDToInt32(useRegister(opd), maybeTemp), ins); +} + +void LIRGeneratorX86Shared::lowerTruncateFToInt32(MTruncateToInt32* ins) { + MDefinition* opd = ins->input(); + MOZ_ASSERT(opd->type() == MIRType::Float32); + + LDefinition maybeTemp = + Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempFloat32(); + define(new (alloc()) LTruncateFToInt32(useRegister(opd), maybeTemp), ins); +} + +void LIRGeneratorX86Shared::lowerCompareExchangeTypedArrayElement( + MCompareExchangeTypedArrayElement* ins, bool useI386ByteRegisters) { + MOZ_ASSERT(ins->arrayType() != Scalar::Float32); + MOZ_ASSERT(ins->arrayType() != Scalar::Float64); + + MOZ_ASSERT(ins->elements()->type() == MIRType::Elements); + MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr); + + const LUse elements = useRegister(ins->elements()); + const LAllocation index = + useRegisterOrIndexConstant(ins->index(), ins->arrayType()); + + // If the target is a floating register then we need a temp at the + // lower level; that temp must be eax. + // + // Otherwise the target (if used) is an integer register, which + // must be eax. If the target is not used the machine code will + // still clobber eax, so just pretend it's used. + // + // oldval must be in a register. + // + // newval must be in a register. If the source is a byte array + // then newval must be a register that has a byte size: on x86 + // this must be ebx, ecx, or edx (eax is taken for the output). + // + // Bug #1077036 describes some further optimization opportunities. + + bool fixedOutput = false; + LDefinition tempDef = LDefinition::BogusTemp(); + LAllocation newval; + if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) { + tempDef = tempFixed(eax); + newval = useRegister(ins->newval()); + } else { + fixedOutput = true; + if (useI386ByteRegisters && ins->isByteArray()) { + newval = useFixed(ins->newval(), ebx); + } else { + newval = useRegister(ins->newval()); + } + } + + const LAllocation oldval = useRegister(ins->oldval()); + + LCompareExchangeTypedArrayElement* lir = + new (alloc()) LCompareExchangeTypedArrayElement(elements, index, oldval, + newval, tempDef); + + if (fixedOutput) { + defineFixed(lir, ins, LAllocation(AnyRegister(eax))); + } else { + define(lir, ins); + } +} + +void LIRGeneratorX86Shared::lowerAtomicExchangeTypedArrayElement( + MAtomicExchangeTypedArrayElement* ins, bool useI386ByteRegisters) { + MOZ_ASSERT(ins->arrayType() <= Scalar::Uint32); + + MOZ_ASSERT(ins->elements()->type() == MIRType::Elements); + MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr); + + const LUse elements = useRegister(ins->elements()); + const LAllocation index = + useRegisterOrIndexConstant(ins->index(), ins->arrayType()); + const LAllocation value = useRegister(ins->value()); + + // The underlying instruction is XCHG, which can operate on any + // register. + // + // If the target is a floating register (for Uint32) then we need + // a temp into which to exchange. + // + // If the source is a byte array then we need a register that has + // a byte size; in this case -- on x86 only -- pin the output to + // an appropriate register and use that as a temp in the back-end. + + LDefinition tempDef = LDefinition::BogusTemp(); + if (ins->arrayType() == Scalar::Uint32) { + MOZ_ASSERT(ins->type() == MIRType::Double); + tempDef = temp(); + } + + LAtomicExchangeTypedArrayElement* lir = new (alloc()) + LAtomicExchangeTypedArrayElement(elements, index, value, tempDef); + + if (useI386ByteRegisters && ins->isByteArray()) { + defineFixed(lir, ins, LAllocation(AnyRegister(eax))); + } else { + define(lir, ins); + } +} + +void LIRGeneratorX86Shared::lowerAtomicTypedArrayElementBinop( + MAtomicTypedArrayElementBinop* ins, bool useI386ByteRegisters) { + MOZ_ASSERT(ins->arrayType() != Scalar::Uint8Clamped); + MOZ_ASSERT(ins->arrayType() != Scalar::Float32); + MOZ_ASSERT(ins->arrayType() != Scalar::Float64); + + MOZ_ASSERT(ins->elements()->type() == MIRType::Elements); + MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr); + + const LUse elements = useRegister(ins->elements()); + const LAllocation index = + useRegisterOrIndexConstant(ins->index(), ins->arrayType()); + + // Case 1: the result of the operation is not used. + // + // We'll emit a single instruction: LOCK ADD, LOCK SUB, LOCK AND, + // LOCK OR, or LOCK XOR. We can do this even for the Uint32 case. + + if (ins->isForEffect()) { + LAllocation value; + if (useI386ByteRegisters && ins->isByteArray() && + !ins->value()->isConstant()) { + value = useFixed(ins->value(), ebx); + } else { + value = useRegisterOrConstant(ins->value()); + } + + LAtomicTypedArrayElementBinopForEffect* lir = new (alloc()) + LAtomicTypedArrayElementBinopForEffect(elements, index, value); + + add(lir, ins); + return; + } + + // Case 2: the result of the operation is used. + // + // For ADD and SUB we'll use XADD: + // + // movl src, output + // lock xaddl output, mem + // + // For the 8-bit variants XADD needs a byte register for the output. + // + // For AND/OR/XOR we need to use a CMPXCHG loop: + // + // movl *mem, eax + // L: mov eax, temp + // andl src, temp + // lock cmpxchg temp, mem ; reads eax also + // jnz L + // ; result in eax + // + // Note the placement of L, cmpxchg will update eax with *mem if + // *mem does not have the expected value, so reloading it at the + // top of the loop would be redundant. + // + // If the array is not a uint32 array then: + // - eax should be the output (one result of the cmpxchg) + // - there is a temp, which must have a byte register if + // the array has 1-byte elements elements + // + // If the array is a uint32 array then: + // - eax is the first temp + // - we also need a second temp + // + // There are optimization opportunities: + // - better register allocation in the x86 8-bit case, Bug #1077036. + + bool bitOp = !(ins->operation() == AtomicFetchAddOp || + ins->operation() == AtomicFetchSubOp); + bool fixedOutput = true; + bool reuseInput = false; + LDefinition tempDef1 = LDefinition::BogusTemp(); + LDefinition tempDef2 = LDefinition::BogusTemp(); + LAllocation value; + + if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) { + value = useRegisterOrConstant(ins->value()); + fixedOutput = false; + if (bitOp) { + tempDef1 = tempFixed(eax); + tempDef2 = temp(); + } else { + tempDef1 = temp(); + } + } else if (useI386ByteRegisters && ins->isByteArray()) { + if (ins->value()->isConstant()) { + value = useRegisterOrConstant(ins->value()); + } else { + value = useFixed(ins->value(), ebx); + } + if (bitOp) { + tempDef1 = tempFixed(ecx); + } + } else if (bitOp) { + value = useRegisterOrConstant(ins->value()); + tempDef1 = temp(); + } else if (ins->value()->isConstant()) { + fixedOutput = false; + value = useRegisterOrConstant(ins->value()); + } else { + fixedOutput = false; + reuseInput = true; + value = useRegisterAtStart(ins->value()); + } + + LAtomicTypedArrayElementBinop* lir = new (alloc()) + LAtomicTypedArrayElementBinop(elements, index, value, tempDef1, tempDef2); + + if (fixedOutput) { + defineFixed(lir, ins, LAllocation(AnyRegister(eax))); + } else if (reuseInput) { + defineReuseInput(lir, ins, LAtomicTypedArrayElementBinop::valueOp); + } else { + define(lir, ins); + } +} + +void LIRGenerator::visitCopySign(MCopySign* ins) { + MDefinition* lhs = ins->lhs(); + MDefinition* rhs = ins->rhs(); + + MOZ_ASSERT(IsFloatingPointType(lhs->type())); + MOZ_ASSERT(lhs->type() == rhs->type()); + MOZ_ASSERT(lhs->type() == ins->type()); + + LInstructionHelper<1, 2, 2>* lir; + if (lhs->type() == MIRType::Double) { + lir = new (alloc()) LCopySignD(); + } else { + lir = new (alloc()) LCopySignF(); + } + + // As lowerForFPU, but we want rhs to be in a FP register too. + lir->setOperand(0, useRegisterAtStart(lhs)); + if (!Assembler::HasAVX()) { + lir->setOperand(1, willHaveDifferentLIRNodes(lhs, rhs) + ? useRegister(rhs) + : useRegisterAtStart(rhs)); + defineReuseInput(lir, ins, 0); + } else { + lir->setOperand(1, useRegisterAtStart(rhs)); + define(lir, ins); + } +} + +// These lowerings are really x86-shared but some Masm APIs are not yet +// available on x86. + +// Ternary and binary operators require the dest register to be the same as +// their first input register, leading to a pattern of useRegisterAtStart + +// defineReuseInput. + +void LIRGenerator::visitWasmTernarySimd128(MWasmTernarySimd128* ins) { +#ifdef ENABLE_WASM_SIMD + MOZ_ASSERT(ins->v0()->type() == MIRType::Simd128); + MOZ_ASSERT(ins->v1()->type() == MIRType::Simd128); + MOZ_ASSERT(ins->v2()->type() == MIRType::Simd128); + MOZ_ASSERT(ins->type() == MIRType::Simd128); + + switch (ins->simdOp()) { + case wasm::SimdOp::V128Bitselect: { + // Enforcing lhs == output avoids one setup move. We would like to also + // enforce merging the control with the temp (with + // usRegisterAtStart(control) and tempCopy()), but the register allocator + // ignores those constraints at present. + auto* lir = new (alloc()) LWasmTernarySimd128( + ins->simdOp(), useRegisterAtStart(ins->v0()), useRegister(ins->v1()), + useRegister(ins->v2()), tempSimd128()); + defineReuseInput(lir, ins, LWasmTernarySimd128::V0); + break; + } + case wasm::SimdOp::F32x4RelaxedFma: + case wasm::SimdOp::F32x4RelaxedFnma: + case wasm::SimdOp::F64x2RelaxedFma: + case wasm::SimdOp::F64x2RelaxedFnma: { + auto* lir = new (alloc()) LWasmTernarySimd128( + ins->simdOp(), useRegister(ins->v0()), useRegister(ins->v1()), + useRegisterAtStart(ins->v2())); + defineReuseInput(lir, ins, LWasmTernarySimd128::V2); + break; + } + case wasm::SimdOp::I32x4DotI8x16I7x16AddS: { + auto* lir = new (alloc()) LWasmTernarySimd128( + ins->simdOp(), useRegister(ins->v0()), useRegister(ins->v1()), + useRegisterAtStart(ins->v2())); + defineReuseInput(lir, ins, LWasmTernarySimd128::V2); + break; + } + case wasm::SimdOp::I8x16RelaxedLaneSelect: + case wasm::SimdOp::I16x8RelaxedLaneSelect: + case wasm::SimdOp::I32x4RelaxedLaneSelect: + case wasm::SimdOp::I64x2RelaxedLaneSelect: { + if (Assembler::HasAVX()) { + auto* lir = new (alloc()) LWasmTernarySimd128( + ins->simdOp(), useRegisterAtStart(ins->v0()), + useRegisterAtStart(ins->v1()), useRegisterAtStart(ins->v2())); + define(lir, ins); + } else { + auto* lir = new (alloc()) LWasmTernarySimd128( + ins->simdOp(), useRegister(ins->v0()), + useRegisterAtStart(ins->v1()), useFixed(ins->v2(), vmm0)); + defineReuseInput(lir, ins, LWasmTernarySimd128::V1); + } + break; + } + default: + MOZ_CRASH("NYI"); + } +#else + MOZ_CRASH("No SIMD"); +#endif +} + +void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) { +#ifdef ENABLE_WASM_SIMD + MDefinition* lhs = ins->lhs(); + MDefinition* rhs = ins->rhs(); + wasm::SimdOp op = ins->simdOp(); + + MOZ_ASSERT(lhs->type() == MIRType::Simd128); + MOZ_ASSERT(rhs->type() == MIRType::Simd128); + MOZ_ASSERT(ins->type() == MIRType::Simd128); + + // Note MWasmBinarySimd128::foldsTo has already specialized operations that + // have a constant operand, so this takes care of more general cases of + // reordering, see ReorderCommutative. + if (ins->isCommutative()) { + ReorderCommutative(&lhs, &rhs, ins); + } + + // Swap operands and change operation if necessary, these are all x86/x64 + // dependent transformations. Except where noted, this is about avoiding + // unnecessary moves and fixups in the code generator macros. + bool swap = false; + switch (op) { + case wasm::SimdOp::V128AndNot: { + // Code generation requires the operands to be reversed. + swap = true; + break; + } + case wasm::SimdOp::I8x16LtS: { + swap = true; + op = wasm::SimdOp::I8x16GtS; + break; + } + case wasm::SimdOp::I8x16GeS: { + swap = true; + op = wasm::SimdOp::I8x16LeS; + break; + } + case wasm::SimdOp::I16x8LtS: { + swap = true; + op = wasm::SimdOp::I16x8GtS; + break; + } + case wasm::SimdOp::I16x8GeS: { + swap = true; + op = wasm::SimdOp::I16x8LeS; + break; + } + case wasm::SimdOp::I32x4LtS: { + swap = true; + op = wasm::SimdOp::I32x4GtS; + break; + } + case wasm::SimdOp::I32x4GeS: { + swap = true; + op = wasm::SimdOp::I32x4LeS; + break; + } + case wasm::SimdOp::F32x4Gt: { + swap = true; + op = wasm::SimdOp::F32x4Lt; + break; + } + case wasm::SimdOp::F32x4Ge: { + swap = true; + op = wasm::SimdOp::F32x4Le; + break; + } + case wasm::SimdOp::F64x2Gt: { + swap = true; + op = wasm::SimdOp::F64x2Lt; + break; + } + case wasm::SimdOp::F64x2Ge: { + swap = true; + op = wasm::SimdOp::F64x2Le; + break; + } + case wasm::SimdOp::F32x4PMin: + case wasm::SimdOp::F32x4PMax: + case wasm::SimdOp::F64x2PMin: + case wasm::SimdOp::F64x2PMax: { + // Code generation requires the operations to be reversed (the rhs is the + // output register). + swap = true; + break; + } + default: + break; + } + if (swap) { + MDefinition* tmp = lhs; + lhs = rhs; + rhs = tmp; + } + + // Allocate temp registers + LDefinition tempReg0 = LDefinition::BogusTemp(); + LDefinition tempReg1 = LDefinition::BogusTemp(); + switch (op) { + case wasm::SimdOp::I64x2Mul: + tempReg0 = tempSimd128(); + break; + case wasm::SimdOp::F32x4Min: + case wasm::SimdOp::F32x4Max: + case wasm::SimdOp::F64x2Min: + case wasm::SimdOp::F64x2Max: + tempReg0 = tempSimd128(); + tempReg1 = tempSimd128(); + break; + case wasm::SimdOp::I64x2LtS: + case wasm::SimdOp::I64x2GtS: + case wasm::SimdOp::I64x2LeS: + case wasm::SimdOp::I64x2GeS: + // The compareForOrderingInt64x2AVX implementation does not require + // temps but needs SSE4.2 support. Checking if both AVX and SSE4.2 + // are enabled. + if (!(Assembler::HasAVX() && Assembler::HasSSE42())) { + tempReg0 = tempSimd128(); + tempReg1 = tempSimd128(); + } + break; + default: + break; + } + + // For binary ops, without AVX support, the Masm API always is usually + // (rhs, lhsDest) and requires AtStart+ReuseInput for the lhs. + // + // For a few ops, the API is actually (rhsDest, lhs) and the rules are the + // same but the reversed. We swapped operands above; they will be swapped + // again in the code generator to emit the right code. + // + // If AVX support is enabled, some binary ops can use output as destination, + // useRegisterAtStart is applied for both operands and no need for ReuseInput. + + switch (op) { + case wasm::SimdOp::I8x16AvgrU: + case wasm::SimdOp::I16x8AvgrU: + case wasm::SimdOp::I8x16Add: + case wasm::SimdOp::I8x16AddSatS: + case wasm::SimdOp::I8x16AddSatU: + case wasm::SimdOp::I8x16Sub: + case wasm::SimdOp::I8x16SubSatS: + case wasm::SimdOp::I8x16SubSatU: + case wasm::SimdOp::I16x8Mul: + case wasm::SimdOp::I16x8MinS: + case wasm::SimdOp::I16x8MinU: + case wasm::SimdOp::I16x8MaxS: + case wasm::SimdOp::I16x8MaxU: + case wasm::SimdOp::I32x4Add: + case wasm::SimdOp::I32x4Sub: + case wasm::SimdOp::I32x4Mul: + case wasm::SimdOp::I32x4MinS: + case wasm::SimdOp::I32x4MinU: + case wasm::SimdOp::I32x4MaxS: + case wasm::SimdOp::I32x4MaxU: + case wasm::SimdOp::I64x2Add: + case wasm::SimdOp::I64x2Sub: + case wasm::SimdOp::I64x2Mul: + case wasm::SimdOp::F32x4Add: + case wasm::SimdOp::F32x4Sub: + case wasm::SimdOp::F32x4Mul: + case wasm::SimdOp::F32x4Div: + case wasm::SimdOp::F64x2Add: + case wasm::SimdOp::F64x2Sub: + case wasm::SimdOp::F64x2Mul: + case wasm::SimdOp::F64x2Div: + case wasm::SimdOp::F32x4Eq: + case wasm::SimdOp::F32x4Ne: + case wasm::SimdOp::F32x4Lt: + case wasm::SimdOp::F32x4Le: + case wasm::SimdOp::F64x2Eq: + case wasm::SimdOp::F64x2Ne: + case wasm::SimdOp::F64x2Lt: + case wasm::SimdOp::F64x2Le: + case wasm::SimdOp::F32x4PMin: + case wasm::SimdOp::F32x4PMax: + case wasm::SimdOp::F64x2PMin: + case wasm::SimdOp::F64x2PMax: + case wasm::SimdOp::I8x16Swizzle: + case wasm::SimdOp::I8x16RelaxedSwizzle: + case wasm::SimdOp::I8x16Eq: + case wasm::SimdOp::I8x16Ne: + case wasm::SimdOp::I8x16GtS: + case wasm::SimdOp::I8x16LeS: + case wasm::SimdOp::I8x16LtU: + case wasm::SimdOp::I8x16GtU: + case wasm::SimdOp::I8x16LeU: + case wasm::SimdOp::I8x16GeU: + case wasm::SimdOp::I16x8Eq: + case wasm::SimdOp::I16x8Ne: + case wasm::SimdOp::I16x8GtS: + case wasm::SimdOp::I16x8LeS: + case wasm::SimdOp::I16x8LtU: + case wasm::SimdOp::I16x8GtU: + case wasm::SimdOp::I16x8LeU: + case wasm::SimdOp::I16x8GeU: + case wasm::SimdOp::I32x4Eq: + case wasm::SimdOp::I32x4Ne: + case wasm::SimdOp::I32x4GtS: + case wasm::SimdOp::I32x4LeS: + case wasm::SimdOp::I32x4LtU: + case wasm::SimdOp::I32x4GtU: + case wasm::SimdOp::I32x4LeU: + case wasm::SimdOp::I32x4GeU: + case wasm::SimdOp::I64x2Eq: + case wasm::SimdOp::I64x2Ne: + case wasm::SimdOp::I64x2LtS: + case wasm::SimdOp::I64x2GtS: + case wasm::SimdOp::I64x2LeS: + case wasm::SimdOp::I64x2GeS: + case wasm::SimdOp::V128And: + case wasm::SimdOp::V128Or: + case wasm::SimdOp::V128Xor: + case wasm::SimdOp::V128AndNot: + case wasm::SimdOp::F32x4Min: + case wasm::SimdOp::F32x4Max: + case wasm::SimdOp::F64x2Min: + case wasm::SimdOp::F64x2Max: + case wasm::SimdOp::I8x16NarrowI16x8S: + case wasm::SimdOp::I8x16NarrowI16x8U: + case wasm::SimdOp::I16x8NarrowI32x4S: + case wasm::SimdOp::I16x8NarrowI32x4U: + case wasm::SimdOp::I32x4DotI16x8S: + case wasm::SimdOp::I16x8ExtmulLowI8x16S: + case wasm::SimdOp::I16x8ExtmulHighI8x16S: + case wasm::SimdOp::I16x8ExtmulLowI8x16U: + case wasm::SimdOp::I16x8ExtmulHighI8x16U: + case wasm::SimdOp::I32x4ExtmulLowI16x8S: + case wasm::SimdOp::I32x4ExtmulHighI16x8S: + case wasm::SimdOp::I32x4ExtmulLowI16x8U: + case wasm::SimdOp::I32x4ExtmulHighI16x8U: + case wasm::SimdOp::I64x2ExtmulLowI32x4S: + case wasm::SimdOp::I64x2ExtmulHighI32x4S: + case wasm::SimdOp::I64x2ExtmulLowI32x4U: + case wasm::SimdOp::I64x2ExtmulHighI32x4U: + case wasm::SimdOp::I16x8Q15MulrSatS: + case wasm::SimdOp::F32x4RelaxedMin: + case wasm::SimdOp::F32x4RelaxedMax: + case wasm::SimdOp::F64x2RelaxedMin: + case wasm::SimdOp::F64x2RelaxedMax: + case wasm::SimdOp::I16x8RelaxedQ15MulrS: + case wasm::SimdOp::I16x8DotI8x16I7x16S: + case wasm::SimdOp::MozPMADDUBSW: + if (isThreeOpAllowed()) { + auto* lir = new (alloc()) + LWasmBinarySimd128(op, useRegisterAtStart(lhs), + useRegisterAtStart(rhs), tempReg0, tempReg1); + define(lir, ins); + break; + } + [[fallthrough]]; + default: { + LAllocation lhsDestAlloc = useRegisterAtStart(lhs); + LAllocation rhsAlloc = willHaveDifferentLIRNodes(lhs, rhs) + ? useRegister(rhs) + : useRegisterAtStart(rhs); + auto* lir = new (alloc()) + LWasmBinarySimd128(op, lhsDestAlloc, rhsAlloc, tempReg0, tempReg1); + defineReuseInput(lir, ins, LWasmBinarySimd128::LhsDest); + break; + } + } +#else + MOZ_CRASH("No SIMD"); +#endif +} + +#ifdef ENABLE_WASM_SIMD +bool MWasmTernarySimd128::specializeBitselectConstantMaskAsShuffle( + int8_t shuffle[16]) { + if (simdOp() != wasm::SimdOp::V128Bitselect) { + return false; + } + + // Optimization when control vector is a mask with all 0 or all 1 per lane. + // On x86, there is no bitselect, blend operations will be a win, + // e.g. via PBLENDVB or PBLENDW. + SimdConstant constant = static_cast<MWasmFloatConstant*>(v2())->toSimd128(); + const SimdConstant::I8x16& bytes = constant.asInt8x16(); + for (int8_t i = 0; i < 16; i++) { + if (bytes[i] == -1) { + shuffle[i] = i + 16; + } else if (bytes[i] == 0) { + shuffle[i] = i; + } else { + return false; + } + } + return true; +} +bool MWasmTernarySimd128::canRelaxBitselect() { + wasm::SimdOp simdOp; + if (v2()->isWasmBinarySimd128()) { + simdOp = v2()->toWasmBinarySimd128()->simdOp(); + } else if (v2()->isWasmBinarySimd128WithConstant()) { + simdOp = v2()->toWasmBinarySimd128WithConstant()->simdOp(); + } else { + return false; + } + switch (simdOp) { + case wasm::SimdOp::I8x16Eq: + case wasm::SimdOp::I8x16Ne: + case wasm::SimdOp::I8x16GtS: + case wasm::SimdOp::I8x16GeS: + case wasm::SimdOp::I8x16LtS: + case wasm::SimdOp::I8x16LeS: + case wasm::SimdOp::I8x16GtU: + case wasm::SimdOp::I8x16GeU: + case wasm::SimdOp::I8x16LtU: + case wasm::SimdOp::I8x16LeU: + case wasm::SimdOp::I16x8Eq: + case wasm::SimdOp::I16x8Ne: + case wasm::SimdOp::I16x8GtS: + case wasm::SimdOp::I16x8GeS: + case wasm::SimdOp::I16x8LtS: + case wasm::SimdOp::I16x8LeS: + case wasm::SimdOp::I16x8GtU: + case wasm::SimdOp::I16x8GeU: + case wasm::SimdOp::I16x8LtU: + case wasm::SimdOp::I16x8LeU: + case wasm::SimdOp::I32x4Eq: + case wasm::SimdOp::I32x4Ne: + case wasm::SimdOp::I32x4GtS: + case wasm::SimdOp::I32x4GeS: + case wasm::SimdOp::I32x4LtS: + case wasm::SimdOp::I32x4LeS: + case wasm::SimdOp::I32x4GtU: + case wasm::SimdOp::I32x4GeU: + case wasm::SimdOp::I32x4LtU: + case wasm::SimdOp::I32x4LeU: + case wasm::SimdOp::I64x2Eq: + case wasm::SimdOp::I64x2Ne: + case wasm::SimdOp::I64x2GtS: + case wasm::SimdOp::I64x2GeS: + case wasm::SimdOp::I64x2LtS: + case wasm::SimdOp::I64x2LeS: + case wasm::SimdOp::F32x4Eq: + case wasm::SimdOp::F32x4Ne: + case wasm::SimdOp::F32x4Gt: + case wasm::SimdOp::F32x4Ge: + case wasm::SimdOp::F32x4Lt: + case wasm::SimdOp::F32x4Le: + case wasm::SimdOp::F64x2Eq: + case wasm::SimdOp::F64x2Ne: + case wasm::SimdOp::F64x2Gt: + case wasm::SimdOp::F64x2Ge: + case wasm::SimdOp::F64x2Lt: + case wasm::SimdOp::F64x2Le: + return true; + default: + break; + } + return false; +} + +bool MWasmBinarySimd128::canPmaddubsw() { + MOZ_ASSERT(Assembler::HasSSE3()); + return true; +} +#endif + +bool MWasmBinarySimd128::specializeForConstantRhs() { + // The order follows MacroAssembler.h, generally + switch (simdOp()) { + // Operations implemented by a single native instruction where it is + // plausible that the rhs (after commutation if available) could be a + // constant. + // + // Swizzle is not here because it was handled earlier in the pipeline. + // + // Integer compares >= and < are not here because they are not supported in + // the hardware. + // + // Floating compares are not here because our patching machinery can't + // handle them yet. + // + // Floating-point min and max (including pmin and pmax) are not here because + // they are not straightforward to implement. + case wasm::SimdOp::I8x16Add: + case wasm::SimdOp::I16x8Add: + case wasm::SimdOp::I32x4Add: + case wasm::SimdOp::I64x2Add: + case wasm::SimdOp::I8x16Sub: + case wasm::SimdOp::I16x8Sub: + case wasm::SimdOp::I32x4Sub: + case wasm::SimdOp::I64x2Sub: + case wasm::SimdOp::I16x8Mul: + case wasm::SimdOp::I32x4Mul: + case wasm::SimdOp::I8x16AddSatS: + case wasm::SimdOp::I8x16AddSatU: + case wasm::SimdOp::I16x8AddSatS: + case wasm::SimdOp::I16x8AddSatU: + case wasm::SimdOp::I8x16SubSatS: + case wasm::SimdOp::I8x16SubSatU: + case wasm::SimdOp::I16x8SubSatS: + case wasm::SimdOp::I16x8SubSatU: + case wasm::SimdOp::I8x16MinS: + case wasm::SimdOp::I8x16MinU: + case wasm::SimdOp::I16x8MinS: + case wasm::SimdOp::I16x8MinU: + case wasm::SimdOp::I32x4MinS: + case wasm::SimdOp::I32x4MinU: + case wasm::SimdOp::I8x16MaxS: + case wasm::SimdOp::I8x16MaxU: + case wasm::SimdOp::I16x8MaxS: + case wasm::SimdOp::I16x8MaxU: + case wasm::SimdOp::I32x4MaxS: + case wasm::SimdOp::I32x4MaxU: + case wasm::SimdOp::V128And: + case wasm::SimdOp::V128Or: + case wasm::SimdOp::V128Xor: + case wasm::SimdOp::I8x16Eq: + case wasm::SimdOp::I8x16Ne: + case wasm::SimdOp::I8x16GtS: + case wasm::SimdOp::I8x16LeS: + case wasm::SimdOp::I16x8Eq: + case wasm::SimdOp::I16x8Ne: + case wasm::SimdOp::I16x8GtS: + case wasm::SimdOp::I16x8LeS: + case wasm::SimdOp::I32x4Eq: + case wasm::SimdOp::I32x4Ne: + case wasm::SimdOp::I32x4GtS: + case wasm::SimdOp::I32x4LeS: + case wasm::SimdOp::I64x2Mul: + case wasm::SimdOp::F32x4Eq: + case wasm::SimdOp::F32x4Ne: + case wasm::SimdOp::F32x4Lt: + case wasm::SimdOp::F32x4Le: + case wasm::SimdOp::F64x2Eq: + case wasm::SimdOp::F64x2Ne: + case wasm::SimdOp::F64x2Lt: + case wasm::SimdOp::F64x2Le: + case wasm::SimdOp::I32x4DotI16x8S: + case wasm::SimdOp::F32x4Add: + case wasm::SimdOp::F64x2Add: + case wasm::SimdOp::F32x4Sub: + case wasm::SimdOp::F64x2Sub: + case wasm::SimdOp::F32x4Div: + case wasm::SimdOp::F64x2Div: + case wasm::SimdOp::F32x4Mul: + case wasm::SimdOp::F64x2Mul: + case wasm::SimdOp::I8x16NarrowI16x8S: + case wasm::SimdOp::I8x16NarrowI16x8U: + case wasm::SimdOp::I16x8NarrowI32x4S: + case wasm::SimdOp::I16x8NarrowI32x4U: + return true; + default: + return false; + } +} + +void LIRGenerator::visitWasmBinarySimd128WithConstant( + MWasmBinarySimd128WithConstant* ins) { +#ifdef ENABLE_WASM_SIMD + MDefinition* lhs = ins->lhs(); + + MOZ_ASSERT(lhs->type() == MIRType::Simd128); + MOZ_ASSERT(ins->type() == MIRType::Simd128); + + // Allocate temp registers + LDefinition tempReg = LDefinition::BogusTemp(); + switch (ins->simdOp()) { + case wasm::SimdOp::I64x2Mul: + tempReg = tempSimd128(); + break; + default: + break; + } + + if (isThreeOpAllowed()) { + // The non-destructive versions of instructions will be available + // when AVX is enabled. + LAllocation lhsAlloc = useRegisterAtStart(lhs); + auto* lir = new (alloc()) + LWasmBinarySimd128WithConstant(lhsAlloc, ins->rhs(), tempReg); + define(lir, ins); + } else { + // Always beneficial to reuse the lhs register here, see discussion in + // visitWasmBinarySimd128() and also code in specializeForConstantRhs(). + LAllocation lhsDestAlloc = useRegisterAtStart(lhs); + auto* lir = new (alloc()) + LWasmBinarySimd128WithConstant(lhsDestAlloc, ins->rhs(), tempReg); + defineReuseInput(lir, ins, LWasmBinarySimd128WithConstant::LhsDest); + } +#else + MOZ_CRASH("No SIMD"); +#endif +} + +void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) { +#ifdef ENABLE_WASM_SIMD + MDefinition* lhs = ins->lhs(); + MDefinition* rhs = ins->rhs(); + + MOZ_ASSERT(lhs->type() == MIRType::Simd128); + MOZ_ASSERT(rhs->type() == MIRType::Int32); + MOZ_ASSERT(ins->type() == MIRType::Simd128); + + if (rhs->isConstant()) { + int32_t shiftCountMask; + switch (ins->simdOp()) { + case wasm::SimdOp::I8x16Shl: + case wasm::SimdOp::I8x16ShrU: + case wasm::SimdOp::I8x16ShrS: + shiftCountMask = 7; + break; + case wasm::SimdOp::I16x8Shl: + case wasm::SimdOp::I16x8ShrU: + case wasm::SimdOp::I16x8ShrS: + shiftCountMask = 15; + break; + case wasm::SimdOp::I32x4Shl: + case wasm::SimdOp::I32x4ShrU: + case wasm::SimdOp::I32x4ShrS: + shiftCountMask = 31; + break; + case wasm::SimdOp::I64x2Shl: + case wasm::SimdOp::I64x2ShrU: + case wasm::SimdOp::I64x2ShrS: + shiftCountMask = 63; + break; + default: + MOZ_CRASH("Unexpected shift operation"); + } + + int32_t shiftCount = rhs->toConstant()->toInt32() & shiftCountMask; + if (shiftCount == shiftCountMask) { + // Check if possible to apply sign replication optimization. + // For some ops the input shall be reused. + switch (ins->simdOp()) { + case wasm::SimdOp::I8x16ShrS: { + auto* lir = + new (alloc()) LWasmSignReplicationSimd128(useRegister(lhs)); + define(lir, ins); + return; + } + case wasm::SimdOp::I16x8ShrS: + case wasm::SimdOp::I32x4ShrS: + case wasm::SimdOp::I64x2ShrS: { + auto* lir = new (alloc()) + LWasmSignReplicationSimd128(useRegisterAtStart(lhs)); + if (isThreeOpAllowed()) { + define(lir, ins); + } else { + // For non-AVX, it is always beneficial to reuse the input. + defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src); + } + return; + } + default: + break; + } + } + +# ifdef DEBUG + js::wasm::ReportSimdAnalysis("shift -> constant shift"); +# endif + auto* lir = new (alloc()) + LWasmConstantShiftSimd128(useRegisterAtStart(lhs), shiftCount); + if (isThreeOpAllowed()) { + define(lir, ins); + } else { + // For non-AVX, it is always beneficial to reuse the input. + defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src); + } + return; + } + +# ifdef DEBUG + js::wasm::ReportSimdAnalysis("shift -> variable shift"); +# endif + + LDefinition tempReg = LDefinition::BogusTemp(); + switch (ins->simdOp()) { + case wasm::SimdOp::I8x16Shl: + case wasm::SimdOp::I8x16ShrS: + case wasm::SimdOp::I8x16ShrU: + case wasm::SimdOp::I64x2ShrS: + tempReg = tempSimd128(); + break; + default: + break; + } + + // Reusing the input if possible is never detrimental. + LAllocation lhsDestAlloc = useRegisterAtStart(lhs); + LAllocation rhsAlloc = useRegisterAtStart(rhs); + auto* lir = + new (alloc()) LWasmVariableShiftSimd128(lhsDestAlloc, rhsAlloc, tempReg); + defineReuseInput(lir, ins, LWasmVariableShiftSimd128::LhsDest); +#else + MOZ_CRASH("No SIMD"); +#endif +} + +void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) { +#ifdef ENABLE_WASM_SIMD + MOZ_ASSERT(ins->lhs()->type() == MIRType::Simd128); + MOZ_ASSERT(ins->rhs()->type() == MIRType::Simd128); + MOZ_ASSERT(ins->type() == MIRType::Simd128); + + SimdShuffle s = ins->shuffle(); + switch (s.opd) { + case SimdShuffle::Operand::LEFT: + case SimdShuffle::Operand::RIGHT: { + LAllocation src; + bool reuse = false; + switch (*s.permuteOp) { + case SimdPermuteOp::MOVE: + reuse = true; + break; + case SimdPermuteOp::BROADCAST_8x16: + case SimdPermuteOp::BROADCAST_16x8: + case SimdPermuteOp::PERMUTE_8x16: + case SimdPermuteOp::PERMUTE_16x8: + case SimdPermuteOp::PERMUTE_32x4: + case SimdPermuteOp::ROTATE_RIGHT_8x16: + case SimdPermuteOp::SHIFT_LEFT_8x16: + case SimdPermuteOp::SHIFT_RIGHT_8x16: + case SimdPermuteOp::REVERSE_16x8: + case SimdPermuteOp::REVERSE_32x4: + case SimdPermuteOp::REVERSE_64x2: + // No need to reuse registers when VEX instructions are enabled. + reuse = !Assembler::HasAVX(); + break; + default: + MOZ_CRASH("Unexpected operator"); + } + if (s.opd == SimdShuffle::Operand::LEFT) { + src = useRegisterAtStart(ins->lhs()); + } else { + src = useRegisterAtStart(ins->rhs()); + } + auto* lir = + new (alloc()) LWasmPermuteSimd128(src, *s.permuteOp, s.control); + if (reuse) { + defineReuseInput(lir, ins, LWasmPermuteSimd128::Src); + } else { + define(lir, ins); + } + break; + } + case SimdShuffle::Operand::BOTH: + case SimdShuffle::Operand::BOTH_SWAPPED: { + LDefinition temp = LDefinition::BogusTemp(); + switch (*s.shuffleOp) { + case SimdShuffleOp::BLEND_8x16: + temp = Assembler::HasAVX() ? tempSimd128() : tempFixed(xmm0); + break; + default: + break; + } + if (isThreeOpAllowed()) { + LAllocation lhs; + LAllocation rhs; + if (s.opd == SimdShuffle::Operand::BOTH) { + lhs = useRegisterAtStart(ins->lhs()); + rhs = useRegisterAtStart(ins->rhs()); + } else { + lhs = useRegisterAtStart(ins->rhs()); + rhs = useRegisterAtStart(ins->lhs()); + } + auto* lir = new (alloc()) + LWasmShuffleSimd128(lhs, rhs, temp, *s.shuffleOp, s.control); + define(lir, ins); + } else { + LAllocation lhs; + LAllocation rhs; + if (s.opd == SimdShuffle::Operand::BOTH) { + lhs = useRegisterAtStart(ins->lhs()); + rhs = useRegister(ins->rhs()); + } else { + lhs = useRegisterAtStart(ins->rhs()); + rhs = useRegister(ins->lhs()); + } + auto* lir = new (alloc()) + LWasmShuffleSimd128(lhs, rhs, temp, *s.shuffleOp, s.control); + defineReuseInput(lir, ins, LWasmShuffleSimd128::LhsDest); + } + break; + } + } +#else + MOZ_CRASH("No SIMD"); +#endif +} + +void LIRGenerator::visitWasmReplaceLaneSimd128(MWasmReplaceLaneSimd128* ins) { +#ifdef ENABLE_WASM_SIMD + MOZ_ASSERT(ins->lhs()->type() == MIRType::Simd128); + MOZ_ASSERT(ins->type() == MIRType::Simd128); + + // If AVX support is disabled, the Masm API is (rhs, lhsDest) and requires + // AtStart+ReuseInput for the lhs. For type reasons, the rhs will never be + // the same as the lhs and is therefore a plain Use. + // + // If AVX support is enabled, useRegisterAtStart is preferred. + + if (ins->rhs()->type() == MIRType::Int64) { + if (isThreeOpAllowed()) { + auto* lir = new (alloc()) LWasmReplaceInt64LaneSimd128( + useRegisterAtStart(ins->lhs()), useInt64RegisterAtStart(ins->rhs())); + define(lir, ins); + } else { + auto* lir = new (alloc()) LWasmReplaceInt64LaneSimd128( + useRegisterAtStart(ins->lhs()), useInt64Register(ins->rhs())); + defineReuseInput(lir, ins, LWasmReplaceInt64LaneSimd128::LhsDest); + } + } else { + if (isThreeOpAllowed()) { + auto* lir = new (alloc()) LWasmReplaceLaneSimd128( + useRegisterAtStart(ins->lhs()), useRegisterAtStart(ins->rhs())); + define(lir, ins); + } else { + auto* lir = new (alloc()) LWasmReplaceLaneSimd128( + useRegisterAtStart(ins->lhs()), useRegister(ins->rhs())); + defineReuseInput(lir, ins, LWasmReplaceLaneSimd128::LhsDest); + } + } +#else + MOZ_CRASH("No SIMD"); +#endif +} + +void LIRGenerator::visitWasmScalarToSimd128(MWasmScalarToSimd128* ins) { +#ifdef ENABLE_WASM_SIMD + MOZ_ASSERT(ins->type() == MIRType::Simd128); + + switch (ins->input()->type()) { + case MIRType::Int64: { + // 64-bit integer splats. + // Load-and-(sign|zero)extend. + auto* lir = new (alloc()) + LWasmInt64ToSimd128(useInt64RegisterAtStart(ins->input())); + define(lir, ins); + break; + } + case MIRType::Float32: + case MIRType::Double: { + // Floating-point splats. + // Ideally we save a move on SSE systems by reusing the input register, + // but since the input and output register types differ, we can't. + auto* lir = + new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input())); + define(lir, ins); + break; + } + default: { + // 32-bit integer splats. + auto* lir = + new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input())); + define(lir, ins); + break; + } + } +#else + MOZ_CRASH("No SIMD"); +#endif +} + +void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) { +#ifdef ENABLE_WASM_SIMD + MOZ_ASSERT(ins->input()->type() == MIRType::Simd128); + MOZ_ASSERT(ins->type() == MIRType::Simd128); + + bool useAtStart = false; + bool reuseInput = false; + LDefinition tempReg = LDefinition::BogusTemp(); + switch (ins->simdOp()) { + case wasm::SimdOp::I8x16Neg: + case wasm::SimdOp::I16x8Neg: + case wasm::SimdOp::I32x4Neg: + case wasm::SimdOp::I64x2Neg: + case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S: + // Prefer src != dest to avoid an unconditional src->temp move. + MOZ_ASSERT(!reuseInput); + // If AVX is enabled, we prefer useRegisterAtStart. + useAtStart = isThreeOpAllowed(); + break; + case wasm::SimdOp::F32x4Neg: + case wasm::SimdOp::F64x2Neg: + case wasm::SimdOp::F32x4Abs: + case wasm::SimdOp::F64x2Abs: + case wasm::SimdOp::V128Not: + case wasm::SimdOp::F32x4Sqrt: + case wasm::SimdOp::F64x2Sqrt: + case wasm::SimdOp::I8x16Abs: + case wasm::SimdOp::I16x8Abs: + case wasm::SimdOp::I32x4Abs: + case wasm::SimdOp::I64x2Abs: + case wasm::SimdOp::I32x4TruncSatF32x4S: + case wasm::SimdOp::F32x4ConvertI32x4U: + case wasm::SimdOp::I16x8ExtaddPairwiseI8x16U: + case wasm::SimdOp::I32x4ExtaddPairwiseI16x8S: + case wasm::SimdOp::I32x4ExtaddPairwiseI16x8U: + case wasm::SimdOp::I32x4RelaxedTruncF32x4S: + case wasm::SimdOp::I32x4RelaxedTruncF32x4U: + case wasm::SimdOp::I32x4RelaxedTruncF64x2SZero: + case wasm::SimdOp::I32x4RelaxedTruncF64x2UZero: + case wasm::SimdOp::I64x2ExtendHighI32x4S: + case wasm::SimdOp::I64x2ExtendHighI32x4U: + // Prefer src == dest to avoid an unconditional src->dest move + // for better performance in non-AVX mode (e.g. non-PSHUFD use). + useAtStart = true; + reuseInput = !isThreeOpAllowed(); + break; + case wasm::SimdOp::I32x4TruncSatF32x4U: + case wasm::SimdOp::I32x4TruncSatF64x2SZero: + case wasm::SimdOp::I32x4TruncSatF64x2UZero: + case wasm::SimdOp::I8x16Popcnt: + tempReg = tempSimd128(); + // Prefer src == dest to avoid an unconditional src->dest move + // in non-AVX mode. + useAtStart = true; + reuseInput = !isThreeOpAllowed(); + break; + case wasm::SimdOp::I16x8ExtendLowI8x16S: + case wasm::SimdOp::I16x8ExtendHighI8x16S: + case wasm::SimdOp::I16x8ExtendLowI8x16U: + case wasm::SimdOp::I16x8ExtendHighI8x16U: + case wasm::SimdOp::I32x4ExtendLowI16x8S: + case wasm::SimdOp::I32x4ExtendHighI16x8S: + case wasm::SimdOp::I32x4ExtendLowI16x8U: + case wasm::SimdOp::I32x4ExtendHighI16x8U: + case wasm::SimdOp::I64x2ExtendLowI32x4S: + case wasm::SimdOp::I64x2ExtendLowI32x4U: + case wasm::SimdOp::F32x4ConvertI32x4S: + case wasm::SimdOp::F32x4Ceil: + case wasm::SimdOp::F32x4Floor: + case wasm::SimdOp::F32x4Trunc: + case wasm::SimdOp::F32x4Nearest: + case wasm::SimdOp::F64x2Ceil: + case wasm::SimdOp::F64x2Floor: + case wasm::SimdOp::F64x2Trunc: + case wasm::SimdOp::F64x2Nearest: + case wasm::SimdOp::F32x4DemoteF64x2Zero: + case wasm::SimdOp::F64x2PromoteLowF32x4: + case wasm::SimdOp::F64x2ConvertLowI32x4S: + case wasm::SimdOp::F64x2ConvertLowI32x4U: + // Prefer src == dest to exert the lowest register pressure on the + // surrounding code. + useAtStart = true; + MOZ_ASSERT(!reuseInput); + break; + default: + MOZ_CRASH("Unary SimdOp not implemented"); + } + + LUse inputUse = + useAtStart ? useRegisterAtStart(ins->input()) : useRegister(ins->input()); + LWasmUnarySimd128* lir = new (alloc()) LWasmUnarySimd128(inputUse, tempReg); + if (reuseInput) { + defineReuseInput(lir, ins, LWasmUnarySimd128::Src); + } else { + define(lir, ins); + } +#else + MOZ_CRASH("No SIMD"); +#endif +} + +void LIRGenerator::visitWasmLoadLaneSimd128(MWasmLoadLaneSimd128* ins) { +#ifdef ENABLE_WASM_SIMD + // A trick: On 32-bit systems, the base pointer is 32 bits (it was bounds + // checked and then chopped). On 64-bit systems, it can be 32 bits or 64 + // bits. Either way, it fits in a GPR so we can ignore the + // Register/Register64 distinction here. +# ifndef JS_64BIT + MOZ_ASSERT(ins->base()->type() == MIRType::Int32); +# endif + LUse base = useRegisterAtStart(ins->base()); + LUse inputUse = useRegisterAtStart(ins->value()); + LAllocation memoryBase = ins->hasMemoryBase() + ? useRegisterAtStart(ins->memoryBase()) + : LAllocation(); + LWasmLoadLaneSimd128* lir = new (alloc()) LWasmLoadLaneSimd128( + base, inputUse, LDefinition::BogusTemp(), memoryBase); + defineReuseInput(lir, ins, LWasmLoadLaneSimd128::Src); +#else + MOZ_CRASH("No SIMD"); +#endif +} + +void LIRGenerator::visitWasmStoreLaneSimd128(MWasmStoreLaneSimd128* ins) { +#ifdef ENABLE_WASM_SIMD + // See comment above. +# ifndef JS_64BIT + MOZ_ASSERT(ins->base()->type() == MIRType::Int32); +# endif + LUse base = useRegisterAtStart(ins->base()); + LUse input = useRegisterAtStart(ins->value()); + LAllocation memoryBase = ins->hasMemoryBase() + ? useRegisterAtStart(ins->memoryBase()) + : LAllocation(); + LWasmStoreLaneSimd128* lir = new (alloc()) + LWasmStoreLaneSimd128(base, input, LDefinition::BogusTemp(), memoryBase); + add(lir, ins); +#else + MOZ_CRASH("No SIMD"); +#endif +} + +#ifdef ENABLE_WASM_SIMD + +bool LIRGeneratorX86Shared::canFoldReduceSimd128AndBranch(wasm::SimdOp op) { + switch (op) { + case wasm::SimdOp::V128AnyTrue: + case wasm::SimdOp::I8x16AllTrue: + case wasm::SimdOp::I16x8AllTrue: + case wasm::SimdOp::I32x4AllTrue: + case wasm::SimdOp::I64x2AllTrue: + case wasm::SimdOp::I16x8Bitmask: + return true; + default: + return false; + } +} + +bool LIRGeneratorX86Shared::canEmitWasmReduceSimd128AtUses( + MWasmReduceSimd128* ins) { + if (!ins->canEmitAtUses()) { + return false; + } + // Only specific ops generating int32. + if (ins->type() != MIRType::Int32) { + return false; + } + if (!canFoldReduceSimd128AndBranch(ins->simdOp())) { + return false; + } + // If never used then defer (it will be removed). + MUseIterator iter(ins->usesBegin()); + if (iter == ins->usesEnd()) { + return true; + } + // We require an MTest consumer. + MNode* node = iter->consumer(); + if (!node->isDefinition() || !node->toDefinition()->isTest()) { + return false; + } + // Defer only if there's only one use. + iter++; + return iter == ins->usesEnd(); +} + +#endif // ENABLE_WASM_SIMD + +void LIRGenerator::visitWasmReduceSimd128(MWasmReduceSimd128* ins) { +#ifdef ENABLE_WASM_SIMD + if (canEmitWasmReduceSimd128AtUses(ins)) { + emitAtUses(ins); + return; + } + + // Reductions (any_true, all_true, bitmask, extract_lane) uniformly prefer + // useRegisterAtStart: + // + // - In most cases, the input type differs from the output type, so there's no + // conflict and it doesn't really matter. + // + // - For extract_lane(0) on F32x4 and F64x2, input == output results in zero + // code being generated. + // + // - For extract_lane(k > 0) on F32x4 and F64x2, allowing the input register + // to be targeted lowers register pressure if it's the last use of the + // input. + + if (ins->type() == MIRType::Int64) { + auto* lir = new (alloc()) + LWasmReduceSimd128ToInt64(useRegisterAtStart(ins->input())); + defineInt64(lir, ins); + } else { + // Ideally we would reuse the input register for floating extract_lane if + // the lane is zero, but constraints in the register allocator require the + // input and output register types to be the same. + auto* lir = new (alloc()) LWasmReduceSimd128( + useRegisterAtStart(ins->input()), LDefinition::BogusTemp()); + define(lir, ins); + } +#else + MOZ_CRASH("No SIMD"); +#endif +} |