/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: set ts=8 sts=2 et sw=2 tw=80: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "jit/x86-shared/Lowering-x86-shared.h" #include "mozilla/MathAlgorithms.h" #include "jit/Lowering.h" #include "jit/MIR.h" #include "jit/shared/Lowering-shared-inl.h" using namespace js; using namespace js::jit; using mozilla::Abs; using mozilla::FloorLog2; using mozilla::Maybe; using mozilla::Nothing; using mozilla::Some; LTableSwitch* LIRGeneratorX86Shared::newLTableSwitch( const LAllocation& in, const LDefinition& inputCopy, MTableSwitch* tableswitch) { return new (alloc()) LTableSwitch(in, inputCopy, temp(), tableswitch); } LTableSwitchV* LIRGeneratorX86Shared::newLTableSwitchV( MTableSwitch* tableswitch) { return new (alloc()) LTableSwitchV(useBox(tableswitch->getOperand(0)), temp(), tempDouble(), temp(), tableswitch); } void LIRGenerator::visitPowHalf(MPowHalf* ins) { MDefinition* input = ins->input(); MOZ_ASSERT(input->type() == MIRType::Double); LPowHalfD* lir = new (alloc()) LPowHalfD(useRegisterAtStart(input)); define(lir, ins); } void LIRGeneratorX86Shared::lowerForShift(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir, MDefinition* lhs, MDefinition* rhs) { ins->setOperand(0, useRegisterAtStart(lhs)); // Shift operand should be constant or, unless BMI2 is available, in register // ecx. x86 can't shift a non-ecx register. if (rhs->isConstant()) { ins->setOperand(1, useOrConstantAtStart(rhs)); } else if (Assembler::HasBMI2() && !mir->isRotate()) { ins->setOperand(1, willHaveDifferentLIRNodes(lhs, rhs) ? useRegister(rhs) : useRegisterAtStart(rhs)); } else { ins->setOperand(1, willHaveDifferentLIRNodes(lhs, rhs) ? useFixed(rhs, ecx) : useFixedAtStart(rhs, ecx)); } defineReuseInput(ins, mir, 0); } template void LIRGeneratorX86Shared::lowerForShiftInt64( LInstructionHelper* ins, MDefinition* mir, MDefinition* lhs, MDefinition* rhs) { ins->setInt64Operand(0, useInt64RegisterAtStart(lhs)); #if defined(JS_NUNBOX32) if (mir->isRotate()) { ins->setTemp(0, temp()); } #endif static_assert(LShiftI64::Rhs == INT64_PIECES, "Assume Rhs is located at INT64_PIECES."); static_assert(LRotateI64::Count == INT64_PIECES, "Assume Count is located at INT64_PIECES."); // Shift operand should be constant or, unless BMI2 is available, in register // ecx. x86 can't shift a non-ecx register. if (rhs->isConstant()) { ins->setOperand(INT64_PIECES, useOrConstantAtStart(rhs)); #ifdef JS_CODEGEN_X64 } else if (Assembler::HasBMI2() && !mir->isRotate()) { ins->setOperand(INT64_PIECES, useRegister(rhs)); #endif } else { // The operands are int64, but we only care about the lower 32 bits of // the RHS. On 32-bit, the code below will load that part in ecx and // will discard the upper half. ensureDefined(rhs); LUse use(ecx); use.setVirtualRegister(rhs->virtualRegister()); ins->setOperand(INT64_PIECES, use); } defineInt64ReuseInput(ins, mir, 0); } template void LIRGeneratorX86Shared::lowerForShiftInt64( LInstructionHelper* ins, MDefinition* mir, MDefinition* lhs, MDefinition* rhs); template void LIRGeneratorX86Shared::lowerForShiftInt64( LInstructionHelper* ins, MDefinition* mir, MDefinition* lhs, MDefinition* rhs); void LIRGeneratorX86Shared::lowerForCompareI64AndBranch( MTest* mir, MCompare* comp, JSOp op, MDefinition* left, MDefinition* right, MBasicBlock* ifTrue, MBasicBlock* ifFalse) { auto* lir = new (alloc()) LCompareI64AndBranch(comp, op, useInt64Register(left), useInt64OrConstant(right), ifTrue, ifFalse); add(lir, mir); } void LIRGeneratorX86Shared::lowerForALU(LInstructionHelper<1, 1, 0>* ins, MDefinition* mir, MDefinition* input) { ins->setOperand(0, useRegisterAtStart(input)); defineReuseInput(ins, mir, 0); } void LIRGeneratorX86Shared::lowerForALU(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir, MDefinition* lhs, MDefinition* rhs) { ins->setOperand(0, useRegisterAtStart(lhs)); ins->setOperand(1, willHaveDifferentLIRNodes(lhs, rhs) ? useOrConstant(rhs) : useOrConstantAtStart(rhs)); defineReuseInput(ins, mir, 0); } template void LIRGeneratorX86Shared::lowerForFPU(LInstructionHelper<1, 2, Temps>* ins, MDefinition* mir, MDefinition* lhs, MDefinition* rhs) { // Without AVX, we'll need to use the x86 encodings where one of the // inputs must be the same location as the output. if (!Assembler::HasAVX()) { ins->setOperand(0, useRegisterAtStart(lhs)); ins->setOperand( 1, willHaveDifferentLIRNodes(lhs, rhs) ? use(rhs) : useAtStart(rhs)); defineReuseInput(ins, mir, 0); } else { ins->setOperand(0, useRegisterAtStart(lhs)); ins->setOperand(1, useAtStart(rhs)); define(ins, mir); } } template void LIRGeneratorX86Shared::lowerForFPU( LInstructionHelper<1, 2, 0>* ins, MDefinition* mir, MDefinition* lhs, MDefinition* rhs); template void LIRGeneratorX86Shared::lowerForFPU( LInstructionHelper<1, 2, 1>* ins, MDefinition* mir, MDefinition* lhs, MDefinition* rhs); void LIRGeneratorX86Shared::lowerForBitAndAndBranch(LBitAndAndBranch* baab, MInstruction* mir, MDefinition* lhs, MDefinition* rhs) { baab->setOperand(0, useRegisterAtStart(lhs)); baab->setOperand(1, useRegisterOrConstantAtStart(rhs)); add(baab, mir); } void LIRGeneratorX86Shared::lowerNegI(MInstruction* ins, MDefinition* input) { defineReuseInput(new (alloc()) LNegI(useRegisterAtStart(input)), ins, 0); } void LIRGeneratorX86Shared::lowerNegI64(MInstruction* ins, MDefinition* input) { defineInt64ReuseInput(new (alloc()) LNegI64(useInt64RegisterAtStart(input)), ins, 0); } void LIRGenerator::visitAbs(MAbs* ins) { defineReuseInput(allocateAbs(ins, useRegisterAtStart(ins->input())), ins, 0); } void LIRGeneratorX86Shared::lowerMulI(MMul* mul, MDefinition* lhs, MDefinition* rhs) { // Note: If we need a negative zero check, lhs is used twice. LAllocation lhsCopy = mul->canBeNegativeZero() ? use(lhs) : LAllocation(); LMulI* lir = new (alloc()) LMulI(useRegisterAtStart(lhs), willHaveDifferentLIRNodes(lhs, rhs) ? useOrConstant(rhs) : useOrConstantAtStart(rhs), lhsCopy); if (mul->fallible()) { assignSnapshot(lir, mul->bailoutKind()); } defineReuseInput(lir, mul, 0); } void LIRGeneratorX86Shared::lowerDivI(MDiv* div) { if (div->isUnsigned()) { lowerUDiv(div); return; } // Division instructions are slow. Division by constant denominators can be // rewritten to use other instructions. if (div->rhs()->isConstant()) { int32_t rhs = div->rhs()->toConstant()->toInt32(); // Division by powers of two can be done by shifting, and division by // other numbers can be done by a reciprocal multiplication technique. int32_t shift = FloorLog2(Abs(rhs)); if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) { LAllocation lhs = useRegisterAtStart(div->lhs()); LDivPowTwoI* lir; // When truncated with maybe a non-zero remainder, we have to round the // result toward 0. This requires an extra register to round up/down // whether the left-hand-side is signed. bool needRoundNeg = div->canBeNegativeDividend() && div->isTruncated(); if (!needRoundNeg) { // Numerator is unsigned, so does not need adjusting. lir = new (alloc()) LDivPowTwoI(lhs, lhs, shift, rhs < 0); } else { // Numerator might be signed, and needs adjusting, and an extra lhs copy // is needed to round the result of the integer division towards zero. lir = new (alloc()) LDivPowTwoI(lhs, useRegister(div->lhs()), shift, rhs < 0); } if (div->fallible()) { assignSnapshot(lir, div->bailoutKind()); } defineReuseInput(lir, div, 0); return; } if (rhs != 0) { LDivOrModConstantI* lir; lir = new (alloc()) LDivOrModConstantI(useRegister(div->lhs()), rhs, tempFixed(eax)); if (div->fallible()) { assignSnapshot(lir, div->bailoutKind()); } defineFixed(lir, div, LAllocation(AnyRegister(edx))); return; } } LDivI* lir = new (alloc()) LDivI(useRegister(div->lhs()), useRegister(div->rhs()), tempFixed(edx)); if (div->fallible()) { assignSnapshot(lir, div->bailoutKind()); } defineFixed(lir, div, LAllocation(AnyRegister(eax))); } void LIRGeneratorX86Shared::lowerModI(MMod* mod) { if (mod->isUnsigned()) { lowerUMod(mod); return; } if (mod->rhs()->isConstant()) { int32_t rhs = mod->rhs()->toConstant()->toInt32(); int32_t shift = FloorLog2(Abs(rhs)); if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) { LModPowTwoI* lir = new (alloc()) LModPowTwoI(useRegisterAtStart(mod->lhs()), shift); if (mod->fallible()) { assignSnapshot(lir, mod->bailoutKind()); } defineReuseInput(lir, mod, 0); return; } if (rhs != 0) { LDivOrModConstantI* lir; lir = new (alloc()) LDivOrModConstantI(useRegister(mod->lhs()), rhs, tempFixed(edx)); if (mod->fallible()) { assignSnapshot(lir, mod->bailoutKind()); } defineFixed(lir, mod, LAllocation(AnyRegister(eax))); return; } } LModI* lir = new (alloc()) LModI(useRegister(mod->lhs()), useRegister(mod->rhs()), tempFixed(eax)); if (mod->fallible()) { assignSnapshot(lir, mod->bailoutKind()); } defineFixed(lir, mod, LAllocation(AnyRegister(edx))); } void LIRGenerator::visitWasmNeg(MWasmNeg* ins) { switch (ins->type()) { case MIRType::Int32: defineReuseInput(new (alloc()) LNegI(useRegisterAtStart(ins->input())), ins, 0); break; case MIRType::Float32: defineReuseInput(new (alloc()) LNegF(useRegisterAtStart(ins->input())), ins, 0); break; case MIRType::Double: defineReuseInput(new (alloc()) LNegD(useRegisterAtStart(ins->input())), ins, 0); break; default: MOZ_CRASH(); } } void LIRGeneratorX86Shared::lowerWasmSelectI(MWasmSelect* select) { auto* lir = new (alloc()) LWasmSelect(useRegisterAtStart(select->trueExpr()), useAny(select->falseExpr()), useRegister(select->condExpr())); defineReuseInput(lir, select, LWasmSelect::TrueExprIndex); } void LIRGeneratorX86Shared::lowerWasmSelectI64(MWasmSelect* select) { auto* lir = new (alloc()) LWasmSelectI64( useInt64RegisterAtStart(select->trueExpr()), useInt64(select->falseExpr()), useRegister(select->condExpr())); defineInt64ReuseInput(lir, select, LWasmSelectI64::TrueExprIndex); } void LIRGenerator::visitAsmJSLoadHeap(MAsmJSLoadHeap* ins) { MDefinition* base = ins->base(); MOZ_ASSERT(base->type() == MIRType::Int32); MDefinition* boundsCheckLimit = ins->boundsCheckLimit(); MOZ_ASSERT_IF(ins->needsBoundsCheck(), boundsCheckLimit->type() == MIRType::Int32); // For simplicity, require a register if we're going to emit a bounds-check // branch, so that we don't have special cases for constants. This should // only happen in rare constant-folding cases since asm.js sets the minimum // heap size based when accessed via constant. LAllocation baseAlloc = ins->needsBoundsCheck() ? useRegisterAtStart(base) : useRegisterOrZeroAtStart(base); LAllocation limitAlloc = ins->needsBoundsCheck() ? useRegisterAtStart(boundsCheckLimit) : LAllocation(); LAllocation memoryBaseAlloc = ins->hasMemoryBase() ? useRegisterAtStart(ins->memoryBase()) : LAllocation(); auto* lir = new (alloc()) LAsmJSLoadHeap(baseAlloc, limitAlloc, memoryBaseAlloc); define(lir, ins); } void LIRGenerator::visitAsmJSStoreHeap(MAsmJSStoreHeap* ins) { MDefinition* base = ins->base(); MOZ_ASSERT(base->type() == MIRType::Int32); MDefinition* boundsCheckLimit = ins->boundsCheckLimit(); MOZ_ASSERT_IF(ins->needsBoundsCheck(), boundsCheckLimit->type() == MIRType::Int32); // For simplicity, require a register if we're going to emit a bounds-check // branch, so that we don't have special cases for constants. This should // only happen in rare constant-folding cases since asm.js sets the minimum // heap size based when accessed via constant. LAllocation baseAlloc = ins->needsBoundsCheck() ? useRegisterAtStart(base) : useRegisterOrZeroAtStart(base); LAllocation limitAlloc = ins->needsBoundsCheck() ? useRegisterAtStart(boundsCheckLimit) : LAllocation(); LAllocation memoryBaseAlloc = ins->hasMemoryBase() ? useRegisterAtStart(ins->memoryBase()) : LAllocation(); LAsmJSStoreHeap* lir = nullptr; switch (ins->access().type()) { case Scalar::Int8: case Scalar::Uint8: #ifdef JS_CODEGEN_X86 // See comment for LIRGeneratorX86::useByteOpRegister. lir = new (alloc()) LAsmJSStoreHeap( baseAlloc, useFixed(ins->value(), eax), limitAlloc, memoryBaseAlloc); break; #endif case Scalar::Int16: case Scalar::Uint16: case Scalar::Int32: case Scalar::Uint32: case Scalar::Float32: case Scalar::Float64: // For now, don't allow constant values. The immediate operand affects // instruction layout which affects patching. lir = new (alloc()) LAsmJSStoreHeap(baseAlloc, useRegisterAtStart(ins->value()), limitAlloc, memoryBaseAlloc); break; case Scalar::Int64: case Scalar::Simd128: MOZ_CRASH("NYI"); case Scalar::Uint8Clamped: case Scalar::BigInt64: case Scalar::BigUint64: case Scalar::MaxTypedArrayViewType: MOZ_CRASH("unexpected array type"); } add(lir, ins); } void LIRGeneratorX86Shared::lowerUDiv(MDiv* div) { if (div->rhs()->isConstant()) { // NOTE: the result of toInt32 is coerced to uint32_t. uint32_t rhs = div->rhs()->toConstant()->toInt32(); int32_t shift = FloorLog2(rhs); LAllocation lhs = useRegisterAtStart(div->lhs()); if (rhs != 0 && uint32_t(1) << shift == rhs) { LDivPowTwoI* lir = new (alloc()) LDivPowTwoI(lhs, lhs, shift, false); if (div->fallible()) { assignSnapshot(lir, div->bailoutKind()); } defineReuseInput(lir, div, 0); } else { LUDivOrModConstant* lir = new (alloc()) LUDivOrModConstant(useRegister(div->lhs()), rhs, tempFixed(eax)); if (div->fallible()) { assignSnapshot(lir, div->bailoutKind()); } defineFixed(lir, div, LAllocation(AnyRegister(edx))); } return; } LUDivOrMod* lir = new (alloc()) LUDivOrMod( useRegister(div->lhs()), useRegister(div->rhs()), tempFixed(edx)); if (div->fallible()) { assignSnapshot(lir, div->bailoutKind()); } defineFixed(lir, div, LAllocation(AnyRegister(eax))); } void LIRGeneratorX86Shared::lowerUMod(MMod* mod) { if (mod->rhs()->isConstant()) { uint32_t rhs = mod->rhs()->toConstant()->toInt32(); int32_t shift = FloorLog2(rhs); if (rhs != 0 && uint32_t(1) << shift == rhs) { LModPowTwoI* lir = new (alloc()) LModPowTwoI(useRegisterAtStart(mod->lhs()), shift); if (mod->fallible()) { assignSnapshot(lir, mod->bailoutKind()); } defineReuseInput(lir, mod, 0); } else { LUDivOrModConstant* lir = new (alloc()) LUDivOrModConstant(useRegister(mod->lhs()), rhs, tempFixed(edx)); if (mod->fallible()) { assignSnapshot(lir, mod->bailoutKind()); } defineFixed(lir, mod, LAllocation(AnyRegister(eax))); } return; } LUDivOrMod* lir = new (alloc()) LUDivOrMod( useRegister(mod->lhs()), useRegister(mod->rhs()), tempFixed(eax)); if (mod->fallible()) { assignSnapshot(lir, mod->bailoutKind()); } defineFixed(lir, mod, LAllocation(AnyRegister(edx))); } void LIRGeneratorX86Shared::lowerUrshD(MUrsh* mir) { MDefinition* lhs = mir->lhs(); MDefinition* rhs = mir->rhs(); MOZ_ASSERT(lhs->type() == MIRType::Int32); MOZ_ASSERT(rhs->type() == MIRType::Int32); MOZ_ASSERT(mir->type() == MIRType::Double); #ifdef JS_CODEGEN_X64 static_assert(ecx == rcx); #endif // Without BMI2, x86 can only shift by ecx. LUse lhsUse = useRegisterAtStart(lhs); LAllocation rhsAlloc; if (rhs->isConstant()) { rhsAlloc = useOrConstant(rhs); } else if (Assembler::HasBMI2()) { rhsAlloc = useRegister(rhs); } else { rhsAlloc = useFixed(rhs, ecx); } LUrshD* lir = new (alloc()) LUrshD(lhsUse, rhsAlloc, tempCopy(lhs, 0)); define(lir, mir); } void LIRGeneratorX86Shared::lowerPowOfTwoI(MPow* mir) { int32_t base = mir->input()->toConstant()->toInt32(); MDefinition* power = mir->power(); // Shift operand should be in register ecx, unless BMI2 is available. // x86 can't shift a non-ecx register. LAllocation powerAlloc = Assembler::HasBMI2() ? useRegister(power) : useFixed(power, ecx); auto* lir = new (alloc()) LPowOfTwoI(powerAlloc, base); assignSnapshot(lir, mir->bailoutKind()); define(lir, mir); } void LIRGeneratorX86Shared::lowerBigIntLsh(MBigIntLsh* ins) { // Shift operand should be in register ecx, unless BMI2 is available. // x86 can't shift a non-ecx register. LDefinition shiftAlloc = Assembler::HasBMI2() ? temp() : tempFixed(ecx); auto* lir = new (alloc()) LBigIntLsh(useRegister(ins->lhs()), useRegister(ins->rhs()), temp(), shiftAlloc, temp()); define(lir, ins); assignSafepoint(lir, ins); } void LIRGeneratorX86Shared::lowerBigIntRsh(MBigIntRsh* ins) { // Shift operand should be in register ecx, unless BMI2 is available. // x86 can't shift a non-ecx register. LDefinition shiftAlloc = Assembler::HasBMI2() ? temp() : tempFixed(ecx); auto* lir = new (alloc()) LBigIntRsh(useRegister(ins->lhs()), useRegister(ins->rhs()), temp(), shiftAlloc, temp()); define(lir, ins); assignSafepoint(lir, ins); } void LIRGeneratorX86Shared::lowerWasmBuiltinTruncateToInt32( MWasmBuiltinTruncateToInt32* ins) { MDefinition* opd = ins->input(); MOZ_ASSERT(opd->type() == MIRType::Double || opd->type() == MIRType::Float32); LDefinition maybeTemp = Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempDouble(); if (opd->type() == MIRType::Double) { define(new (alloc()) LWasmBuiltinTruncateDToInt32( useRegister(opd), useFixed(ins->instance(), InstanceReg), maybeTemp), ins); return; } define( new (alloc()) LWasmBuiltinTruncateFToInt32( useRegister(opd), useFixed(ins->instance(), InstanceReg), maybeTemp), ins); } void LIRGeneratorX86Shared::lowerTruncateDToInt32(MTruncateToInt32* ins) { MDefinition* opd = ins->input(); MOZ_ASSERT(opd->type() == MIRType::Double); LDefinition maybeTemp = Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempDouble(); define(new (alloc()) LTruncateDToInt32(useRegister(opd), maybeTemp), ins); } void LIRGeneratorX86Shared::lowerTruncateFToInt32(MTruncateToInt32* ins) { MDefinition* opd = ins->input(); MOZ_ASSERT(opd->type() == MIRType::Float32); LDefinition maybeTemp = Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempFloat32(); define(new (alloc()) LTruncateFToInt32(useRegister(opd), maybeTemp), ins); } void LIRGeneratorX86Shared::lowerCompareExchangeTypedArrayElement( MCompareExchangeTypedArrayElement* ins, bool useI386ByteRegisters) { MOZ_ASSERT(ins->arrayType() != Scalar::Float32); MOZ_ASSERT(ins->arrayType() != Scalar::Float64); MOZ_ASSERT(ins->elements()->type() == MIRType::Elements); MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr); const LUse elements = useRegister(ins->elements()); const LAllocation index = useRegisterOrIndexConstant(ins->index(), ins->arrayType()); // If the target is a floating register then we need a temp at the // lower level; that temp must be eax. // // Otherwise the target (if used) is an integer register, which // must be eax. If the target is not used the machine code will // still clobber eax, so just pretend it's used. // // oldval must be in a register. // // newval must be in a register. If the source is a byte array // then newval must be a register that has a byte size: on x86 // this must be ebx, ecx, or edx (eax is taken for the output). // // Bug #1077036 describes some further optimization opportunities. bool fixedOutput = false; LDefinition tempDef = LDefinition::BogusTemp(); LAllocation newval; if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) { tempDef = tempFixed(eax); newval = useRegister(ins->newval()); } else { fixedOutput = true; if (useI386ByteRegisters && ins->isByteArray()) { newval = useFixed(ins->newval(), ebx); } else { newval = useRegister(ins->newval()); } } const LAllocation oldval = useRegister(ins->oldval()); LCompareExchangeTypedArrayElement* lir = new (alloc()) LCompareExchangeTypedArrayElement(elements, index, oldval, newval, tempDef); if (fixedOutput) { defineFixed(lir, ins, LAllocation(AnyRegister(eax))); } else { define(lir, ins); } } void LIRGeneratorX86Shared::lowerAtomicExchangeTypedArrayElement( MAtomicExchangeTypedArrayElement* ins, bool useI386ByteRegisters) { MOZ_ASSERT(ins->arrayType() <= Scalar::Uint32); MOZ_ASSERT(ins->elements()->type() == MIRType::Elements); MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr); const LUse elements = useRegister(ins->elements()); const LAllocation index = useRegisterOrIndexConstant(ins->index(), ins->arrayType()); const LAllocation value = useRegister(ins->value()); // The underlying instruction is XCHG, which can operate on any // register. // // If the target is a floating register (for Uint32) then we need // a temp into which to exchange. // // If the source is a byte array then we need a register that has // a byte size; in this case -- on x86 only -- pin the output to // an appropriate register and use that as a temp in the back-end. LDefinition tempDef = LDefinition::BogusTemp(); if (ins->arrayType() == Scalar::Uint32) { MOZ_ASSERT(ins->type() == MIRType::Double); tempDef = temp(); } LAtomicExchangeTypedArrayElement* lir = new (alloc()) LAtomicExchangeTypedArrayElement(elements, index, value, tempDef); if (useI386ByteRegisters && ins->isByteArray()) { defineFixed(lir, ins, LAllocation(AnyRegister(eax))); } else { define(lir, ins); } } void LIRGeneratorX86Shared::lowerAtomicTypedArrayElementBinop( MAtomicTypedArrayElementBinop* ins, bool useI386ByteRegisters) { MOZ_ASSERT(ins->arrayType() != Scalar::Uint8Clamped); MOZ_ASSERT(ins->arrayType() != Scalar::Float32); MOZ_ASSERT(ins->arrayType() != Scalar::Float64); MOZ_ASSERT(ins->elements()->type() == MIRType::Elements); MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr); const LUse elements = useRegister(ins->elements()); const LAllocation index = useRegisterOrIndexConstant(ins->index(), ins->arrayType()); // Case 1: the result of the operation is not used. // // We'll emit a single instruction: LOCK ADD, LOCK SUB, LOCK AND, // LOCK OR, or LOCK XOR. We can do this even for the Uint32 case. if (ins->isForEffect()) { LAllocation value; if (useI386ByteRegisters && ins->isByteArray() && !ins->value()->isConstant()) { value = useFixed(ins->value(), ebx); } else { value = useRegisterOrConstant(ins->value()); } LAtomicTypedArrayElementBinopForEffect* lir = new (alloc()) LAtomicTypedArrayElementBinopForEffect(elements, index, value); add(lir, ins); return; } // Case 2: the result of the operation is used. // // For ADD and SUB we'll use XADD: // // movl src, output // lock xaddl output, mem // // For the 8-bit variants XADD needs a byte register for the output. // // For AND/OR/XOR we need to use a CMPXCHG loop: // // movl *mem, eax // L: mov eax, temp // andl src, temp // lock cmpxchg temp, mem ; reads eax also // jnz L // ; result in eax // // Note the placement of L, cmpxchg will update eax with *mem if // *mem does not have the expected value, so reloading it at the // top of the loop would be redundant. // // If the array is not a uint32 array then: // - eax should be the output (one result of the cmpxchg) // - there is a temp, which must have a byte register if // the array has 1-byte elements elements // // If the array is a uint32 array then: // - eax is the first temp // - we also need a second temp // // There are optimization opportunities: // - better register allocation in the x86 8-bit case, Bug #1077036. bool bitOp = !(ins->operation() == AtomicFetchAddOp || ins->operation() == AtomicFetchSubOp); bool fixedOutput = true; bool reuseInput = false; LDefinition tempDef1 = LDefinition::BogusTemp(); LDefinition tempDef2 = LDefinition::BogusTemp(); LAllocation value; if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) { value = useRegisterOrConstant(ins->value()); fixedOutput = false; if (bitOp) { tempDef1 = tempFixed(eax); tempDef2 = temp(); } else { tempDef1 = temp(); } } else if (useI386ByteRegisters && ins->isByteArray()) { if (ins->value()->isConstant()) { value = useRegisterOrConstant(ins->value()); } else { value = useFixed(ins->value(), ebx); } if (bitOp) { tempDef1 = tempFixed(ecx); } } else if (bitOp) { value = useRegisterOrConstant(ins->value()); tempDef1 = temp(); } else if (ins->value()->isConstant()) { fixedOutput = false; value = useRegisterOrConstant(ins->value()); } else { fixedOutput = false; reuseInput = true; value = useRegisterAtStart(ins->value()); } LAtomicTypedArrayElementBinop* lir = new (alloc()) LAtomicTypedArrayElementBinop(elements, index, value, tempDef1, tempDef2); if (fixedOutput) { defineFixed(lir, ins, LAllocation(AnyRegister(eax))); } else if (reuseInput) { defineReuseInput(lir, ins, LAtomicTypedArrayElementBinop::valueOp); } else { define(lir, ins); } } void LIRGenerator::visitCopySign(MCopySign* ins) { MDefinition* lhs = ins->lhs(); MDefinition* rhs = ins->rhs(); MOZ_ASSERT(IsFloatingPointType(lhs->type())); MOZ_ASSERT(lhs->type() == rhs->type()); MOZ_ASSERT(lhs->type() == ins->type()); LInstructionHelper<1, 2, 2>* lir; if (lhs->type() == MIRType::Double) { lir = new (alloc()) LCopySignD(); } else { lir = new (alloc()) LCopySignF(); } // As lowerForFPU, but we want rhs to be in a FP register too. lir->setOperand(0, useRegisterAtStart(lhs)); if (!Assembler::HasAVX()) { lir->setOperand(1, willHaveDifferentLIRNodes(lhs, rhs) ? useRegister(rhs) : useRegisterAtStart(rhs)); defineReuseInput(lir, ins, 0); } else { lir->setOperand(1, useRegisterAtStart(rhs)); define(lir, ins); } } // These lowerings are really x86-shared but some Masm APIs are not yet // available on x86. // Ternary and binary operators require the dest register to be the same as // their first input register, leading to a pattern of useRegisterAtStart + // defineReuseInput. void LIRGenerator::visitWasmTernarySimd128(MWasmTernarySimd128* ins) { #ifdef ENABLE_WASM_SIMD MOZ_ASSERT(ins->v0()->type() == MIRType::Simd128); MOZ_ASSERT(ins->v1()->type() == MIRType::Simd128); MOZ_ASSERT(ins->v2()->type() == MIRType::Simd128); MOZ_ASSERT(ins->type() == MIRType::Simd128); switch (ins->simdOp()) { case wasm::SimdOp::V128Bitselect: { // Enforcing lhs == output avoids one setup move. We would like to also // enforce merging the control with the temp (with // usRegisterAtStart(control) and tempCopy()), but the register allocator // ignores those constraints at present. auto* lir = new (alloc()) LWasmTernarySimd128( ins->simdOp(), useRegisterAtStart(ins->v0()), useRegister(ins->v1()), useRegister(ins->v2()), tempSimd128()); defineReuseInput(lir, ins, LWasmTernarySimd128::V0); break; } case wasm::SimdOp::F32x4RelaxedFma: case wasm::SimdOp::F32x4RelaxedFnma: case wasm::SimdOp::F64x2RelaxedFma: case wasm::SimdOp::F64x2RelaxedFnma: { auto* lir = new (alloc()) LWasmTernarySimd128( ins->simdOp(), useRegister(ins->v0()), useRegister(ins->v1()), useRegisterAtStart(ins->v2())); defineReuseInput(lir, ins, LWasmTernarySimd128::V2); break; } case wasm::SimdOp::I32x4DotI8x16I7x16AddS: { auto* lir = new (alloc()) LWasmTernarySimd128( ins->simdOp(), useRegister(ins->v0()), useRegister(ins->v1()), useRegisterAtStart(ins->v2())); defineReuseInput(lir, ins, LWasmTernarySimd128::V2); break; } case wasm::SimdOp::I8x16RelaxedLaneSelect: case wasm::SimdOp::I16x8RelaxedLaneSelect: case wasm::SimdOp::I32x4RelaxedLaneSelect: case wasm::SimdOp::I64x2RelaxedLaneSelect: { if (Assembler::HasAVX()) { auto* lir = new (alloc()) LWasmTernarySimd128( ins->simdOp(), useRegisterAtStart(ins->v0()), useRegisterAtStart(ins->v1()), useRegisterAtStart(ins->v2())); define(lir, ins); } else { auto* lir = new (alloc()) LWasmTernarySimd128( ins->simdOp(), useRegister(ins->v0()), useRegisterAtStart(ins->v1()), useFixed(ins->v2(), vmm0)); defineReuseInput(lir, ins, LWasmTernarySimd128::V1); } break; } default: MOZ_CRASH("NYI"); } #else MOZ_CRASH("No SIMD"); #endif } void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) { #ifdef ENABLE_WASM_SIMD MDefinition* lhs = ins->lhs(); MDefinition* rhs = ins->rhs(); wasm::SimdOp op = ins->simdOp(); MOZ_ASSERT(lhs->type() == MIRType::Simd128); MOZ_ASSERT(rhs->type() == MIRType::Simd128); MOZ_ASSERT(ins->type() == MIRType::Simd128); // Note MWasmBinarySimd128::foldsTo has already specialized operations that // have a constant operand, so this takes care of more general cases of // reordering, see ReorderCommutative. if (ins->isCommutative()) { ReorderCommutative(&lhs, &rhs, ins); } // Swap operands and change operation if necessary, these are all x86/x64 // dependent transformations. Except where noted, this is about avoiding // unnecessary moves and fixups in the code generator macros. bool swap = false; switch (op) { case wasm::SimdOp::V128AndNot: { // Code generation requires the operands to be reversed. swap = true; break; } case wasm::SimdOp::I8x16LtS: { swap = true; op = wasm::SimdOp::I8x16GtS; break; } case wasm::SimdOp::I8x16GeS: { swap = true; op = wasm::SimdOp::I8x16LeS; break; } case wasm::SimdOp::I16x8LtS: { swap = true; op = wasm::SimdOp::I16x8GtS; break; } case wasm::SimdOp::I16x8GeS: { swap = true; op = wasm::SimdOp::I16x8LeS; break; } case wasm::SimdOp::I32x4LtS: { swap = true; op = wasm::SimdOp::I32x4GtS; break; } case wasm::SimdOp::I32x4GeS: { swap = true; op = wasm::SimdOp::I32x4LeS; break; } case wasm::SimdOp::F32x4Gt: { swap = true; op = wasm::SimdOp::F32x4Lt; break; } case wasm::SimdOp::F32x4Ge: { swap = true; op = wasm::SimdOp::F32x4Le; break; } case wasm::SimdOp::F64x2Gt: { swap = true; op = wasm::SimdOp::F64x2Lt; break; } case wasm::SimdOp::F64x2Ge: { swap = true; op = wasm::SimdOp::F64x2Le; break; } case wasm::SimdOp::F32x4PMin: case wasm::SimdOp::F32x4PMax: case wasm::SimdOp::F64x2PMin: case wasm::SimdOp::F64x2PMax: { // Code generation requires the operations to be reversed (the rhs is the // output register). swap = true; break; } default: break; } if (swap) { MDefinition* tmp = lhs; lhs = rhs; rhs = tmp; } // Allocate temp registers LDefinition tempReg0 = LDefinition::BogusTemp(); LDefinition tempReg1 = LDefinition::BogusTemp(); switch (op) { case wasm::SimdOp::I64x2Mul: tempReg0 = tempSimd128(); break; case wasm::SimdOp::F32x4Min: case wasm::SimdOp::F32x4Max: case wasm::SimdOp::F64x2Min: case wasm::SimdOp::F64x2Max: tempReg0 = tempSimd128(); tempReg1 = tempSimd128(); break; case wasm::SimdOp::I64x2LtS: case wasm::SimdOp::I64x2GtS: case wasm::SimdOp::I64x2LeS: case wasm::SimdOp::I64x2GeS: // The compareForOrderingInt64x2AVX implementation does not require // temps but needs SSE4.2 support. Checking if both AVX and SSE4.2 // are enabled. if (!(Assembler::HasAVX() && Assembler::HasSSE42())) { tempReg0 = tempSimd128(); tempReg1 = tempSimd128(); } break; default: break; } // For binary ops, without AVX support, the Masm API always is usually // (rhs, lhsDest) and requires AtStart+ReuseInput for the lhs. // // For a few ops, the API is actually (rhsDest, lhs) and the rules are the // same but the reversed. We swapped operands above; they will be swapped // again in the code generator to emit the right code. // // If AVX support is enabled, some binary ops can use output as destination, // useRegisterAtStart is applied for both operands and no need for ReuseInput. switch (op) { case wasm::SimdOp::I8x16AvgrU: case wasm::SimdOp::I16x8AvgrU: case wasm::SimdOp::I8x16Add: case wasm::SimdOp::I8x16AddSatS: case wasm::SimdOp::I8x16AddSatU: case wasm::SimdOp::I8x16Sub: case wasm::SimdOp::I8x16SubSatS: case wasm::SimdOp::I8x16SubSatU: case wasm::SimdOp::I16x8Mul: case wasm::SimdOp::I16x8MinS: case wasm::SimdOp::I16x8MinU: case wasm::SimdOp::I16x8MaxS: case wasm::SimdOp::I16x8MaxU: case wasm::SimdOp::I32x4Add: case wasm::SimdOp::I32x4Sub: case wasm::SimdOp::I32x4Mul: case wasm::SimdOp::I32x4MinS: case wasm::SimdOp::I32x4MinU: case wasm::SimdOp::I32x4MaxS: case wasm::SimdOp::I32x4MaxU: case wasm::SimdOp::I64x2Add: case wasm::SimdOp::I64x2Sub: case wasm::SimdOp::I64x2Mul: case wasm::SimdOp::F32x4Add: case wasm::SimdOp::F32x4Sub: case wasm::SimdOp::F32x4Mul: case wasm::SimdOp::F32x4Div: case wasm::SimdOp::F64x2Add: case wasm::SimdOp::F64x2Sub: case wasm::SimdOp::F64x2Mul: case wasm::SimdOp::F64x2Div: case wasm::SimdOp::F32x4Eq: case wasm::SimdOp::F32x4Ne: case wasm::SimdOp::F32x4Lt: case wasm::SimdOp::F32x4Le: case wasm::SimdOp::F64x2Eq: case wasm::SimdOp::F64x2Ne: case wasm::SimdOp::F64x2Lt: case wasm::SimdOp::F64x2Le: case wasm::SimdOp::F32x4PMin: case wasm::SimdOp::F32x4PMax: case wasm::SimdOp::F64x2PMin: case wasm::SimdOp::F64x2PMax: case wasm::SimdOp::I8x16Swizzle: case wasm::SimdOp::I8x16RelaxedSwizzle: case wasm::SimdOp::I8x16Eq: case wasm::SimdOp::I8x16Ne: case wasm::SimdOp::I8x16GtS: case wasm::SimdOp::I8x16LeS: case wasm::SimdOp::I8x16LtU: case wasm::SimdOp::I8x16GtU: case wasm::SimdOp::I8x16LeU: case wasm::SimdOp::I8x16GeU: case wasm::SimdOp::I16x8Eq: case wasm::SimdOp::I16x8Ne: case wasm::SimdOp::I16x8GtS: case wasm::SimdOp::I16x8LeS: case wasm::SimdOp::I16x8LtU: case wasm::SimdOp::I16x8GtU: case wasm::SimdOp::I16x8LeU: case wasm::SimdOp::I16x8GeU: case wasm::SimdOp::I32x4Eq: case wasm::SimdOp::I32x4Ne: case wasm::SimdOp::I32x4GtS: case wasm::SimdOp::I32x4LeS: case wasm::SimdOp::I32x4LtU: case wasm::SimdOp::I32x4GtU: case wasm::SimdOp::I32x4LeU: case wasm::SimdOp::I32x4GeU: case wasm::SimdOp::I64x2Eq: case wasm::SimdOp::I64x2Ne: case wasm::SimdOp::I64x2LtS: case wasm::SimdOp::I64x2GtS: case wasm::SimdOp::I64x2LeS: case wasm::SimdOp::I64x2GeS: case wasm::SimdOp::V128And: case wasm::SimdOp::V128Or: case wasm::SimdOp::V128Xor: case wasm::SimdOp::V128AndNot: case wasm::SimdOp::F32x4Min: case wasm::SimdOp::F32x4Max: case wasm::SimdOp::F64x2Min: case wasm::SimdOp::F64x2Max: case wasm::SimdOp::I8x16NarrowI16x8S: case wasm::SimdOp::I8x16NarrowI16x8U: case wasm::SimdOp::I16x8NarrowI32x4S: case wasm::SimdOp::I16x8NarrowI32x4U: case wasm::SimdOp::I32x4DotI16x8S: case wasm::SimdOp::I16x8ExtmulLowI8x16S: case wasm::SimdOp::I16x8ExtmulHighI8x16S: case wasm::SimdOp::I16x8ExtmulLowI8x16U: case wasm::SimdOp::I16x8ExtmulHighI8x16U: case wasm::SimdOp::I32x4ExtmulLowI16x8S: case wasm::SimdOp::I32x4ExtmulHighI16x8S: case wasm::SimdOp::I32x4ExtmulLowI16x8U: case wasm::SimdOp::I32x4ExtmulHighI16x8U: case wasm::SimdOp::I64x2ExtmulLowI32x4S: case wasm::SimdOp::I64x2ExtmulHighI32x4S: case wasm::SimdOp::I64x2ExtmulLowI32x4U: case wasm::SimdOp::I64x2ExtmulHighI32x4U: case wasm::SimdOp::I16x8Q15MulrSatS: case wasm::SimdOp::F32x4RelaxedMin: case wasm::SimdOp::F32x4RelaxedMax: case wasm::SimdOp::F64x2RelaxedMin: case wasm::SimdOp::F64x2RelaxedMax: case wasm::SimdOp::I16x8RelaxedQ15MulrS: case wasm::SimdOp::I16x8DotI8x16I7x16S: case wasm::SimdOp::MozPMADDUBSW: if (isThreeOpAllowed()) { auto* lir = new (alloc()) LWasmBinarySimd128(op, useRegisterAtStart(lhs), useRegisterAtStart(rhs), tempReg0, tempReg1); define(lir, ins); break; } [[fallthrough]]; default: { LAllocation lhsDestAlloc = useRegisterAtStart(lhs); LAllocation rhsAlloc = willHaveDifferentLIRNodes(lhs, rhs) ? useRegister(rhs) : useRegisterAtStart(rhs); auto* lir = new (alloc()) LWasmBinarySimd128(op, lhsDestAlloc, rhsAlloc, tempReg0, tempReg1); defineReuseInput(lir, ins, LWasmBinarySimd128::LhsDest); break; } } #else MOZ_CRASH("No SIMD"); #endif } #ifdef ENABLE_WASM_SIMD bool MWasmTernarySimd128::specializeBitselectConstantMaskAsShuffle( int8_t shuffle[16]) { if (simdOp() != wasm::SimdOp::V128Bitselect) { return false; } // Optimization when control vector is a mask with all 0 or all 1 per lane. // On x86, there is no bitselect, blend operations will be a win, // e.g. via PBLENDVB or PBLENDW. SimdConstant constant = static_cast(v2())->toSimd128(); const SimdConstant::I8x16& bytes = constant.asInt8x16(); for (int8_t i = 0; i < 16; i++) { if (bytes[i] == -1) { shuffle[i] = i + 16; } else if (bytes[i] == 0) { shuffle[i] = i; } else { return false; } } return true; } bool MWasmTernarySimd128::canRelaxBitselect() { wasm::SimdOp simdOp; if (v2()->isWasmBinarySimd128()) { simdOp = v2()->toWasmBinarySimd128()->simdOp(); } else if (v2()->isWasmBinarySimd128WithConstant()) { simdOp = v2()->toWasmBinarySimd128WithConstant()->simdOp(); } else { return false; } switch (simdOp) { case wasm::SimdOp::I8x16Eq: case wasm::SimdOp::I8x16Ne: case wasm::SimdOp::I8x16GtS: case wasm::SimdOp::I8x16GeS: case wasm::SimdOp::I8x16LtS: case wasm::SimdOp::I8x16LeS: case wasm::SimdOp::I8x16GtU: case wasm::SimdOp::I8x16GeU: case wasm::SimdOp::I8x16LtU: case wasm::SimdOp::I8x16LeU: case wasm::SimdOp::I16x8Eq: case wasm::SimdOp::I16x8Ne: case wasm::SimdOp::I16x8GtS: case wasm::SimdOp::I16x8GeS: case wasm::SimdOp::I16x8LtS: case wasm::SimdOp::I16x8LeS: case wasm::SimdOp::I16x8GtU: case wasm::SimdOp::I16x8GeU: case wasm::SimdOp::I16x8LtU: case wasm::SimdOp::I16x8LeU: case wasm::SimdOp::I32x4Eq: case wasm::SimdOp::I32x4Ne: case wasm::SimdOp::I32x4GtS: case wasm::SimdOp::I32x4GeS: case wasm::SimdOp::I32x4LtS: case wasm::SimdOp::I32x4LeS: case wasm::SimdOp::I32x4GtU: case wasm::SimdOp::I32x4GeU: case wasm::SimdOp::I32x4LtU: case wasm::SimdOp::I32x4LeU: case wasm::SimdOp::I64x2Eq: case wasm::SimdOp::I64x2Ne: case wasm::SimdOp::I64x2GtS: case wasm::SimdOp::I64x2GeS: case wasm::SimdOp::I64x2LtS: case wasm::SimdOp::I64x2LeS: case wasm::SimdOp::F32x4Eq: case wasm::SimdOp::F32x4Ne: case wasm::SimdOp::F32x4Gt: case wasm::SimdOp::F32x4Ge: case wasm::SimdOp::F32x4Lt: case wasm::SimdOp::F32x4Le: case wasm::SimdOp::F64x2Eq: case wasm::SimdOp::F64x2Ne: case wasm::SimdOp::F64x2Gt: case wasm::SimdOp::F64x2Ge: case wasm::SimdOp::F64x2Lt: case wasm::SimdOp::F64x2Le: return true; default: break; } return false; } bool MWasmBinarySimd128::canPmaddubsw() { MOZ_ASSERT(Assembler::HasSSE3()); return true; } #endif bool MWasmBinarySimd128::specializeForConstantRhs() { // The order follows MacroAssembler.h, generally switch (simdOp()) { // Operations implemented by a single native instruction where it is // plausible that the rhs (after commutation if available) could be a // constant. // // Swizzle is not here because it was handled earlier in the pipeline. // // Integer compares >= and < are not here because they are not supported in // the hardware. // // Floating compares are not here because our patching machinery can't // handle them yet. // // Floating-point min and max (including pmin and pmax) are not here because // they are not straightforward to implement. case wasm::SimdOp::I8x16Add: case wasm::SimdOp::I16x8Add: case wasm::SimdOp::I32x4Add: case wasm::SimdOp::I64x2Add: case wasm::SimdOp::I8x16Sub: case wasm::SimdOp::I16x8Sub: case wasm::SimdOp::I32x4Sub: case wasm::SimdOp::I64x2Sub: case wasm::SimdOp::I16x8Mul: case wasm::SimdOp::I32x4Mul: case wasm::SimdOp::I8x16AddSatS: case wasm::SimdOp::I8x16AddSatU: case wasm::SimdOp::I16x8AddSatS: case wasm::SimdOp::I16x8AddSatU: case wasm::SimdOp::I8x16SubSatS: case wasm::SimdOp::I8x16SubSatU: case wasm::SimdOp::I16x8SubSatS: case wasm::SimdOp::I16x8SubSatU: case wasm::SimdOp::I8x16MinS: case wasm::SimdOp::I8x16MinU: case wasm::SimdOp::I16x8MinS: case wasm::SimdOp::I16x8MinU: case wasm::SimdOp::I32x4MinS: case wasm::SimdOp::I32x4MinU: case wasm::SimdOp::I8x16MaxS: case wasm::SimdOp::I8x16MaxU: case wasm::SimdOp::I16x8MaxS: case wasm::SimdOp::I16x8MaxU: case wasm::SimdOp::I32x4MaxS: case wasm::SimdOp::I32x4MaxU: case wasm::SimdOp::V128And: case wasm::SimdOp::V128Or: case wasm::SimdOp::V128Xor: case wasm::SimdOp::I8x16Eq: case wasm::SimdOp::I8x16Ne: case wasm::SimdOp::I8x16GtS: case wasm::SimdOp::I8x16LeS: case wasm::SimdOp::I16x8Eq: case wasm::SimdOp::I16x8Ne: case wasm::SimdOp::I16x8GtS: case wasm::SimdOp::I16x8LeS: case wasm::SimdOp::I32x4Eq: case wasm::SimdOp::I32x4Ne: case wasm::SimdOp::I32x4GtS: case wasm::SimdOp::I32x4LeS: case wasm::SimdOp::I64x2Mul: case wasm::SimdOp::F32x4Eq: case wasm::SimdOp::F32x4Ne: case wasm::SimdOp::F32x4Lt: case wasm::SimdOp::F32x4Le: case wasm::SimdOp::F64x2Eq: case wasm::SimdOp::F64x2Ne: case wasm::SimdOp::F64x2Lt: case wasm::SimdOp::F64x2Le: case wasm::SimdOp::I32x4DotI16x8S: case wasm::SimdOp::F32x4Add: case wasm::SimdOp::F64x2Add: case wasm::SimdOp::F32x4Sub: case wasm::SimdOp::F64x2Sub: case wasm::SimdOp::F32x4Div: case wasm::SimdOp::F64x2Div: case wasm::SimdOp::F32x4Mul: case wasm::SimdOp::F64x2Mul: case wasm::SimdOp::I8x16NarrowI16x8S: case wasm::SimdOp::I8x16NarrowI16x8U: case wasm::SimdOp::I16x8NarrowI32x4S: case wasm::SimdOp::I16x8NarrowI32x4U: return true; default: return false; } } void LIRGenerator::visitWasmBinarySimd128WithConstant( MWasmBinarySimd128WithConstant* ins) { #ifdef ENABLE_WASM_SIMD MDefinition* lhs = ins->lhs(); MOZ_ASSERT(lhs->type() == MIRType::Simd128); MOZ_ASSERT(ins->type() == MIRType::Simd128); // Allocate temp registers LDefinition tempReg = LDefinition::BogusTemp(); switch (ins->simdOp()) { case wasm::SimdOp::I64x2Mul: tempReg = tempSimd128(); break; default: break; } if (isThreeOpAllowed()) { // The non-destructive versions of instructions will be available // when AVX is enabled. LAllocation lhsAlloc = useRegisterAtStart(lhs); auto* lir = new (alloc()) LWasmBinarySimd128WithConstant(lhsAlloc, ins->rhs(), tempReg); define(lir, ins); } else { // Always beneficial to reuse the lhs register here, see discussion in // visitWasmBinarySimd128() and also code in specializeForConstantRhs(). LAllocation lhsDestAlloc = useRegisterAtStart(lhs); auto* lir = new (alloc()) LWasmBinarySimd128WithConstant(lhsDestAlloc, ins->rhs(), tempReg); defineReuseInput(lir, ins, LWasmBinarySimd128WithConstant::LhsDest); } #else MOZ_CRASH("No SIMD"); #endif } void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) { #ifdef ENABLE_WASM_SIMD MDefinition* lhs = ins->lhs(); MDefinition* rhs = ins->rhs(); MOZ_ASSERT(lhs->type() == MIRType::Simd128); MOZ_ASSERT(rhs->type() == MIRType::Int32); MOZ_ASSERT(ins->type() == MIRType::Simd128); if (rhs->isConstant()) { int32_t shiftCountMask; switch (ins->simdOp()) { case wasm::SimdOp::I8x16Shl: case wasm::SimdOp::I8x16ShrU: case wasm::SimdOp::I8x16ShrS: shiftCountMask = 7; break; case wasm::SimdOp::I16x8Shl: case wasm::SimdOp::I16x8ShrU: case wasm::SimdOp::I16x8ShrS: shiftCountMask = 15; break; case wasm::SimdOp::I32x4Shl: case wasm::SimdOp::I32x4ShrU: case wasm::SimdOp::I32x4ShrS: shiftCountMask = 31; break; case wasm::SimdOp::I64x2Shl: case wasm::SimdOp::I64x2ShrU: case wasm::SimdOp::I64x2ShrS: shiftCountMask = 63; break; default: MOZ_CRASH("Unexpected shift operation"); } int32_t shiftCount = rhs->toConstant()->toInt32() & shiftCountMask; if (shiftCount == shiftCountMask) { // Check if possible to apply sign replication optimization. // For some ops the input shall be reused. switch (ins->simdOp()) { case wasm::SimdOp::I8x16ShrS: { auto* lir = new (alloc()) LWasmSignReplicationSimd128(useRegister(lhs)); define(lir, ins); return; } case wasm::SimdOp::I16x8ShrS: case wasm::SimdOp::I32x4ShrS: case wasm::SimdOp::I64x2ShrS: { auto* lir = new (alloc()) LWasmSignReplicationSimd128(useRegisterAtStart(lhs)); if (isThreeOpAllowed()) { define(lir, ins); } else { // For non-AVX, it is always beneficial to reuse the input. defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src); } return; } default: break; } } # ifdef DEBUG js::wasm::ReportSimdAnalysis("shift -> constant shift"); # endif auto* lir = new (alloc()) LWasmConstantShiftSimd128(useRegisterAtStart(lhs), shiftCount); if (isThreeOpAllowed()) { define(lir, ins); } else { // For non-AVX, it is always beneficial to reuse the input. defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src); } return; } # ifdef DEBUG js::wasm::ReportSimdAnalysis("shift -> variable shift"); # endif LDefinition tempReg = LDefinition::BogusTemp(); switch (ins->simdOp()) { case wasm::SimdOp::I8x16Shl: case wasm::SimdOp::I8x16ShrS: case wasm::SimdOp::I8x16ShrU: case wasm::SimdOp::I64x2ShrS: tempReg = tempSimd128(); break; default: break; } // Reusing the input if possible is never detrimental. LAllocation lhsDestAlloc = useRegisterAtStart(lhs); LAllocation rhsAlloc = useRegisterAtStart(rhs); auto* lir = new (alloc()) LWasmVariableShiftSimd128(lhsDestAlloc, rhsAlloc, tempReg); defineReuseInput(lir, ins, LWasmVariableShiftSimd128::LhsDest); #else MOZ_CRASH("No SIMD"); #endif } void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) { #ifdef ENABLE_WASM_SIMD MOZ_ASSERT(ins->lhs()->type() == MIRType::Simd128); MOZ_ASSERT(ins->rhs()->type() == MIRType::Simd128); MOZ_ASSERT(ins->type() == MIRType::Simd128); SimdShuffle s = ins->shuffle(); switch (s.opd) { case SimdShuffle::Operand::LEFT: case SimdShuffle::Operand::RIGHT: { LAllocation src; bool reuse = false; switch (*s.permuteOp) { case SimdPermuteOp::MOVE: reuse = true; break; case SimdPermuteOp::BROADCAST_8x16: case SimdPermuteOp::BROADCAST_16x8: case SimdPermuteOp::PERMUTE_8x16: case SimdPermuteOp::PERMUTE_16x8: case SimdPermuteOp::PERMUTE_32x4: case SimdPermuteOp::ROTATE_RIGHT_8x16: case SimdPermuteOp::SHIFT_LEFT_8x16: case SimdPermuteOp::SHIFT_RIGHT_8x16: case SimdPermuteOp::REVERSE_16x8: case SimdPermuteOp::REVERSE_32x4: case SimdPermuteOp::REVERSE_64x2: // No need to reuse registers when VEX instructions are enabled. reuse = !Assembler::HasAVX(); break; default: MOZ_CRASH("Unexpected operator"); } if (s.opd == SimdShuffle::Operand::LEFT) { src = useRegisterAtStart(ins->lhs()); } else { src = useRegisterAtStart(ins->rhs()); } auto* lir = new (alloc()) LWasmPermuteSimd128(src, *s.permuteOp, s.control); if (reuse) { defineReuseInput(lir, ins, LWasmPermuteSimd128::Src); } else { define(lir, ins); } break; } case SimdShuffle::Operand::BOTH: case SimdShuffle::Operand::BOTH_SWAPPED: { LDefinition temp = LDefinition::BogusTemp(); switch (*s.shuffleOp) { case SimdShuffleOp::BLEND_8x16: temp = Assembler::HasAVX() ? tempSimd128() : tempFixed(xmm0); break; default: break; } if (isThreeOpAllowed()) { LAllocation lhs; LAllocation rhs; if (s.opd == SimdShuffle::Operand::BOTH) { lhs = useRegisterAtStart(ins->lhs()); rhs = useRegisterAtStart(ins->rhs()); } else { lhs = useRegisterAtStart(ins->rhs()); rhs = useRegisterAtStart(ins->lhs()); } auto* lir = new (alloc()) LWasmShuffleSimd128(lhs, rhs, temp, *s.shuffleOp, s.control); define(lir, ins); } else { LAllocation lhs; LAllocation rhs; if (s.opd == SimdShuffle::Operand::BOTH) { lhs = useRegisterAtStart(ins->lhs()); rhs = useRegister(ins->rhs()); } else { lhs = useRegisterAtStart(ins->rhs()); rhs = useRegister(ins->lhs()); } auto* lir = new (alloc()) LWasmShuffleSimd128(lhs, rhs, temp, *s.shuffleOp, s.control); defineReuseInput(lir, ins, LWasmShuffleSimd128::LhsDest); } break; } } #else MOZ_CRASH("No SIMD"); #endif } void LIRGenerator::visitWasmReplaceLaneSimd128(MWasmReplaceLaneSimd128* ins) { #ifdef ENABLE_WASM_SIMD MOZ_ASSERT(ins->lhs()->type() == MIRType::Simd128); MOZ_ASSERT(ins->type() == MIRType::Simd128); // If AVX support is disabled, the Masm API is (rhs, lhsDest) and requires // AtStart+ReuseInput for the lhs. For type reasons, the rhs will never be // the same as the lhs and is therefore a plain Use. // // If AVX support is enabled, useRegisterAtStart is preferred. if (ins->rhs()->type() == MIRType::Int64) { if (isThreeOpAllowed()) { auto* lir = new (alloc()) LWasmReplaceInt64LaneSimd128( useRegisterAtStart(ins->lhs()), useInt64RegisterAtStart(ins->rhs())); define(lir, ins); } else { auto* lir = new (alloc()) LWasmReplaceInt64LaneSimd128( useRegisterAtStart(ins->lhs()), useInt64Register(ins->rhs())); defineReuseInput(lir, ins, LWasmReplaceInt64LaneSimd128::LhsDest); } } else { if (isThreeOpAllowed()) { auto* lir = new (alloc()) LWasmReplaceLaneSimd128( useRegisterAtStart(ins->lhs()), useRegisterAtStart(ins->rhs())); define(lir, ins); } else { auto* lir = new (alloc()) LWasmReplaceLaneSimd128( useRegisterAtStart(ins->lhs()), useRegister(ins->rhs())); defineReuseInput(lir, ins, LWasmReplaceLaneSimd128::LhsDest); } } #else MOZ_CRASH("No SIMD"); #endif } void LIRGenerator::visitWasmScalarToSimd128(MWasmScalarToSimd128* ins) { #ifdef ENABLE_WASM_SIMD MOZ_ASSERT(ins->type() == MIRType::Simd128); switch (ins->input()->type()) { case MIRType::Int64: { // 64-bit integer splats. // Load-and-(sign|zero)extend. auto* lir = new (alloc()) LWasmInt64ToSimd128(useInt64RegisterAtStart(ins->input())); define(lir, ins); break; } case MIRType::Float32: case MIRType::Double: { // Floating-point splats. // Ideally we save a move on SSE systems by reusing the input register, // but since the input and output register types differ, we can't. auto* lir = new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input())); define(lir, ins); break; } default: { // 32-bit integer splats. auto* lir = new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input())); define(lir, ins); break; } } #else MOZ_CRASH("No SIMD"); #endif } void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) { #ifdef ENABLE_WASM_SIMD MOZ_ASSERT(ins->input()->type() == MIRType::Simd128); MOZ_ASSERT(ins->type() == MIRType::Simd128); bool useAtStart = false; bool reuseInput = false; LDefinition tempReg = LDefinition::BogusTemp(); switch (ins->simdOp()) { case wasm::SimdOp::I8x16Neg: case wasm::SimdOp::I16x8Neg: case wasm::SimdOp::I32x4Neg: case wasm::SimdOp::I64x2Neg: case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S: // Prefer src != dest to avoid an unconditional src->temp move. MOZ_ASSERT(!reuseInput); // If AVX is enabled, we prefer useRegisterAtStart. useAtStart = isThreeOpAllowed(); break; case wasm::SimdOp::F32x4Neg: case wasm::SimdOp::F64x2Neg: case wasm::SimdOp::F32x4Abs: case wasm::SimdOp::F64x2Abs: case wasm::SimdOp::V128Not: case wasm::SimdOp::F32x4Sqrt: case wasm::SimdOp::F64x2Sqrt: case wasm::SimdOp::I8x16Abs: case wasm::SimdOp::I16x8Abs: case wasm::SimdOp::I32x4Abs: case wasm::SimdOp::I64x2Abs: case wasm::SimdOp::I32x4TruncSatF32x4S: case wasm::SimdOp::F32x4ConvertI32x4U: case wasm::SimdOp::I16x8ExtaddPairwiseI8x16U: case wasm::SimdOp::I32x4ExtaddPairwiseI16x8S: case wasm::SimdOp::I32x4ExtaddPairwiseI16x8U: case wasm::SimdOp::I32x4RelaxedTruncF32x4S: case wasm::SimdOp::I32x4RelaxedTruncF32x4U: case wasm::SimdOp::I32x4RelaxedTruncF64x2SZero: case wasm::SimdOp::I32x4RelaxedTruncF64x2UZero: case wasm::SimdOp::I64x2ExtendHighI32x4S: case wasm::SimdOp::I64x2ExtendHighI32x4U: // Prefer src == dest to avoid an unconditional src->dest move // for better performance in non-AVX mode (e.g. non-PSHUFD use). useAtStart = true; reuseInput = !isThreeOpAllowed(); break; case wasm::SimdOp::I32x4TruncSatF32x4U: case wasm::SimdOp::I32x4TruncSatF64x2SZero: case wasm::SimdOp::I32x4TruncSatF64x2UZero: case wasm::SimdOp::I8x16Popcnt: tempReg = tempSimd128(); // Prefer src == dest to avoid an unconditional src->dest move // in non-AVX mode. useAtStart = true; reuseInput = !isThreeOpAllowed(); break; case wasm::SimdOp::I16x8ExtendLowI8x16S: case wasm::SimdOp::I16x8ExtendHighI8x16S: case wasm::SimdOp::I16x8ExtendLowI8x16U: case wasm::SimdOp::I16x8ExtendHighI8x16U: case wasm::SimdOp::I32x4ExtendLowI16x8S: case wasm::SimdOp::I32x4ExtendHighI16x8S: case wasm::SimdOp::I32x4ExtendLowI16x8U: case wasm::SimdOp::I32x4ExtendHighI16x8U: case wasm::SimdOp::I64x2ExtendLowI32x4S: case wasm::SimdOp::I64x2ExtendLowI32x4U: case wasm::SimdOp::F32x4ConvertI32x4S: case wasm::SimdOp::F32x4Ceil: case wasm::SimdOp::F32x4Floor: case wasm::SimdOp::F32x4Trunc: case wasm::SimdOp::F32x4Nearest: case wasm::SimdOp::F64x2Ceil: case wasm::SimdOp::F64x2Floor: case wasm::SimdOp::F64x2Trunc: case wasm::SimdOp::F64x2Nearest: case wasm::SimdOp::F32x4DemoteF64x2Zero: case wasm::SimdOp::F64x2PromoteLowF32x4: case wasm::SimdOp::F64x2ConvertLowI32x4S: case wasm::SimdOp::F64x2ConvertLowI32x4U: // Prefer src == dest to exert the lowest register pressure on the // surrounding code. useAtStart = true; MOZ_ASSERT(!reuseInput); break; default: MOZ_CRASH("Unary SimdOp not implemented"); } LUse inputUse = useAtStart ? useRegisterAtStart(ins->input()) : useRegister(ins->input()); LWasmUnarySimd128* lir = new (alloc()) LWasmUnarySimd128(inputUse, tempReg); if (reuseInput) { defineReuseInput(lir, ins, LWasmUnarySimd128::Src); } else { define(lir, ins); } #else MOZ_CRASH("No SIMD"); #endif } void LIRGenerator::visitWasmLoadLaneSimd128(MWasmLoadLaneSimd128* ins) { #ifdef ENABLE_WASM_SIMD // A trick: On 32-bit systems, the base pointer is 32 bits (it was bounds // checked and then chopped). On 64-bit systems, it can be 32 bits or 64 // bits. Either way, it fits in a GPR so we can ignore the // Register/Register64 distinction here. # ifndef JS_64BIT MOZ_ASSERT(ins->base()->type() == MIRType::Int32); # endif LUse base = useRegisterAtStart(ins->base()); LUse inputUse = useRegisterAtStart(ins->value()); LAllocation memoryBase = ins->hasMemoryBase() ? useRegisterAtStart(ins->memoryBase()) : LAllocation(); LWasmLoadLaneSimd128* lir = new (alloc()) LWasmLoadLaneSimd128( base, inputUse, LDefinition::BogusTemp(), memoryBase); defineReuseInput(lir, ins, LWasmLoadLaneSimd128::Src); #else MOZ_CRASH("No SIMD"); #endif } void LIRGenerator::visitWasmStoreLaneSimd128(MWasmStoreLaneSimd128* ins) { #ifdef ENABLE_WASM_SIMD // See comment above. # ifndef JS_64BIT MOZ_ASSERT(ins->base()->type() == MIRType::Int32); # endif LUse base = useRegisterAtStart(ins->base()); LUse input = useRegisterAtStart(ins->value()); LAllocation memoryBase = ins->hasMemoryBase() ? useRegisterAtStart(ins->memoryBase()) : LAllocation(); LWasmStoreLaneSimd128* lir = new (alloc()) LWasmStoreLaneSimd128(base, input, LDefinition::BogusTemp(), memoryBase); add(lir, ins); #else MOZ_CRASH("No SIMD"); #endif } #ifdef ENABLE_WASM_SIMD bool LIRGeneratorX86Shared::canFoldReduceSimd128AndBranch(wasm::SimdOp op) { switch (op) { case wasm::SimdOp::V128AnyTrue: case wasm::SimdOp::I8x16AllTrue: case wasm::SimdOp::I16x8AllTrue: case wasm::SimdOp::I32x4AllTrue: case wasm::SimdOp::I64x2AllTrue: case wasm::SimdOp::I16x8Bitmask: return true; default: return false; } } bool LIRGeneratorX86Shared::canEmitWasmReduceSimd128AtUses( MWasmReduceSimd128* ins) { if (!ins->canEmitAtUses()) { return false; } // Only specific ops generating int32. if (ins->type() != MIRType::Int32) { return false; } if (!canFoldReduceSimd128AndBranch(ins->simdOp())) { return false; } // If never used then defer (it will be removed). MUseIterator iter(ins->usesBegin()); if (iter == ins->usesEnd()) { return true; } // We require an MTest consumer. MNode* node = iter->consumer(); if (!node->isDefinition() || !node->toDefinition()->isTest()) { return false; } // Defer only if there's only one use. iter++; return iter == ins->usesEnd(); } #endif // ENABLE_WASM_SIMD void LIRGenerator::visitWasmReduceSimd128(MWasmReduceSimd128* ins) { #ifdef ENABLE_WASM_SIMD if (canEmitWasmReduceSimd128AtUses(ins)) { emitAtUses(ins); return; } // Reductions (any_true, all_true, bitmask, extract_lane) uniformly prefer // useRegisterAtStart: // // - In most cases, the input type differs from the output type, so there's no // conflict and it doesn't really matter. // // - For extract_lane(0) on F32x4 and F64x2, input == output results in zero // code being generated. // // - For extract_lane(k > 0) on F32x4 and F64x2, allowing the input register // to be targeted lowers register pressure if it's the last use of the // input. if (ins->type() == MIRType::Int64) { auto* lir = new (alloc()) LWasmReduceSimd128ToInt64(useRegisterAtStart(ins->input())); defineInt64(lir, ins); } else { // Ideally we would reuse the input register for floating extract_lane if // the lane is zero, but constraints in the register allocator require the // input and output register types to be the same. auto* lir = new (alloc()) LWasmReduceSimd128( useRegisterAtStart(ins->input()), LDefinition::BogusTemp()); define(lir, ins); } #else MOZ_CRASH("No SIMD"); #endif }