1 files changed, 1393 insertions, 0 deletions
diff --git a/js/src/jit/x86-shared/Lowering-x86-shared.cpp b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
new file mode 100644
index 0000000000..d0ce6b1496
--- /dev/null
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -0,0 +1,1393 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "jit/x86-shared/Lowering-x86-shared.h"
+
+#include "mozilla/MathAlgorithms.h"
+
+#include "jit/Lowering.h"
+#include "jit/MIR.h"
+
+#include "jit/shared/Lowering-shared-inl.h"
+
+using namespace js;
+using namespace js::jit;
+
+using mozilla::Abs;
+using mozilla::FloorLog2;
+using mozilla::Maybe;
+using mozilla::Nothing;
+using mozilla::Some;
+
+LTableSwitch* LIRGeneratorX86Shared::newLTableSwitch(
+    const LAllocation& in, const LDefinition& inputCopy,
+    MTableSwitch* tableswitch) {
+  return new (alloc()) LTableSwitch(in, inputCopy, temp(), tableswitch);
+}
+
+LTableSwitchV* LIRGeneratorX86Shared::newLTableSwitchV(
+    MTableSwitch* tableswitch) {
+  return new (alloc()) LTableSwitchV(useBox(tableswitch->getOperand(0)), temp(),
+                                     tempDouble(), temp(), tableswitch);
+}
+
+void LIRGenerator::visitPowHalf(MPowHalf* ins) {
+  MDefinition* input = ins->input();
+  MOZ_ASSERT(input->type() == MIRType::Double);
+  LPowHalfD* lir = new (alloc()) LPowHalfD(useRegisterAtStart(input));
+  define(lir, ins);
+}
+
+void LIRGeneratorX86Shared::lowerForShift(LInstructionHelper<1, 2, 0>* ins,
+                                          MDefinition* mir, MDefinition* lhs,
+                                          MDefinition* rhs) {
+  ins->setOperand(0, useRegisterAtStart(lhs));
+
+  // Shift operand should be constant or, unless BMI2 is available, in register
+  // ecx. x86 can't shift a non-ecx register.
+  if (rhs->isConstant()) {
+    ins->setOperand(1, useOrConstantAtStart(rhs));
+  } else if (Assembler::HasBMI2() && !mir->isRotate()) {
+    ins->setOperand(1, lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs));
+  } else {
+    ins->setOperand(
+        1, lhs != rhs ? useFixed(rhs, ecx) : useFixedAtStart(rhs, ecx));
+  }
+
+  defineReuseInput(ins, mir, 0);
+}
+
+template <size_t Temps>
+void LIRGeneratorX86Shared::lowerForShiftInt64(
+    LInstructionHelper<INT64_PIECES, INT64_PIECES + 1, Temps>* ins,
+    MDefinition* mir, MDefinition* lhs, MDefinition* rhs) {
+  ins->setInt64Operand(0, useInt64RegisterAtStart(lhs));
+#if defined(JS_NUNBOX32)
+  if (mir->isRotate()) {
+    ins->setTemp(0, temp());
+  }
+#endif
+
+  static_assert(LShiftI64::Rhs == INT64_PIECES,
+                "Assume Rhs is located at INT64_PIECES.");
+  static_assert(LRotateI64::Count == INT64_PIECES,
+                "Assume Count is located at INT64_PIECES.");
+
+  // Shift operand should be constant or, unless BMI2 is available, in register
+  // ecx. x86 can't shift a non-ecx register.
+  if (rhs->isConstant()) {
+    ins->setOperand(INT64_PIECES, useOrConstantAtStart(rhs));
+#ifdef JS_CODEGEN_X64
+  } else if (Assembler::HasBMI2() && !mir->isRotate()) {
+    ins->setOperand(INT64_PIECES, useRegister(rhs));
+#endif
+  } else {
+    // The operands are int64, but we only care about the lower 32 bits of
+    // the RHS. On 32-bit, the code below will load that part in ecx and
+    // will discard the upper half.
+    ensureDefined(rhs);
+    LUse use(ecx);
+    use.setVirtualRegister(rhs->virtualRegister());
+    ins->setOperand(INT64_PIECES, use);
+  }
+
+  defineInt64ReuseInput(ins, mir, 0);
+}
+
+template void LIRGeneratorX86Shared::lowerForShiftInt64(
+    LInstructionHelper<INT64_PIECES, INT64_PIECES + 1, 0>* ins,
+    MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
+template void LIRGeneratorX86Shared::lowerForShiftInt64(
+    LInstructionHelper<INT64_PIECES, INT64_PIECES + 1, 1>* ins,
+    MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
+
+void LIRGeneratorX86Shared::lowerForALU(LInstructionHelper<1, 1, 0>* ins,
+                                        MDefinition* mir, MDefinition* input) {
+  ins->setOperand(0, useRegisterAtStart(input));
+  defineReuseInput(ins, mir, 0);
+}
+
+void LIRGeneratorX86Shared::lowerForALU(LInstructionHelper<1, 2, 0>* ins,
+                                        MDefinition* mir, MDefinition* lhs,
+                                        MDefinition* rhs) {
+  ins->setOperand(0, useRegisterAtStart(lhs));
+  ins->setOperand(1,
+                  lhs != rhs ? useOrConstant(rhs) : useOrConstantAtStart(rhs));
+  defineReuseInput(ins, mir, 0);
+}
+
+template <size_t Temps>
+void LIRGeneratorX86Shared::lowerForFPU(LInstructionHelper<1, 2, Temps>* ins,
+                                        MDefinition* mir, MDefinition* lhs,
+                                        MDefinition* rhs) {
+  // Without AVX, we'll need to use the x86 encodings where one of the
+  // inputs must be the same location as the output.
+  if (!Assembler::HasAVX()) {
+    ins->setOperand(0, useRegisterAtStart(lhs));
+    ins->setOperand(1, lhs != rhs ? use(rhs) : useAtStart(rhs));
+    defineReuseInput(ins, mir, 0);
+  } else {
+    ins->setOperand(0, useRegisterAtStart(lhs));
+    ins->setOperand(1, useAtStart(rhs));
+    define(ins, mir);
+  }
+}
+
+template void LIRGeneratorX86Shared::lowerForFPU(
+    LInstructionHelper<1, 2, 0>* ins, MDefinition* mir, MDefinition* lhs,
+    MDefinition* rhs);
+template void LIRGeneratorX86Shared::lowerForFPU(
+    LInstructionHelper<1, 2, 1>* ins, MDefinition* mir, MDefinition* lhs,
+    MDefinition* rhs);
+
+void LIRGeneratorX86Shared::lowerForBitAndAndBranch(LBitAndAndBranch* baab,
+                                                    MInstruction* mir,
+                                                    MDefinition* lhs,
+                                                    MDefinition* rhs) {
+  baab->setOperand(0, useRegisterAtStart(lhs));
+  baab->setOperand(1, useRegisterOrConstantAtStart(rhs));
+  add(baab, mir);
+}
+
+void LIRGeneratorX86Shared::lowerMulI(MMul* mul, MDefinition* lhs,
+                                      MDefinition* rhs) {
+  // Note: If we need a negative zero check, lhs is used twice.
+  LAllocation lhsCopy = mul->canBeNegativeZero() ? use(lhs) : LAllocation();
+  LMulI* lir = new (alloc()) LMulI(
+      useRegisterAtStart(lhs),
+      lhs != rhs ? useOrConstant(rhs) : useOrConstantAtStart(rhs), lhsCopy);
+  if (mul->fallible()) {
+    assignSnapshot(lir, mul->bailoutKind());
+  }
+  defineReuseInput(lir, mul, 0);
+}
+
+void LIRGeneratorX86Shared::lowerDivI(MDiv* div) {
+  if (div->isUnsigned()) {
+    lowerUDiv(div);
+    return;
+  }
+
+  // Division instructions are slow. Division by constant denominators can be
+  // rewritten to use other instructions.
+  if (div->rhs()->isConstant()) {
+    int32_t rhs = div->rhs()->toConstant()->toInt32();
+
+    // Division by powers of two can be done by shifting, and division by
+    // other numbers can be done by a reciprocal multiplication technique.
+    int32_t shift = FloorLog2(Abs(rhs));
+    if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) {
+      LAllocation lhs = useRegisterAtStart(div->lhs());
+      LDivPowTwoI* lir;
+      // When truncated with maybe a non-zero remainder, we have to round the
+      // result toward 0. This requires an extra register to round up/down
+      // whether the left-hand-side is signed.
+      bool needRoundNeg = div->canBeNegativeDividend() && div->isTruncated();
+      if (!needRoundNeg) {
+        // Numerator is unsigned, so does not need adjusting.
+        lir = new (alloc()) LDivPowTwoI(lhs, lhs, shift, rhs < 0);
+      } else {
+        // Numerator might be signed, and needs adjusting, and an extra lhs copy
+        // is needed to round the result of the integer division towards zero.
+        lir = new (alloc())
+            LDivPowTwoI(lhs, useRegister(div->lhs()), shift, rhs < 0);
+      }
+      if (div->fallible()) {
+        assignSnapshot(lir, div->bailoutKind());
+      }
+      defineReuseInput(lir, div, 0);
+      return;
+    }
+    if (rhs != 0) {
+      LDivOrModConstantI* lir;
+      lir = new (alloc())
+          LDivOrModConstantI(useRegister(div->lhs()), rhs, tempFixed(eax));
+      if (div->fallible()) {
+        assignSnapshot(lir, div->bailoutKind());
+      }
+      defineFixed(lir, div, LAllocation(AnyRegister(edx)));
+      return;
+    }
+  }
+
+  LDivI* lir = new (alloc())
+      LDivI(useRegister(div->lhs()), useRegister(div->rhs()), tempFixed(edx));
+  if (div->fallible()) {
+    assignSnapshot(lir, div->bailoutKind());
+  }
+  defineFixed(lir, div, LAllocation(AnyRegister(eax)));
+}
+
+void LIRGeneratorX86Shared::lowerModI(MMod* mod) {
+  if (mod->isUnsigned()) {
+    lowerUMod(mod);
+    return;
+  }
+
+  if (mod->rhs()->isConstant()) {
+    int32_t rhs = mod->rhs()->toConstant()->toInt32();
+    int32_t shift = FloorLog2(Abs(rhs));
+    if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) {
+      LModPowTwoI* lir =
+          new (alloc()) LModPowTwoI(useRegisterAtStart(mod->lhs()), shift);
+      if (mod->fallible()) {
+        assignSnapshot(lir, mod->bailoutKind());
+      }
+      defineReuseInput(lir, mod, 0);
+      return;
+    }
+    if (rhs != 0) {
+      LDivOrModConstantI* lir;
+      lir = new (alloc())
+          LDivOrModConstantI(useRegister(mod->lhs()), rhs, tempFixed(edx));
+      if (mod->fallible()) {
+        assignSnapshot(lir, mod->bailoutKind());
+      }
+      defineFixed(lir, mod, LAllocation(AnyRegister(eax)));
+      return;
+    }
+  }
+
+  LModI* lir = new (alloc())
+      LModI(useRegister(mod->lhs()), useRegister(mod->rhs()), tempFixed(eax));
+  if (mod->fallible()) {
+    assignSnapshot(lir, mod->bailoutKind());
+  }
+  defineFixed(lir, mod, LAllocation(AnyRegister(edx)));
+}
+
+void LIRGenerator::visitWasmNeg(MWasmNeg* ins) {
+  switch (ins->type()) {
+    case MIRType::Int32:
+      defineReuseInput(new (alloc()) LNegI(useRegisterAtStart(ins->input())),
+                       ins, 0);
+      break;
+    case MIRType::Float32:
+      defineReuseInput(new (alloc()) LNegF(useRegisterAtStart(ins->input())),
+                       ins, 0);
+      break;
+    case MIRType::Double:
+      defineReuseInput(new (alloc()) LNegD(useRegisterAtStart(ins->input())),
+                       ins, 0);
+      break;
+    default:
+      MOZ_CRASH();
+  }
+}
+
+void LIRGenerator::visitAsmJSLoadHeap(MAsmJSLoadHeap* ins) {
+  MDefinition* base = ins->base();
+  MOZ_ASSERT(base->type() == MIRType::Int32);
+
+  MDefinition* boundsCheckLimit = ins->boundsCheckLimit();
+  MOZ_ASSERT_IF(ins->needsBoundsCheck(),
+                boundsCheckLimit->type() == MIRType::Int32);
+
+  // For simplicity, require a register if we're going to emit a bounds-check
+  // branch, so that we don't have special cases for constants. This should
+  // only happen in rare constant-folding cases since asm.js sets the minimum
+  // heap size based when accessed via constant.
+  LAllocation baseAlloc = ins->needsBoundsCheck()
+                              ? useRegisterAtStart(base)
+                              : useRegisterOrZeroAtStart(base);
+
+  LAllocation limitAlloc = ins->needsBoundsCheck()
+                               ? useRegisterAtStart(boundsCheckLimit)
+                               : LAllocation();
+  LAllocation memoryBaseAlloc = ins->hasMemoryBase()
+                                    ? useRegisterAtStart(ins->memoryBase())
+                                    : LAllocation();
+
+  auto* lir =
+      new (alloc()) LAsmJSLoadHeap(baseAlloc, limitAlloc, memoryBaseAlloc);
+  define(lir, ins);
+}
+
+void LIRGenerator::visitAsmJSStoreHeap(MAsmJSStoreHeap* ins) {
+  MDefinition* base = ins->base();
+  MOZ_ASSERT(base->type() == MIRType::Int32);
+
+  MDefinition* boundsCheckLimit = ins->boundsCheckLimit();
+  MOZ_ASSERT_IF(ins->needsBoundsCheck(),
+                boundsCheckLimit->type() == MIRType::Int32);
+
+  // For simplicity, require a register if we're going to emit a bounds-check
+  // branch, so that we don't have special cases for constants. This should
+  // only happen in rare constant-folding cases since asm.js sets the minimum
+  // heap size based when accessed via constant.
+  LAllocation baseAlloc = ins->needsBoundsCheck()
+                              ? useRegisterAtStart(base)
+                              : useRegisterOrZeroAtStart(base);
+
+  LAllocation limitAlloc = ins->needsBoundsCheck()
+                               ? useRegisterAtStart(boundsCheckLimit)
+                               : LAllocation();
+  LAllocation memoryBaseAlloc = ins->hasMemoryBase()
+                                    ? useRegisterAtStart(ins->memoryBase())
+                                    : LAllocation();
+
+  LAsmJSStoreHeap* lir = nullptr;
+  switch (ins->access().type()) {
+    case Scalar::Int8:
+    case Scalar::Uint8:
+#ifdef JS_CODEGEN_X86
+      // See comment for LIRGeneratorX86::useByteOpRegister.
+      lir = new (alloc()) LAsmJSStoreHeap(
+          baseAlloc, useFixed(ins->value(), eax), limitAlloc, memoryBaseAlloc);
+      break;
+#endif
+    case Scalar::Int16:
+    case Scalar::Uint16:
+    case Scalar::Int32:
+    case Scalar::Uint32:
+    case Scalar::Float32:
+    case Scalar::Float64:
+      // For now, don't allow constant values. The immediate operand affects
+      // instruction layout which affects patching.
+      lir = new (alloc())
+          LAsmJSStoreHeap(baseAlloc, useRegisterAtStart(ins->value()),
+                          limitAlloc, memoryBaseAlloc);
+      break;
+    case Scalar::Int64:
+    case Scalar::Simd128:
+      MOZ_CRASH("NYI");
+    case Scalar::Uint8Clamped:
+    case Scalar::BigInt64:
+    case Scalar::BigUint64:
+    case Scalar::MaxTypedArrayViewType:
+      MOZ_CRASH("unexpected array type");
+  }
+  add(lir, ins);
+}
+
+void LIRGeneratorX86Shared::lowerUDiv(MDiv* div) {
+  if (div->rhs()->isConstant()) {
+    uint32_t rhs = div->rhs()->toConstant()->toInt32();
+    int32_t shift = FloorLog2(rhs);
+
+    LAllocation lhs = useRegisterAtStart(div->lhs());
+    if (rhs != 0 && uint32_t(1) << shift == rhs) {
+      LDivPowTwoI* lir = new (alloc()) LDivPowTwoI(lhs, lhs, shift, false);
+      if (div->fallible()) {
+        assignSnapshot(lir, div->bailoutKind());
+      }
+      defineReuseInput(lir, div, 0);
+    } else {
+      LUDivOrModConstant* lir = new (alloc())
+          LUDivOrModConstant(useRegister(div->lhs()), rhs, tempFixed(eax));
+      if (div->fallible()) {
+        assignSnapshot(lir, div->bailoutKind());
+      }
+      defineFixed(lir, div, LAllocation(AnyRegister(edx)));
+    }
+    return;
+  }
+
+  LUDivOrMod* lir = new (alloc()) LUDivOrMod(
+      useRegister(div->lhs()), useRegister(div->rhs()), tempFixed(edx));
+  if (div->fallible()) {
+    assignSnapshot(lir, div->bailoutKind());
+  }
+  defineFixed(lir, div, LAllocation(AnyRegister(eax)));
+}
+
+void LIRGeneratorX86Shared::lowerUMod(MMod* mod) {
+  if (mod->rhs()->isConstant()) {
+    uint32_t rhs = mod->rhs()->toConstant()->toInt32();
+    int32_t shift = FloorLog2(rhs);
+
+    if (rhs != 0 && uint32_t(1) << shift == rhs) {
+      LModPowTwoI* lir =
+          new (alloc()) LModPowTwoI(useRegisterAtStart(mod->lhs()), shift);
+      if (mod->fallible()) {
+        assignSnapshot(lir, mod->bailoutKind());
+      }
+      defineReuseInput(lir, mod, 0);
+    } else {
+      LUDivOrModConstant* lir = new (alloc())
+          LUDivOrModConstant(useRegister(mod->lhs()), rhs, tempFixed(edx));
+      if (mod->fallible()) {
+        assignSnapshot(lir, mod->bailoutKind());
+      }
+      defineFixed(lir, mod, LAllocation(AnyRegister(eax)));
+    }
+    return;
+  }
+
+  LUDivOrMod* lir = new (alloc()) LUDivOrMod(
+      useRegister(mod->lhs()), useRegister(mod->rhs()), tempFixed(eax));
+  if (mod->fallible()) {
+    assignSnapshot(lir, mod->bailoutKind());
+  }
+  defineFixed(lir, mod, LAllocation(AnyRegister(edx)));
+}
+
+void LIRGeneratorX86Shared::lowerUrshD(MUrsh* mir) {
+  MDefinition* lhs = mir->lhs();
+  MDefinition* rhs = mir->rhs();
+
+  MOZ_ASSERT(lhs->type() == MIRType::Int32);
+  MOZ_ASSERT(rhs->type() == MIRType::Int32);
+  MOZ_ASSERT(mir->type() == MIRType::Double);
+
+#ifdef JS_CODEGEN_X64
+  static_assert(ecx == rcx);
+#endif
+
+  // Without BMI2, x86 can only shift by ecx.
+  LUse lhsUse = useRegisterAtStart(lhs);
+  LAllocation rhsAlloc;
+  if (rhs->isConstant()) {
+    rhsAlloc = useOrConstant(rhs);
+  } else if (Assembler::HasBMI2()) {
+    rhsAlloc = useRegister(rhs);
+  } else {
+    rhsAlloc = useFixed(rhs, ecx);
+  }
+
+  LUrshD* lir = new (alloc()) LUrshD(lhsUse, rhsAlloc, tempCopy(lhs, 0));
+  define(lir, mir);
+}
+
+void LIRGeneratorX86Shared::lowerPowOfTwoI(MPow* mir) {
+  int32_t base = mir->input()->toConstant()->toInt32();
+  MDefinition* power = mir->power();
+
+  // Shift operand should be in register ecx, unless BMI2 is available.
+  // x86 can't shift a non-ecx register.
+  LAllocation powerAlloc =
+      Assembler::HasBMI2() ? useRegister(power) : useFixed(power, ecx);
+  auto* lir = new (alloc()) LPowOfTwoI(base, powerAlloc);
+  assignSnapshot(lir, mir->bailoutKind());
+  define(lir, mir);
+}
+
+void LIRGeneratorX86Shared::lowerBigIntLsh(MBigIntLsh* ins) {
+  // Shift operand should be in register ecx, unless BMI2 is available.
+  // x86 can't shift a non-ecx register.
+  LDefinition shiftAlloc = Assembler::HasBMI2() ? temp() : tempFixed(ecx);
+  auto* lir =
+      new (alloc()) LBigIntLsh(useRegister(ins->lhs()), useRegister(ins->rhs()),
+                               temp(), shiftAlloc, temp());
+  define(lir, ins);
+  assignSafepoint(lir, ins);
+}
+
+void LIRGeneratorX86Shared::lowerBigIntRsh(MBigIntRsh* ins) {
+  // Shift operand should be in register ecx, unless BMI2 is available.
+  // x86 can't shift a non-ecx register.
+  LDefinition shiftAlloc = Assembler::HasBMI2() ? temp() : tempFixed(ecx);
+  auto* lir =
+      new (alloc()) LBigIntRsh(useRegister(ins->lhs()), useRegister(ins->rhs()),
+                               temp(), shiftAlloc, temp());
+  define(lir, ins);
+  assignSafepoint(lir, ins);
+}
+
+void LIRGeneratorX86Shared::lowerWasmBuiltinTruncateToInt32(
+    MWasmBuiltinTruncateToInt32* ins) {
+  MDefinition* opd = ins->input();
+  MOZ_ASSERT(opd->type() == MIRType::Double || opd->type() == MIRType::Float32);
+
+  LDefinition maybeTemp =
+      Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempDouble();
+  if (opd->type() == MIRType::Double) {
+    define(new (alloc()) LWasmBuiltinTruncateDToInt32(
+               useRegister(opd), useFixed(ins->tls(), WasmTlsReg), maybeTemp),
+           ins);
+    return;
+  }
+
+  define(new (alloc()) LWasmBuiltinTruncateFToInt32(
+             useRegister(opd), useFixed(ins->tls(), WasmTlsReg), maybeTemp),
+         ins);
+}
+
+void LIRGeneratorX86Shared::lowerTruncateDToInt32(MTruncateToInt32* ins) {
+  MDefinition* opd = ins->input();
+  MOZ_ASSERT(opd->type() == MIRType::Double);
+
+  LDefinition maybeTemp =
+      Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempDouble();
+  define(new (alloc()) LTruncateDToInt32(useRegister(opd), maybeTemp), ins);
+}
+
+void LIRGeneratorX86Shared::lowerTruncateFToInt32(MTruncateToInt32* ins) {
+  MDefinition* opd = ins->input();
+  MOZ_ASSERT(opd->type() == MIRType::Float32);
+
+  LDefinition maybeTemp =
+      Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempFloat32();
+  define(new (alloc()) LTruncateFToInt32(useRegister(opd), maybeTemp), ins);
+}
+
+void LIRGeneratorX86Shared::lowerCompareExchangeTypedArrayElement(
+    MCompareExchangeTypedArrayElement* ins, bool useI386ByteRegisters) {
+  MOZ_ASSERT(ins->arrayType() != Scalar::Float32);
+  MOZ_ASSERT(ins->arrayType() != Scalar::Float64);
+
+  MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
+  MOZ_ASSERT(ins->index()->type() == MIRType::Int32);
+
+  const LUse elements = useRegister(ins->elements());
+  const LAllocation index = useRegisterOrConstant(ins->index());
+
+  // If the target is a floating register then we need a temp at the
+  // lower level; that temp must be eax.
+  //
+  // Otherwise the target (if used) is an integer register, which
+  // must be eax.  If the target is not used the machine code will
+  // still clobber eax, so just pretend it's used.
+  //
+  // oldval must be in a register.
+  //
+  // newval must be in a register.  If the source is a byte array
+  // then newval must be a register that has a byte size: on x86
+  // this must be ebx, ecx, or edx (eax is taken for the output).
+  //
+  // Bug #1077036 describes some further optimization opportunities.
+
+  bool fixedOutput = false;
+  LDefinition tempDef = LDefinition::BogusTemp();
+  LAllocation newval;
+  if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
+    tempDef = tempFixed(eax);
+    newval = useRegister(ins->newval());
+  } else {
+    fixedOutput = true;
+    if (useI386ByteRegisters && ins->isByteArray()) {
+      newval = useFixed(ins->newval(), ebx);
+    } else {
+      newval = useRegister(ins->newval());
+    }
+  }
+
+  const LAllocation oldval = useRegister(ins->oldval());
+
+  LCompareExchangeTypedArrayElement* lir =
+      new (alloc()) LCompareExchangeTypedArrayElement(elements, index, oldval,
+                                                      newval, tempDef);
+
+  if (fixedOutput) {
+    defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+  } else {
+    define(lir, ins);
+  }
+}
+
+void LIRGeneratorX86Shared::lowerAtomicExchangeTypedArrayElement(
+    MAtomicExchangeTypedArrayElement* ins, bool useI386ByteRegisters) {
+  MOZ_ASSERT(ins->arrayType() <= Scalar::Uint32);
+
+  MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
+  MOZ_ASSERT(ins->index()->type() == MIRType::Int32);
+
+  const LUse elements = useRegister(ins->elements());
+  const LAllocation index = useRegisterOrConstant(ins->index());
+  const LAllocation value = useRegister(ins->value());
+
+  // The underlying instruction is XCHG, which can operate on any
+  // register.
+  //
+  // If the target is a floating register (for Uint32) then we need
+  // a temp into which to exchange.
+  //
+  // If the source is a byte array then we need a register that has
+  // a byte size; in this case -- on x86 only -- pin the output to
+  // an appropriate register and use that as a temp in the back-end.
+
+  LDefinition tempDef = LDefinition::BogusTemp();
+  if (ins->arrayType() == Scalar::Uint32) {
+    MOZ_ASSERT(ins->type() == MIRType::Double);
+    tempDef = temp();
+  }
+
+  LAtomicExchangeTypedArrayElement* lir = new (alloc())
+      LAtomicExchangeTypedArrayElement(elements, index, value, tempDef);
+
+  if (useI386ByteRegisters && ins->isByteArray()) {
+    defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+  } else {
+    define(lir, ins);
+  }
+}
+
+void LIRGeneratorX86Shared::lowerAtomicTypedArrayElementBinop(
+    MAtomicTypedArrayElementBinop* ins, bool useI386ByteRegisters) {
+  MOZ_ASSERT(ins->arrayType() != Scalar::Uint8Clamped);
+  MOZ_ASSERT(ins->arrayType() != Scalar::Float32);
+  MOZ_ASSERT(ins->arrayType() != Scalar::Float64);
+
+  MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
+  MOZ_ASSERT(ins->index()->type() == MIRType::Int32);
+
+  const LUse elements = useRegister(ins->elements());
+  const LAllocation index = useRegisterOrConstant(ins->index());
+
+  // Case 1: the result of the operation is not used.
+  //
+  // We'll emit a single instruction: LOCK ADD, LOCK SUB, LOCK AND,
+  // LOCK OR, or LOCK XOR.  We can do this even for the Uint32 case.
+
+  if (!ins->hasUses()) {
+    LAllocation value;
+    if (useI386ByteRegisters && ins->isByteArray() &&
+        !ins->value()->isConstant()) {
+      value = useFixed(ins->value(), ebx);
+    } else {
+      value = useRegisterOrConstant(ins->value());
+    }
+
+    LAtomicTypedArrayElementBinopForEffect* lir = new (alloc())
+        LAtomicTypedArrayElementBinopForEffect(elements, index, value);
+
+    add(lir, ins);
+    return;
+  }
+
+  // Case 2: the result of the operation is used.
+  //
+  // For ADD and SUB we'll use XADD:
+  //
+  //    movl       src, output
+  //    lock xaddl output, mem
+  //
+  // For the 8-bit variants XADD needs a byte register for the output.
+  //
+  // For AND/OR/XOR we need to use a CMPXCHG loop:
+  //
+  //    movl          *mem, eax
+  // L: mov           eax, temp
+  //    andl          src, temp
+  //    lock cmpxchg  temp, mem  ; reads eax also
+  //    jnz           L
+  //    ; result in eax
+  //
+  // Note the placement of L, cmpxchg will update eax with *mem if
+  // *mem does not have the expected value, so reloading it at the
+  // top of the loop would be redundant.
+  //
+  // If the array is not a uint32 array then:
+  //  - eax should be the output (one result of the cmpxchg)
+  //  - there is a temp, which must have a byte register if
+  //    the array has 1-byte elements elements
+  //
+  // If the array is a uint32 array then:
+  //  - eax is the first temp
+  //  - we also need a second temp
+  //
+  // There are optimization opportunities:
+  //  - better register allocation in the x86 8-bit case, Bug #1077036.
+
+  bool bitOp = !(ins->operation() == AtomicFetchAddOp ||
+                 ins->operation() == AtomicFetchSubOp);
+  bool fixedOutput = true;
+  bool reuseInput = false;
+  LDefinition tempDef1 = LDefinition::BogusTemp();
+  LDefinition tempDef2 = LDefinition::BogusTemp();
+  LAllocation value;
+
+  if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
+    value = useRegisterOrConstant(ins->value());
+    fixedOutput = false;
+    if (bitOp) {
+      tempDef1 = tempFixed(eax);
+      tempDef2 = temp();
+    } else {
+      tempDef1 = temp();
+    }
+  } else if (useI386ByteRegisters && ins->isByteArray()) {
+    if (ins->value()->isConstant()) {
+      value = useRegisterOrConstant(ins->value());
+    } else {
+      value = useFixed(ins->value(), ebx);
+    }
+    if (bitOp) {
+      tempDef1 = tempFixed(ecx);
+    }
+  } else if (bitOp) {
+    value = useRegisterOrConstant(ins->value());
+    tempDef1 = temp();
+  } else if (ins->value()->isConstant()) {
+    fixedOutput = false;
+    value = useRegisterOrConstant(ins->value());
+  } else {
+    fixedOutput = false;
+    reuseInput = true;
+    value = useRegisterAtStart(ins->value());
+  }
+
+  LAtomicTypedArrayElementBinop* lir = new (alloc())
+      LAtomicTypedArrayElementBinop(elements, index, value, tempDef1, tempDef2);
+
+  if (fixedOutput) {
+    defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+  } else if (reuseInput) {
+    defineReuseInput(lir, ins, LAtomicTypedArrayElementBinop::valueOp);
+  } else {
+    define(lir, ins);
+  }
+}
+
+void LIRGenerator::visitCopySign(MCopySign* ins) {
+  MDefinition* lhs = ins->lhs();
+  MDefinition* rhs = ins->rhs();
+
+  MOZ_ASSERT(IsFloatingPointType(lhs->type()));
+  MOZ_ASSERT(lhs->type() == rhs->type());
+  MOZ_ASSERT(lhs->type() == ins->type());
+
+  LInstructionHelper<1, 2, 2>* lir;
+  if (lhs->type() == MIRType::Double) {
+    lir = new (alloc()) LCopySignD();
+  } else {
+    lir = new (alloc()) LCopySignF();
+  }
+
+  // As lowerForFPU, but we want rhs to be in a FP register too.
+  lir->setOperand(0, useRegisterAtStart(lhs));
+  if (!Assembler::HasAVX()) {
+    lir->setOperand(1, lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs));
+    defineReuseInput(lir, ins, 0);
+  } else {
+    lir->setOperand(1, useRegisterAtStart(rhs));
+    define(lir, ins);
+  }
+}
+
+#ifdef ENABLE_WASM_SIMD
+
+// These lowerings are really x86-shared but some Masm APIs are not yet
+// available on x86.
+
+// Ternary and binary operators require the dest register to be the same as
+// their first input register, leading to a pattern of useRegisterAtStart +
+// defineReuseInput.
+
+void LIRGenerator::visitWasmBitselectSimd128(MWasmBitselectSimd128* ins) {
+  MOZ_ASSERT(ins->lhs()->type() == MIRType::Simd128);
+  MOZ_ASSERT(ins->rhs()->type() == MIRType::Simd128);
+  MOZ_ASSERT(ins->control()->type() == MIRType::Simd128);
+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
+
+  // Enforcing lhs == output avoids one setup move.  We would like to also
+  // enforce merging the control with the temp (with usRegisterAtStart(control)
+  // and tempCopy()), but the register allocator ignores those constraints
+  // at present.
+
+  auto* lir = new (alloc()) LWasmBitselectSimd128(
+      useRegisterAtStart(ins->lhs()), useRegister(ins->rhs()),
+      useRegister(ins->control()), tempSimd128());
+  defineReuseInput(lir, ins, LWasmBitselectSimd128::LhsDest);
+}
+
+void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
+  MDefinition* lhs = ins->lhs();
+  MDefinition* rhs = ins->rhs();
+  wasm::SimdOp op = ins->simdOp();
+
+  MOZ_ASSERT(lhs->type() == MIRType::Simd128);
+  MOZ_ASSERT(rhs->type() == MIRType::Simd128);
+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
+
+  if (ins->isCommutative()) {
+    ReorderCommutative(&lhs, &rhs, ins);
+  }
+
+  // Swap operands and change operation if necessary, these are all x86/x64
+  // dependent transformations.  Except where noted, this is about avoiding
+  // unnecessary moves and fixups in the code generator macros.
+  bool swap = false;
+  switch (op) {
+    case wasm::SimdOp::V128AndNot: {
+      // Code generation requires the operands to be reversed.
+      swap = true;
+      break;
+    }
+    case wasm::SimdOp::I8x16LtS: {
+      swap = true;
+      op = wasm::SimdOp::I8x16GtS;
+      break;
+    }
+    case wasm::SimdOp::I8x16GeS: {
+      swap = true;
+      op = wasm::SimdOp::I8x16LeS;
+      break;
+    }
+    case wasm::SimdOp::I16x8LtS: {
+      swap = true;
+      op = wasm::SimdOp::I16x8GtS;
+      break;
+    }
+    case wasm::SimdOp::I16x8GeS: {
+      swap = true;
+      op = wasm::SimdOp::I16x8LeS;
+      break;
+    }
+    case wasm::SimdOp::I32x4LtS: {
+      swap = true;
+      op = wasm::SimdOp::I32x4GtS;
+      break;
+    }
+    case wasm::SimdOp::I32x4GeS: {
+      swap = true;
+      op = wasm::SimdOp::I32x4LeS;
+      break;
+    }
+    case wasm::SimdOp::F32x4Gt: {
+      swap = true;
+      op = wasm::SimdOp::F32x4Lt;
+      break;
+    }
+    case wasm::SimdOp::F32x4Ge: {
+      swap = true;
+      op = wasm::SimdOp::F32x4Le;
+      break;
+    }
+    case wasm::SimdOp::F64x2Gt: {
+      swap = true;
+      op = wasm::SimdOp::F64x2Lt;
+      break;
+    }
+    case wasm::SimdOp::F64x2Ge: {
+      swap = true;
+      op = wasm::SimdOp::F64x2Le;
+      break;
+    }
+    case wasm::SimdOp::F32x4PMin:
+    case wasm::SimdOp::F32x4PMax:
+    case wasm::SimdOp::F64x2PMin:
+    case wasm::SimdOp::F64x2PMax: {
+      // Code generation requires the operations to be reversed (the rhs is the
+      // output register).
+      swap = true;
+      break;
+    }
+    default:
+      break;
+  }
+  if (swap) {
+    MDefinition* tmp = lhs;
+    lhs = rhs;
+    rhs = tmp;
+  }
+
+  // Allocate temp registers
+  LDefinition tempReg0 = LDefinition::BogusTemp();
+  LDefinition tempReg1 = LDefinition::BogusTemp();
+  switch (op) {
+    case wasm::SimdOp::I64x2Mul:
+    case wasm::SimdOp::V8x16Swizzle:
+      tempReg0 = tempSimd128();
+      break;
+    case wasm::SimdOp::F32x4Min:
+    case wasm::SimdOp::F32x4Max:
+    case wasm::SimdOp::F64x2Min:
+    case wasm::SimdOp::F64x2Max:
+    case wasm::SimdOp::I8x16LtU:
+    case wasm::SimdOp::I8x16GtU:
+    case wasm::SimdOp::I8x16LeU:
+    case wasm::SimdOp::I8x16GeU:
+    case wasm::SimdOp::I16x8LtU:
+    case wasm::SimdOp::I16x8GtU:
+    case wasm::SimdOp::I16x8LeU:
+    case wasm::SimdOp::I16x8GeU:
+    case wasm::SimdOp::I32x4LtU:
+    case wasm::SimdOp::I32x4GtU:
+    case wasm::SimdOp::I32x4LeU:
+    case wasm::SimdOp::I32x4GeU:
+      tempReg0 = tempSimd128();
+      tempReg1 = tempSimd128();
+      break;
+    default:
+      break;
+  }
+
+  // For binary ops, the Masm API always is usually (rhs, lhsDest) and requires
+  // AtStart+ReuseInput for the lhs.
+  //
+  // The rhs is tricky due to register allocator restrictions:
+  //  - if lhs == rhs and lhs is AtStart then rhs must be AtStart too
+  //  - if lhs != rhs and lhs is AtStart then rhs must not be AtStart,
+  //    this appears to have something to do with risk of the rhs
+  //    being clobbered.  Anyway it doesn't matter much, since the
+  //    liveness of rhs will not prevent the lhs register to be reused
+  //    for the output.
+  //
+  // For a few ops, the API is actually (rhsDest, lhs) and the rules are the
+  // same but the reversed.  We swapped operands above; they will be swapped
+  // again in the code generator to emit the right code.
+
+  LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
+  LAllocation rhsAlloc =
+      lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
+  auto* lir = new (alloc())
+      LWasmBinarySimd128(op, lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
+  defineReuseInput(lir, ins, LWasmBinarySimd128::LhsDest);
+}
+
+bool MWasmBinarySimd128::specializeForConstantRhs() {
+  // The order follows MacroAssembler.h, generally
+  switch (simdOp()) {
+    // Operations implemented by a single native instruction where it is
+    // plausible that the rhs (after commutation if available) could be a
+    // constant.
+    //
+    // Swizzle is not here because it was handled earlier in the pipeline.
+    //
+    // Integer compares >= and < are not here because they are not supported in
+    // the hardware.
+    //
+    // Floating compares are not here because our patching machinery can't
+    // handle them yet.
+    //
+    // Floating-point min and max (including pmin and pmax) are not here because
+    // they are not straightforward to implement.
+    case wasm::SimdOp::I8x16Add:
+    case wasm::SimdOp::I16x8Add:
+    case wasm::SimdOp::I32x4Add:
+    case wasm::SimdOp::I64x2Add:
+    case wasm::SimdOp::I8x16Sub:
+    case wasm::SimdOp::I16x8Sub:
+    case wasm::SimdOp::I32x4Sub:
+    case wasm::SimdOp::I64x2Sub:
+    case wasm::SimdOp::I16x8Mul:
+    case wasm::SimdOp::I32x4Mul:
+    case wasm::SimdOp::I8x16AddSaturateS:
+    case wasm::SimdOp::I8x16AddSaturateU:
+    case wasm::SimdOp::I16x8AddSaturateS:
+    case wasm::SimdOp::I16x8AddSaturateU:
+    case wasm::SimdOp::I8x16SubSaturateS:
+    case wasm::SimdOp::I8x16SubSaturateU:
+    case wasm::SimdOp::I16x8SubSaturateS:
+    case wasm::SimdOp::I16x8SubSaturateU:
+    case wasm::SimdOp::I8x16MinS:
+    case wasm::SimdOp::I8x16MinU:
+    case wasm::SimdOp::I16x8MinS:
+    case wasm::SimdOp::I16x8MinU:
+    case wasm::SimdOp::I32x4MinS:
+    case wasm::SimdOp::I32x4MinU:
+    case wasm::SimdOp::I8x16MaxS:
+    case wasm::SimdOp::I8x16MaxU:
+    case wasm::SimdOp::I16x8MaxS:
+    case wasm::SimdOp::I16x8MaxU:
+    case wasm::SimdOp::I32x4MaxS:
+    case wasm::SimdOp::I32x4MaxU:
+    case wasm::SimdOp::V128And:
+    case wasm::SimdOp::V128Or:
+    case wasm::SimdOp::V128Xor:
+    case wasm::SimdOp::I8x16Eq:
+    case wasm::SimdOp::I8x16Ne:
+    case wasm::SimdOp::I8x16GtS:
+    case wasm::SimdOp::I8x16LeS:
+    case wasm::SimdOp::I16x8Eq:
+    case wasm::SimdOp::I16x8Ne:
+    case wasm::SimdOp::I16x8GtS:
+    case wasm::SimdOp::I16x8LeS:
+    case wasm::SimdOp::I32x4Eq:
+    case wasm::SimdOp::I32x4Ne:
+    case wasm::SimdOp::I32x4GtS:
+    case wasm::SimdOp::I32x4LeS:
+    case wasm::SimdOp::F32x4Eq:
+    case wasm::SimdOp::F32x4Ne:
+    case wasm::SimdOp::F32x4Lt:
+    case wasm::SimdOp::F32x4Le:
+    case wasm::SimdOp::F64x2Eq:
+    case wasm::SimdOp::F64x2Ne:
+    case wasm::SimdOp::F64x2Lt:
+    case wasm::SimdOp::F64x2Le:
+    case wasm::SimdOp::I32x4DotSI16x8:
+    case wasm::SimdOp::F32x4Add:
+    case wasm::SimdOp::F64x2Add:
+    case wasm::SimdOp::F32x4Sub:
+    case wasm::SimdOp::F64x2Sub:
+    case wasm::SimdOp::F32x4Div:
+    case wasm::SimdOp::F64x2Div:
+    case wasm::SimdOp::F32x4Mul:
+    case wasm::SimdOp::F64x2Mul:
+    case wasm::SimdOp::I8x16NarrowSI16x8:
+    case wasm::SimdOp::I8x16NarrowUI16x8:
+    case wasm::SimdOp::I16x8NarrowSI32x4:
+    case wasm::SimdOp::I16x8NarrowUI32x4:
+      return true;
+    default:
+      return false;
+  }
+}
+
+void LIRGenerator::visitWasmBinarySimd128WithConstant(
+    MWasmBinarySimd128WithConstant* ins) {
+  MDefinition* lhs = ins->lhs();
+
+  MOZ_ASSERT(lhs->type() == MIRType::Simd128);
+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
+
+  // Always beneficial to reuse the lhs register here, see discussion in
+  // visitWasmBinarySimd128() and also code in specializeForConstantRhs().
+
+  LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
+  auto* lir =
+      new (alloc()) LWasmBinarySimd128WithConstant(lhsDestAlloc, ins->rhs());
+  defineReuseInput(lir, ins, LWasmBinarySimd128WithConstant::LhsDest);
+}
+
+void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
+  MDefinition* lhs = ins->lhs();
+  MDefinition* rhs = ins->rhs();
+
+  MOZ_ASSERT(lhs->type() == MIRType::Simd128);
+  MOZ_ASSERT(rhs->type() == MIRType::Int32);
+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
+
+  if (rhs->isConstant()) {
+    LDefinition temp = LDefinition::BogusTemp();
+    int32_t shiftCount = rhs->toConstant()->toInt32();
+    switch (ins->simdOp()) {
+      case wasm::SimdOp::I8x16Shl:
+      case wasm::SimdOp::I8x16ShrU:
+        shiftCount &= 7;
+        break;
+      case wasm::SimdOp::I8x16ShrS:
+        shiftCount &= 7;
+        temp = tempSimd128();
+        break;
+      case wasm::SimdOp::I16x8Shl:
+      case wasm::SimdOp::I16x8ShrU:
+      case wasm::SimdOp::I16x8ShrS:
+        shiftCount &= 15;
+        break;
+      case wasm::SimdOp::I32x4Shl:
+      case wasm::SimdOp::I32x4ShrU:
+      case wasm::SimdOp::I32x4ShrS:
+        shiftCount &= 31;
+        break;
+      case wasm::SimdOp::I64x2Shl:
+      case wasm::SimdOp::I64x2ShrU:
+      case wasm::SimdOp::I64x2ShrS:
+        shiftCount &= 63;
+        break;
+      default:
+        MOZ_CRASH("Unexpected shift operation");
+    }
+#  ifdef DEBUG
+    js::wasm::ReportSimdAnalysis("shift -> constant shift");
+#  endif
+    // Almost always beneficial, and never detrimental, to reuse the input if
+    // possible.
+    auto* lir = new (alloc())
+        LWasmConstantShiftSimd128(useRegisterAtStart(lhs), temp, shiftCount);
+    defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
+    return;
+  }
+
+#  ifdef DEBUG
+  js::wasm::ReportSimdAnalysis("shift -> variable shift");
+#  endif
+
+  LDefinition tempReg0 = LDefinition::BogusTemp();
+  LDefinition tempReg1 = LDefinition::BogusTemp();
+  switch (ins->simdOp()) {
+    case wasm::SimdOp::I8x16Shl:
+    case wasm::SimdOp::I8x16ShrS:
+    case wasm::SimdOp::I8x16ShrU:
+      tempReg0 = temp();
+      tempReg1 = tempSimd128();
+      break;
+    default:
+      tempReg0 = temp();
+      break;
+  }
+
+  // Reusing the input if possible is never detrimental.
+  LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
+  LAllocation rhsAlloc = useRegisterAtStart(rhs);
+  auto* lir = new (alloc())
+      LWasmVariableShiftSimd128(lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
+  defineReuseInput(lir, ins, LWasmVariableShiftSimd128::LhsDest);
+}
+
+void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) {
+  MOZ_ASSERT(ins->lhs()->type() == MIRType::Simd128);
+  MOZ_ASSERT(ins->rhs()->type() == MIRType::Simd128);
+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
+
+  Shuffle s = AnalyzeShuffle(ins);
+#  ifdef DEBUG
+  ReportShuffleSpecialization(s);
+#  endif
+  switch (s.opd) {
+    case Shuffle::Operand::LEFT:
+    case Shuffle::Operand::RIGHT: {
+      LAllocation src;
+      // All permute operators currently favor reusing the input register so
+      // we're not currently exercising code paths below that do not reuse.
+      // Those paths have been exercised in the past however and are believed
+      // to be correct.
+      bool useAtStartAndReuse = false;
+      switch (*s.permuteOp) {
+        case LWasmPermuteSimd128::MOVE:
+        case LWasmPermuteSimd128::BROADCAST_8x16:
+        case LWasmPermuteSimd128::BROADCAST_16x8:
+        case LWasmPermuteSimd128::PERMUTE_8x16:
+        case LWasmPermuteSimd128::PERMUTE_16x8:
+        case LWasmPermuteSimd128::PERMUTE_32x4:
+        case LWasmPermuteSimd128::ROTATE_RIGHT_8x16:
+        case LWasmPermuteSimd128::SHIFT_LEFT_8x16:
+        case LWasmPermuteSimd128::SHIFT_RIGHT_8x16:
+          useAtStartAndReuse = true;
+          break;
+        default:
+          MOZ_CRASH("Unexpected operator");
+      }
+      if (s.opd == Shuffle::Operand::LEFT) {
+        if (useAtStartAndReuse) {
+          src = useRegisterAtStart(ins->lhs());
+        } else {
+          src = useRegister(ins->lhs());
+        }
+      } else {
+        if (useAtStartAndReuse) {
+          src = useRegisterAtStart(ins->rhs());
+        } else {
+          src = useRegister(ins->rhs());
+        }
+      }
+      auto* lir =
+          new (alloc()) LWasmPermuteSimd128(src, *s.permuteOp, s.control);
+      if (useAtStartAndReuse) {
+        defineReuseInput(lir, ins, LWasmPermuteSimd128::Src);
+      } else {
+        define(lir, ins);
+      }
+      break;
+    }
+    case Shuffle::Operand::BOTH:
+    case Shuffle::Operand::BOTH_SWAPPED: {
+      LDefinition temp = LDefinition::BogusTemp();
+      switch (*s.shuffleOp) {
+        case LWasmShuffleSimd128::BLEND_8x16:
+          temp = tempSimd128();
+          break;
+        default:
+          break;
+      }
+      LAllocation lhs;
+      LAllocation rhs;
+      if (s.opd == Shuffle::Operand::BOTH) {
+        lhs = useRegisterAtStart(ins->lhs());
+        rhs = useRegister(ins->rhs());
+      } else {
+        lhs = useRegisterAtStart(ins->rhs());
+        rhs = useRegister(ins->lhs());
+      }
+      auto* lir = new (alloc())
+          LWasmShuffleSimd128(lhs, rhs, temp, *s.shuffleOp, s.control);
+      defineReuseInput(lir, ins, LWasmShuffleSimd128::LhsDest);
+      break;
+    }
+  }
+}
+
+void LIRGenerator::visitWasmReplaceLaneSimd128(MWasmReplaceLaneSimd128* ins) {
+  MOZ_ASSERT(ins->lhs()->type() == MIRType::Simd128);
+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
+
+  // The Masm API is (rhs, lhsDest) and requires AtStart+ReuseInput for the lhs.
+  // For type reasons, the rhs will never be the same as the lhs and is
+  // therefore a plain Use.
+
+  if (ins->rhs()->type() == MIRType::Int64) {
+    auto* lir = new (alloc()) LWasmReplaceInt64LaneSimd128(
+        useRegisterAtStart(ins->lhs()), useInt64Register(ins->rhs()));
+    defineReuseInput(lir, ins, LWasmReplaceInt64LaneSimd128::LhsDest);
+  } else {
+    auto* lir = new (alloc()) LWasmReplaceLaneSimd128(
+        useRegisterAtStart(ins->lhs()), useRegister(ins->rhs()));
+    defineReuseInput(lir, ins, LWasmReplaceLaneSimd128::LhsDest);
+  }
+}
+
+void LIRGenerator::visitWasmScalarToSimd128(MWasmScalarToSimd128* ins) {
+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
+
+  switch (ins->input()->type()) {
+    case MIRType::Int64: {
+      // 64-bit integer splats.
+      // Load-and-(sign|zero)extend.
+      auto* lir = new (alloc())
+          LWasmInt64ToSimd128(useInt64RegisterAtStart(ins->input()));
+      define(lir, ins);
+      break;
+    }
+    case MIRType::Float32:
+    case MIRType::Double: {
+      // Floating-point splats.
+      // Ideally we save a move on SSE systems by reusing the input register,
+      // but since the input and output register types differ, we can't.
+      auto* lir =
+          new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input()));
+      define(lir, ins);
+      break;
+    }
+    default: {
+      // 32-bit integer splats.
+      auto* lir =
+          new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input()));
+      define(lir, ins);
+      break;
+    }
+  }
+}
+
+void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
+  MOZ_ASSERT(ins->input()->type() == MIRType::Simd128);
+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
+
+  bool useAtStart = false;
+  bool reuseInput = false;
+  LDefinition tempReg = LDefinition::BogusTemp();
+  switch (ins->simdOp()) {
+    case wasm::SimdOp::I8x16Neg:
+    case wasm::SimdOp::I16x8Neg:
+    case wasm::SimdOp::I32x4Neg:
+    case wasm::SimdOp::I64x2Neg:
+      // Prefer src != dest to avoid an unconditional src->temp move.
+      MOZ_ASSERT(!useAtStart && !reuseInput);
+      break;
+    case wasm::SimdOp::F32x4Neg:
+    case wasm::SimdOp::F64x2Neg:
+    case wasm::SimdOp::F32x4Abs:
+    case wasm::SimdOp::F64x2Abs:
+    case wasm::SimdOp::V128Not:
+    case wasm::SimdOp::F32x4Sqrt:
+    case wasm::SimdOp::F64x2Sqrt:
+    case wasm::SimdOp::I8x16Abs:
+    case wasm::SimdOp::I16x8Abs:
+    case wasm::SimdOp::I32x4Abs:
+    case wasm::SimdOp::I32x4TruncSSatF32x4:
+    case wasm::SimdOp::F32x4ConvertUI32x4:
+      // Prefer src == dest to avoid an unconditional src->dest move.
+      useAtStart = true;
+      reuseInput = true;
+      break;
+    case wasm::SimdOp::I32x4TruncUSatF32x4:
+      tempReg = tempSimd128();
+      // Prefer src == dest to avoid an unconditional src->dest move.
+      useAtStart = true;
+      reuseInput = true;
+      break;
+    case wasm::SimdOp::I16x8WidenLowSI8x16:
+    case wasm::SimdOp::I16x8WidenHighSI8x16:
+    case wasm::SimdOp::I16x8WidenLowUI8x16:
+    case wasm::SimdOp::I16x8WidenHighUI8x16:
+    case wasm::SimdOp::I32x4WidenLowSI16x8:
+    case wasm::SimdOp::I32x4WidenHighSI16x8:
+    case wasm::SimdOp::I32x4WidenLowUI16x8:
+    case wasm::SimdOp::I32x4WidenHighUI16x8:
+    case wasm::SimdOp::F32x4ConvertSI32x4:
+    case wasm::SimdOp::F32x4Ceil:
+    case wasm::SimdOp::F32x4Floor:
+    case wasm::SimdOp::F32x4Trunc:
+    case wasm::SimdOp::F32x4Nearest:
+    case wasm::SimdOp::F64x2Ceil:
+    case wasm::SimdOp::F64x2Floor:
+    case wasm::SimdOp::F64x2Trunc:
+    case wasm::SimdOp::F64x2Nearest:
+      // Prefer src == dest to exert the lowest register pressure on the
+      // surrounding code.
+      useAtStart = true;
+      MOZ_ASSERT(!reuseInput);
+      break;
+    default:
+      MOZ_CRASH("Unary SimdOp not implemented");
+  }
+
+  LUse inputUse =
+      useAtStart ? useRegisterAtStart(ins->input()) : useRegister(ins->input());
+  LWasmUnarySimd128* lir = new (alloc()) LWasmUnarySimd128(inputUse, tempReg);
+  if (reuseInput) {
+    defineReuseInput(lir, ins, LWasmUnarySimd128::Src);
+  } else {
+    define(lir, ins);
+  }
+}
+
+bool LIRGeneratorX86Shared::canFoldReduceSimd128AndBranch(wasm::SimdOp op) {
+  switch (op) {
+    case wasm::SimdOp::I8x16AnyTrue:
+    case wasm::SimdOp::I16x8AnyTrue:
+    case wasm::SimdOp::I32x4AnyTrue:
+    case wasm::SimdOp::I8x16AllTrue:
+    case wasm::SimdOp::I16x8AllTrue:
+    case wasm::SimdOp::I32x4AllTrue:
+    case wasm::SimdOp::I16x8Bitmask:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool LIRGeneratorX86Shared::canEmitWasmReduceSimd128AtUses(
+    MWasmReduceSimd128* ins) {
+  if (!ins->canEmitAtUses()) {
+    return false;
+  }
+  // Only specific ops generating int32.
+  if (ins->type() != MIRType::Int32) {
+    return false;
+  }
+  if (!canFoldReduceSimd128AndBranch(ins->simdOp())) {
+    return false;
+  }
+  // If never used then defer (it will be removed).
+  MUseIterator iter(ins->usesBegin());
+  if (iter == ins->usesEnd()) {
+    return true;
+  }
+  // We require an MTest consumer.
+  MNode* node = iter->consumer();
+  if (!node->isDefinition() || !node->toDefinition()->isTest()) {
+    return false;
+  }
+  // Defer only if there's only one use.
+  iter++;
+  return iter == ins->usesEnd();
+}
+
+void LIRGenerator::visitWasmReduceSimd128(MWasmReduceSimd128* ins) {
+  if (canEmitWasmReduceSimd128AtUses(ins)) {
+    emitAtUses(ins);
+    return;
+  }
+
+  // Reductions (any_true, all_true, bitmask, extract_lane) uniformly prefer
+  // useRegisterAtStart:
+  //
+  // - In most cases, the input type differs from the output type, so there's no
+  //   conflict and it doesn't really matter.
+  //
+  // - For extract_lane(0) on F32x4 and F64x2, input == output results in zero
+  //   code being generated.
+  //
+  // - For extract_lane(k > 0) on F32x4 and F64x2, allowing the input register
+  //   to be targeted lowers register pressure if it's the last use of the
+  //   input.
+
+  if (ins->type() == MIRType::Int64) {
+    auto* lir = new (alloc())
+        LWasmReduceSimd128ToInt64(useRegisterAtStart(ins->input()));
+    defineInt64(lir, ins);
+  } else {
+    // Ideally we would reuse the input register for floating extract_lane if
+    // the lane is zero, but constraints in the register allocator require the
+    // input and output register types to be the same.
+    auto* lir =
+        new (alloc()) LWasmReduceSimd128(useRegisterAtStart(ins->input()));
+    define(lir, ins);
+  }
+}
+
+#endif  // ENABLE_WASM_SIMD