1 files changed, 3416 insertions, 0 deletions
diff --git a/js/src/jit/arm64/MacroAssembler-arm64.cpp b/js/src/jit/arm64/MacroAssembler-arm64.cpp
new file mode 100644
index 0000000000..a4aff730e6
--- /dev/null
+++ b/js/src/jit/arm64/MacroAssembler-arm64.cpp
@@ -0,0 +1,3416 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "jit/arm64/MacroAssembler-arm64.h"
+
+#include "mozilla/MathAlgorithms.h"
+#include "mozilla/Maybe.h"
+
+#include "jsmath.h"
+
+#include "jit/arm64/MoveEmitter-arm64.h"
+#include "jit/arm64/SharedICRegisters-arm64.h"
+#include "jit/Bailouts.h"
+#include "jit/BaselineFrame.h"
+#include "jit/JitRuntime.h"
+#include "jit/MacroAssembler.h"
+#include "util/Memory.h"
+#include "vm/BigIntType.h"
+#include "vm/JitActivation.h"  // js::jit::JitActivation
+#include "vm/JSContext.h"
+#include "vm/StringType.h"
+
+#include "jit/MacroAssembler-inl.h"
+
+namespace js {
+namespace jit {
+
+enum class Width { _32 = 32, _64 = 64 };
+
+static inline ARMRegister X(Register r) { return ARMRegister(r, 64); }
+
+static inline ARMRegister X(MacroAssembler& masm, RegisterOrSP r) {
+  return masm.toARMRegister(r, 64);
+}
+
+static inline ARMRegister W(Register r) { return ARMRegister(r, 32); }
+
+static inline ARMRegister R(Register r, Width w) {
+  return ARMRegister(r, unsigned(w));
+}
+
+void MacroAssemblerCompat::boxValue(JSValueType type, Register src,
+                                    Register dest) {
+#ifdef DEBUG
+  if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN) {
+    Label upper32BitsZeroed;
+    movePtr(ImmWord(UINT32_MAX), dest);
+    asMasm().branchPtr(Assembler::BelowOrEqual, src, dest, &upper32BitsZeroed);
+    breakpoint();
+    bind(&upper32BitsZeroed);
+  }
+#endif
+  Orr(ARMRegister(dest, 64), ARMRegister(src, 64),
+      Operand(ImmShiftedTag(type).value));
+}
+
+#ifdef ENABLE_WASM_SIMD
+bool MacroAssembler::MustMaskShiftCountSimd128(wasm::SimdOp op, int32_t* mask) {
+  switch (op) {
+    case wasm::SimdOp::I8x16Shl:
+    case wasm::SimdOp::I8x16ShrU:
+    case wasm::SimdOp::I8x16ShrS:
+      *mask = 7;
+      break;
+    case wasm::SimdOp::I16x8Shl:
+    case wasm::SimdOp::I16x8ShrU:
+    case wasm::SimdOp::I16x8ShrS:
+      *mask = 15;
+      break;
+    case wasm::SimdOp::I32x4Shl:
+    case wasm::SimdOp::I32x4ShrU:
+    case wasm::SimdOp::I32x4ShrS:
+      *mask = 31;
+      break;
+    case wasm::SimdOp::I64x2Shl:
+    case wasm::SimdOp::I64x2ShrU:
+    case wasm::SimdOp::I64x2ShrS:
+      *mask = 63;
+      break;
+    default:
+      MOZ_CRASH("Unexpected shift operation");
+  }
+  return true;
+}
+#endif
+
+void MacroAssembler::clampDoubleToUint8(FloatRegister input, Register output) {
+  ARMRegister dest(output, 32);
+  Fcvtns(dest, ARMFPRegister(input, 64));
+
+  {
+    vixl::UseScratchRegisterScope temps(this);
+    const ARMRegister scratch32 = temps.AcquireW();
+
+    Mov(scratch32, Operand(0xff));
+    Cmp(dest, scratch32);
+    Csel(dest, dest, scratch32, LessThan);
+  }
+
+  Cmp(dest, Operand(0));
+  Csel(dest, dest, wzr, GreaterThan);
+}
+
+js::jit::MacroAssembler& MacroAssemblerCompat::asMasm() {
+  return *static_cast<js::jit::MacroAssembler*>(this);
+}
+
+const js::jit::MacroAssembler& MacroAssemblerCompat::asMasm() const {
+  return *static_cast<const js::jit::MacroAssembler*>(this);
+}
+
+vixl::MacroAssembler& MacroAssemblerCompat::asVIXL() {
+  return *static_cast<vixl::MacroAssembler*>(this);
+}
+
+const vixl::MacroAssembler& MacroAssemblerCompat::asVIXL() const {
+  return *static_cast<const vixl::MacroAssembler*>(this);
+}
+
+void MacroAssemblerCompat::mov(CodeLabel* label, Register dest) {
+  BufferOffset bo = movePatchablePtr(ImmWord(/* placeholder */ 0), dest);
+  label->patchAt()->bind(bo.getOffset());
+  label->setLinkMode(CodeLabel::MoveImmediate);
+}
+
+BufferOffset MacroAssemblerCompat::movePatchablePtr(ImmPtr ptr, Register dest) {
+  const size_t numInst = 1;           // Inserting one load instruction.
+  const unsigned numPoolEntries = 2;  // Every pool entry is 4 bytes.
+  uint8_t* literalAddr = (uint8_t*)(&ptr.value);  // TODO: Should be const.
+
+  // Scratch space for generating the load instruction.
+  //
+  // allocLiteralLoadEntry() will use InsertIndexIntoTag() to store a temporary
+  // index to the corresponding PoolEntry in the instruction itself.
+  //
+  // That index will be fixed up later when finishPool()
+  // walks over all marked loads and calls PatchConstantPoolLoad().
+  uint32_t instructionScratch = 0;
+
+  // Emit the instruction mask in the scratch space.
+  // The offset doesn't matter: it will be fixed up later.
+  vixl::Assembler::ldr((Instruction*)&instructionScratch, ARMRegister(dest, 64),
+                       0);
+
+  // Add the entry to the pool, fix up the LDR imm19 offset,
+  // and add the completed instruction to the buffer.
+  return allocLiteralLoadEntry(numInst, numPoolEntries,
+                               (uint8_t*)&instructionScratch, literalAddr);
+}
+
+BufferOffset MacroAssemblerCompat::movePatchablePtr(ImmWord ptr,
+                                                    Register dest) {
+  const size_t numInst = 1;           // Inserting one load instruction.
+  const unsigned numPoolEntries = 2;  // Every pool entry is 4 bytes.
+  uint8_t* literalAddr = (uint8_t*)(&ptr.value);
+
+  // Scratch space for generating the load instruction.
+  //
+  // allocLiteralLoadEntry() will use InsertIndexIntoTag() to store a temporary
+  // index to the corresponding PoolEntry in the instruction itself.
+  //
+  // That index will be fixed up later when finishPool()
+  // walks over all marked loads and calls PatchConstantPoolLoad().
+  uint32_t instructionScratch = 0;
+
+  // Emit the instruction mask in the scratch space.
+  // The offset doesn't matter: it will be fixed up later.
+  vixl::Assembler::ldr((Instruction*)&instructionScratch, ARMRegister(dest, 64),
+                       0);
+
+  // Add the entry to the pool, fix up the LDR imm19 offset,
+  // and add the completed instruction to the buffer.
+  return allocLiteralLoadEntry(numInst, numPoolEntries,
+                               (uint8_t*)&instructionScratch, literalAddr);
+}
+
+void MacroAssemblerCompat::loadPrivate(const Address& src, Register dest) {
+  loadPtr(src, dest);
+}
+
+void MacroAssemblerCompat::handleFailureWithHandlerTail(Label* profilerExitTail,
+                                                        Label* bailoutTail) {
+  // Fail rather than silently create wrong code.
+  MOZ_RELEASE_ASSERT(GetStackPointer64().Is(PseudoStackPointer64));
+
+  // Reserve space for exception information.
+  int64_t size = (sizeof(ResumeFromException) + 7) & ~7;
+  Sub(PseudoStackPointer64, PseudoStackPointer64, Operand(size));
+  syncStackPtr();
+
+  MOZ_ASSERT(!x0.Is(PseudoStackPointer64));
+  Mov(x0, PseudoStackPointer64);
+
+  // Call the handler.
+  using Fn = void (*)(ResumeFromException* rfe);
+  asMasm().setupUnalignedABICall(r1);
+  asMasm().passABIArg(r0);
+  asMasm().callWithABI<Fn, HandleException>(
+      MoveOp::GENERAL, CheckUnsafeCallWithABI::DontCheckHasExitFrame);
+
+  Label entryFrame;
+  Label catch_;
+  Label finally;
+  Label returnBaseline;
+  Label returnIon;
+  Label bailout;
+  Label wasm;
+  Label wasmCatch;
+
+  // Check the `asMasm` calls above didn't mess with the StackPointer identity.
+  MOZ_ASSERT(GetStackPointer64().Is(PseudoStackPointer64));
+
+  loadPtr(Address(PseudoStackPointer, ResumeFromException::offsetOfKind()), r0);
+  asMasm().branch32(Assembler::Equal, r0,
+                    Imm32(ExceptionResumeKind::EntryFrame), &entryFrame);
+  asMasm().branch32(Assembler::Equal, r0, Imm32(ExceptionResumeKind::Catch),
+                    &catch_);
+  asMasm().branch32(Assembler::Equal, r0, Imm32(ExceptionResumeKind::Finally),
+                    &finally);
+  asMasm().branch32(Assembler::Equal, r0,
+                    Imm32(ExceptionResumeKind::ForcedReturnBaseline),
+                    &returnBaseline);
+  asMasm().branch32(Assembler::Equal, r0,
+                    Imm32(ExceptionResumeKind::ForcedReturnIon), &returnIon);
+  asMasm().branch32(Assembler::Equal, r0, Imm32(ExceptionResumeKind::Bailout),
+                    &bailout);
+  asMasm().branch32(Assembler::Equal, r0, Imm32(ExceptionResumeKind::Wasm),
+                    &wasm);
+  asMasm().branch32(Assembler::Equal, r0, Imm32(ExceptionResumeKind::WasmCatch),
+                    &wasmCatch);
+
+  breakpoint();  // Invalid kind.
+
+  // No exception handler. Load the error value, restore state and return from
+  // the entry frame.
+  bind(&entryFrame);
+  moveValue(MagicValue(JS_ION_ERROR), JSReturnOperand);
+  loadPtr(
+      Address(PseudoStackPointer, ResumeFromException::offsetOfFramePointer()),
+      FramePointer);
+  loadPtr(
+      Address(PseudoStackPointer, ResumeFromException::offsetOfStackPointer()),
+      PseudoStackPointer);
+
+  // `retn` does indeed sync the stack pointer, but before doing that it reads
+  // from the stack.  Consequently, if we remove this call to syncStackPointer
+  // then we take on the requirement to prove that the immediately preceding
+  // loadPtr produces a value for PSP which maintains the SP <= PSP invariant.
+  // That's a proof burden we don't want to take on.  In general it would be
+  // good to move (at some time in the future, not now) to a world where
+  // *every* assignment to PSP or SP is followed immediately by a copy into
+  // the other register.  That would make all required correctness proofs
+  // trivial in the sense that it requires only local inspection of code
+  // immediately following (dominated by) any such assignment.
+  syncStackPtr();
+  retn(Imm32(1 * sizeof(void*)));  // Pop from stack and return.
+
+  // If we found a catch handler, this must be a baseline frame. Restore state
+  // and jump to the catch block.
+  bind(&catch_);
+  loadPtr(Address(PseudoStackPointer, ResumeFromException::offsetOfTarget()),
+          r0);
+  loadPtr(
+      Address(PseudoStackPointer, ResumeFromException::offsetOfFramePointer()),
+      FramePointer);
+  loadPtr(
+      Address(PseudoStackPointer, ResumeFromException::offsetOfStackPointer()),
+      PseudoStackPointer);
+  syncStackPtr();
+  Br(x0);
+
+  // If we found a finally block, this must be a baseline frame. Push two
+  // values expected by the finally block: the exception and BooleanValue(true).
+  bind(&finally);
+  ARMRegister exception = x1;
+  Ldr(exception, MemOperand(PseudoStackPointer64,
+                            ResumeFromException::offsetOfException()));
+  Ldr(x0,
+      MemOperand(PseudoStackPointer64, ResumeFromException::offsetOfTarget()));
+  Ldr(ARMRegister(FramePointer, 64),
+      MemOperand(PseudoStackPointer64,
+                 ResumeFromException::offsetOfFramePointer()));
+  Ldr(PseudoStackPointer64,
+      MemOperand(PseudoStackPointer64,
+                 ResumeFromException::offsetOfStackPointer()));
+  syncStackPtr();
+  push(exception);
+  pushValue(BooleanValue(true));
+  Br(x0);
+
+  // Return BaselineFrame->returnValue() to the caller.
+  // Used in debug mode and for GeneratorReturn.
+  Label profilingInstrumentation;
+  bind(&returnBaseline);
+  loadPtr(
+      Address(PseudoStackPointer, ResumeFromException::offsetOfFramePointer()),
+      FramePointer);
+  loadPtr(
+      Address(PseudoStackPointer, ResumeFromException::offsetOfStackPointer()),
+      PseudoStackPointer);
+  // See comment further up beginning "`retn` does indeed sync the stack
+  // pointer".  That comment applies here too.
+  syncStackPtr();
+  loadValue(Address(FramePointer, BaselineFrame::reverseOffsetOfReturnValue()),
+            JSReturnOperand);
+  jump(&profilingInstrumentation);
+
+  // Return the given value to the caller.
+  bind(&returnIon);
+  loadValue(
+      Address(PseudoStackPointer, ResumeFromException::offsetOfException()),
+      JSReturnOperand);
+  loadPtr(
+      Address(PseudoStackPointer, offsetof(ResumeFromException, framePointer)),
+      FramePointer);
+  loadPtr(
+      Address(PseudoStackPointer, offsetof(ResumeFromException, stackPointer)),
+      PseudoStackPointer);
+  syncStackPtr();
+
+  // If profiling is enabled, then update the lastProfilingFrame to refer to
+  // caller frame before returning. This code is shared by ForcedReturnIon
+  // and ForcedReturnBaseline.
+  bind(&profilingInstrumentation);
+  {
+    Label skipProfilingInstrumentation;
+    AbsoluteAddress addressOfEnabled(
+        asMasm().runtime()->geckoProfiler().addressOfEnabled());
+    asMasm().branch32(Assembler::Equal, addressOfEnabled, Imm32(0),
+                      &skipProfilingInstrumentation);
+    jump(profilerExitTail);
+    bind(&skipProfilingInstrumentation);
+  }
+
+  movePtr(FramePointer, PseudoStackPointer);
+  syncStackPtr();
+  vixl::MacroAssembler::Pop(ARMRegister(FramePointer, 64));
+
+  vixl::MacroAssembler::Pop(vixl::lr);
+  syncStackPtr();
+  vixl::MacroAssembler::Ret(vixl::lr);
+
+  // If we are bailing out to baseline to handle an exception, jump to the
+  // bailout tail stub. Load 1 (true) in x0 (ReturnReg) to indicate success.
+  bind(&bailout);
+  Ldr(x2, MemOperand(PseudoStackPointer64,
+                     ResumeFromException::offsetOfBailoutInfo()));
+  Ldr(PseudoStackPointer64,
+      MemOperand(PseudoStackPointer64,
+                 ResumeFromException::offsetOfStackPointer()));
+  syncStackPtr();
+  Mov(x0, 1);
+  jump(bailoutTail);
+
+  // If we are throwing and the innermost frame was a wasm frame, reset SP and
+  // FP; SP is pointing to the unwound return address to the wasm entry, so
+  // we can just ret().
+  bind(&wasm);
+  Ldr(x29, MemOperand(PseudoStackPointer64,
+                      ResumeFromException::offsetOfFramePointer()));
+  Ldr(PseudoStackPointer64,
+      MemOperand(PseudoStackPointer64,
+                 ResumeFromException::offsetOfStackPointer()));
+  syncStackPtr();
+  Mov(x23, int64_t(wasm::FailInstanceReg));
+  ret();
+
+  // Found a wasm catch handler, restore state and jump to it.
+  bind(&wasmCatch);
+  loadPtr(Address(PseudoStackPointer, ResumeFromException::offsetOfTarget()),
+          r0);
+  loadPtr(
+      Address(PseudoStackPointer, ResumeFromException::offsetOfFramePointer()),
+      r29);
+  loadPtr(
+      Address(PseudoStackPointer, ResumeFromException::offsetOfStackPointer()),
+      PseudoStackPointer);
+  syncStackPtr();
+  Br(x0);
+
+  MOZ_ASSERT(GetStackPointer64().Is(PseudoStackPointer64));
+}
+
+void MacroAssemblerCompat::profilerEnterFrame(Register framePtr,
+                                              Register scratch) {
+  asMasm().loadJSContext(scratch);
+  loadPtr(Address(scratch, offsetof(JSContext, profilingActivation_)), scratch);
+  storePtr(framePtr,
+           Address(scratch, JitActivation::offsetOfLastProfilingFrame()));
+  storePtr(ImmPtr(nullptr),
+           Address(scratch, JitActivation::offsetOfLastProfilingCallSite()));
+}
+
+void MacroAssemblerCompat::profilerExitFrame() {
+  jump(asMasm().runtime()->jitRuntime()->getProfilerExitFrameTail());
+}
+
+Assembler::Condition MacroAssemblerCompat::testStringTruthy(
+    bool truthy, const ValueOperand& value) {
+  vixl::UseScratchRegisterScope temps(this);
+  const Register scratch = temps.AcquireX().asUnsized();
+  const ARMRegister scratch32(scratch, 32);
+  const ARMRegister scratch64(scratch, 64);
+
+  MOZ_ASSERT(value.valueReg() != scratch);
+
+  unboxString(value, scratch);
+  Ldr(scratch32, MemOperand(scratch64, JSString::offsetOfLength()));
+  Cmp(scratch32, Operand(0));
+  return truthy ? Condition::NonZero : Condition::Zero;
+}
+
+Assembler::Condition MacroAssemblerCompat::testBigIntTruthy(
+    bool truthy, const ValueOperand& value) {
+  vixl::UseScratchRegisterScope temps(this);
+  const Register scratch = temps.AcquireX().asUnsized();
+
+  MOZ_ASSERT(value.valueReg() != scratch);
+
+  unboxBigInt(value, scratch);
+  load32(Address(scratch, BigInt::offsetOfDigitLength()), scratch);
+  cmp32(scratch, Imm32(0));
+  return truthy ? Condition::NonZero : Condition::Zero;
+}
+
+void MacroAssemblerCompat::breakpoint() {
+  // Note, other payloads are possible, but GDB is known to misinterpret them
+  // sometimes and iloop on the breakpoint instead of stopping properly.
+  Brk(0);
+}
+
+// Either `any` is valid or `sixtyfour` is valid.  Return a 32-bit ARMRegister
+// in the first case and an ARMRegister of the desired size in the latter case.
+
+static inline ARMRegister SelectGPReg(AnyRegister any, Register64 sixtyfour,
+                                      unsigned size = 64) {
+  MOZ_ASSERT(any.isValid() != (sixtyfour != Register64::Invalid()));
+
+  if (sixtyfour == Register64::Invalid()) {
+    return ARMRegister(any.gpr(), 32);
+  }
+
+  return ARMRegister(sixtyfour.reg, size);
+}
+
+// Assert that `sixtyfour` is invalid and then return an FP register from `any`
+// of the desired size.
+
+static inline ARMFPRegister SelectFPReg(AnyRegister any, Register64 sixtyfour,
+                                        unsigned size) {
+  MOZ_ASSERT(sixtyfour == Register64::Invalid());
+  return ARMFPRegister(any.fpu(), size);
+}
+
+void MacroAssemblerCompat::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
+                                        Register memoryBase_, Register ptr_,
+                                        AnyRegister outany, Register64 out64) {
+  uint32_t offset = access.offset();
+  MOZ_ASSERT(offset < asMasm().wasmMaxOffsetGuardLimit());
+
+  ARMRegister memoryBase(memoryBase_, 64);
+  ARMRegister ptr(ptr_, 64);
+  if (offset) {
+    vixl::UseScratchRegisterScope temps(this);
+    ARMRegister scratch = temps.AcquireX();
+    Add(scratch, ptr, Operand(offset));
+    MemOperand srcAddr(memoryBase, scratch);
+    wasmLoadImpl(access, srcAddr, outany, out64);
+  } else {
+    MemOperand srcAddr(memoryBase, ptr);
+    wasmLoadImpl(access, srcAddr, outany, out64);
+  }
+}
+
+void MacroAssemblerCompat::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
+                                        MemOperand srcAddr, AnyRegister outany,
+                                        Register64 out64) {
+  // Reg+Reg and Reg+SmallImm addressing is directly encodable in one Load
+  // instruction, hence we expect exactly one instruction to be emitted in the
+  // window.
+  int32_t instructionsExpected = 1;
+
+  // Splat and widen however require an additional instruction to be emitted
+  // after the load, so allow one more instruction in the window.
+  if (access.isSplatSimd128Load() || access.isWidenSimd128Load()) {
+    MOZ_ASSERT(access.type() == Scalar::Float64);
+    instructionsExpected++;
+  }
+
+  // NOTE: the generated code must match the assembly code in gen_load in
+  // GenerateAtomicOperations.py
+  asMasm().memoryBarrierBefore(access.sync());
+
+  {
+    // The AutoForbidPoolsAndNops asserts if we emit more than the expected
+    // number of instructions and thus ensures that the access metadata is
+    // emitted at the address of the Load.
+    AutoForbidPoolsAndNops afp(this, instructionsExpected);
+
+    append(access, asMasm().currentOffset());
+    switch (access.type()) {
+      case Scalar::Int8:
+        Ldrsb(SelectGPReg(outany, out64), srcAddr);
+        break;
+      case Scalar::Uint8:
+        Ldrb(SelectGPReg(outany, out64), srcAddr);
+        break;
+      case Scalar::Int16:
+        Ldrsh(SelectGPReg(outany, out64), srcAddr);
+        break;
+      case Scalar::Uint16:
+        Ldrh(SelectGPReg(outany, out64), srcAddr);
+        break;
+      case Scalar::Int32:
+        if (out64 != Register64::Invalid()) {
+          Ldrsw(SelectGPReg(outany, out64), srcAddr);
+        } else {
+          Ldr(SelectGPReg(outany, out64, 32), srcAddr);
+        }
+        break;
+      case Scalar::Uint32:
+        Ldr(SelectGPReg(outany, out64, 32), srcAddr);
+        break;
+      case Scalar::Int64:
+        Ldr(SelectGPReg(outany, out64), srcAddr);
+        break;
+      case Scalar::Float32:
+        // LDR does the right thing also for access.isZeroExtendSimd128Load()
+        Ldr(SelectFPReg(outany, out64, 32), srcAddr);
+        break;
+      case Scalar::Float64:
+        if (access.isSplatSimd128Load() || access.isWidenSimd128Load()) {
+          ScratchSimd128Scope scratch_(asMasm());
+          ARMFPRegister scratch = Simd1D(scratch_);
+          Ldr(scratch, srcAddr);
+          if (access.isSplatSimd128Load()) {
+            Dup(SelectFPReg(outany, out64, 128).V2D(), scratch, 0);
+          } else {
+            MOZ_ASSERT(access.isWidenSimd128Load());
+            switch (access.widenSimdOp()) {
+              case wasm::SimdOp::V128Load8x8S:
+                Sshll(SelectFPReg(outany, out64, 128).V8H(), scratch.V8B(), 0);
+                break;
+              case wasm::SimdOp::V128Load8x8U:
+                Ushll(SelectFPReg(outany, out64, 128).V8H(), scratch.V8B(), 0);
+                break;
+              case wasm::SimdOp::V128Load16x4S:
+                Sshll(SelectFPReg(outany, out64, 128).V4S(), scratch.V4H(), 0);
+                break;
+              case wasm::SimdOp::V128Load16x4U:
+                Ushll(SelectFPReg(outany, out64, 128).V4S(), scratch.V4H(), 0);
+                break;
+              case wasm::SimdOp::V128Load32x2S:
+                Sshll(SelectFPReg(outany, out64, 128).V2D(), scratch.V2S(), 0);
+                break;
+              case wasm::SimdOp::V128Load32x2U:
+                Ushll(SelectFPReg(outany, out64, 128).V2D(), scratch.V2S(), 0);
+                break;
+              default:
+                MOZ_CRASH("Unexpected widening op for wasmLoad");
+            }
+          }
+        } else {
+          // LDR does the right thing also for access.isZeroExtendSimd128Load()
+          Ldr(SelectFPReg(outany, out64, 64), srcAddr);
+        }
+        break;
+      case Scalar::Simd128:
+        Ldr(SelectFPReg(outany, out64, 128), srcAddr);
+        break;
+      case Scalar::Uint8Clamped:
+      case Scalar::BigInt64:
+      case Scalar::BigUint64:
+      case Scalar::MaxTypedArrayViewType:
+        MOZ_CRASH("unexpected array type");
+    }
+  }
+
+  asMasm().memoryBarrierAfter(access.sync());
+}
+
+// Return true if `address` can be represented as an immediate (possibly scaled
+// by the access size) in an LDR/STR type instruction.
+//
+// For more about the logic here, see vixl::MacroAssembler::LoadStoreMacro().
+static bool IsLSImmediateOffset(uint64_t address, size_t accessByteSize) {
+  // The predicates below operate on signed values only.
+  if (address > INT64_MAX) {
+    return false;
+  }
+
+  // The access size is always a power of 2, so computing the log amounts to
+  // counting trailing zeroes.
+  unsigned logAccessSize = mozilla::CountTrailingZeroes32(accessByteSize);
+  return (MacroAssemblerCompat::IsImmLSUnscaled(int64_t(address)) ||
+          MacroAssemblerCompat::IsImmLSScaled(int64_t(address), logAccessSize));
+}
+
+void MacroAssemblerCompat::wasmLoadAbsolute(
+    const wasm::MemoryAccessDesc& access, Register memoryBase, uint64_t address,
+    AnyRegister output, Register64 out64) {
+  if (!IsLSImmediateOffset(address, access.byteSize())) {
+    // The access will require the constant to be loaded into a temp register.
+    // Do so here, to keep the logic in wasmLoadImpl() tractable wrt emitting
+    // trap information.
+    //
+    // Almost all constant addresses will in practice be handled by a single MOV
+    // so do not worry about additional optimizations here.
+    vixl::UseScratchRegisterScope temps(this);
+    ARMRegister scratch = temps.AcquireX();
+    Mov(scratch, address);
+    MemOperand srcAddr(X(memoryBase), scratch);
+    wasmLoadImpl(access, srcAddr, output, out64);
+  } else {
+    MemOperand srcAddr(X(memoryBase), address);
+    wasmLoadImpl(access, srcAddr, output, out64);
+  }
+}
+
+void MacroAssemblerCompat::wasmStoreImpl(const wasm::MemoryAccessDesc& access,
+                                         AnyRegister valany, Register64 val64,
+                                         Register memoryBase_, Register ptr_) {
+  uint32_t offset = access.offset();
+  MOZ_ASSERT(offset < asMasm().wasmMaxOffsetGuardLimit());
+
+  ARMRegister memoryBase(memoryBase_, 64);
+  ARMRegister ptr(ptr_, 64);
+  if (offset) {
+    vixl::UseScratchRegisterScope temps(this);
+    ARMRegister scratch = temps.AcquireX();
+    Add(scratch, ptr, Operand(offset));
+    MemOperand destAddr(memoryBase, scratch);
+    wasmStoreImpl(access, destAddr, valany, val64);
+  } else {
+    MemOperand destAddr(memoryBase, ptr);
+    wasmStoreImpl(access, destAddr, valany, val64);
+  }
+}
+
+void MacroAssemblerCompat::wasmStoreImpl(const wasm::MemoryAccessDesc& access,
+                                         MemOperand dstAddr, AnyRegister valany,
+                                         Register64 val64) {
+  // NOTE: the generated code must match the assembly code in gen_store in
+  // GenerateAtomicOperations.py
+  asMasm().memoryBarrierBefore(access.sync());
+
+  {
+    // Reg+Reg addressing is directly encodable in one Store instruction, hence
+    // the AutoForbidPoolsAndNops will ensure that the access metadata is
+    // emitted at the address of the Store.  The AutoForbidPoolsAndNops will
+    // assert if we emit more than one instruction.
+
+    AutoForbidPoolsAndNops afp(this,
+                               /* max number of instructions in scope = */ 1);
+
+    append(access, asMasm().currentOffset());
+    switch (access.type()) {
+      case Scalar::Int8:
+      case Scalar::Uint8:
+        Strb(SelectGPReg(valany, val64), dstAddr);
+        break;
+      case Scalar::Int16:
+      case Scalar::Uint16:
+        Strh(SelectGPReg(valany, val64), dstAddr);
+        break;
+      case Scalar::Int32:
+      case Scalar::Uint32:
+        Str(SelectGPReg(valany, val64), dstAddr);
+        break;
+      case Scalar::Int64:
+        Str(SelectGPReg(valany, val64), dstAddr);
+        break;
+      case Scalar::Float32:
+        Str(SelectFPReg(valany, val64, 32), dstAddr);
+        break;
+      case Scalar::Float64:
+        Str(SelectFPReg(valany, val64, 64), dstAddr);
+        break;
+      case Scalar::Simd128:
+        Str(SelectFPReg(valany, val64, 128), dstAddr);
+        break;
+      case Scalar::Uint8Clamped:
+      case Scalar::BigInt64:
+      case Scalar::BigUint64:
+      case Scalar::MaxTypedArrayViewType:
+        MOZ_CRASH("unexpected array type");
+    }
+  }
+
+  asMasm().memoryBarrierAfter(access.sync());
+}
+
+void MacroAssemblerCompat::wasmStoreAbsolute(
+    const wasm::MemoryAccessDesc& access, AnyRegister value, Register64 value64,
+    Register memoryBase, uint64_t address) {
+  // See comments in wasmLoadAbsolute.
+  unsigned logAccessSize = mozilla::CountTrailingZeroes32(access.byteSize());
+  if (address > INT64_MAX || !(IsImmLSScaled(int64_t(address), logAccessSize) ||
+                               IsImmLSUnscaled(int64_t(address)))) {
+    vixl::UseScratchRegisterScope temps(this);
+    ARMRegister scratch = temps.AcquireX();
+    Mov(scratch, address);
+    MemOperand destAddr(X(memoryBase), scratch);
+    wasmStoreImpl(access, destAddr, value, value64);
+  } else {
+    MemOperand destAddr(X(memoryBase), address);
+    wasmStoreImpl(access, destAddr, value, value64);
+  }
+}
+
+void MacroAssemblerCompat::compareSimd128Int(Assembler::Condition cond,
+                                             ARMFPRegister dest,
+                                             ARMFPRegister lhs,
+                                             ARMFPRegister rhs) {
+  switch (cond) {
+    case Assembler::Equal:
+      Cmeq(dest, lhs, rhs);
+      break;
+    case Assembler::NotEqual:
+      Cmeq(dest, lhs, rhs);
+      Mvn(dest, dest);
+      break;
+    case Assembler::GreaterThan:
+      Cmgt(dest, lhs, rhs);
+      break;
+    case Assembler::GreaterThanOrEqual:
+      Cmge(dest, lhs, rhs);
+      break;
+    case Assembler::LessThan:
+      Cmgt(dest, rhs, lhs);
+      break;
+    case Assembler::LessThanOrEqual:
+      Cmge(dest, rhs, lhs);
+      break;
+    case Assembler::Above:
+      Cmhi(dest, lhs, rhs);
+      break;
+    case Assembler::AboveOrEqual:
+      Cmhs(dest, lhs, rhs);
+      break;
+    case Assembler::Below:
+      Cmhi(dest, rhs, lhs);
+      break;
+    case Assembler::BelowOrEqual:
+      Cmhs(dest, rhs, lhs);
+      break;
+    default:
+      MOZ_CRASH("Unexpected SIMD integer condition");
+  }
+}
+
+void MacroAssemblerCompat::compareSimd128Float(Assembler::Condition cond,
+                                               ARMFPRegister dest,
+                                               ARMFPRegister lhs,
+                                               ARMFPRegister rhs) {
+  switch (cond) {
+    case Assembler::Equal:
+      Fcmeq(dest, lhs, rhs);
+      break;
+    case Assembler::NotEqual:
+      Fcmeq(dest, lhs, rhs);
+      Mvn(dest, dest);
+      break;
+    case Assembler::GreaterThan:
+      Fcmgt(dest, lhs, rhs);
+      break;
+    case Assembler::GreaterThanOrEqual:
+      Fcmge(dest, lhs, rhs);
+      break;
+    case Assembler::LessThan:
+      Fcmgt(dest, rhs, lhs);
+      break;
+    case Assembler::LessThanOrEqual:
+      Fcmge(dest, rhs, lhs);
+      break;
+    default:
+      MOZ_CRASH("Unexpected SIMD integer condition");
+  }
+}
+
+void MacroAssemblerCompat::rightShiftInt8x16(FloatRegister lhs, Register rhs,
+                                             FloatRegister dest,
+                                             bool isUnsigned) {
+  ScratchSimd128Scope scratch_(asMasm());
+  ARMFPRegister shift = Simd16B(scratch_);
+
+  Dup(shift, ARMRegister(rhs, 32));
+  Neg(shift, shift);
+
+  if (isUnsigned) {
+    Ushl(Simd16B(dest), Simd16B(lhs), shift);
+  } else {
+    Sshl(Simd16B(dest), Simd16B(lhs), shift);
+  }
+}
+
+void MacroAssemblerCompat::rightShiftInt16x8(FloatRegister lhs, Register rhs,
+                                             FloatRegister dest,
+                                             bool isUnsigned) {
+  ScratchSimd128Scope scratch_(asMasm());
+  ARMFPRegister shift = Simd8H(scratch_);
+
+  Dup(shift, ARMRegister(rhs, 32));
+  Neg(shift, shift);
+
+  if (isUnsigned) {
+    Ushl(Simd8H(dest), Simd8H(lhs), shift);
+  } else {
+    Sshl(Simd8H(dest), Simd8H(lhs), shift);
+  }
+}
+
+void MacroAssemblerCompat::rightShiftInt32x4(FloatRegister lhs, Register rhs,
+                                             FloatRegister dest,
+                                             bool isUnsigned) {
+  ScratchSimd128Scope scratch_(asMasm());
+  ARMFPRegister shift = Simd4S(scratch_);
+
+  Dup(shift, ARMRegister(rhs, 32));
+  Neg(shift, shift);
+
+  if (isUnsigned) {
+    Ushl(Simd4S(dest), Simd4S(lhs), shift);
+  } else {
+    Sshl(Simd4S(dest), Simd4S(lhs), shift);
+  }
+}
+
+void MacroAssemblerCompat::rightShiftInt64x2(FloatRegister lhs, Register rhs,
+                                             FloatRegister dest,
+                                             bool isUnsigned) {
+  ScratchSimd128Scope scratch_(asMasm());
+  ARMFPRegister shift = Simd2D(scratch_);
+
+  Dup(shift, ARMRegister(rhs, 64));
+  Neg(shift, shift);
+
+  if (isUnsigned) {
+    Ushl(Simd2D(dest), Simd2D(lhs), shift);
+  } else {
+    Sshl(Simd2D(dest), Simd2D(lhs), shift);
+  }
+}
+
+void MacroAssembler::reserveStack(uint32_t amount) {
+  // TODO: This bumps |sp| every time we reserve using a second register.
+  // It would save some instructions if we had a fixed frame size.
+  vixl::MacroAssembler::Claim(Operand(amount));
+  adjustFrame(amount);
+}
+
+void MacroAssembler::Push(RegisterOrSP reg) {
+  if (IsHiddenSP(reg)) {
+    push(sp);
+  } else {
+    push(AsRegister(reg));
+  }
+  adjustFrame(sizeof(intptr_t));
+}
+
+//{{{ check_macroassembler_style
+// ===============================================================
+// MacroAssembler high-level usage.
+
+void MacroAssembler::flush() { Assembler::flush(); }
+
+// ===============================================================
+// Stack manipulation functions.
+
+// Routines for saving/restoring registers on the stack.  The format is:
+//
+//   (highest address)
+//
+//   integer (X) regs in any order      size: 8 * # int regs
+//
+//   if # int regs is odd,
+//     then an 8 byte alignment hole    size: 0 or 8
+//
+//   double (D) regs in any order       size: 8 * # double regs
+//
+//   if # double regs is odd,
+//     then an 8 byte alignment hole    size: 0 or 8
+//
+//   vector (Q) regs in any order       size: 16 * # vector regs
+//
+//   (lowest address)
+//
+// Hence the size of the save area is 0 % 16.  And, provided that the base
+// (highest) address is 16-aligned, then the vector reg save/restore accesses
+// will also be 16-aligned, as will pairwise operations for the double regs.
+//
+// Implied by this is that the format of the double and vector dump area
+// corresponds with what FloatRegister::GetPushSizeInBytes computes.
+// See block comment in MacroAssembler.h for more details.
+
+size_t MacroAssembler::PushRegsInMaskSizeInBytes(LiveRegisterSet set) {
+  size_t numIntRegs = set.gprs().size();
+  return ((numIntRegs + 1) & ~1) * sizeof(intptr_t) +
+         FloatRegister::GetPushSizeInBytes(set.fpus());
+}
+
+// Generate code to dump the values in `set`, either on the stack if `dest` is
+// `Nothing` or working backwards from the address denoted by `dest` if it is
+// `Some`.  These two cases are combined so as to minimise the chance of
+// mistakenly generating different formats for the same `set`, given that the
+// `Some` `dest` case is used extremely rarely.
+static void PushOrStoreRegsInMask(MacroAssembler* masm, LiveRegisterSet set,
+                                  mozilla::Maybe<Address> dest) {
+  static_assert(sizeof(FloatRegisters::RegisterContent) == 16);
+
+  // If we're saving to arbitrary memory, check the destination is big enough.
+  if (dest) {
+    mozilla::DebugOnly<size_t> bytesRequired =
+        masm->PushRegsInMaskSizeInBytes(set);
+    MOZ_ASSERT(dest->offset >= 0);
+    MOZ_ASSERT(((size_t)dest->offset) >= bytesRequired);
+  }
+
+  // Note the high limit point; we'll check it again later.
+  mozilla::DebugOnly<size_t> maxExtentInitial =
+      dest ? dest->offset : masm->framePushed();
+
+  // Gather up the integer registers in groups of four, and either push each
+  // group as a single transfer so as to minimise the number of stack pointer
+  // changes, or write them individually to memory.  Take care to ensure the
+  // space used remains 16-aligned.
+  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more();) {
+    vixl::CPURegister src[4] = {vixl::NoCPUReg, vixl::NoCPUReg, vixl::NoCPUReg,
+                                vixl::NoCPUReg};
+    size_t i;
+    for (i = 0; i < 4 && iter.more(); i++) {
+      src[i] = ARMRegister(*iter, 64);
+      ++iter;
+    }
+    MOZ_ASSERT(i > 0);
+
+    if (i == 1 || i == 3) {
+      // Ensure the stack remains 16-aligned
+      MOZ_ASSERT(!iter.more());
+      src[i] = vixl::xzr;
+      i++;
+    }
+    MOZ_ASSERT(i == 2 || i == 4);
+
+    if (dest) {
+      for (size_t j = 0; j < i; j++) {
+        Register ireg = Register::FromCode(src[j].IsZero() ? Registers::xzr
+                                                           : src[j].code());
+        dest->offset -= sizeof(intptr_t);
+        masm->storePtr(ireg, *dest);
+      }
+    } else {
+      masm->adjustFrame(i * 8);
+      masm->vixl::MacroAssembler::Push(src[0], src[1], src[2], src[3]);
+    }
+  }
+
+  // Now the same for the FP double registers.  Note that because of how
+  // ReduceSetForPush works, an underlying AArch64 SIMD/FP register can either
+  // be present as a double register, or as a V128 register, but not both.
+  // Firstly, round up the registers to be pushed.
+
+  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
+  vixl::CPURegister allSrcs[FloatRegisters::TotalPhys];
+  size_t numAllSrcs = 0;
+
+  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
+    FloatRegister reg = *iter;
+    if (reg.isDouble()) {
+      MOZ_RELEASE_ASSERT(numAllSrcs < FloatRegisters::TotalPhys);
+      allSrcs[numAllSrcs] = ARMFPRegister(reg, 64);
+      numAllSrcs++;
+    } else {
+      MOZ_ASSERT(reg.isSimd128());
+    }
+  }
+  MOZ_RELEASE_ASSERT(numAllSrcs <= FloatRegisters::TotalPhys);
+
+  if ((numAllSrcs & 1) == 1) {
+    // We've got an odd number of doubles.  In order to maintain 16-alignment,
+    // push the last register twice.  We'll skip over the duplicate in
+    // PopRegsInMaskIgnore.
+    allSrcs[numAllSrcs] = allSrcs[numAllSrcs - 1];
+    numAllSrcs++;
+  }
+  MOZ_RELEASE_ASSERT(numAllSrcs <= FloatRegisters::TotalPhys);
+  MOZ_RELEASE_ASSERT((numAllSrcs & 1) == 0);
+
+  // And now generate the transfers.
+  size_t i;
+  if (dest) {
+    for (i = 0; i < numAllSrcs; i++) {
+      FloatRegister freg =
+          FloatRegister(FloatRegisters::FPRegisterID(allSrcs[i].code()),
+                        FloatRegisters::Kind::Double);
+      dest->offset -= sizeof(double);
+      masm->storeDouble(freg, *dest);
+    }
+  } else {
+    i = 0;
+    while (i < numAllSrcs) {
+      vixl::CPURegister src[4] = {vixl::NoCPUReg, vixl::NoCPUReg,
+                                  vixl::NoCPUReg, vixl::NoCPUReg};
+      size_t j;
+      for (j = 0; j < 4 && j + i < numAllSrcs; j++) {
+        src[j] = allSrcs[j + i];
+      }
+      masm->adjustFrame(8 * j);
+      masm->vixl::MacroAssembler::Push(src[0], src[1], src[2], src[3]);
+      i += j;
+    }
+  }
+  MOZ_ASSERT(i == numAllSrcs);
+
+  // Finally, deal with the SIMD (V128) registers.  This is a bit simpler
+  // as there's no need for special-casing to maintain 16-alignment.
+
+  numAllSrcs = 0;
+  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
+    FloatRegister reg = *iter;
+    if (reg.isSimd128()) {
+      MOZ_RELEASE_ASSERT(numAllSrcs < FloatRegisters::TotalPhys);
+      allSrcs[numAllSrcs] = ARMFPRegister(reg, 128);
+      numAllSrcs++;
+    }
+  }
+  MOZ_RELEASE_ASSERT(numAllSrcs <= FloatRegisters::TotalPhys);
+
+  // Generate the transfers.
+  if (dest) {
+    for (i = 0; i < numAllSrcs; i++) {
+      FloatRegister freg =
+          FloatRegister(FloatRegisters::FPRegisterID(allSrcs[i].code()),
+                        FloatRegisters::Kind::Simd128);
+      dest->offset -= FloatRegister::SizeOfSimd128;
+      masm->storeUnalignedSimd128(freg, *dest);
+    }
+  } else {
+    i = 0;
+    while (i < numAllSrcs) {
+      vixl::CPURegister src[4] = {vixl::NoCPUReg, vixl::NoCPUReg,
+                                  vixl::NoCPUReg, vixl::NoCPUReg};
+      size_t j;
+      for (j = 0; j < 4 && j + i < numAllSrcs; j++) {
+        src[j] = allSrcs[j + i];
+      }
+      masm->adjustFrame(16 * j);
+      masm->vixl::MacroAssembler::Push(src[0], src[1], src[2], src[3]);
+      i += j;
+    }
+  }
+  MOZ_ASSERT(i == numAllSrcs);
+
+  // Final overrun check.
+  if (dest) {
+    MOZ_ASSERT(maxExtentInitial - dest->offset ==
+               masm->PushRegsInMaskSizeInBytes(set));
+  } else {
+    MOZ_ASSERT(masm->framePushed() - maxExtentInitial ==
+               masm->PushRegsInMaskSizeInBytes(set));
+  }
+}
+
+void MacroAssembler::PushRegsInMask(LiveRegisterSet set) {
+  PushOrStoreRegsInMask(this, set, mozilla::Nothing());
+}
+
+void MacroAssembler::storeRegsInMask(LiveRegisterSet set, Address dest,
+                                     Register scratch) {
+  PushOrStoreRegsInMask(this, set, mozilla::Some(dest));
+}
+
+// This is a helper function for PopRegsInMaskIgnore below.  It emits the
+// loads described by dests[0] and [1] and offsets[0] and [1], generating a
+// load-pair if it can.
+static void GeneratePendingLoadsThenFlush(MacroAssembler* masm,
+                                          vixl::CPURegister* dests,
+                                          uint32_t* offsets,
+                                          uint32_t transactionSize) {
+  // Generate the loads ..
+  if (!dests[0].IsNone()) {
+    if (!dests[1].IsNone()) {
+      // [0] and [1] both present.
+      if (offsets[0] + transactionSize == offsets[1]) {
+        masm->Ldp(dests[0], dests[1],
+                  MemOperand(masm->GetStackPointer64(), offsets[0]));
+      } else {
+        // Theoretically we could check for a load-pair with the destinations
+        // switched, but our callers will never generate that.  Hence there's
+        // no loss in giving up at this point and generating two loads.
+        masm->Ldr(dests[0], MemOperand(masm->GetStackPointer64(), offsets[0]));
+        masm->Ldr(dests[1], MemOperand(masm->GetStackPointer64(), offsets[1]));
+      }
+    } else {
+      // [0] only.
+      masm->Ldr(dests[0], MemOperand(masm->GetStackPointer64(), offsets[0]));
+    }
+  } else {
+    if (!dests[1].IsNone()) {
+      // [1] only.  Can't happen because callers always fill [0] before [1].
+      MOZ_CRASH("GenerateLoadsThenFlush");
+    } else {
+      // Neither entry valid.  This can happen.
+    }
+  }
+
+  // .. and flush.
+  dests[0] = dests[1] = vixl::NoCPUReg;
+  offsets[0] = offsets[1] = 0;
+}
+
+void MacroAssembler::PopRegsInMaskIgnore(LiveRegisterSet set,
+                                         LiveRegisterSet ignore) {
+  mozilla::DebugOnly<size_t> framePushedInitial = framePushed();
+
+  // The offset of the data from the stack pointer.
+  uint32_t offset = 0;
+
+  // The set of FP/SIMD registers we need to restore.
+  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
+
+  // The set of registers to ignore.  BroadcastToAllSizes() is used to avoid
+  // any ambiguities arising from (eg) `fpuSet` containing q17 but `ignore`
+  // containing d17.
+  FloatRegisterSet ignoreFpusBroadcasted(
+      FloatRegister::BroadcastToAllSizes(ignore.fpus()));
+
+  // First recover the SIMD (V128) registers.  This is straightforward in that
+  // we don't need to think about alignment holes.
+
+  // These three form a two-entry queue that holds loads that we know we
+  // need, but which we haven't yet emitted.
+  vixl::CPURegister pendingDests[2] = {vixl::NoCPUReg, vixl::NoCPUReg};
+  uint32_t pendingOffsets[2] = {0, 0};
+  size_t nPending = 0;
+
+  for (FloatRegisterIterator iter(fpuSet); iter.more(); ++iter) {
+    FloatRegister reg = *iter;
+    if (reg.isDouble()) {
+      continue;
+    }
+    MOZ_RELEASE_ASSERT(reg.isSimd128());
+
+    uint32_t offsetForReg = offset;
+    offset += FloatRegister::SizeOfSimd128;
+
+    if (ignoreFpusBroadcasted.hasRegisterIndex(reg)) {
+      continue;
+    }
+
+    MOZ_ASSERT(nPending <= 2);
+    if (nPending == 2) {
+      GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 16);
+      nPending = 0;
+    }
+    pendingDests[nPending] = ARMFPRegister(reg, 128);
+    pendingOffsets[nPending] = offsetForReg;
+    nPending++;
+  }
+  GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 16);
+  nPending = 0;
+
+  MOZ_ASSERT((offset % 16) == 0);
+
+  // Now recover the FP double registers.  This is more tricky in that we need
+  // to skip over the lowest-addressed of them if the number of them was odd.
+
+  if ((((fpuSet.bits() & FloatRegisters::AllDoubleMask).size()) & 1) == 1) {
+    offset += sizeof(double);
+  }
+
+  for (FloatRegisterIterator iter(fpuSet); iter.more(); ++iter) {
+    FloatRegister reg = *iter;
+    if (reg.isSimd128()) {
+      continue;
+    }
+    /* true but redundant, per loop above: MOZ_RELEASE_ASSERT(reg.isDouble()) */
+
+    uint32_t offsetForReg = offset;
+    offset += sizeof(double);
+
+    if (ignoreFpusBroadcasted.hasRegisterIndex(reg)) {
+      continue;
+    }
+
+    MOZ_ASSERT(nPending <= 2);
+    if (nPending == 2) {
+      GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 8);
+      nPending = 0;
+    }
+    pendingDests[nPending] = ARMFPRegister(reg, 64);
+    pendingOffsets[nPending] = offsetForReg;
+    nPending++;
+  }
+  GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 8);
+  nPending = 0;
+
+  MOZ_ASSERT((offset % 16) == 0);
+  MOZ_ASSERT(offset == set.fpus().getPushSizeInBytes());
+
+  // And finally recover the integer registers, again skipping an alignment
+  // hole if it exists.
+
+  if ((set.gprs().size() & 1) == 1) {
+    offset += sizeof(uint64_t);
+  }
+
+  for (GeneralRegisterIterator iter(set.gprs()); iter.more(); ++iter) {
+    Register reg = *iter;
+
+    uint32_t offsetForReg = offset;
+    offset += sizeof(uint64_t);
+
+    if (ignore.has(reg)) {
+      continue;
+    }
+
+    MOZ_ASSERT(nPending <= 2);
+    if (nPending == 2) {
+      GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 8);
+      nPending = 0;
+    }
+    pendingDests[nPending] = ARMRegister(reg, 64);
+    pendingOffsets[nPending] = offsetForReg;
+    nPending++;
+  }
+  GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 8);
+
+  MOZ_ASSERT((offset % 16) == 0);
+
+  size_t bytesPushed = PushRegsInMaskSizeInBytes(set);
+  MOZ_ASSERT(offset == bytesPushed);
+  freeStack(bytesPushed);
+}
+
+void MacroAssembler::Push(Register reg) {
+  push(reg);
+  adjustFrame(sizeof(intptr_t));
+}
+
+void MacroAssembler::Push(Register reg1, Register reg2, Register reg3,
+                          Register reg4) {
+  push(reg1, reg2, reg3, reg4);
+  adjustFrame(4 * sizeof(intptr_t));
+}
+
+void MacroAssembler::Push(const Imm32 imm) {
+  push(imm);
+  adjustFrame(sizeof(intptr_t));
+}
+
+void MacroAssembler::Push(const ImmWord imm) {
+  push(imm);
+  adjustFrame(sizeof(intptr_t));
+}
+
+void MacroAssembler::Push(const ImmPtr imm) {
+  push(imm);
+  adjustFrame(sizeof(intptr_t));
+}
+
+void MacroAssembler::Push(const ImmGCPtr ptr) {
+  push(ptr);
+  adjustFrame(sizeof(intptr_t));
+}
+
+void MacroAssembler::Push(FloatRegister f) {
+  push(f);
+  adjustFrame(sizeof(double));
+}
+
+void MacroAssembler::PushBoxed(FloatRegister reg) {
+  subFromStackPtr(Imm32(sizeof(double)));
+  boxDouble(reg, Address(getStackPointer(), 0));
+  adjustFrame(sizeof(double));
+}
+
+void MacroAssembler::Pop(Register reg) {
+  pop(reg);
+  adjustFrame(-1 * int64_t(sizeof(int64_t)));
+}
+
+void MacroAssembler::Pop(FloatRegister f) {
+  loadDouble(Address(getStackPointer(), 0), f);
+  freeStack(sizeof(double));
+}
+
+void MacroAssembler::Pop(const ValueOperand& val) {
+  pop(val);
+  adjustFrame(-1 * int64_t(sizeof(int64_t)));
+}
+
+// ===============================================================
+// Simple call functions.
+
+CodeOffset MacroAssembler::call(Register reg) {
+  // This sync has been observed (and is expected) to be necessary.
+  // eg testcase: tests/debug/bug1107525.js
+  syncStackPtr();
+  Blr(ARMRegister(reg, 64));
+  return CodeOffset(currentOffset());
+}
+
+CodeOffset MacroAssembler::call(Label* label) {
+  // This sync has been observed (and is expected) to be necessary.
+  // eg testcase: tests/basic/testBug504520Harder.js
+  syncStackPtr();
+  Bl(label);
+  return CodeOffset(currentOffset());
+}
+
+void MacroAssembler::call(ImmPtr imm) {
+  // This sync has been observed (and is expected) to be necessary.
+  // eg testcase: asm.js/testTimeout5.js
+  syncStackPtr();
+  vixl::UseScratchRegisterScope temps(this);
+  MOZ_ASSERT(temps.IsAvailable(ScratchReg64));  // ip0
+  temps.Exclude(ScratchReg64);
+  movePtr(imm, ScratchReg64.asUnsized());
+  Blr(ScratchReg64);
+}
+
+void MacroAssembler::call(ImmWord imm) { call(ImmPtr((void*)imm.value)); }
+
+CodeOffset MacroAssembler::call(wasm::SymbolicAddress imm) {
+  vixl::UseScratchRegisterScope temps(this);
+  const Register scratch = temps.AcquireX().asUnsized();
+  // This sync is believed to be necessary, although no case in jit-test/tests
+  // has been observed to cause SP != PSP here.
+  syncStackPtr();
+  movePtr(imm, scratch);
+  Blr(ARMRegister(scratch, 64));
+  return CodeOffset(currentOffset());
+}
+
+void MacroAssembler::call(const Address& addr) {
+  vixl::UseScratchRegisterScope temps(this);
+  const Register scratch = temps.AcquireX().asUnsized();
+  // This sync has been observed (and is expected) to be necessary.
+  // eg testcase: tests/backup-point-bug1315634.js
+  syncStackPtr();
+  loadPtr(addr, scratch);
+  Blr(ARMRegister(scratch, 64));
+}
+
+void MacroAssembler::call(JitCode* c) {
+  vixl::UseScratchRegisterScope temps(this);
+  const ARMRegister scratch64 = temps.AcquireX();
+  // This sync has been observed (and is expected) to be necessary.
+  // eg testcase: arrays/new-array-undefined-undefined-more-args-2.js
+  syncStackPtr();
+  BufferOffset off = immPool64(scratch64, uint64_t(c->raw()));
+  addPendingJump(off, ImmPtr(c->raw()), RelocationKind::JITCODE);
+  blr(scratch64);
+}
+
+CodeOffset MacroAssembler::callWithPatch() {
+  // This needs to sync.  Wasm goes through this one for intramodule calls.
+  //
+  // In other cases, wasm goes through masm.wasmCallImport(),
+  // masm.wasmCallBuiltinInstanceMethod, masm.wasmCallIndirect, all of which
+  // sync.
+  //
+  // This sync is believed to be necessary, although no case in jit-test/tests
+  // has been observed to cause SP != PSP here.
+  syncStackPtr();
+  bl(0, LabelDoc());
+  return CodeOffset(currentOffset());
+}
+void MacroAssembler::patchCall(uint32_t callerOffset, uint32_t calleeOffset) {
+  Instruction* inst = getInstructionAt(BufferOffset(callerOffset - 4));
+  MOZ_ASSERT(inst->IsBL());
+  ptrdiff_t relTarget = (int)calleeOffset - ((int)callerOffset - 4);
+  ptrdiff_t relTarget00 = relTarget >> 2;
+  MOZ_RELEASE_ASSERT((relTarget & 0x3) == 0);
+  MOZ_RELEASE_ASSERT(vixl::IsInt26(relTarget00));
+  bl(inst, relTarget00);
+}
+
+CodeOffset MacroAssembler::farJumpWithPatch() {
+  vixl::UseScratchRegisterScope temps(this);
+  const ARMRegister scratch = temps.AcquireX();
+  const ARMRegister scratch2 = temps.AcquireX();
+
+  AutoForbidPoolsAndNops afp(this,
+                             /* max number of instructions in scope = */ 7);
+
+  mozilla::DebugOnly<uint32_t> before = currentOffset();
+
+  align(8);  // At most one nop
+
+  Label branch;
+  adr(scratch2, &branch);
+  ldr(scratch, vixl::MemOperand(scratch2, 4));
+  add(scratch2, scratch2, scratch);
+  CodeOffset offs(currentOffset());
+  bind(&branch);
+  br(scratch2);
+  Emit(UINT32_MAX);
+  Emit(UINT32_MAX);
+
+  mozilla::DebugOnly<uint32_t> after = currentOffset();
+
+  MOZ_ASSERT(after - before == 24 || after - before == 28);
+
+  return offs;
+}
+
+void MacroAssembler::patchFarJump(CodeOffset farJump, uint32_t targetOffset) {
+  Instruction* inst1 = getInstructionAt(BufferOffset(farJump.offset() + 4));
+  Instruction* inst2 = getInstructionAt(BufferOffset(farJump.offset() + 8));
+
+  int64_t distance = (int64_t)targetOffset - (int64_t)farJump.offset();
+
+  MOZ_ASSERT(inst1->InstructionBits() == UINT32_MAX);
+  MOZ_ASSERT(inst2->InstructionBits() == UINT32_MAX);
+
+  inst1->SetInstructionBits((uint32_t)distance);
+  inst2->SetInstructionBits((uint32_t)(distance >> 32));
+}
+
+CodeOffset MacroAssembler::nopPatchableToCall() {
+  AutoForbidPoolsAndNops afp(this,
+                             /* max number of instructions in scope = */ 1);
+  Nop();
+  return CodeOffset(currentOffset());
+}
+
+void MacroAssembler::patchNopToCall(uint8_t* call, uint8_t* target) {
+  uint8_t* inst = call - 4;
+  Instruction* instr = reinterpret_cast<Instruction*>(inst);
+  MOZ_ASSERT(instr->IsBL() || instr->IsNOP());
+  bl(instr, (target - inst) >> 2);
+}
+
+void MacroAssembler::patchCallToNop(uint8_t* call) {
+  uint8_t* inst = call - 4;
+  Instruction* instr = reinterpret_cast<Instruction*>(inst);
+  MOZ_ASSERT(instr->IsBL() || instr->IsNOP());
+  nop(instr);
+}
+
+void MacroAssembler::pushReturnAddress() {
+  MOZ_RELEASE_ASSERT(!sp.Is(GetStackPointer64()), "Not valid");
+  push(lr);
+}
+
+void MacroAssembler::popReturnAddress() {
+  MOZ_RELEASE_ASSERT(!sp.Is(GetStackPointer64()), "Not valid");
+  pop(lr);
+}
+
+// ===============================================================
+// ABI function calls.
+
+void MacroAssembler::setupUnalignedABICall(Register scratch) {
+  // Because wasm operates without the need for dynamic alignment of SP, it is
+  // implied that this routine should never be called when generating wasm.
+  MOZ_ASSERT(!IsCompilingWasm());
+
+  // The following won't work for SP -- needs slightly different logic.
+  MOZ_RELEASE_ASSERT(GetStackPointer64().Is(PseudoStackPointer64));
+
+  setupNativeABICall();
+  dynamicAlignment_ = true;
+
+  int64_t alignment = ~(int64_t(ABIStackAlignment) - 1);
+  ARMRegister scratch64(scratch, 64);
+  MOZ_ASSERT(!scratch64.Is(PseudoStackPointer64));
+
+  // Always save LR -- Baseline ICs assume that LR isn't modified.
+  push(lr);
+
+  // Remember the stack address on entry.  This is reloaded in callWithABIPost
+  // below.
+  Mov(scratch64, PseudoStackPointer64);
+
+  // Make alignment, including the effective push of the previous sp.
+  Sub(PseudoStackPointer64, PseudoStackPointer64, Operand(8));
+  And(PseudoStackPointer64, PseudoStackPointer64, Operand(alignment));
+  syncStackPtr();
+
+  // Store previous sp to the top of the stack, aligned.  This is also
+  // reloaded in callWithABIPost.
+  Str(scratch64, MemOperand(PseudoStackPointer64, 0));
+}
+
+void MacroAssembler::callWithABIPre(uint32_t* stackAdjust, bool callFromWasm) {
+  // wasm operates without the need for dynamic alignment of SP.
+  MOZ_ASSERT(!(dynamicAlignment_ && callFromWasm));
+
+  MOZ_ASSERT(inCall_);
+  uint32_t stackForCall = abiArgs_.stackBytesConsumedSoFar();
+
+  // ARM64 *really* wants SP to always be 16-aligned, so ensure this now.
+  if (dynamicAlignment_) {
+    stackForCall += ComputeByteAlignment(stackForCall, StackAlignment);
+  } else {
+    // This can happen when we attach out-of-line stubs for rare cases.  For
+    // example CodeGenerator::visitWasmTruncateToInt32 adds an out-of-line
+    // chunk.
+    uint32_t alignmentAtPrologue = callFromWasm ? sizeof(wasm::Frame) : 0;
+    stackForCall += ComputeByteAlignment(
+        stackForCall + framePushed() + alignmentAtPrologue, ABIStackAlignment);
+  }
+
+  *stackAdjust = stackForCall;
+  reserveStack(*stackAdjust);
+  {
+    enoughMemory_ &= moveResolver_.resolve();
+    if (!enoughMemory_) {
+      return;
+    }
+    MoveEmitter emitter(*this);
+    emitter.emit(moveResolver_);
+    emitter.finish();
+  }
+
+  // Call boundaries communicate stack via SP.
+  // (jseward, 2021Mar03) This sync may well be redundant, given that all of
+  // the MacroAssembler::call methods generate a sync before the call.
+  // Removing it does not cause any failures for all of jit-tests.
+  syncStackPtr();
+
+  assertStackAlignment(ABIStackAlignment);
+}
+
+void MacroAssembler::callWithABIPost(uint32_t stackAdjust, MoveOp::Type result,
+                                     bool callFromWasm) {
+  // wasm operates without the need for dynamic alignment of SP.
+  MOZ_ASSERT(!(dynamicAlignment_ && callFromWasm));
+
+  // Call boundaries communicate stack via SP, so we must resync PSP now.
+  initPseudoStackPtr();
+
+  freeStack(stackAdjust);
+
+  if (dynamicAlignment_) {
+    // This then-clause makes more sense if you first read
+    // setupUnalignedABICall above.
+    //
+    // Restore the stack pointer from entry.  The stack pointer will have been
+    // saved by setupUnalignedABICall.  This is fragile in that it assumes
+    // that uses of this routine (callWithABIPost) with `dynamicAlignment_ ==
+    // true` are preceded by matching calls to setupUnalignedABICall.  But
+    // there's nothing that enforce that mechanically.  If we really want to
+    // enforce this, we could add a debug-only CallWithABIState enum to the
+    // MacroAssembler and assert that setupUnalignedABICall updates it before
+    // we get here, then reset it to its initial state.
+    Ldr(GetStackPointer64(), MemOperand(GetStackPointer64(), 0));
+    syncStackPtr();
+
+    // Restore LR.  This restores LR to the value stored by
+    // setupUnalignedABICall, which should have been called just before
+    // callWithABIPre.  This is, per the above comment, also fragile.
+    pop(lr);
+
+    // SP may be < PSP now.  That is expected from the behaviour of `pop`.  It
+    // is not clear why the following `syncStackPtr` is necessary, but it is:
+    // without it, the following test segfaults:
+    // tests/backup-point-bug1315634.js
+    syncStackPtr();
+  }
+
+  // If the ABI's return regs are where ION is expecting them, then
+  // no other work needs to be done.
+
+#ifdef DEBUG
+  MOZ_ASSERT(inCall_);
+  inCall_ = false;
+#endif
+}
+
+void MacroAssembler::callWithABINoProfiler(Register fun, MoveOp::Type result) {
+  vixl::UseScratchRegisterScope temps(this);
+  const Register scratch = temps.AcquireX().asUnsized();
+  movePtr(fun, scratch);
+
+  uint32_t stackAdjust;
+  callWithABIPre(&stackAdjust);
+  call(scratch);
+  callWithABIPost(stackAdjust, result);
+}
+
+void MacroAssembler::callWithABINoProfiler(const Address& fun,
+                                           MoveOp::Type result) {
+  vixl::UseScratchRegisterScope temps(this);
+  const Register scratch = temps.AcquireX().asUnsized();
+  loadPtr(fun, scratch);
+
+  uint32_t stackAdjust;
+  callWithABIPre(&stackAdjust);
+  call(scratch);
+  callWithABIPost(stackAdjust, result);
+}
+
+// ===============================================================
+// Jit Frames.
+
+uint32_t MacroAssembler::pushFakeReturnAddress(Register scratch) {
+  enterNoPool(3);
+  Label fakeCallsite;
+
+  Adr(ARMRegister(scratch, 64), &fakeCallsite);
+  Push(scratch);
+  bind(&fakeCallsite);
+  uint32_t pseudoReturnOffset = currentOffset();
+
+  leaveNoPool();
+  return pseudoReturnOffset;
+}
+
+bool MacroAssemblerCompat::buildOOLFakeExitFrame(void* fakeReturnAddr) {
+  asMasm().PushFrameDescriptor(FrameType::IonJS);
+  asMasm().Push(ImmPtr(fakeReturnAddr));
+  asMasm().Push(FramePointer);
+  return true;
+}
+
+// ===============================================================
+// Move instructions
+
+void MacroAssembler::moveValue(const TypedOrValueRegister& src,
+                               const ValueOperand& dest) {
+  if (src.hasValue()) {
+    moveValue(src.valueReg(), dest);
+    return;
+  }
+
+  MIRType type = src.type();
+  AnyRegister reg = src.typedReg();
+
+  if (!IsFloatingPointType(type)) {
+    boxNonDouble(ValueTypeFromMIRType(type), reg.gpr(), dest);
+    return;
+  }
+
+  ScratchDoubleScope scratch(*this);
+  FloatRegister freg = reg.fpu();
+  if (type == MIRType::Float32) {
+    convertFloat32ToDouble(freg, scratch);
+    freg = scratch;
+  }
+  boxDouble(freg, dest, scratch);
+}
+
+void MacroAssembler::moveValue(const ValueOperand& src,
+                               const ValueOperand& dest) {
+  if (src == dest) {
+    return;
+  }
+  movePtr(src.valueReg(), dest.valueReg());
+}
+
+void MacroAssembler::moveValue(const Value& src, const ValueOperand& dest) {
+  if (!src.isGCThing()) {
+    movePtr(ImmWord(src.asRawBits()), dest.valueReg());
+    return;
+  }
+
+  BufferOffset load =
+      movePatchablePtr(ImmPtr(src.bitsAsPunboxPointer()), dest.valueReg());
+  writeDataRelocation(src, load);
+}
+
+// ===============================================================
+// Branch functions
+
+void MacroAssembler::loadStoreBuffer(Register ptr, Register buffer) {
+  And(ARMRegister(buffer, 64), ARMRegister(ptr, 64),
+      Operand(int32_t(~gc::ChunkMask)));
+  loadPtr(Address(buffer, gc::ChunkStoreBufferOffset), buffer);
+}
+
+void MacroAssembler::branchPtrInNurseryChunk(Condition cond, Register ptr,
+                                             Register temp, Label* label) {
+  MOZ_ASSERT(cond == Assembler::Equal || cond == Assembler::NotEqual);
+  MOZ_ASSERT(ptr != temp);
+  MOZ_ASSERT(ptr != ScratchReg &&
+             ptr != ScratchReg2);  // Both may be used internally.
+  MOZ_ASSERT(temp != ScratchReg && temp != ScratchReg2);
+
+  And(ARMRegister(temp, 64), ARMRegister(ptr, 64),
+      Operand(int32_t(~gc::ChunkMask)));
+  branchPtr(InvertCondition(cond), Address(temp, gc::ChunkStoreBufferOffset),
+            ImmWord(0), label);
+}
+
+void MacroAssembler::branchValueIsNurseryCell(Condition cond,
+                                              const Address& address,
+                                              Register temp, Label* label) {
+  branchValueIsNurseryCellImpl(cond, address, temp, label);
+}
+
+void MacroAssembler::branchValueIsNurseryCell(Condition cond,
+                                              ValueOperand value, Register temp,
+                                              Label* label) {
+  branchValueIsNurseryCellImpl(cond, value, temp, label);
+}
+
+template <typename T>
+void MacroAssembler::branchValueIsNurseryCellImpl(Condition cond,
+                                                  const T& value, Register temp,
+                                                  Label* label) {
+  MOZ_ASSERT(cond == Assembler::Equal || cond == Assembler::NotEqual);
+  MOZ_ASSERT(temp != ScratchReg &&
+             temp != ScratchReg2);  // Both may be used internally.
+
+  Label done;
+  branchTestGCThing(Assembler::NotEqual, value,
+                    cond == Assembler::Equal ? &done : label);
+
+  getGCThingValueChunk(value, temp);
+  branchPtr(InvertCondition(cond), Address(temp, gc::ChunkStoreBufferOffset),
+            ImmWord(0), label);
+
+  bind(&done);
+}
+
+void MacroAssembler::branchTestValue(Condition cond, const ValueOperand& lhs,
+                                     const Value& rhs, Label* label) {
+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
+  vixl::UseScratchRegisterScope temps(this);
+  const ARMRegister scratch64 = temps.AcquireX();
+  MOZ_ASSERT(scratch64.asUnsized() != lhs.valueReg());
+  moveValue(rhs, ValueOperand(scratch64.asUnsized()));
+  Cmp(ARMRegister(lhs.valueReg(), 64), scratch64);
+  B(label, cond);
+}
+
+// ========================================================================
+// Memory access primitives.
+template <typename T>
+void MacroAssembler::storeUnboxedValue(const ConstantOrRegister& value,
+                                       MIRType valueType, const T& dest) {
+  MOZ_ASSERT(valueType < MIRType::Value);
+
+  if (valueType == MIRType::Double) {
+    boxDouble(value.reg().typedReg().fpu(), dest);
+    return;
+  }
+
+  if (value.constant()) {
+    storeValue(value.value(), dest);
+  } else {
+    storeValue(ValueTypeFromMIRType(valueType), value.reg().typedReg().gpr(),
+               dest);
+  }
+}
+
+template void MacroAssembler::storeUnboxedValue(const ConstantOrRegister& value,
+                                                MIRType valueType,
+                                                const Address& dest);
+template void MacroAssembler::storeUnboxedValue(
+    const ConstantOrRegister& value, MIRType valueType,
+    const BaseObjectElementIndex& dest);
+
+void MacroAssembler::comment(const char* msg) { Assembler::comment(msg); }
+
+// ========================================================================
+// wasm support
+
+CodeOffset MacroAssembler::wasmTrapInstruction() {
+  AutoForbidPoolsAndNops afp(this,
+                             /* max number of instructions in scope = */ 1);
+  CodeOffset offs(currentOffset());
+  Unreachable();
+  return offs;
+}
+
+void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
+                                       Register boundsCheckLimit, Label* ok) {
+  branch32(cond, index, boundsCheckLimit, ok);
+  if (JitOptions.spectreIndexMasking) {
+    csel(ARMRegister(index, 32), vixl::wzr, ARMRegister(index, 32), cond);
+  }
+}
+
+void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
+                                       Address boundsCheckLimit, Label* ok) {
+  branch32(cond, index, boundsCheckLimit, ok);
+  if (JitOptions.spectreIndexMasking) {
+    csel(ARMRegister(index, 32), vixl::wzr, ARMRegister(index, 32), cond);
+  }
+}
+
+void MacroAssembler::wasmBoundsCheck64(Condition cond, Register64 index,
+                                       Register64 boundsCheckLimit, Label* ok) {
+  branchPtr(cond, index.reg, boundsCheckLimit.reg, ok);
+  if (JitOptions.spectreIndexMasking) {
+    csel(ARMRegister(index.reg, 64), vixl::xzr, ARMRegister(index.reg, 64),
+         cond);
+  }
+}
+
+void MacroAssembler::wasmBoundsCheck64(Condition cond, Register64 index,
+                                       Address boundsCheckLimit, Label* ok) {
+  branchPtr(InvertCondition(cond), boundsCheckLimit, index.reg, ok);
+  if (JitOptions.spectreIndexMasking) {
+    csel(ARMRegister(index.reg, 64), vixl::xzr, ARMRegister(index.reg, 64),
+         cond);
+  }
+}
+
+// FCVTZU behaves as follows:
+//
+// on NaN it produces zero
+// on too large it produces UINT_MAX (for appropriate type)
+// on too small it produces zero
+//
+// FCVTZS behaves as follows:
+//
+// on NaN it produces zero
+// on too large it produces INT_MAX (for appropriate type)
+// on too small it produces INT_MIN (ditto)
+
+void MacroAssembler::wasmTruncateDoubleToUInt32(FloatRegister input_,
+                                                Register output_,
+                                                bool isSaturating,
+                                                Label* oolEntry) {
+  ARMRegister output(output_, 32);
+  ARMFPRegister input(input_, 64);
+  Fcvtzu(output, input);
+  if (!isSaturating) {
+    Cmp(output, 0);
+    Ccmp(output, -1, vixl::ZFlag, Assembler::NotEqual);
+    B(oolEntry, Assembler::Equal);
+  }
+}
+
+void MacroAssembler::wasmTruncateFloat32ToUInt32(FloatRegister input_,
+                                                 Register output_,
+                                                 bool isSaturating,
+                                                 Label* oolEntry) {
+  ARMRegister output(output_, 32);
+  ARMFPRegister input(input_, 32);
+  Fcvtzu(output, input);
+  if (!isSaturating) {
+    Cmp(output, 0);
+    Ccmp(output, -1, vixl::ZFlag, Assembler::NotEqual);
+    B(oolEntry, Assembler::Equal);
+  }
+}
+
+void MacroAssembler::wasmTruncateDoubleToInt32(FloatRegister input_,
+                                               Register output_,
+                                               bool isSaturating,
+                                               Label* oolEntry) {
+  ARMRegister output(output_, 32);
+  ARMFPRegister input(input_, 64);
+  Fcvtzs(output, input);
+  if (!isSaturating) {
+    Cmp(output, 0);
+    Ccmp(output, INT32_MAX, vixl::ZFlag, Assembler::NotEqual);
+    Ccmp(output, INT32_MIN, vixl::ZFlag, Assembler::NotEqual);
+    B(oolEntry, Assembler::Equal);
+  }
+}
+
+void MacroAssembler::wasmTruncateFloat32ToInt32(FloatRegister input_,
+                                                Register output_,
+                                                bool isSaturating,
+                                                Label* oolEntry) {
+  ARMRegister output(output_, 32);
+  ARMFPRegister input(input_, 32);
+  Fcvtzs(output, input);
+  if (!isSaturating) {
+    Cmp(output, 0);
+    Ccmp(output, INT32_MAX, vixl::ZFlag, Assembler::NotEqual);
+    Ccmp(output, INT32_MIN, vixl::ZFlag, Assembler::NotEqual);
+    B(oolEntry, Assembler::Equal);
+  }
+}
+
+void MacroAssembler::wasmTruncateDoubleToUInt64(
+    FloatRegister input_, Register64 output_, bool isSaturating,
+    Label* oolEntry, Label* oolRejoin, FloatRegister tempDouble) {
+  MOZ_ASSERT(tempDouble.isInvalid());
+
+  ARMRegister output(output_.reg, 64);
+  ARMFPRegister input(input_, 64);
+  Fcvtzu(output, input);
+  if (!isSaturating) {
+    Cmp(output, 0);
+    Ccmp(output, -1, vixl::ZFlag, Assembler::NotEqual);
+    B(oolEntry, Assembler::Equal);
+    bind(oolRejoin);
+  }
+}
+
+void MacroAssembler::wasmTruncateFloat32ToUInt64(
+    FloatRegister input_, Register64 output_, bool isSaturating,
+    Label* oolEntry, Label* oolRejoin, FloatRegister tempDouble) {
+  MOZ_ASSERT(tempDouble.isInvalid());
+
+  ARMRegister output(output_.reg, 64);
+  ARMFPRegister input(input_, 32);
+  Fcvtzu(output, input);
+  if (!isSaturating) {
+    Cmp(output, 0);
+    Ccmp(output, -1, vixl::ZFlag, Assembler::NotEqual);
+    B(oolEntry, Assembler::Equal);
+    bind(oolRejoin);
+  }
+}
+
+void MacroAssembler::wasmTruncateDoubleToInt64(
+    FloatRegister input_, Register64 output_, bool isSaturating,
+    Label* oolEntry, Label* oolRejoin, FloatRegister tempDouble) {
+  MOZ_ASSERT(tempDouble.isInvalid());
+
+  ARMRegister output(output_.reg, 64);
+  ARMFPRegister input(input_, 64);
+  Fcvtzs(output, input);
+  if (!isSaturating) {
+    Cmp(output, 0);
+    Ccmp(output, INT64_MAX, vixl::ZFlag, Assembler::NotEqual);
+    Ccmp(output, INT64_MIN, vixl::ZFlag, Assembler::NotEqual);
+    B(oolEntry, Assembler::Equal);
+    bind(oolRejoin);
+  }
+}
+
+void MacroAssembler::wasmTruncateFloat32ToInt64(
+    FloatRegister input_, Register64 output_, bool isSaturating,
+    Label* oolEntry, Label* oolRejoin, FloatRegister tempDouble) {
+  ARMRegister output(output_.reg, 64);
+  ARMFPRegister input(input_, 32);
+  Fcvtzs(output, input);
+  if (!isSaturating) {
+    Cmp(output, 0);
+    Ccmp(output, INT64_MAX, vixl::ZFlag, Assembler::NotEqual);
+    Ccmp(output, INT64_MIN, vixl::ZFlag, Assembler::NotEqual);
+    B(oolEntry, Assembler::Equal);
+    bind(oolRejoin);
+  }
+}
+
+void MacroAssembler::oolWasmTruncateCheckF32ToI32(FloatRegister input,
+                                                  Register output,
+                                                  TruncFlags flags,
+                                                  wasm::BytecodeOffset off,
+                                                  Label* rejoin) {
+  Label notNaN;
+  branchFloat(Assembler::DoubleOrdered, input, input, &notNaN);
+  wasmTrap(wasm::Trap::InvalidConversionToInteger, off);
+  bind(&notNaN);
+
+  Label isOverflow;
+  const float two_31 = -float(INT32_MIN);
+  ScratchFloat32Scope fpscratch(*this);
+  if (flags & TRUNC_UNSIGNED) {
+    loadConstantFloat32(two_31 * 2, fpscratch);
+    branchFloat(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
+                &isOverflow);
+    loadConstantFloat32(-1.0f, fpscratch);
+    branchFloat(Assembler::DoubleGreaterThan, input, fpscratch, rejoin);
+  } else {
+    loadConstantFloat32(two_31, fpscratch);
+    branchFloat(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
+                &isOverflow);
+    loadConstantFloat32(-two_31, fpscratch);
+    branchFloat(Assembler::DoubleGreaterThanOrEqual, input, fpscratch, rejoin);
+  }
+  bind(&isOverflow);
+  wasmTrap(wasm::Trap::IntegerOverflow, off);
+}
+
+void MacroAssembler::oolWasmTruncateCheckF64ToI32(FloatRegister input,
+                                                  Register output,
+                                                  TruncFlags flags,
+                                                  wasm::BytecodeOffset off,
+                                                  Label* rejoin) {
+  Label notNaN;
+  branchDouble(Assembler::DoubleOrdered, input, input, &notNaN);
+  wasmTrap(wasm::Trap::InvalidConversionToInteger, off);
+  bind(&notNaN);
+
+  Label isOverflow;
+  const double two_31 = -double(INT32_MIN);
+  ScratchDoubleScope fpscratch(*this);
+  if (flags & TRUNC_UNSIGNED) {
+    loadConstantDouble(two_31 * 2, fpscratch);
+    branchDouble(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
+                 &isOverflow);
+    loadConstantDouble(-1.0, fpscratch);
+    branchDouble(Assembler::DoubleGreaterThan, input, fpscratch, rejoin);
+  } else {
+    loadConstantDouble(two_31, fpscratch);
+    branchDouble(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
+                 &isOverflow);
+    loadConstantDouble(-two_31 - 1, fpscratch);
+    branchDouble(Assembler::DoubleGreaterThan, input, fpscratch, rejoin);
+  }
+  bind(&isOverflow);
+  wasmTrap(wasm::Trap::IntegerOverflow, off);
+}
+
+void MacroAssembler::oolWasmTruncateCheckF32ToI64(FloatRegister input,
+                                                  Register64 output,
+                                                  TruncFlags flags,
+                                                  wasm::BytecodeOffset off,
+                                                  Label* rejoin) {
+  Label notNaN;
+  branchFloat(Assembler::DoubleOrdered, input, input, &notNaN);
+  wasmTrap(wasm::Trap::InvalidConversionToInteger, off);
+  bind(&notNaN);
+
+  Label isOverflow;
+  const float two_63 = -float(INT64_MIN);
+  ScratchFloat32Scope fpscratch(*this);
+  if (flags & TRUNC_UNSIGNED) {
+    loadConstantFloat32(two_63 * 2, fpscratch);
+    branchFloat(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
+                &isOverflow);
+    loadConstantFloat32(-1.0f, fpscratch);
+    branchFloat(Assembler::DoubleGreaterThan, input, fpscratch, rejoin);
+  } else {
+    loadConstantFloat32(two_63, fpscratch);
+    branchFloat(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
+                &isOverflow);
+    loadConstantFloat32(-two_63, fpscratch);
+    branchFloat(Assembler::DoubleGreaterThanOrEqual, input, fpscratch, rejoin);
+  }
+  bind(&isOverflow);
+  wasmTrap(wasm::Trap::IntegerOverflow, off);
+}
+
+void MacroAssembler::oolWasmTruncateCheckF64ToI64(FloatRegister input,
+                                                  Register64 output,
+                                                  TruncFlags flags,
+                                                  wasm::BytecodeOffset off,
+                                                  Label* rejoin) {
+  Label notNaN;
+  branchDouble(Assembler::DoubleOrdered, input, input, &notNaN);
+  wasmTrap(wasm::Trap::InvalidConversionToInteger, off);
+  bind(&notNaN);
+
+  Label isOverflow;
+  const double two_63 = -double(INT64_MIN);
+  ScratchDoubleScope fpscratch(*this);
+  if (flags & TRUNC_UNSIGNED) {
+    loadConstantDouble(two_63 * 2, fpscratch);
+    branchDouble(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
+                 &isOverflow);
+    loadConstantDouble(-1.0, fpscratch);
+    branchDouble(Assembler::DoubleGreaterThan, input, fpscratch, rejoin);
+  } else {
+    loadConstantDouble(two_63, fpscratch);
+    branchDouble(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
+                 &isOverflow);
+    loadConstantDouble(-two_63, fpscratch);
+    branchDouble(Assembler::DoubleGreaterThanOrEqual, input, fpscratch, rejoin);
+  }
+  bind(&isOverflow);
+  wasmTrap(wasm::Trap::IntegerOverflow, off);
+}
+
+void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
+                              Register memoryBase, Register ptr,
+                              AnyRegister output) {
+  wasmLoadImpl(access, memoryBase, ptr, output, Register64::Invalid());
+}
+
+void MacroAssembler::wasmLoadI64(const wasm::MemoryAccessDesc& access,
+                                 Register memoryBase, Register ptr,
+                                 Register64 output) {
+  wasmLoadImpl(access, memoryBase, ptr, AnyRegister(), output);
+}
+
+void MacroAssembler::wasmStore(const wasm::MemoryAccessDesc& access,
+                               AnyRegister value, Register memoryBase,
+                               Register ptr) {
+  wasmStoreImpl(access, value, Register64::Invalid(), memoryBase, ptr);
+}
+
+void MacroAssembler::wasmStoreI64(const wasm::MemoryAccessDesc& access,
+                                  Register64 value, Register memoryBase,
+                                  Register ptr) {
+  wasmStoreImpl(access, AnyRegister(), value, memoryBase, ptr);
+}
+
+void MacroAssembler::enterFakeExitFrameForWasm(Register cxreg, Register scratch,
+                                               ExitFrameType type) {
+  // Wasm stubs use the native SP, not the PSP.
+
+  linkExitFrame(cxreg, scratch);
+
+  MOZ_RELEASE_ASSERT(sp.Is(GetStackPointer64()));
+
+  // SP has to be 16-byte aligned when we do a load/store, so push |type| twice
+  // and then add 8 bytes to SP. This leaves SP unaligned.
+  move32(Imm32(int32_t(type)), scratch);
+  push(scratch, scratch);
+  Add(sp, sp, 8);
+
+  // Despite the above assertion, it is possible for control to flow from here
+  // to the code generated by
+  // MacroAssemblerCompat::handleFailureWithHandlerTail without any
+  // intervening assignment to PSP.  But handleFailureWithHandlerTail assumes
+  // that PSP is the active stack pointer.  Hence the following is necessary
+  // for safety.  Note we can't use initPseudoStackPtr here as that would
+  // generate no instructions.
+  Mov(PseudoStackPointer64, sp);
+}
+
+void MacroAssembler::widenInt32(Register r) {
+  move32To64ZeroExtend(r, Register64(r));
+}
+
+// ========================================================================
+// Convert floating point.
+
+bool MacroAssembler::convertUInt64ToDoubleNeedsTemp() { return false; }
+
+void MacroAssembler::convertUInt64ToDouble(Register64 src, FloatRegister dest,
+                                           Register temp) {
+  MOZ_ASSERT(temp == Register::Invalid());
+  Ucvtf(ARMFPRegister(dest, 64), ARMRegister(src.reg, 64));
+}
+
+void MacroAssembler::convertInt64ToDouble(Register64 src, FloatRegister dest) {
+  Scvtf(ARMFPRegister(dest, 64), ARMRegister(src.reg, 64));
+}
+
+void MacroAssembler::convertUInt64ToFloat32(Register64 src, FloatRegister dest,
+                                            Register temp) {
+  MOZ_ASSERT(temp == Register::Invalid());
+  Ucvtf(ARMFPRegister(dest, 32), ARMRegister(src.reg, 64));
+}
+
+void MacroAssembler::convertInt64ToFloat32(Register64 src, FloatRegister dest) {
+  Scvtf(ARMFPRegister(dest, 32), ARMRegister(src.reg, 64));
+}
+
+void MacroAssembler::convertIntPtrToDouble(Register src, FloatRegister dest) {
+  convertInt64ToDouble(Register64(src), dest);
+}
+
+// ========================================================================
+// Primitive atomic operations.
+
+// The computed MemOperand must be Reg+0 because the load/store exclusive
+// instructions only take a single pointer register.
+
+static MemOperand ComputePointerForAtomic(MacroAssembler& masm,
+                                          const Address& address,
+                                          Register scratch) {
+  if (address.offset == 0) {
+    return MemOperand(X(masm, address.base), 0);
+  }
+
+  masm.Add(X(scratch), X(masm, address.base), address.offset);
+  return MemOperand(X(scratch), 0);
+}
+
+static MemOperand ComputePointerForAtomic(MacroAssembler& masm,
+                                          const BaseIndex& address,
+                                          Register scratch) {
+  masm.Add(X(scratch), X(masm, address.base),
+           Operand(X(address.index), vixl::LSL, address.scale));
+  if (address.offset) {
+    masm.Add(X(scratch), X(scratch), address.offset);
+  }
+  return MemOperand(X(scratch), 0);
+}
+
+// This sign extends to targetWidth and leaves any higher bits zero.
+
+static void SignOrZeroExtend(MacroAssembler& masm, Scalar::Type srcType,
+                             Width targetWidth, Register src, Register dest) {
+  bool signExtend = Scalar::isSignedIntType(srcType);
+
+  switch (Scalar::byteSize(srcType)) {
+    case 1:
+      if (signExtend) {
+        masm.Sbfm(R(dest, targetWidth), R(src, targetWidth), 0, 7);
+      } else {
+        masm.Ubfm(R(dest, targetWidth), R(src, targetWidth), 0, 7);
+      }
+      break;
+    case 2:
+      if (signExtend) {
+        masm.Sbfm(R(dest, targetWidth), R(src, targetWidth), 0, 15);
+      } else {
+        masm.Ubfm(R(dest, targetWidth), R(src, targetWidth), 0, 15);
+      }
+      break;
+    case 4:
+      if (targetWidth == Width::_64) {
+        if (signExtend) {
+          masm.Sbfm(X(dest), X(src), 0, 31);
+        } else {
+          masm.Ubfm(X(dest), X(src), 0, 31);
+        }
+      } else if (src != dest) {
+        masm.Mov(R(dest, targetWidth), R(src, targetWidth));
+      }
+      break;
+    case 8:
+      if (src != dest) {
+        masm.Mov(R(dest, targetWidth), R(src, targetWidth));
+      }
+      break;
+    default:
+      MOZ_CRASH();
+  }
+}
+
+// Exclusive-loads zero-extend their values to the full width of the X register.
+//
+// Note, we've promised to leave the high bits of the 64-bit register clear if
+// the targetWidth is 32.
+
+static void LoadExclusive(MacroAssembler& masm,
+                          const wasm::MemoryAccessDesc* access,
+                          Scalar::Type srcType, Width targetWidth,
+                          MemOperand ptr, Register dest) {
+  bool signExtend = Scalar::isSignedIntType(srcType);
+
+  // With this address form, a single native ldxr* will be emitted, and the
+  // AutoForbidPoolsAndNops ensures that the metadata is emitted at the address
+  // of the ldxr*.
+  MOZ_ASSERT(ptr.IsImmediateOffset() && ptr.offset() == 0);
+
+  switch (Scalar::byteSize(srcType)) {
+    case 1: {
+      {
+        AutoForbidPoolsAndNops afp(
+            &masm,
+            /* max number of instructions in scope = */ 1);
+        if (access) {
+          masm.append(*access, masm.currentOffset());
+        }
+        masm.Ldxrb(W(dest), ptr);
+      }
+      if (signExtend) {
+        masm.Sbfm(R(dest, targetWidth), R(dest, targetWidth), 0, 7);
+      }
+      break;
+    }
+    case 2: {
+      {
+        AutoForbidPoolsAndNops afp(
+            &masm,
+            /* max number of instructions in scope = */ 1);
+        if (access) {
+          masm.append(*access, masm.currentOffset());
+        }
+        masm.Ldxrh(W(dest), ptr);
+      }
+      if (signExtend) {
+        masm.Sbfm(R(dest, targetWidth), R(dest, targetWidth), 0, 15);
+      }
+      break;
+    }
+    case 4: {
+      {
+        AutoForbidPoolsAndNops afp(
+            &masm,
+            /* max number of instructions in scope = */ 1);
+        if (access) {
+          masm.append(*access, masm.currentOffset());
+        }
+        masm.Ldxr(W(dest), ptr);
+      }
+      if (targetWidth == Width::_64 && signExtend) {
+        masm.Sbfm(X(dest), X(dest), 0, 31);
+      }
+      break;
+    }
+    case 8: {
+      {
+        AutoForbidPoolsAndNops afp(
+            &masm,
+            /* max number of instructions in scope = */ 1);
+        if (access) {
+          masm.append(*access, masm.currentOffset());
+        }
+        masm.Ldxr(X(dest), ptr);
+      }
+      break;
+    }
+    default: {
+      MOZ_CRASH();
+    }
+  }
+}
+
+static void StoreExclusive(MacroAssembler& masm, Scalar::Type type,
+                           Register status, Register src, MemOperand ptr) {
+  switch (Scalar::byteSize(type)) {
+    case 1:
+      masm.Stxrb(W(status), W(src), ptr);
+      break;
+    case 2:
+      masm.Stxrh(W(status), W(src), ptr);
+      break;
+    case 4:
+      masm.Stxr(W(status), W(src), ptr);
+      break;
+    case 8:
+      masm.Stxr(W(status), X(src), ptr);
+      break;
+  }
+}
+
+static bool HasAtomicInstructions(MacroAssembler& masm) {
+  return masm.asVIXL().GetCPUFeatures()->Has(vixl::CPUFeatures::kAtomics);
+}
+
+static inline bool SupportedAtomicInstructionOperands(Scalar::Type type,
+                                                      Width targetWidth) {
+  if (targetWidth == Width::_32) {
+    return byteSize(type) <= 4;
+  }
+  if (targetWidth == Width::_64) {
+    return byteSize(type) == 8;
+  }
+  return false;
+}
+
+template <typename T>
+static void CompareExchange(MacroAssembler& masm,
+                            const wasm::MemoryAccessDesc* access,
+                            Scalar::Type type, Width targetWidth,
+                            const Synchronization& sync, const T& mem,
+                            Register oldval, Register newval, Register output) {
+  MOZ_ASSERT(oldval != output && newval != output);
+
+  vixl::UseScratchRegisterScope temps(&masm);
+
+  Register ptrScratch = temps.AcquireX().asUnsized();
+  MemOperand ptr = ComputePointerForAtomic(masm, mem, ptrScratch);
+
+  MOZ_ASSERT(ptr.base().asUnsized() != output);
+
+  if (HasAtomicInstructions(masm) &&
+      SupportedAtomicInstructionOperands(type, targetWidth)) {
+    masm.Mov(X(output), X(oldval));
+    // Capal is using same atomic mechanism as Ldxr/Stxr, and
+    // consider it is the same for "Inner Shareable" domain.
+    // Not updated gen_cmpxchg in GenerateAtomicOperations.py.
+    masm.memoryBarrierBefore(sync);
+    if (access) {
+      masm.append(*access, masm.currentOffset());
+    }
+    switch (byteSize(type)) {
+      case 1:
+        masm.Casalb(R(output, targetWidth), R(newval, targetWidth), ptr);
+        break;
+      case 2:
+        masm.Casalh(R(output, targetWidth), R(newval, targetWidth), ptr);
+        break;
+      case 4:
+      case 8:
+        masm.Casal(R(output, targetWidth), R(newval, targetWidth), ptr);
+        break;
+      default:
+        MOZ_CRASH("CompareExchange unsupported type");
+    }
+    masm.memoryBarrierAfter(sync);
+    SignOrZeroExtend(masm, type, targetWidth, output, output);
+    return;
+  }
+
+  // The target doesn't support atomics, so generate a LL-SC loop. This requires
+  // only AArch64 v8.0.
+  Label again;
+  Label done;
+
+  // NOTE: the generated code must match the assembly code in gen_cmpxchg in
+  // GenerateAtomicOperations.py
+  masm.memoryBarrierBefore(sync);
+
+  Register scratch = temps.AcquireX().asUnsized();
+
+  masm.bind(&again);
+  SignOrZeroExtend(masm, type, targetWidth, oldval, scratch);
+  LoadExclusive(masm, access, type, targetWidth, ptr, output);
+  masm.Cmp(R(output, targetWidth), R(scratch, targetWidth));
+  masm.B(&done, MacroAssembler::NotEqual);
+  StoreExclusive(masm, type, scratch, newval, ptr);
+  masm.Cbnz(W(scratch), &again);
+  masm.bind(&done);
+
+  masm.memoryBarrierAfter(sync);
+}
+
+template <typename T>
+static void AtomicExchange(MacroAssembler& masm,
+                           const wasm::MemoryAccessDesc* access,
+                           Scalar::Type type, Width targetWidth,
+                           const Synchronization& sync, const T& mem,
+                           Register value, Register output) {
+  MOZ_ASSERT(value != output);
+
+  vixl::UseScratchRegisterScope temps(&masm);
+
+  Register ptrScratch = temps.AcquireX().asUnsized();
+  MemOperand ptr = ComputePointerForAtomic(masm, mem, ptrScratch);
+
+  if (HasAtomicInstructions(masm) &&
+      SupportedAtomicInstructionOperands(type, targetWidth)) {
+    // Swpal is using same atomic mechanism as Ldxr/Stxr, and
+    // consider it is the same for "Inner Shareable" domain.
+    // Not updated gen_exchange in GenerateAtomicOperations.py.
+    masm.memoryBarrierBefore(sync);
+    if (access) {
+      masm.append(*access, masm.currentOffset());
+    }
+    switch (byteSize(type)) {
+      case 1:
+        masm.Swpalb(R(value, targetWidth), R(output, targetWidth), ptr);
+        break;
+      case 2:
+        masm.Swpalh(R(value, targetWidth), R(output, targetWidth), ptr);
+        break;
+      case 4:
+      case 8:
+        masm.Swpal(R(value, targetWidth), R(output, targetWidth), ptr);
+        break;
+      default:
+        MOZ_CRASH("AtomicExchange unsupported type");
+    }
+    masm.memoryBarrierAfter(sync);
+    SignOrZeroExtend(masm, type, targetWidth, output, output);
+    return;
+  }
+
+  // The target doesn't support atomics, so generate a LL-SC loop. This requires
+  // only AArch64 v8.0.
+  Label again;
+
+  // NOTE: the generated code must match the assembly code in gen_exchange in
+  // GenerateAtomicOperations.py
+  masm.memoryBarrierBefore(sync);
+
+  Register scratch = temps.AcquireX().asUnsized();
+
+  masm.bind(&again);
+  LoadExclusive(masm, access, type, targetWidth, ptr, output);
+  StoreExclusive(masm, type, scratch, value, ptr);
+  masm.Cbnz(W(scratch), &again);
+
+  masm.memoryBarrierAfter(sync);
+}
+
+template <bool wantResult, typename T>
+static void AtomicFetchOp(MacroAssembler& masm,
+                          const wasm::MemoryAccessDesc* access,
+                          Scalar::Type type, Width targetWidth,
+                          const Synchronization& sync, AtomicOp op,
+                          const T& mem, Register value, Register temp,
+                          Register output) {
+  MOZ_ASSERT(value != output);
+  MOZ_ASSERT(value != temp);
+  MOZ_ASSERT_IF(wantResult, output != temp);
+
+  vixl::UseScratchRegisterScope temps(&masm);
+
+  Register ptrScratch = temps.AcquireX().asUnsized();
+  MemOperand ptr = ComputePointerForAtomic(masm, mem, ptrScratch);
+
+  if (HasAtomicInstructions(masm) &&
+      SupportedAtomicInstructionOperands(type, targetWidth) &&
+      !isFloatingType(type)) {
+    // LdXXXal/StXXXl is using same atomic mechanism as Ldxr/Stxr, and
+    // consider it is the same for "Inner Shareable" domain.
+    // Not updated gen_fetchop in GenerateAtomicOperations.py.
+    masm.memoryBarrierBefore(sync);
+
+#define FETCH_OP_CASE(op, arg)                                              \
+  if (access) {                                                             \
+    masm.append(*access, masm.currentOffset());                             \
+  }                                                                         \
+  switch (byteSize(type)) {                                                 \
+    case 1:                                                                 \
+      if (wantResult) {                                                     \
+        masm.Ld##op##alb(R(arg, targetWidth), R(output, targetWidth), ptr); \
+      } else {                                                              \
+        masm.St##op##lb(R(arg, targetWidth), ptr);                          \
+      }                                                                     \
+      break;                                                                \
+    case 2:                                                                 \
+      if (wantResult) {                                                     \
+        masm.Ld##op##alh(R(arg, targetWidth), R(output, targetWidth), ptr); \
+      } else {                                                              \
+        masm.St##op##lh(R(arg, targetWidth), ptr);                          \
+      }                                                                     \
+      break;                                                                \
+    case 4:                                                                 \
+    case 8:                                                                 \
+      if (wantResult) {                                                     \
+        masm.Ld##op##al(R(arg, targetWidth), R(output, targetWidth), ptr);  \
+      } else {                                                              \
+        masm.St##op##l(R(arg, targetWidth), ptr);                           \
+      }                                                                     \
+      break;                                                                \
+    default:                                                                \
+      MOZ_CRASH("AtomicFetchOp unsupported type");                          \
+  }
+
+    switch (op) {
+      case AtomicFetchAddOp:
+        FETCH_OP_CASE(add, value);
+        break;
+      case AtomicFetchSubOp: {
+        Register scratch = temps.AcquireX().asUnsized();
+        masm.Neg(X(scratch), X(value));
+        FETCH_OP_CASE(add, scratch);
+        break;
+      }
+      case AtomicFetchAndOp: {
+        Register scratch = temps.AcquireX().asUnsized();
+        masm.Eor(X(scratch), X(value), Operand(~0));
+        FETCH_OP_CASE(clr, scratch);
+        break;
+      }
+      case AtomicFetchOrOp:
+        FETCH_OP_CASE(set, value);
+        break;
+      case AtomicFetchXorOp:
+        FETCH_OP_CASE(eor, value);
+        break;
+    }
+    masm.memoryBarrierAfter(sync);
+    if (wantResult) {
+      SignOrZeroExtend(masm, type, targetWidth, output, output);
+    }
+    return;
+  }
+
+#undef FETCH_OP_CASE
+
+  // The target doesn't support atomics, so generate a LL-SC loop. This requires
+  // only AArch64 v8.0.
+  Label again;
+
+  // NOTE: the generated code must match the assembly code in gen_fetchop in
+  // GenerateAtomicOperations.py
+  masm.memoryBarrierBefore(sync);
+
+  Register scratch = temps.AcquireX().asUnsized();
+
+  masm.bind(&again);
+  LoadExclusive(masm, access, type, targetWidth, ptr, output);
+  switch (op) {
+    case AtomicFetchAddOp:
+      masm.Add(X(temp), X(output), X(value));
+      break;
+    case AtomicFetchSubOp:
+      masm.Sub(X(temp), X(output), X(value));
+      break;
+    case AtomicFetchAndOp:
+      masm.And(X(temp), X(output), X(value));
+      break;
+    case AtomicFetchOrOp:
+      masm.Orr(X(temp), X(output), X(value));
+      break;
+    case AtomicFetchXorOp:
+      masm.Eor(X(temp), X(output), X(value));
+      break;
+  }
+  StoreExclusive(masm, type, scratch, temp, ptr);
+  masm.Cbnz(W(scratch), &again);
+  if (wantResult) {
+    SignOrZeroExtend(masm, type, targetWidth, output, output);
+  }
+
+  masm.memoryBarrierAfter(sync);
+}
+
+void MacroAssembler::compareExchange(Scalar::Type type,
+                                     const Synchronization& sync,
+                                     const Address& mem, Register oldval,
+                                     Register newval, Register output) {
+  CompareExchange(*this, nullptr, type, Width::_32, sync, mem, oldval, newval,
+                  output);
+}
+
+void MacroAssembler::compareExchange(Scalar::Type type,
+                                     const Synchronization& sync,
+                                     const BaseIndex& mem, Register oldval,
+                                     Register newval, Register output) {
+  CompareExchange(*this, nullptr, type, Width::_32, sync, mem, oldval, newval,
+                  output);
+}
+
+void MacroAssembler::compareExchange64(const Synchronization& sync,
+                                       const Address& mem, Register64 expect,
+                                       Register64 replace, Register64 output) {
+  CompareExchange(*this, nullptr, Scalar::Int64, Width::_64, sync, mem,
+                  expect.reg, replace.reg, output.reg);
+}
+
+void MacroAssembler::compareExchange64(const Synchronization& sync,
+                                       const BaseIndex& mem, Register64 expect,
+                                       Register64 replace, Register64 output) {
+  CompareExchange(*this, nullptr, Scalar::Int64, Width::_64, sync, mem,
+                  expect.reg, replace.reg, output.reg);
+}
+
+void MacroAssembler::atomicExchange64(const Synchronization& sync,
+                                      const Address& mem, Register64 value,
+                                      Register64 output) {
+  AtomicExchange(*this, nullptr, Scalar::Int64, Width::_64, sync, mem,
+                 value.reg, output.reg);
+}
+
+void MacroAssembler::atomicExchange64(const Synchronization& sync,
+                                      const BaseIndex& mem, Register64 value,
+                                      Register64 output) {
+  AtomicExchange(*this, nullptr, Scalar::Int64, Width::_64, sync, mem,
+                 value.reg, output.reg);
+}
+
+void MacroAssembler::atomicFetchOp64(const Synchronization& sync, AtomicOp op,
+                                     Register64 value, const Address& mem,
+                                     Register64 temp, Register64 output) {
+  AtomicFetchOp<true>(*this, nullptr, Scalar::Int64, Width::_64, sync, op, mem,
+                      value.reg, temp.reg, output.reg);
+}
+
+void MacroAssembler::atomicFetchOp64(const Synchronization& sync, AtomicOp op,
+                                     Register64 value, const BaseIndex& mem,
+                                     Register64 temp, Register64 output) {
+  AtomicFetchOp<true>(*this, nullptr, Scalar::Int64, Width::_64, sync, op, mem,
+                      value.reg, temp.reg, output.reg);
+}
+
+void MacroAssembler::atomicEffectOp64(const Synchronization& sync, AtomicOp op,
+                                      Register64 value, const Address& mem,
+                                      Register64 temp) {
+  AtomicFetchOp<false>(*this, nullptr, Scalar::Int64, Width::_64, sync, op, mem,
+                       value.reg, temp.reg, temp.reg);
+}
+
+void MacroAssembler::atomicEffectOp64(const Synchronization& sync, AtomicOp op,
+                                      Register64 value, const BaseIndex& mem,
+                                      Register64 temp) {
+  AtomicFetchOp<false>(*this, nullptr, Scalar::Int64, Width::_64, sync, op, mem,
+                       value.reg, temp.reg, temp.reg);
+}
+
+void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
+                                         const Address& mem, Register oldval,
+                                         Register newval, Register output) {
+  CompareExchange(*this, &access, access.type(), Width::_32, access.sync(), mem,
+                  oldval, newval, output);
+}
+
+void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
+                                         const BaseIndex& mem, Register oldval,
+                                         Register newval, Register output) {
+  CompareExchange(*this, &access, access.type(), Width::_32, access.sync(), mem,
+                  oldval, newval, output);
+}
+
+void MacroAssembler::atomicExchange(Scalar::Type type,
+                                    const Synchronization& sync,
+                                    const Address& mem, Register value,
+                                    Register output) {
+  AtomicExchange(*this, nullptr, type, Width::_32, sync, mem, value, output);
+}
+
+void MacroAssembler::atomicExchange(Scalar::Type type,
+                                    const Synchronization& sync,
+                                    const BaseIndex& mem, Register value,
+                                    Register output) {
+  AtomicExchange(*this, nullptr, type, Width::_32, sync, mem, value, output);
+}
+
+void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
+                                        const Address& mem, Register value,
+                                        Register output) {
+  AtomicExchange(*this, &access, access.type(), Width::_32, access.sync(), mem,
+                 value, output);
+}
+
+void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
+                                        const BaseIndex& mem, Register value,
+                                        Register output) {
+  AtomicExchange(*this, &access, access.type(), Width::_32, access.sync(), mem,
+                 value, output);
+}
+
+void MacroAssembler::atomicFetchOp(Scalar::Type type,
+                                   const Synchronization& sync, AtomicOp op,
+                                   Register value, const Address& mem,
+                                   Register temp, Register output) {
+  AtomicFetchOp<true>(*this, nullptr, type, Width::_32, sync, op, mem, value,
+                      temp, output);
+}
+
+void MacroAssembler::atomicFetchOp(Scalar::Type type,
+                                   const Synchronization& sync, AtomicOp op,
+                                   Register value, const BaseIndex& mem,
+                                   Register temp, Register output) {
+  AtomicFetchOp<true>(*this, nullptr, type, Width::_32, sync, op, mem, value,
+                      temp, output);
+}
+
+void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
+                                       AtomicOp op, Register value,
+                                       const Address& mem, Register temp,
+                                       Register output) {
+  AtomicFetchOp<true>(*this, &access, access.type(), Width::_32, access.sync(),
+                      op, mem, value, temp, output);
+}
+
+void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
+                                       AtomicOp op, Register value,
+                                       const BaseIndex& mem, Register temp,
+                                       Register output) {
+  AtomicFetchOp<true>(*this, &access, access.type(), Width::_32, access.sync(),
+                      op, mem, value, temp, output);
+}
+
+void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
+                                        AtomicOp op, Register value,
+                                        const Address& mem, Register temp) {
+  AtomicFetchOp<false>(*this, &access, access.type(), Width::_32, access.sync(),
+                       op, mem, value, temp, temp);
+}
+
+void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
+                                        AtomicOp op, Register value,
+                                        const BaseIndex& mem, Register temp) {
+  AtomicFetchOp<false>(*this, &access, access.type(), Width::_32, access.sync(),
+                       op, mem, value, temp, temp);
+}
+
+void MacroAssembler::wasmCompareExchange64(const wasm::MemoryAccessDesc& access,
+                                           const Address& mem,
+                                           Register64 expect,
+                                           Register64 replace,
+                                           Register64 output) {
+  CompareExchange(*this, &access, Scalar::Int64, Width::_64, access.sync(), mem,
+                  expect.reg, replace.reg, output.reg);
+}
+
+void MacroAssembler::wasmCompareExchange64(const wasm::MemoryAccessDesc& access,
+                                           const BaseIndex& mem,
+                                           Register64 expect,
+                                           Register64 replace,
+                                           Register64 output) {
+  CompareExchange(*this, &access, Scalar::Int64, Width::_64, access.sync(), mem,
+                  expect.reg, replace.reg, output.reg);
+}
+
+void MacroAssembler::wasmAtomicExchange64(const wasm::MemoryAccessDesc& access,
+                                          const Address& mem, Register64 value,
+                                          Register64 output) {
+  AtomicExchange(*this, &access, Scalar::Int64, Width::_64, access.sync(), mem,
+                 value.reg, output.reg);
+}
+
+void MacroAssembler::wasmAtomicExchange64(const wasm::MemoryAccessDesc& access,
+                                          const BaseIndex& mem,
+                                          Register64 value, Register64 output) {
+  AtomicExchange(*this, &access, Scalar::Int64, Width::_64, access.sync(), mem,
+                 value.reg, output.reg);
+}
+
+void MacroAssembler::wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access,
+                                         AtomicOp op, Register64 value,
+                                         const Address& mem, Register64 temp,
+                                         Register64 output) {
+  AtomicFetchOp<true>(*this, &access, Scalar::Int64, Width::_64, access.sync(),
+                      op, mem, value.reg, temp.reg, output.reg);
+}
+
+void MacroAssembler::wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access,
+                                         AtomicOp op, Register64 value,
+                                         const BaseIndex& mem, Register64 temp,
+                                         Register64 output) {
+  AtomicFetchOp<true>(*this, &access, Scalar::Int64, Width::_64, access.sync(),
+                      op, mem, value.reg, temp.reg, output.reg);
+}
+
+void MacroAssembler::wasmAtomicEffectOp64(const wasm::MemoryAccessDesc& access,
+                                          AtomicOp op, Register64 value,
+                                          const BaseIndex& mem,
+                                          Register64 temp) {
+  AtomicFetchOp<false>(*this, &access, Scalar::Int64, Width::_64, access.sync(),
+                       op, mem, value.reg, temp.reg, temp.reg);
+}
+
+// ========================================================================
+// JS atomic operations.
+
+template <typename T>
+static void CompareExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
+                              const Synchronization& sync, const T& mem,
+                              Register oldval, Register newval, Register temp,
+                              AnyRegister output) {
+  if (arrayType == Scalar::Uint32) {
+    masm.compareExchange(arrayType, sync, mem, oldval, newval, temp);
+    masm.convertUInt32ToDouble(temp, output.fpu());
+  } else {
+    masm.compareExchange(arrayType, sync, mem, oldval, newval, output.gpr());
+  }
+}
+
+void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
+                                       const Synchronization& sync,
+                                       const Address& mem, Register oldval,
+                                       Register newval, Register temp,
+                                       AnyRegister output) {
+  CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, temp, output);
+}
+
+void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
+                                       const Synchronization& sync,
+                                       const BaseIndex& mem, Register oldval,
+                                       Register newval, Register temp,
+                                       AnyRegister output) {
+  CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, temp, output);
+}
+
+template <typename T>
+static void AtomicExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
+                             const Synchronization& sync, const T& mem,
+                             Register value, Register temp,
+                             AnyRegister output) {
+  if (arrayType == Scalar::Uint32) {
+    masm.atomicExchange(arrayType, sync, mem, value, temp);
+    masm.convertUInt32ToDouble(temp, output.fpu());
+  } else {
+    masm.atomicExchange(arrayType, sync, mem, value, output.gpr());
+  }
+}
+
+void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
+                                      const Synchronization& sync,
+                                      const Address& mem, Register value,
+                                      Register temp, AnyRegister output) {
+  AtomicExchangeJS(*this, arrayType, sync, mem, value, temp, output);
+}
+
+void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
+                                      const Synchronization& sync,
+                                      const BaseIndex& mem, Register value,
+                                      Register temp, AnyRegister output) {
+  AtomicExchangeJS(*this, arrayType, sync, mem, value, temp, output);
+}
+
+template <typename T>
+static void AtomicFetchOpJS(MacroAssembler& masm, Scalar::Type arrayType,
+                            const Synchronization& sync, AtomicOp op,
+                            Register value, const T& mem, Register temp1,
+                            Register temp2, AnyRegister output) {
+  if (arrayType == Scalar::Uint32) {
+    masm.atomicFetchOp(arrayType, sync, op, value, mem, temp2, temp1);
+    masm.convertUInt32ToDouble(temp1, output.fpu());
+  } else {
+    masm.atomicFetchOp(arrayType, sync, op, value, mem, temp1, output.gpr());
+  }
+}
+
+void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
+                                     const Synchronization& sync, AtomicOp op,
+                                     Register value, const Address& mem,
+                                     Register temp1, Register temp2,
+                                     AnyRegister output) {
+  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, temp1, temp2, output);
+}
+
+void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
+                                     const Synchronization& sync, AtomicOp op,
+                                     Register value, const BaseIndex& mem,
+                                     Register temp1, Register temp2,
+                                     AnyRegister output) {
+  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, temp1, temp2, output);
+}
+
+void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType,
+                                      const Synchronization& sync, AtomicOp op,
+                                      Register value, const BaseIndex& mem,
+                                      Register temp) {
+  AtomicFetchOp<false>(*this, nullptr, arrayType, Width::_32, sync, op, mem,
+                       value, temp, temp);
+}
+
+void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType,
+                                      const Synchronization& sync, AtomicOp op,
+                                      Register value, const Address& mem,
+                                      Register temp) {
+  AtomicFetchOp<false>(*this, nullptr, arrayType, Width::_32, sync, op, mem,
+                       value, temp, temp);
+}
+
+void MacroAssembler::flexibleQuotient32(Register rhs, Register srcDest,
+                                        bool isUnsigned,
+                                        const LiveRegisterSet&) {
+  quotient32(rhs, srcDest, isUnsigned);
+}
+
+void MacroAssembler::flexibleRemainder32(Register rhs, Register srcDest,
+                                         bool isUnsigned,
+                                         const LiveRegisterSet&) {
+  remainder32(rhs, srcDest, isUnsigned);
+}
+
+void MacroAssembler::flexibleDivMod32(Register rhs, Register srcDest,
+                                      Register remOutput, bool isUnsigned,
+                                      const LiveRegisterSet&) {
+  vixl::UseScratchRegisterScope temps(this);
+  ARMRegister scratch = temps.AcquireW();
+  ARMRegister src = temps.AcquireW();
+
+  // Preserve src for remainder computation
+  Mov(src, ARMRegister(srcDest, 32));
+
+  if (isUnsigned) {
+    Udiv(ARMRegister(srcDest, 32), src, ARMRegister(rhs, 32));
+  } else {
+    Sdiv(ARMRegister(srcDest, 32), src, ARMRegister(rhs, 32));
+  }
+  // Compute remainder
+  Mul(scratch, ARMRegister(srcDest, 32), ARMRegister(rhs, 32));
+  Sub(ARMRegister(remOutput, 32), src, scratch);
+}
+
+CodeOffset MacroAssembler::moveNearAddressWithPatch(Register dest) {
+  AutoForbidPoolsAndNops afp(this,
+                             /* max number of instructions in scope = */ 1);
+  CodeOffset offset(currentOffset());
+  adr(ARMRegister(dest, 64), 0, LabelDoc());
+  return offset;
+}
+
+void MacroAssembler::patchNearAddressMove(CodeLocationLabel loc,
+                                          CodeLocationLabel target) {
+  ptrdiff_t off = target - loc;
+  MOZ_RELEASE_ASSERT(vixl::IsInt21(off));
+
+  Instruction* cur = reinterpret_cast<Instruction*>(loc.raw());
+  MOZ_ASSERT(cur->IsADR());
+
+  vixl::Register rd = vixl::Register::XRegFromCode(cur->Rd());
+  adr(cur, rd, off);
+}
+
+// ========================================================================
+// Spectre Mitigations.
+
+void MacroAssembler::speculationBarrier() {
+  // Conditional speculation barrier.
+  csdb();
+}
+
+void MacroAssembler::floorFloat32ToInt32(FloatRegister src, Register dest,
+                                         Label* fail) {
+  ARMFPRegister iFlt(src, 32);
+  ARMRegister o64(dest, 64);
+  ARMRegister o32(dest, 32);
+
+  Label handleZero;
+  Label fin;
+
+  // Handle ±0 and NaN first.
+  Fcmp(iFlt, 0.0);
+  B(Assembler::Equal, &handleZero);
+  // NaN is always a bail condition, just bail directly.
+  B(Assembler::Overflow, fail);
+
+  // Round towards negative infinity.
+  Fcvtms(o64, iFlt);
+
+  // Sign extend lower 32 bits to test if the result isn't an Int32.
+  Cmp(o64, Operand(o64, vixl::SXTW));
+  B(NotEqual, fail);
+
+  // Clear upper 32 bits.
+  Uxtw(o64, o64);
+  B(&fin);
+
+  bind(&handleZero);
+  // Move the top word of the float into the output reg, if it is non-zero,
+  // then the original value was -0.0.
+  Fmov(o32, iFlt);
+  Cbnz(o32, fail);
+  bind(&fin);
+}
+
+void MacroAssembler::floorDoubleToInt32(FloatRegister src, Register dest,
+                                        Label* fail) {
+  ARMFPRegister iDbl(src, 64);
+  ARMRegister o64(dest, 64);
+  ARMRegister o32(dest, 32);
+
+  Label handleZero;
+  Label fin;
+
+  // Handle ±0 and NaN first.
+  Fcmp(iDbl, 0.0);
+  B(Assembler::Equal, &handleZero);
+  // NaN is always a bail condition, just bail directly.
+  B(Assembler::Overflow, fail);
+
+  // Round towards negative infinity.
+  Fcvtms(o64, iDbl);
+
+  // Sign extend lower 32 bits to test if the result isn't an Int32.
+  Cmp(o64, Operand(o64, vixl::SXTW));
+  B(NotEqual, fail);
+
+  // Clear upper 32 bits.
+  Uxtw(o64, o64);
+  B(&fin);
+
+  bind(&handleZero);
+  // Move the top word of the double into the output reg, if it is non-zero,
+  // then the original value was -0.0.
+  Fmov(o64, iDbl);
+  Cbnz(o64, fail);
+  bind(&fin);
+}
+
+void MacroAssembler::ceilFloat32ToInt32(FloatRegister src, Register dest,
+                                        Label* fail) {
+  ARMFPRegister iFlt(src, 32);
+  ARMRegister o64(dest, 64);
+  ARMRegister o32(dest, 32);
+
+  Label handleZero;
+  Label fin;
+
+  // Round towards positive infinity.
+  Fcvtps(o64, iFlt);
+
+  // Sign extend lower 32 bits to test if the result isn't an Int32.
+  Cmp(o64, Operand(o64, vixl::SXTW));
+  B(NotEqual, fail);
+
+  // We have to check for (-1, -0] and NaN when the result is zero.
+  Cbz(o64, &handleZero);
+
+  // Clear upper 32 bits.
+  Uxtw(o64, o64);
+  B(&fin);
+
+  // Bail if the input is in (-1, -0] or NaN.
+  bind(&handleZero);
+  // Move the top word of the float into the output reg, if it is non-zero,
+  // then the original value wasn't +0.0.
+  Fmov(o32, iFlt);
+  Cbnz(o32, fail);
+  bind(&fin);
+}
+
+void MacroAssembler::ceilDoubleToInt32(FloatRegister src, Register dest,
+                                       Label* fail) {
+  ARMFPRegister iDbl(src, 64);
+  ARMRegister o64(dest, 64);
+  ARMRegister o32(dest, 32);
+
+  Label handleZero;
+  Label fin;
+
+  // Round towards positive infinity.
+  Fcvtps(o64, iDbl);
+
+  // Sign extend lower 32 bits to test if the result isn't an Int32.
+  Cmp(o64, Operand(o64, vixl::SXTW));
+  B(NotEqual, fail);
+
+  // We have to check for (-1, -0] and NaN when the result is zero.
+  Cbz(o64, &handleZero);
+
+  // Clear upper 32 bits.
+  Uxtw(o64, o64);
+  B(&fin);
+
+  // Bail if the input is in (-1, -0] or NaN.
+  bind(&handleZero);
+  // Move the top word of the double into the output reg, if it is non-zero,
+  // then the original value wasn't +0.0.
+  Fmov(o64, iDbl);
+  Cbnz(o64, fail);
+  bind(&fin);
+}
+
+void MacroAssembler::truncFloat32ToInt32(FloatRegister src, Register dest,
+                                         Label* fail) {
+  ARMFPRegister src32(src, 32);
+  ARMRegister dest32(dest, 32);
+  ARMRegister dest64(dest, 64);
+
+  Label done, zeroCase;
+
+  // Convert scalar to signed 64-bit fixed-point, rounding toward zero.
+  // In the case of overflow, the output is saturated.
+  // In the case of NaN and -0, the output is zero.
+  Fcvtzs(dest64, src32);
+
+  // If the output was zero, worry about special cases.
+  Cbz(dest64, &zeroCase);
+
+  // Sign extend lower 32 bits to test if the result isn't an Int32.
+  Cmp(dest64, Operand(dest64, vixl::SXTW));
+  B(NotEqual, fail);
+
+  // Clear upper 32 bits.
+  Uxtw(dest64, dest64);
+
+  // If the output was non-zero and wasn't saturated, just return it.
+  B(&done);
+
+  // Handle the case of a zero output:
+  // 1. The input may have been NaN, requiring a failure.
+  // 2. The input may have been in (-1,-0], requiring a failure.
+  {
+    bind(&zeroCase);
+
+    // Combine test for negative and NaN values using a single bitwise
+    // operation.
+    //
+    // | Decimal number | Bitwise representation |
+    // |----------------|------------------------|
+    // | -0             | 8000'0000              |
+    // | +0             | 0000'0000              |
+    // | +1             | 3f80'0000              |
+    // |  NaN (or +Inf) | 7fyx'xxxx, y >= 8      |
+    // | -NaN (or -Inf) | ffyx'xxxx, y >= 8      |
+    //
+    // If any of two most significant bits is set, the number isn't in [0, 1).
+    // (Recall that floating point numbers, except for NaN, are strictly ordered
+    // when comparing their bitwise representation as signed integers.)
+
+    Fmov(dest32, src32);
+    Lsr(dest32, dest32, 30);
+    Cbnz(dest32, fail);
+  }
+
+  bind(&done);
+}
+
+void MacroAssembler::truncDoubleToInt32(FloatRegister src, Register dest,
+                                        Label* fail) {
+  ARMFPRegister src64(src, 64);
+  ARMRegister dest64(dest, 64);
+  ARMRegister dest32(dest, 32);
+
+  Label done, zeroCase;
+
+  // Convert scalar to signed 64-bit fixed-point, rounding toward zero.
+  // In the case of overflow, the output is saturated.
+  // In the case of NaN and -0, the output is zero.
+  Fcvtzs(dest64, src64);
+
+  // If the output was zero, worry about special cases.
+  Cbz(dest64, &zeroCase);
+
+  // Sign extend lower 32 bits to test if the result isn't an Int32.
+  Cmp(dest64, Operand(dest64, vixl::SXTW));
+  B(NotEqual, fail);
+
+  // Clear upper 32 bits.
+  Uxtw(dest64, dest64);
+
+  // If the output was non-zero and wasn't saturated, just return it.
+  B(&done);
+
+  // Handle the case of a zero output:
+  // 1. The input may have been NaN, requiring a failure.
+  // 2. The input may have been in (-1,-0], requiring a failure.
+  {
+    bind(&zeroCase);
+
+    // Combine test for negative and NaN values using a single bitwise
+    // operation.
+    //
+    // | Decimal number | Bitwise representation |
+    // |----------------|------------------------|
+    // | -0             | 8000'0000'0000'0000    |
+    // | +0             | 0000'0000'0000'0000    |
+    // | +1             | 3ff0'0000'0000'0000    |
+    // |  NaN (or +Inf) | 7ffx'xxxx'xxxx'xxxx    |
+    // | -NaN (or -Inf) | fffx'xxxx'xxxx'xxxx    |
+    //
+    // If any of two most significant bits is set, the number isn't in [0, 1).
+    // (Recall that floating point numbers, except for NaN, are strictly ordered
+    // when comparing their bitwise representation as signed integers.)
+
+    Fmov(dest64, src64);
+    Lsr(dest64, dest64, 62);
+    Cbnz(dest64, fail);
+  }
+
+  bind(&done);
+}
+
+void MacroAssembler::roundFloat32ToInt32(FloatRegister src, Register dest,
+                                         FloatRegister temp, Label* fail) {
+  ARMFPRegister src32(src, 32);
+  ARMRegister dest32(dest, 32);
+  ARMRegister dest64(dest, 64);
+
+  Label negative, saturated, done;
+
+  // Branch to a slow path if input < 0.0 due to complicated rounding rules.
+  // Note that Fcmp with NaN unsets the negative flag.
+  Fcmp(src32, 0.0);
+  B(&negative, Assembler::Condition::lo);
+
+  // Handle the simple case of a positive input, and also -0 and NaN.
+  // Rounding proceeds with consideration of the fractional part of the input:
+  // 1. If > 0.5, round to integer with higher absolute value (so, up).
+  // 2. If < 0.5, round to integer with lower absolute value (so, down).
+  // 3. If = 0.5, round to +Infinity (so, up).
+  {
+    // Convert to signed 64-bit integer, rounding halfway cases away from zero.
+    // In the case of overflow, the output is saturated.
+    // In the case of NaN and -0, the output is zero.
+    Fcvtas(dest64, src32);
+
+    // In the case of zero, the input may have been NaN or -0, which must bail.
+    Cbnz(dest64, &saturated);
+
+    // Combine test for -0 and NaN values using a single bitwise operation.
+    // See truncFloat32ToInt32 for an explanation.
+    Fmov(dest32, src32);
+    Lsr(dest32, dest32, 30);
+    Cbnz(dest32, fail);
+
+    B(&done);
+  }
+
+  // Handle the complicated case of a negative input.
+  // Rounding proceeds with consideration of the fractional part of the input:
+  // 1. If > 0.5, round to integer with higher absolute value (so, down).
+  // 2. If < 0.5, round to integer with lower absolute value (so, up).
+  // 3. If = 0.5, round to +Infinity (so, up).
+  bind(&negative);
+  {
+    // Inputs in [-0.5, 0) are rounded to -0. Fail.
+    loadConstantFloat32(-0.5f, temp);
+    branchFloat(Assembler::DoubleGreaterThanOrEqual, src, temp, fail);
+
+    // Other negative inputs need the biggest double less than 0.5 added.
+    loadConstantFloat32(GetBiggestNumberLessThan(0.5f), temp);
+    addFloat32(src, temp);
+
+    // Round all values toward -Infinity.
+    // In the case of overflow, the output is saturated.
+    // NaN and -0 are already handled by the "positive number" path above.
+    Fcvtms(dest64, temp);
+  }
+
+  bind(&saturated);
+
+  // Sign extend lower 32 bits to test if the result isn't an Int32.
+  Cmp(dest64, Operand(dest64, vixl::SXTW));
+  B(NotEqual, fail);
+
+  // Clear upper 32 bits.
+  Uxtw(dest64, dest64);
+
+  bind(&done);
+}
+
+void MacroAssembler::roundDoubleToInt32(FloatRegister src, Register dest,
+                                        FloatRegister temp, Label* fail) {
+  ARMFPRegister src64(src, 64);
+  ARMRegister dest64(dest, 64);
+  ARMRegister dest32(dest, 32);
+
+  Label negative, saturated, done;
+
+  // Branch to a slow path if input < 0.0 due to complicated rounding rules.
+  // Note that Fcmp with NaN unsets the negative flag.
+  Fcmp(src64, 0.0);
+  B(&negative, Assembler::Condition::lo);
+
+  // Handle the simple case of a positive input, and also -0 and NaN.
+  // Rounding proceeds with consideration of the fractional part of the input:
+  // 1. If > 0.5, round to integer with higher absolute value (so, up).
+  // 2. If < 0.5, round to integer with lower absolute value (so, down).
+  // 3. If = 0.5, round to +Infinity (so, up).
+  {
+    // Convert to signed 64-bit integer, rounding halfway cases away from zero.
+    // In the case of overflow, the output is saturated.
+    // In the case of NaN and -0, the output is zero.
+    Fcvtas(dest64, src64);
+
+    // In the case of zero, the input may have been NaN or -0, which must bail.
+    Cbnz(dest64, &saturated);
+
+    // Combine test for -0 and NaN values using a single bitwise operation.
+    // See truncDoubleToInt32 for an explanation.
+    Fmov(dest64, src64);
+    Lsr(dest64, dest64, 62);
+    Cbnz(dest64, fail);
+
+    B(&done);
+  }
+
+  // Handle the complicated case of a negative input.
+  // Rounding proceeds with consideration of the fractional part of the input:
+  // 1. If > 0.5, round to integer with higher absolute value (so, down).
+  // 2. If < 0.5, round to integer with lower absolute value (so, up).
+  // 3. If = 0.5, round to +Infinity (so, up).
+  bind(&negative);
+  {
+    // Inputs in [-0.5, 0) are rounded to -0. Fail.
+    loadConstantDouble(-0.5, temp);
+    branchDouble(Assembler::DoubleGreaterThanOrEqual, src, temp, fail);
+
+    // Other negative inputs need the biggest double less than 0.5 added.
+    loadConstantDouble(GetBiggestNumberLessThan(0.5), temp);
+    addDouble(src, temp);
+
+    // Round all values toward -Infinity.
+    // In the case of overflow, the output is saturated.
+    // NaN and -0 are already handled by the "positive number" path above.
+    Fcvtms(dest64, temp);
+  }
+
+  bind(&saturated);
+
+  // Sign extend lower 32 bits to test if the result isn't an Int32.
+  Cmp(dest64, Operand(dest64, vixl::SXTW));
+  B(NotEqual, fail);
+
+  // Clear upper 32 bits.
+  Uxtw(dest64, dest64);
+
+  bind(&done);
+}
+
+void MacroAssembler::nearbyIntDouble(RoundingMode mode, FloatRegister src,
+                                     FloatRegister dest) {
+  switch (mode) {
+    case RoundingMode::Up:
+      frintp(ARMFPRegister(dest, 64), ARMFPRegister(src, 64));
+      return;
+    case RoundingMode::Down:
+      frintm(ARMFPRegister(dest, 64), ARMFPRegister(src, 64));
+      return;
+    case RoundingMode::NearestTiesToEven:
+      frintn(ARMFPRegister(dest, 64), ARMFPRegister(src, 64));
+      return;
+    case RoundingMode::TowardsZero:
+      frintz(ARMFPRegister(dest, 64), ARMFPRegister(src, 64));
+      return;
+  }
+  MOZ_CRASH("unexpected mode");
+}
+
+void MacroAssembler::nearbyIntFloat32(RoundingMode mode, FloatRegister src,
+                                      FloatRegister dest) {
+  switch (mode) {
+    case RoundingMode::Up:
+      frintp(ARMFPRegister(dest, 32), ARMFPRegister(src, 32));
+      return;
+    case RoundingMode::Down:
+      frintm(ARMFPRegister(dest, 32), ARMFPRegister(src, 32));
+      return;
+    case RoundingMode::NearestTiesToEven:
+      frintn(ARMFPRegister(dest, 32), ARMFPRegister(src, 32));
+      return;
+    case RoundingMode::TowardsZero:
+      frintz(ARMFPRegister(dest, 32), ARMFPRegister(src, 32));
+      return;
+  }
+  MOZ_CRASH("unexpected mode");
+}
+
+void MacroAssembler::copySignDouble(FloatRegister lhs, FloatRegister rhs,
+                                    FloatRegister output) {
+  ScratchDoubleScope scratch(*this);
+
+  // Double with only the sign bit set
+  loadConstantDouble(-0.0, scratch);
+
+  if (lhs != output) {
+    moveDouble(lhs, output);
+  }
+
+  bit(ARMFPRegister(output.encoding(), vixl::VectorFormat::kFormat8B),
+      ARMFPRegister(rhs.encoding(), vixl::VectorFormat::kFormat8B),
+      ARMFPRegister(scratch.encoding(), vixl::VectorFormat::kFormat8B));
+}
+
+void MacroAssembler::copySignFloat32(FloatRegister lhs, FloatRegister rhs,
+                                     FloatRegister output) {
+  ScratchFloat32Scope scratch(*this);
+
+  // Float with only the sign bit set
+  loadConstantFloat32(-0.0f, scratch);
+
+  if (lhs != output) {
+    moveFloat32(lhs, output);
+  }
+
+  bit(ARMFPRegister(output.encoding(), vixl::VectorFormat::kFormat8B),
+      ARMFPRegister(rhs.encoding(), vixl::VectorFormat::kFormat8B),
+      ARMFPRegister(scratch.encoding(), vixl::VectorFormat::kFormat8B));
+}
+
+void MacroAssembler::shiftIndex32AndAdd(Register indexTemp32, int shift,
+                                        Register pointer) {
+  Add(ARMRegister(pointer, 64), ARMRegister(pointer, 64),
+      Operand(ARMRegister(indexTemp32, 64), vixl::LSL, shift));
+}
+
+//}}} check_macroassembler_style
+
+}  // namespace jit
+}  // namespace js