summaryrefslogtreecommitdiffstats
path: root/js/src/jit/arm64/Assembler-arm64.h
blob: 9745e9d262ac11e9bde29febb31e53494420ab82 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 * vim: set ts=8 sts=2 et sw=2 tw=80:
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef A64_ASSEMBLER_A64_H_
#define A64_ASSEMBLER_A64_H_

#include <iterator>

#include "jit/arm64/vixl/Assembler-vixl.h"

#include "jit/CompactBuffer.h"
#include "jit/shared/Disassembler-shared.h"
#include "wasm/WasmTypeDecls.h"

namespace js {
namespace jit {

// VIXL imports.
typedef vixl::Register ARMRegister;
typedef vixl::FPRegister ARMFPRegister;
using vixl::ARMBuffer;
using vixl::Instruction;

using LabelDoc = DisassemblerSpew::LabelDoc;
using LiteralDoc = DisassemblerSpew::LiteralDoc;

static const uint32_t AlignmentAtPrologue = 0;
static const uint32_t AlignmentMidPrologue = 8;
static const Scale ScalePointer = TimesEight;

// The MacroAssembler uses scratch registers extensively and unexpectedly.
// For safety, scratch registers should always be acquired using
// vixl::UseScratchRegisterScope.
static constexpr Register ScratchReg{Registers::ip0};
static constexpr ARMRegister ScratchReg64 = {ScratchReg, 64};

static constexpr Register ScratchReg2{Registers::ip1};
static constexpr ARMRegister ScratchReg2_64 = {ScratchReg2, 64};

static constexpr FloatRegister ReturnDoubleReg = {FloatRegisters::d0,
                                                  FloatRegisters::Double};
static constexpr FloatRegister ScratchDoubleReg_ = {FloatRegisters::d31,
                                                    FloatRegisters::Double};
struct ScratchDoubleScope : public AutoFloatRegisterScope {
  explicit ScratchDoubleScope(MacroAssembler& masm)
      : AutoFloatRegisterScope(masm, ScratchDoubleReg_) {}
};

static constexpr FloatRegister ReturnFloat32Reg = {FloatRegisters::s0,
                                                   FloatRegisters::Single};
static constexpr FloatRegister ScratchFloat32Reg_ = {FloatRegisters::s31,
                                                     FloatRegisters::Single};
struct ScratchFloat32Scope : public AutoFloatRegisterScope {
  explicit ScratchFloat32Scope(MacroAssembler& masm)
      : AutoFloatRegisterScope(masm, ScratchFloat32Reg_) {}
};

#ifdef ENABLE_WASM_SIMD
static constexpr FloatRegister ReturnSimd128Reg = {FloatRegisters::v0,
                                                   FloatRegisters::Simd128};
static constexpr FloatRegister ScratchSimd128Reg = {FloatRegisters::v31,
                                                    FloatRegisters::Simd128};
struct ScratchSimd128Scope : public AutoFloatRegisterScope {
  explicit ScratchSimd128Scope(MacroAssembler& masm)
      : AutoFloatRegisterScope(masm, ScratchSimd128Reg) {}
};
#else
struct ScratchSimd128Scope : public AutoFloatRegisterScope {
  explicit ScratchSimd128Scope(MacroAssembler& masm)
      : AutoFloatRegisterScope(masm, ScratchDoubleReg_) {
    MOZ_CRASH("SIMD not enabled");
  }
};
#endif

static constexpr Register InvalidReg{Registers::Invalid};
static constexpr FloatRegister InvalidFloatReg = {};

static constexpr Register OsrFrameReg{Registers::x3};
static constexpr Register CallTempReg0{Registers::x9};
static constexpr Register CallTempReg1{Registers::x10};
static constexpr Register CallTempReg2{Registers::x11};
static constexpr Register CallTempReg3{Registers::x12};
static constexpr Register CallTempReg4{Registers::x13};
static constexpr Register CallTempReg5{Registers::x14};

static constexpr Register PreBarrierReg{Registers::x1};

static constexpr Register InterpreterPCReg{Registers::x9};

static constexpr Register ReturnReg{Registers::x0};
static constexpr Register64 ReturnReg64(ReturnReg);
static constexpr Register JSReturnReg{Registers::x2};
static constexpr Register FramePointer{Registers::fp};
static constexpr ARMRegister FramePointer64{FramePointer, 64};
static constexpr Register ZeroRegister{Registers::sp};
static constexpr ARMRegister ZeroRegister64{Registers::sp, 64};
static constexpr ARMRegister ZeroRegister32{Registers::sp, 32};

// [SMDOC] AArch64 Stack Pointer and Pseudo Stack Pointer conventions
//
//                               ================
//
// Stack pointer (SP), PseudoStackPointer (PSP), and RealStackPointer:
//
// The ARM64 real SP has a constraint: it must be 16-byte aligned whenever it
// is used as the base pointer for a memory access.  (SP+offset need not be
// 16-byte aligned, but the SP value itself must be.)  The SP register may
// take on unaligned values but may not be used for a memory access while it
// is unaligned.
//
// Stack-alignment checking can be enabled or disabled by a control register;
// however that register cannot be modified by user space.  We have to assume
// stack alignment checking is enabled, and that does usually appear to be the
// case.  See the ARM Architecture Reference Manual, "D1.8.2 SP alignment
// checking", for further details.
//
// A second constraint is forced upon us by the ARM64 ABI.  This requires that
// all accesses to the stack must be at or above SP.  Accesses below SP are
// strictly forbidden, presumably because the kernel might use that area of
// memory for its own purposes -- in particular, signal delivery -- and hence
// it may get trashed at any time.
//
// Note this doesn't mean that accesses to the stack must be based off
// register SP.  Only that the effective addresses must be >= SP, regardless
// of how the address is formed.
//
// In order to allow word-wise pushes and pops, some of our ARM64 jits
// (JS-Baseline, JS-Ion, and Wasm-Ion, but not Wasm-Baseline) dedicate x28 to
// be used as a PseudoStackPointer (PSP).
//
// Initially the PSP will have the same value as the SP.  Code can, if it
// wants, push a single word by subtracting 8 from the PSP, doing SP := PSP,
// then storing the value at PSP+0.  Given other constraints on the alignment
// of the SP at function call boundaries, this works out OK, at the cost of
// the two extra instructions per push / pop.
//
// This is all a bit messy, and is probably not robustly adhered to.  However,
// the following appear to be the intended, and mostly implemented, current
// invariants:
//
// (1) PSP is "primary", SP is "secondary".  Most stack refs are
//     PSP-relative. SP-relative is rare and (obviously) only done when we
//     know that SP is aligned.
//
// (2) At all times, the relationship SP <= PSP is maintained.  The fact that
//     SP may validly be less than PSP means that pushes on the stack force
//     the two values to become equal, by copying PSP into SP.  However, pops
//     behave differently: PSP moves back up and SP stays the same, since that
//     doesn't break the SP <= PSP invariant.
//
// (3) However, immediately before a call instruction, SP and PSP must be the
//     same.  To enforce this, PSP is copied into SP by the arm64-specific
//     MacroAssembler::call routines.
//
// (4) Also, after a function has returned, it is expected that SP holds the
//     "primary" value.  How exactly this is implemented remains not entirely
//     clear and merits further investigation.  The following points are
//     believed to be relevant:
//
//     - For calls to functions observing the system AArch64 ABI, PSP (x28) is
//       callee-saved.  That, combined with (3) above, implies SP == PSP
//       immediately after the call returns.
//
//     - JIT-generated routines return using MacroAssemblerCompat::retn, and
//       that copies PSP into SP (bizarrely; this would make more sense if it
//       copied SP into PSP); but in any case, the point is that they are the
//       same at the point that the return instruction executes.
//
//     - MacroAssembler::callWithABIPost copies PSP into SP after the return
//       of a call requiring dynamic alignment.
//
//     Given the above, it is unclear exactly where in the return sequence it
//     is expected that SP == PSP, and also whether it is the callee or caller
//     that is expected to enforce it.
//
// In general it would be nice to be able to move (at some time in the future,
// not now) to a world where *every* assignment to PSP or SP is followed
// immediately by a copy into the other register.  That would make all
// required correctness proofs trivial in the sense that it would require only
// local inspection of code immediately following (dominated by) any such
// assignment.  For the moment, however, this is a guideline, not a hard
// requirement.
//
//                               ================
//
// Mechanics of keeping the stack pointers in sync:
//
// The following two methods require that the masm's SP has been set to the PSP
// with MacroAssembler::SetStackPointer64(PseudoStackPointer64), or they will be
// no-ops.  The setup is performed manually by the jits after creating the masm.
//
// * MacroAssembler::syncStackPtr() performs SP := PSP, presumably after PSP has
//   been updated, so SP needs to move too.  This is used pretty liberally
//   throughout the code base.
//
// * MacroAssembler::initPseudoStackPtr() performs PSP := SP.  This can be used
//   after calls to non-ABI compliant code; it's not used much.
//
// In the ARM64 assembler there is a function Instruction::IsStackPtrSync() that
// recognizes the instruction emitted by syncStackPtr(), and this is used to
// skip that instruction a few places, should it be present, in the JS JIT where
// code is generated to deal with toggled calls.
//
// In various places there are calls to MacroAssembler::syncStackPtr() which
// appear to be redundant.  Investigation shows that they often are redundant,
// but not always.  Finding and removing such redundancies would be quite some
// work, so we live for now with the occasional redundant update.  Perusal of
// the Cortex-A55 and -A72 optimization guides shows no evidence that such
// assignments are any more expensive than assignments between vanilla integer
// registers, so the costs of such redundant updates are assumed to be small.
//
// Invariants on the PSP at function call boundaries:
//
// It *appears* that the following invariants exist:
//
// * On entry to JIT code, PSP == SP, ie the stack pointer is transmitted via
//   both registers.
//
// * On entry to C++ code, PSP == SP.  Certainly it appears that all calls
//   created by the MacroAssembler::call(..) routines perform 'syncStackPtr'
//   immediately before the call, and all ABI calls are routed through the
//   MacroAssembler::call layer.
//
// * The stubs generated by WasmStubs.cpp assume that, on entry, SP is the
//   active stack pointer and that PSP is dead.
//
// * The PSP is non-volatile (callee-saved).  Along a normal return path from
//   JIT code, simply having PSP == SP on exit is correct, since the exit SP is
//   the same as the entry SP by the JIT ABI.
//
// * Call-outs to non-JIT C++ code do not need to set up the PSP (it won't be
//   used), and will not need to restore the PSP on return because x28 is
//   non-volatile in the ARM64 ABI.
//
//                               ================
//
// Future cleanups to the SP-vs-PSP machinery:
//
// Currently we have somewhat unclear invariants, which are not obviously
// always enforced, and which may require complex non-local reasoning.
// Auditing the code to ensure that the invariants always hold, whilst not
// generating duplicate syncs, is close to impossible.  A future rework to
// tidy this might be as follows.  (This suggestion pertains the the entire
// JIT complex: all of the JS compilers, wasm compilers, stub generators,
// regexp compilers, etc).
//
// Currently we have that, in JIT-generated code, PSP is "primary" and SP is
// "secondary", meaning that PSP has the "real" stack pointer value and SP is
// updated whenever PSP acquires a lower value, so as to ensure that SP <= PSP.
// An exception to this scheme is the stubs code generated by WasmStubs.cpp,
// which assumes that SP is "primary" and PSP is dead.
//
// It might give us an easier incremental path to eventually removing PSP
// entirely if we switched to having SP always be the primary.  That is:
//
// (1) SP is primary, PSP is secondary
// (2) After any assignment to SP, it is copied into PSP
// (3) All (non-frame-pointer-based) stack accesses are PSP-relative
//     (as at present)
//
// This would have the effect that:
//
// * It would reinstate the invariant that on all targets, the "real" SP value
//   is in the ABI-and-or-hardware-mandated stack pointer register.
//
// * It would give us a simple story about calls and returns:
//   - for calls to non-JIT generated code (viz, C++ etc), we need no extra
//     copies, because PSP (x28) is callee-saved
//   - for calls to JIT-generated code, we need no extra copies, because of (2)
//     above
//
// * We could incrementally migrate those parts of the code generator where we
//   know that SP is 16-aligned, to use SP- rather than PSP-relative accesses
//
// * The consistent use of (2) would remove the requirement to have to perform
//   path-dependent reasoning (for paths in the generated code, not in the
//   compiler) when reading/understanding the code.
//
// * x28 would become free for use by stubs and the baseline compiler without
//   having to worry about interoperating with code that expects x28 to hold a
//   valid PSP.
//
// One might ask what mechanical checks we can add to ensure correctness, rather
// than having to verify these invariants by hand indefinitely.  Maybe some
// combination of:
//
// * In debug builds, compiling-in assert(SP == PSP) at critical places.  This
//   can be done using the existing `assertStackPtrsSynced` function.
//
// * In debug builds, scanning sections of generated code to ensure no
//   SP-relative stack accesses have been created -- for some sections, at
//   least every assignment to SP is immediately followed by a copy to x28.
//   This would also facilitate detection of duplicate syncs.
//
//                               ================
//
// Other investigative notes, for the code base at present:
//
// * Some disassembly dumps suggest that we sync the stack pointer too often.
//   This could be the result of various pieces of code working at cross
//   purposes when syncing the stack pointer, or of not paying attention to the
//   precise invariants.
//
// * As documented in RegExpNativeMacroAssembler.cpp, function
//   SMRegExpMacroAssembler::createStackFrame:
//
//   // ARM64 communicates stack address via SP, but uses a pseudo-sp (PSP) for
//   // addressing.  The register we use for PSP may however also be used by
//   // calling code, and it is nonvolatile, so save it.  Do this as a special
//   // case first because the generic save/restore code needs the PSP to be
//   // initialized already.
//
//   and also in function SMRegExpMacroAssembler::exitHandler:
//
//   // Restore the saved value of the PSP register, this value is whatever the
//   // caller had saved in it, not any actual SP value, and it must not be
//   // overwritten subsequently.
//
//   The original source for these comments was a patch for bug 1445907.
//
// * MacroAssembler-arm64.h has an interesting comment in the retn()
//   function:
//
//   syncStackPtr();  // SP is always used to transmit the stack between calls.
//
//   Same comment at abiret() in that file, and in MacroAssembler-arm64.cpp,
//   at callWithABIPre and callWithABIPost.
//
// * In Trampoline-arm64.cpp function JitRuntime::generateVMWrapper we find
//
//   // SP is used to transfer stack across call boundaries.
//   masm.initPseudoStackPtr();
//
//   after the return point of a callWithVMWrapper.  The only reasonable
//   conclusion from all those (assuming they are right) is that SP == PSP.
//
// * Wasm-Baseline does not use the PSP, but as Wasm-Ion code requires SP==PSP
//   and tiered code can have Baseline->Ion calls, Baseline will set PSP=SP
//   before a call to wasm code.
//
//                               ================

// StackPointer is intentionally undefined on ARM64 to prevent misuse: using
// sp as a base register is only valid if sp % 16 == 0.
static constexpr Register RealStackPointer{Registers::sp};

static constexpr Register PseudoStackPointer{Registers::x28};
static constexpr ARMRegister PseudoStackPointer64 = {Registers::x28, 64};
static constexpr ARMRegister PseudoStackPointer32 = {Registers::x28, 32};

static constexpr Register IntArgReg0{Registers::x0};
static constexpr Register IntArgReg1{Registers::x1};
static constexpr Register IntArgReg2{Registers::x2};
static constexpr Register IntArgReg3{Registers::x3};
static constexpr Register IntArgReg4{Registers::x4};
static constexpr Register IntArgReg5{Registers::x5};
static constexpr Register IntArgReg6{Registers::x6};
static constexpr Register IntArgReg7{Registers::x7};
static constexpr Register HeapReg{Registers::x21};

// Define unsized Registers.
#define DEFINE_UNSIZED_REGISTERS(N) \
  static constexpr Register r##N{Registers::x##N};
REGISTER_CODE_LIST(DEFINE_UNSIZED_REGISTERS)
#undef DEFINE_UNSIZED_REGISTERS
static constexpr Register ip0{Registers::x16};
static constexpr Register ip1{Registers::x17};
static constexpr Register fp{Registers::x29};
static constexpr Register lr{Registers::x30};
static constexpr Register rzr{Registers::xzr};

// Import VIXL registers into the js::jit namespace.
#define IMPORT_VIXL_REGISTERS(N)                  \
  static constexpr ARMRegister w##N = vixl::w##N; \
  static constexpr ARMRegister x##N = vixl::x##N;
REGISTER_CODE_LIST(IMPORT_VIXL_REGISTERS)
#undef IMPORT_VIXL_REGISTERS
static constexpr ARMRegister wzr = vixl::wzr;
static constexpr ARMRegister xzr = vixl::xzr;
static constexpr ARMRegister wsp = vixl::wsp;
static constexpr ARMRegister sp = vixl::sp;

// Import VIXL VRegisters into the js::jit namespace.
#define IMPORT_VIXL_VREGISTERS(N)                   \
  static constexpr ARMFPRegister s##N = vixl::s##N; \
  static constexpr ARMFPRegister d##N = vixl::d##N;
REGISTER_CODE_LIST(IMPORT_VIXL_VREGISTERS)
#undef IMPORT_VIXL_VREGISTERS

static constexpr ValueOperand JSReturnOperand = ValueOperand(JSReturnReg);

// Registers used by RegExpMatcher and RegExpExecMatch stubs (do not use
// JSReturnOperand).
static constexpr Register RegExpMatcherRegExpReg = CallTempReg0;
static constexpr Register RegExpMatcherStringReg = CallTempReg1;
static constexpr Register RegExpMatcherLastIndexReg = CallTempReg2;

// Registers used by RegExpExecTest stub (do not use ReturnReg).
static constexpr Register RegExpExecTestRegExpReg = CallTempReg0;
static constexpr Register RegExpExecTestStringReg = CallTempReg1;

// Registers used by RegExpSearcher stub (do not use ReturnReg).
static constexpr Register RegExpSearcherRegExpReg = CallTempReg0;
static constexpr Register RegExpSearcherStringReg = CallTempReg1;
static constexpr Register RegExpSearcherLastIndexReg = CallTempReg2;

static constexpr Register JSReturnReg_Type = r3;
static constexpr Register JSReturnReg_Data = r2;

static constexpr FloatRegister NANReg = {FloatRegisters::d14,
                                         FloatRegisters::Single};
// N.B. r8 isn't listed as an aapcs temp register, but we can use it as such
// because we never use return-structs.
static constexpr Register CallTempNonArgRegs[] = {r8,  r9,  r10, r11,
                                                  r12, r13, r14, r15};
static const uint32_t NumCallTempNonArgRegs = std::size(CallTempNonArgRegs);

static constexpr uint32_t JitStackAlignment = 16;

static constexpr uint32_t JitStackValueAlignment =
    JitStackAlignment / sizeof(Value);
static_assert(JitStackAlignment % sizeof(Value) == 0 &&
                  JitStackValueAlignment >= 1,
              "Stack alignment should be a non-zero multiple of sizeof(Value)");

static constexpr uint32_t SimdMemoryAlignment = 16;

static_assert(CodeAlignment % SimdMemoryAlignment == 0,
              "Code alignment should be larger than any of the alignments "
              "which are used for "
              "the constant sections of the code buffer.  Thus it should be "
              "larger than the "
              "alignment for SIMD constants.");

static const uint32_t WasmStackAlignment = SimdMemoryAlignment;
static const uint32_t WasmTrapInstructionLength = 4;

// See comments in wasm::GenerateFunctionPrologue.  The difference between these
// is the size of the largest callable prologue on the platform.
static constexpr uint32_t WasmCheckedCallEntryOffset = 0u;

class Assembler : public vixl::Assembler {
 public:
  Assembler() : vixl::Assembler() {}

  typedef vixl::Condition Condition;

  void finish();
  bool appendRawCode(const uint8_t* code, size_t numBytes);
  bool reserve(size_t size);
  bool swapBuffer(wasm::Bytes& bytes);

  // Emit the jump table, returning the BufferOffset to the first entry in the
  // table.
  BufferOffset emitExtendedJumpTable();
  BufferOffset ExtendedJumpTable_;
  void executableCopy(uint8_t* buffer);

  BufferOffset immPool(ARMRegister dest, uint8_t* value, vixl::LoadLiteralOp op,
                       const LiteralDoc& doc,
                       ARMBuffer::PoolEntry* pe = nullptr);
  BufferOffset immPool64(ARMRegister dest, uint64_t value,
                         ARMBuffer::PoolEntry* pe = nullptr);
  BufferOffset fImmPool(ARMFPRegister dest, uint8_t* value,
                        vixl::LoadLiteralOp op, const LiteralDoc& doc);
  BufferOffset fImmPool64(ARMFPRegister dest, double value);
  BufferOffset fImmPool32(ARMFPRegister dest, float value);

  uint32_t currentOffset() const { return nextOffset().getOffset(); }

  void bind(Label* label) { bind(label, nextOffset()); }
  void bind(Label* label, BufferOffset boff);
  void bind(CodeLabel* label) { label->target()->bind(currentOffset()); }

  void setUnlimitedBuffer() { armbuffer_.setUnlimited(); }
  bool oom() const {
    return AssemblerShared::oom() || armbuffer_.oom() ||
           jumpRelocations_.oom() || dataRelocations_.oom();
  }

  void copyJumpRelocationTable(uint8_t* dest) const {
    if (jumpRelocations_.length()) {
      memcpy(dest, jumpRelocations_.buffer(), jumpRelocations_.length());
    }
  }
  void copyDataRelocationTable(uint8_t* dest) const {
    if (dataRelocations_.length()) {
      memcpy(dest, dataRelocations_.buffer(), dataRelocations_.length());
    }
  }

  size_t jumpRelocationTableBytes() const { return jumpRelocations_.length(); }
  size_t dataRelocationTableBytes() const { return dataRelocations_.length(); }
  size_t bytesNeeded() const {
    return SizeOfCodeGenerated() + jumpRelocationTableBytes() +
           dataRelocationTableBytes();
  }

  void processCodeLabels(uint8_t* rawCode) {
    for (const CodeLabel& label : codeLabels_) {
      Bind(rawCode, label);
    }
  }

  static void UpdateLoad64Value(Instruction* inst0, uint64_t value);

  static void Bind(uint8_t* rawCode, const CodeLabel& label) {
    auto mode = label.linkMode();
    size_t patchAtOffset = label.patchAt().offset();
    size_t targetOffset = label.target().offset();

    if (mode == CodeLabel::MoveImmediate) {
      Instruction* inst = (Instruction*)(rawCode + patchAtOffset);
      Assembler::UpdateLoad64Value(inst, (uint64_t)(rawCode + targetOffset));
    } else {
      *reinterpret_cast<const void**>(rawCode + patchAtOffset) =
          rawCode + targetOffset;
    }
  }

  void retarget(Label* cur, Label* next);

  // The buffer is about to be linked. Ensure any constant pools or
  // excess bookkeeping has been flushed to the instruction stream.
  void flush() { armbuffer_.flushPool(); }

  void comment(const char* msg) {
#ifdef JS_DISASM_ARM64
    spew_.spew("; %s", msg);
#endif
  }

  void setPrinter(Sprinter* sp) {
#ifdef JS_DISASM_ARM64
    spew_.setPrinter(sp);
#endif
  }

  static bool SupportsFloatingPoint() { return true; }
  static bool SupportsUnalignedAccesses() { return true; }
  static bool SupportsFastUnalignedFPAccesses() { return true; }
  static bool SupportsWasmSimd() { return true; }

  static bool HasRoundInstruction(RoundingMode mode) {
    switch (mode) {
      case RoundingMode::Up:
      case RoundingMode::Down:
      case RoundingMode::NearestTiesToEven:
      case RoundingMode::TowardsZero:
        return true;
    }
    MOZ_CRASH("unexpected mode");
  }

 protected:
  // Add a jump whose target is unknown until finalization.
  // The jump may not be patched at runtime.
  void addPendingJump(BufferOffset src, ImmPtr target, RelocationKind kind);

 public:
  static uint32_t PatchWrite_NearCallSize() { return 4; }

  static uint32_t NopSize() { return 4; }

  static void PatchWrite_NearCall(CodeLocationLabel start,
                                  CodeLocationLabel toCall);
  static void PatchDataWithValueCheck(CodeLocationLabel label,
                                      PatchedImmPtr newValue,
                                      PatchedImmPtr expected);

  static void PatchDataWithValueCheck(CodeLocationLabel label, ImmPtr newValue,
                                      ImmPtr expected);

  static void PatchWrite_Imm32(CodeLocationLabel label, Imm32 imm) {
    // Raw is going to be the return address.
    uint32_t* raw = (uint32_t*)label.raw();
    // Overwrite the 4 bytes before the return address, which will end up being
    // the call instruction.
    *(raw - 1) = imm.value;
  }
  static uint32_t AlignDoubleArg(uint32_t offset) {
    MOZ_CRASH("AlignDoubleArg()");
  }
  static uintptr_t GetPointer(uint8_t* ptr) {
    Instruction* i = reinterpret_cast<Instruction*>(ptr);
    uint64_t ret = i->Literal64();
    return ret;
  }

  // Toggle a jmp or cmp emitted by toggledJump().
  static void ToggleToJmp(CodeLocationLabel inst_);
  static void ToggleToCmp(CodeLocationLabel inst_);
  static void ToggleCall(CodeLocationLabel inst_, bool enabled);

  static void TraceJumpRelocations(JSTracer* trc, JitCode* code,
                                   CompactBufferReader& reader);
  static void TraceDataRelocations(JSTracer* trc, JitCode* code,
                                   CompactBufferReader& reader);

  void assertNoGCThings() const {
#ifdef DEBUG
    MOZ_ASSERT(dataRelocations_.length() == 0);
    for (auto& j : pendingJumps_) {
      MOZ_ASSERT(j.kind == RelocationKind::HARDCODED);
    }
#endif
  }

 public:
  // A Jump table entry is 2 instructions, with 8 bytes of raw data
  static const size_t SizeOfJumpTableEntry = 16;

  struct JumpTableEntry {
    uint32_t ldr;
    uint32_t br;
    void* data;

    Instruction* getLdr() { return reinterpret_cast<Instruction*>(&ldr); }
  };

  // Offset of the patchable target for the given entry.
  static const size_t OffsetOfJumpTableEntryPointer = 8;

 public:
  void writeCodePointer(CodeLabel* label) {
    armbuffer_.assertNoPoolAndNoNops();
    uintptr_t x = uintptr_t(-1);
    BufferOffset off = EmitData(&x, sizeof(uintptr_t));
    label->patchAt()->bind(off.getOffset());
  }

  void verifyHeapAccessDisassembly(uint32_t begin, uint32_t end,
                                   const Disassembler::HeapAccess& heapAccess) {
    MOZ_CRASH("verifyHeapAccessDisassembly");
  }

 protected:
  // Structure for fixing up pc-relative loads/jumps when the machine
  // code gets moved (executable copy, gc, etc.).
  struct RelativePatch {
    BufferOffset offset;
    void* target;
    RelocationKind kind;

    RelativePatch(BufferOffset offset, void* target, RelocationKind kind)
        : offset(offset), target(target), kind(kind) {}
  };

  // List of jumps for which the target is either unknown until finalization,
  // or cannot be known due to GC. Each entry here requires a unique entry
  // in the extended jump table, and is patched at finalization.
  js::Vector<RelativePatch, 8, SystemAllocPolicy> pendingJumps_;

  // Final output formatters.
  CompactBufferWriter jumpRelocations_;
  CompactBufferWriter dataRelocations_;
};

static const uint32_t NumIntArgRegs = 8;
static const uint32_t NumFloatArgRegs = 8;

class ABIArgGenerator {
 public:
  ABIArgGenerator()
      : intRegIndex_(0), floatRegIndex_(0), stackOffset_(0), current_() {}

  ABIArg next(MIRType argType);
  ABIArg& current() { return current_; }
  uint32_t stackBytesConsumedSoFar() const { return stackOffset_; }
  void increaseStackOffset(uint32_t bytes) { stackOffset_ += bytes; }

 protected:
  unsigned intRegIndex_;
  unsigned floatRegIndex_;
  uint32_t stackOffset_;
  ABIArg current_;
};

// These registers may be volatile or nonvolatile.
static constexpr Register ABINonArgReg0 = r8;
static constexpr Register ABINonArgReg1 = r9;
static constexpr Register ABINonArgReg2 = r10;
static constexpr Register ABINonArgReg3 = r11;

// This register may be volatile or nonvolatile. Avoid d31 which is the
// ScratchDoubleReg_.
static constexpr FloatRegister ABINonArgDoubleReg = {FloatRegisters::s16,
                                                     FloatRegisters::Single};

// These registers may be volatile or nonvolatile.
// Note: these three registers are all guaranteed to be different
static constexpr Register ABINonArgReturnReg0 = r8;
static constexpr Register ABINonArgReturnReg1 = r9;
static constexpr Register ABINonVolatileReg{Registers::x19};

// This register is guaranteed to be clobberable during the prologue and
// epilogue of an ABI call which must preserve both ABI argument, return
// and non-volatile registers.
static constexpr Register ABINonArgReturnVolatileReg = lr;

// Instance pointer argument register for WebAssembly functions. This must not
// alias any other register used for passing function arguments or return
// values. Preserved by WebAssembly functions.  Must be nonvolatile.
static constexpr Register InstanceReg{Registers::x23};

// Registers used for wasm table calls. These registers must be disjoint
// from the ABI argument registers, InstanceReg and each other.
static constexpr Register WasmTableCallScratchReg0 = ABINonArgReg0;
static constexpr Register WasmTableCallScratchReg1 = ABINonArgReg1;
static constexpr Register WasmTableCallSigReg = ABINonArgReg2;
static constexpr Register WasmTableCallIndexReg = ABINonArgReg3;

// Registers used for ref calls.
static constexpr Register WasmCallRefCallScratchReg0 = ABINonArgReg0;
static constexpr Register WasmCallRefCallScratchReg1 = ABINonArgReg1;
static constexpr Register WasmCallRefReg = ABINonArgReg3;

// Register used as a scratch along the return path in the fast js -> wasm stub
// code.  This must not overlap ReturnReg, JSReturnOperand, or InstanceReg.
// It must be a volatile register.
static constexpr Register WasmJitEntryReturnScratch = r9;

static inline bool GetIntArgReg(uint32_t usedIntArgs, uint32_t usedFloatArgs,
                                Register* out) {
  if (usedIntArgs >= NumIntArgRegs) {
    return false;
  }
  *out = Register::FromCode(usedIntArgs);
  return true;
}

static inline bool GetFloatArgReg(uint32_t usedIntArgs, uint32_t usedFloatArgs,
                                  FloatRegister* out) {
  if (usedFloatArgs >= NumFloatArgRegs) {
    return false;
  }
  *out = FloatRegister::FromCode(usedFloatArgs);
  return true;
}

// Get a register in which we plan to put a quantity that will be used as an
// integer argument.  This differs from GetIntArgReg in that if we have no more
// actual argument registers to use we will fall back on using whatever
// CallTempReg* don't overlap the argument registers, and only fail once those
// run out too.
static inline bool GetTempRegForIntArg(uint32_t usedIntArgs,
                                       uint32_t usedFloatArgs, Register* out) {
  if (GetIntArgReg(usedIntArgs, usedFloatArgs, out)) {
    return true;
  }
  // Unfortunately, we have to assume things about the point at which
  // GetIntArgReg returns false, because we need to know how many registers it
  // can allocate.
  usedIntArgs -= NumIntArgRegs;
  if (usedIntArgs >= NumCallTempNonArgRegs) {
    return false;
  }
  *out = CallTempNonArgRegs[usedIntArgs];
  return true;
}

inline Imm32 Imm64::firstHalf() const { return low(); }

inline Imm32 Imm64::secondHalf() const { return hi(); }

// Forbids nop filling for testing purposes. Not nestable.
class AutoForbidNops {
 protected:
  Assembler* asm_;

 public:
  explicit AutoForbidNops(Assembler* asm_) : asm_(asm_) { asm_->enterNoNops(); }
  ~AutoForbidNops() { asm_->leaveNoNops(); }
};

// Forbids pool generation during a specified interval. Not nestable.
class AutoForbidPoolsAndNops : public AutoForbidNops {
 public:
  AutoForbidPoolsAndNops(Assembler* asm_, size_t maxInst)
      : AutoForbidNops(asm_) {
    asm_->enterNoPool(maxInst);
  }
  ~AutoForbidPoolsAndNops() { asm_->leaveNoPool(); }
};

}  // namespace jit
}  // namespace js

#endif  // A64_ASSEMBLER_A64_H_