11 files changed, 6727 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen/src/machinst/abi.rs b/third_party/rust/cranelift-codegen/src/machinst/abi.rs
new file mode 100644
index 0000000000..c72a81dcc2
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/abi.rs
@@ -0,0 +1,233 @@
+//! ABI definitions.
+
+use crate::binemit::StackMap;
+use crate::ir::StackSlot;
+use crate::isa::CallConv;
+use crate::machinst::*;
+use crate::settings;
+
+use regalloc::{Reg, Set, SpillSlot, Writable};
+
+/// Trait implemented by an object that tracks ABI-related state (e.g., stack
+/// layout) and can generate code while emitting the *body* of a function.
+pub trait ABICallee {
+    /// The instruction type for the ISA associated with this ABI.
+    type I: VCodeInst;
+
+    /// Does the ABI-body code need a temp reg? One will be provided to `init()`
+    /// as the `maybe_tmp` arg if so.
+    fn temp_needed(&self) -> bool;
+
+    /// Initialize. This is called after the ABICallee is constructed because it
+    /// may be provided with a temp vreg, which can only be allocated once the
+    /// lowering context exists.
+    fn init(&mut self, maybe_tmp: Option<Writable<Reg>>);
+
+    /// Accumulate outgoing arguments.  This ensures that at least SIZE bytes
+    /// are allocated in the prologue to be available for use in function calls
+    /// to hold arguments and/or return values.  If this function is called
+    /// multiple times, the maximum of all SIZE values will be available.
+    fn accumulate_outgoing_args_size(&mut self, size: u32);
+
+    /// Get the settings controlling this function's compilation.
+    fn flags(&self) -> &settings::Flags;
+
+    /// Get the calling convention implemented by this ABI object.
+    fn call_conv(&self) -> CallConv;
+
+    /// Get the liveins of the function.
+    fn liveins(&self) -> Set<RealReg>;
+
+    /// Get the liveouts of the function.
+    fn liveouts(&self) -> Set<RealReg>;
+
+    /// Number of arguments.
+    fn num_args(&self) -> usize;
+
+    /// Number of return values.
+    fn num_retvals(&self) -> usize;
+
+    /// Number of stack slots (not spill slots).
+    fn num_stackslots(&self) -> usize;
+
+    /// Generate an instruction which copies an argument to a destination
+    /// register.
+    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;
+
+    /// Is the given argument needed in the body (as opposed to, e.g., serving
+    /// only as a special ABI-specific placeholder)? This controls whether
+    /// lowering will copy it to a virtual reg use by CLIF instructions.
+    fn arg_is_needed_in_body(&self, idx: usize) -> bool;
+
+    /// Generate any setup instruction needed to save values to the
+    /// return-value area. This is usually used when were are multiple return
+    /// values or an otherwise large return value that must be passed on the
+    /// stack; typically the ABI specifies an extra hidden argument that is a
+    /// pointer to that memory.
+    fn gen_retval_area_setup(&self) -> Option<Self::I>;
+
+    /// Generate an instruction which copies a source register to a return value slot.
+    fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Writable<Reg>) -> Vec<Self::I>;
+
+    /// Generate a return instruction.
+    fn gen_ret(&self) -> Self::I;
+
+    /// Generate an epilogue placeholder. The returned instruction should return `true` from
+    /// `is_epilogue_placeholder()`; this is used to indicate to the lowering driver when
+    /// the epilogue should be inserted.
+    fn gen_epilogue_placeholder(&self) -> Self::I;
+
+    // -----------------------------------------------------------------
+    // Every function above this line may only be called pre-regalloc.
+    // Every function below this line may only be called post-regalloc.
+    // `spillslots()` must be called before any other post-regalloc
+    // function.
+    // ----------------------------------------------------------------
+
+    /// Update with the number of spillslots, post-regalloc.
+    fn set_num_spillslots(&mut self, slots: usize);
+
+    /// Update with the clobbered registers, post-regalloc.
+    fn set_clobbered(&mut self, clobbered: Set<Writable<RealReg>>);
+
+    /// Get the address of a stackslot.
+    fn stackslot_addr(&self, slot: StackSlot, offset: u32, into_reg: Writable<Reg>) -> Self::I;
+
+    /// Load from a stackslot.
+    fn load_stackslot(
+        &self,
+        slot: StackSlot,
+        offset: u32,
+        ty: Type,
+        into_reg: Writable<Reg>,
+    ) -> Self::I;
+
+    /// Store to a stackslot.
+    fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Self::I;
+
+    /// Load from a spillslot.
+    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Self::I;
+
+    /// Store to a spillslot.
+    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Self::I;
+
+    /// Generate a stack map, given a list of spillslots and the emission state
+    /// at a given program point (prior to emission fo the safepointing
+    /// instruction).
+    fn spillslots_to_stack_map(
+        &self,
+        slots: &[SpillSlot],
+        state: &<Self::I as MachInstEmit>::State,
+    ) -> StackMap;
+
+    /// Generate a prologue, post-regalloc. This should include any stack
+    /// frame or other setup necessary to use the other methods (`load_arg`,
+    /// `store_retval`, and spillslot accesses.)  `self` is mutable so that we
+    /// can store information in it which will be useful when creating the
+    /// epilogue.
+    fn gen_prologue(&mut self) -> Vec<Self::I>;
+
+    /// Generate an epilogue, post-regalloc. Note that this must generate the
+    /// actual return instruction (rather than emitting this in the lowering
+    /// logic), because the epilogue code comes before the return and the two are
+    /// likely closely related.
+    fn gen_epilogue(&self) -> Vec<Self::I>;
+
+    /// Returns the full frame size for the given function, after prologue
+    /// emission has run. This comprises the spill slots and stack-storage slots
+    /// (but not storage for clobbered callee-save registers, arguments pushed
+    /// at callsites within this function, or other ephemeral pushes).  This is
+    /// used for ABI variants where the client generates prologue/epilogue code,
+    /// as in Baldrdash (SpiderMonkey integration).
+    fn frame_size(&self) -> u32;
+
+    /// Returns the size of arguments expected on the stack.
+    fn stack_args_size(&self) -> u32;
+
+    /// Get the spill-slot size.
+    fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32;
+
+    /// Generate a spill. The type, if known, is given; this can be used to
+    /// generate a store instruction optimized for the particular type rather
+    /// than the RegClass (e.g., only F64 that resides in a V128 register). If
+    /// no type is given, the implementation should spill the whole register.
+    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Option<Type>) -> Self::I;
+
+    /// Generate a reload (fill). As for spills, the type may be given to allow
+    /// a more optimized load instruction to be generated.
+    fn gen_reload(
+        &self,
+        to_reg: Writable<RealReg>,
+        from_slot: SpillSlot,
+        ty: Option<Type>,
+    ) -> Self::I;
+
+    /// Desired unwind info type.
+    fn unwind_info_kind(&self) -> UnwindInfoKind;
+}
+
+/// Trait implemented by an object that tracks ABI-related state and can
+/// generate code while emitting a *call* to a function.
+///
+/// An instance of this trait returns information for a *particular*
+/// callsite. It will usually be computed from the called function's
+/// signature.
+///
+/// Unlike `ABICallee` above, methods on this trait are not invoked directly
+/// by the machine-independent code. Rather, the machine-specific lowering
+/// code will typically create an `ABICaller` when creating machine instructions
+/// for an IR call instruction inside `lower()`, directly emit the arg and
+/// and retval copies, and attach the register use/def info to the call.
+///
+/// This trait is thus provided for convenience to the backends.
+pub trait ABICaller {
+    /// The instruction type for the ISA associated with this ABI.
+    type I: VCodeInst;
+
+    /// Get the number of arguments expected.
+    fn num_args(&self) -> usize;
+
+    /// Emit a copy of an argument value from a source register, prior to the call.
+    fn emit_copy_reg_to_arg<C: LowerCtx<I = Self::I>>(
+        &self,
+        ctx: &mut C,
+        idx: usize,
+        from_reg: Reg,
+    );
+
+    /// Emit a copy a return value into a destination register, after the call returns.
+    fn emit_copy_retval_to_reg<C: LowerCtx<I = Self::I>>(
+        &self,
+        ctx: &mut C,
+        idx: usize,
+        into_reg: Writable<Reg>,
+    );
+
+    /// Emit code to pre-adjust the stack, prior to argument copies and call.
+    fn emit_stack_pre_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C);
+
+    /// Emit code to post-adjust the satck, after call return and return-value copies.
+    fn emit_stack_post_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C);
+
+    /// Accumulate outgoing arguments.  This ensures that the caller (as
+    /// identified via the CTX argument) allocates enough space in the
+    /// prologue to hold all arguments and return values for this call.
+    /// There is no code emitted at the call site, everything is done
+    /// in the caller's function prologue.
+    fn accumulate_outgoing_args_size<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C);
+
+    /// Emit the call itself.
+    ///
+    /// The returned instruction should have proper use- and def-sets according
+    /// to the argument registers, return-value registers, and clobbered
+    /// registers for this function signature in this ABI.
+    ///
+    /// (Arg registers are uses, and retval registers are defs. Clobbered
+    /// registers are also logically defs, but should never be read; their
+    /// values are "defined" (to the regalloc) but "undefined" in every other
+    /// sense.)
+    ///
+    /// This function should only be called once, as it is allowed to re-use
+    /// parts of the ABICaller object in emitting instructions.
+    fn emit_call<C: LowerCtx<I = Self::I>>(&mut self, ctx: &mut C);
+}
diff --git a/third_party/rust/cranelift-codegen/src/machinst/abi_impl.rs b/third_party/rust/cranelift-codegen/src/machinst/abi_impl.rs
new file mode 100644
index 0000000000..e47379da37
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/abi_impl.rs
@@ -0,0 +1,1320 @@
+//! Implementation of a vanilla ABI, shared between several machines. The
+//! implementation here assumes that arguments will be passed in registers
+//! first, then additional args on the stack; that the stack grows downward,
+//! contains a standard frame (return address and frame pointer), and the
+//! compiler is otherwise free to allocate space below that with its choice of
+//! layout; and that the machine has some notion of caller- and callee-save
+//! registers. Most modern machines, e.g. x86-64 and AArch64, should fit this
+//! mold and thus both of these backends use this shared implementation.
+//!
+//! See the documentation in specific machine backends for the "instantiation"
+//! of this generic ABI, i.e., which registers are caller/callee-save, arguments
+//! and return values, and any other special requirements.
+//!
+//! For now the implementation here assumes a 64-bit machine, but we intend to
+//! make this 32/64-bit-generic shortly.
+//!
+//! # Vanilla ABI
+//!
+//! First, arguments and return values are passed in registers up to a certain
+//! fixed count, after which they overflow onto the stack. Multiple return
+//! values either fit in registers, or are returned in a separate return-value
+//! area on the stack, given by a hidden extra parameter.
+//!
+//! Note that the exact stack layout is up to us. We settled on the
+//! below design based on several requirements. In particular, we need to be
+//! able to generate instructions (or instruction sequences) to access
+//! arguments, stack slots, and spill slots before we know how many spill slots
+//! or clobber-saves there will be, because of our pass structure. We also
+//! prefer positive offsets to negative offsets because of an asymmetry in
+//! some machines' addressing modes (e.g., on AArch64, positive offsets have a
+//! larger possible range without a long-form sequence to synthesize an
+//! arbitrary offset). Finally, it is not allowed to access memory below the
+//! current SP value.
+//!
+//! We assume that a prologue first pushes the frame pointer (and return address
+//! above that, if the machine does not do that in hardware). We set FP to point
+//! to this two-word frame record. We store all other frame slots below this
+//! two-word frame record, with the stack pointer remaining at or below this
+//! fixed frame storage for the rest of the function. We can then access frame
+//! storage slots using positive offsets from SP. In order to allow codegen for
+//! the latter before knowing how many clobber-saves we have, and also allow it
+//! while SP is being adjusted to set up a call, we implement a "nominal SP"
+//! tracking feature by which a fixup (distance between actual SP and a
+//! "nominal" SP) is known at each instruction.
+//!
+//! # Stack Layout
+//!
+//! The stack looks like:
+//!
+//! ```plain
+//!   (high address)
+//!
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | stack args                |
+//!                              | (accessed via FP)         |
+//!                              +---------------------------+
+//! SP at function entry ----->  | return address            |
+//!                              +---------------------------+
+//! FP after prologue -------->  | FP (pushed by prologue)   |
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | spill slots               |
+//!                              | (accessed via nominal SP) |
+//!                              |          ...              |
+//!                              | stack slots               |
+//!                              | (accessed via nominal SP) |
+//! nominal SP --------------->  | (alloc'd by prologue)     |
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | clobbered callee-saves    |
+//! SP at end of prologue ---->  | (pushed by prologue)      |
+//!                              +---------------------------+
+//!                              | [alignment as needed]     |
+//!                              |          ...              |
+//!                              | args for call             |
+//! SP before making a call -->  | (pushed at callsite)      |
+//!                              +---------------------------+
+//!
+//!   (low address)
+//! ```
+//!
+//! # Multi-value Returns
+//!
+//! Note that we support multi-value returns in two ways. First, we allow for
+//! multiple return-value registers. Second, if teh appropriate flag is set, we
+//! support the SpiderMonkey Wasm ABI.  For details of the multi-value return
+//! ABI, see:
+//!
+//! https://searchfox.org/mozilla-central/rev/bc3600def806859c31b2c7ac06e3d69271052a89/js/src/wasm/WasmStubs.h#134
+//!
+//! In brief:
+//! - Return values are processed in *reverse* order.
+//! - The first return value in this order (so the last return) goes into the
+//!   ordinary return register.
+//! - Any further returns go in a struct-return area, allocated upwards (in
+//!   address order) during the reverse traversal.
+//! - This struct-return area is provided by the caller, and a pointer to its
+//!   start is passed as an invisible last (extra) argument. Normally the caller
+//!   will allocate this area on the stack. When we generate calls, we place it
+//!   just above the on-stack argument area.
+//! - So, for example, a function returning 4 i64's (v0, v1, v2, v3), with no
+//!   formal arguments, would:
+//!   - Accept a pointer `P` to the struct return area as a hidden argument in the
+//!     first argument register on entry.
+//!   - Return v3 in the one and only return-value register.
+//!   - Return v2 in memory at `[P]`.
+//!   - Return v1 in memory at `[P+8]`.
+//!   - Return v0 in memory at `[P+16]`.
+
+use super::abi::*;
+use crate::binemit::StackMap;
+use crate::ir::types::*;
+use crate::ir::{ArgumentExtension, StackSlot};
+use crate::machinst::*;
+use crate::settings;
+use crate::CodegenResult;
+use crate::{ir, isa};
+use alloc::vec::Vec;
+use log::{debug, trace};
+use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable};
+use std::convert::TryFrom;
+use std::marker::PhantomData;
+use std::mem;
+
+/// A location for an argument or return value.
+#[derive(Clone, Copy, Debug)]
+pub enum ABIArg {
+    /// In a real register.
+    Reg(
+        RealReg,
+        ir::Type,
+        ir::ArgumentExtension,
+        ir::ArgumentPurpose,
+    ),
+    /// Arguments only: on stack, at given offset from SP at entry.
+    Stack(i64, ir::Type, ir::ArgumentExtension, ir::ArgumentPurpose),
+}
+
+impl ABIArg {
+    /// Get the purpose of this arg.
+    fn get_purpose(self) -> ir::ArgumentPurpose {
+        match self {
+            ABIArg::Reg(_, _, _, purpose) => purpose,
+            ABIArg::Stack(_, _, _, purpose) => purpose,
+        }
+    }
+}
+
+/// Are we computing information about arguments or return values? Much of the
+/// handling is factored out into common routines; this enum allows us to
+/// distinguish which case we're handling.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum ArgsOrRets {
+    /// Arguments.
+    Args,
+    /// Return values.
+    Rets,
+}
+
+/// Is an instruction returned by an ABI machine-specific backend a safepoint,
+/// or not?
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum InstIsSafepoint {
+    /// The instruction is a safepoint.
+    Yes,
+    /// The instruction is not a safepoint.
+    No,
+}
+
+/// Abstract location for a machine-specific ABI impl to translate into the
+/// appropriate addressing mode.
+#[derive(Clone, Copy, Debug)]
+pub enum StackAMode {
+    /// Offset from the frame pointer, possibly making use of a specific type
+    /// for a scaled indexing operation.
+    FPOffset(i64, ir::Type),
+    /// Offset from the nominal stack pointer, possibly making use of a specific
+    /// type for a scaled indexing operation.
+    NominalSPOffset(i64, ir::Type),
+    /// Offset from the real stack pointer, possibly making use of a specific
+    /// type for a scaled indexing operation.
+    SPOffset(i64, ir::Type),
+}
+
+/// Trait implemented by machine-specific backend to provide information about
+/// register assignments and to allow generating the specific instructions for
+/// stack loads/saves, prologues/epilogues, etc.
+pub trait ABIMachineSpec {
+    /// The instruction type.
+    type I: VCodeInst;
+
+    /// Returns the number of bits in a word, that is 32/64 for 32/64-bit architecture.
+    fn word_bits() -> u32;
+
+    /// Returns the number of bytes in a word.
+    fn word_bytes() -> u32 {
+        return Self::word_bits() / 8;
+    }
+
+    /// Returns word-size integer type.
+    fn word_type() -> Type {
+        match Self::word_bits() {
+            32 => I32,
+            64 => I64,
+            _ => unreachable!(),
+        }
+    }
+
+    /// Returns word register class.
+    fn word_reg_class() -> RegClass {
+        match Self::word_bits() {
+            32 => RegClass::I32,
+            64 => RegClass::I64,
+            _ => unreachable!(),
+        }
+    }
+
+    /// Returns required stack alignment in bytes.
+    fn stack_align(call_conv: isa::CallConv) -> u32;
+
+    /// Process a list of parameters or return values and allocate them to registers
+    /// and stack slots.
+    ///
+    /// Returns the list of argument locations, the stack-space used (rounded up
+    /// to as alignment requires), and if `add_ret_area_ptr` was passed, the
+    /// index of the extra synthetic arg that was added.
+    fn compute_arg_locs(
+        call_conv: isa::CallConv,
+        params: &[ir::AbiParam],
+        args_or_rets: ArgsOrRets,
+        add_ret_area_ptr: bool,
+    ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)>;
+
+    /// Returns the offset from FP to the argument area, i.e., jumping over the saved FP, return
+    /// address, and maybe other standard elements depending on ABI (e.g. Wasm TLS reg).
+    fn fp_to_arg_offset(call_conv: isa::CallConv, flags: &settings::Flags) -> i64;
+
+    /// Generate a load from the stack.
+    fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I;
+
+    /// Generate a store to the stack.
+    fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Self::I;
+
+    /// Generate a move.
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self::I;
+
+    /// Generate an integer-extend operation.
+    fn gen_extend(
+        to_reg: Writable<Reg>,
+        from_reg: Reg,
+        is_signed: bool,
+        from_bits: u8,
+        to_bits: u8,
+    ) -> Self::I;
+
+    /// Generate a return instruction.
+    fn gen_ret() -> Self::I;
+
+    /// Generate an "epilogue placeholder" instruction, recognized by lowering
+    /// when using the Baldrdash ABI.
+    fn gen_epilogue_placeholder() -> Self::I;
+
+    /// Generate an add-with-immediate. Note that even if this uses a scratch
+    /// register, it must satisfy two requirements:
+    ///
+    /// - The add-imm sequence must only clobber caller-save registers, because
+    ///   it will be placed in the prologue before the clobbered callee-save
+    ///   registers are saved.
+    ///
+    /// - The add-imm sequence must work correctly when `from_reg` and/or
+    ///   `into_reg` are the register returned by `get_stacklimit_reg()`.
+    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Self::I; 4]>;
+
+    /// Generate a sequence that traps with a `TrapCode::StackOverflow` code if
+    /// the stack pointer is less than the given limit register (assuming the
+    /// stack grows downward).
+    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Self::I; 2]>;
+
+    /// Generate an instruction to compute an address of a stack slot (FP- or
+    /// SP-based offset).
+    fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I;
+
+    /// Get a fixed register to use to compute a stack limit. This is needed for
+    /// certain sequences generated after the register allocator has already
+    /// run. This must satisfy two requirements:
+    ///
+    /// - It must be a caller-save register, because it will be clobbered in the
+    ///   prologue before the clobbered callee-save registers are saved.
+    ///
+    /// - It must be safe to pass as an argument and/or destination to
+    ///   `gen_add_imm()`. This is relevant when an addition with a large
+    ///   immediate needs its own temporary; it cannot use the same fixed
+    ///   temporary as this one.
+    fn get_stacklimit_reg() -> Reg;
+
+    /// Generate a store to the given [base+offset] address.
+    fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Self::I;
+
+    /// Generate a load from the given [base+offset] address.
+    fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Self::I;
+
+    /// Adjust the stack pointer up or down.
+    fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Self::I; 2]>;
+
+    /// Generate a meta-instruction that adjusts the nominal SP offset.
+    fn gen_nominal_sp_adj(amount: i32) -> Self::I;
+
+    /// Generate the usual frame-setup sequence for this architecture: e.g.,
+    /// `push rbp / mov rbp, rsp` on x86-64, or `stp fp, lr, [sp, #-16]!` on
+    /// AArch64.
+    fn gen_prologue_frame_setup() -> SmallVec<[Self::I; 2]>;
+
+    /// Generate the usual frame-restore sequence for this architecture.
+    fn gen_epilogue_frame_restore() -> SmallVec<[Self::I; 2]>;
+
+    /// Generate a clobber-save sequence. This takes the list of *all* registers
+    /// written/modified by the function body. The implementation here is
+    /// responsible for determining which of these are callee-saved according to
+    /// the ABI. It should return a sequence of instructions that "push" or
+    /// otherwise save these values to the stack. The sequence of instructions
+    /// should adjust the stack pointer downward, and should align as necessary
+    /// according to ABI requirements.
+    ///
+    /// Returns stack bytes used as well as instructions. Does not adjust
+    /// nominal SP offset; caller will do that.
+    fn gen_clobber_save(
+        call_conv: isa::CallConv,
+        flags: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        fixed_frame_storage_size: u32,
+        outgoing_args_size: u32,
+    ) -> (u64, SmallVec<[Self::I; 16]>);
+
+    /// Generate a clobber-restore sequence. This sequence should perform the
+    /// opposite of the clobber-save sequence generated above, assuming that SP
+    /// going into the sequence is at the same point that it was left when the
+    /// clobber-save sequence finished.
+    fn gen_clobber_restore(
+        call_conv: isa::CallConv,
+        flags: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        fixed_frame_storage_size: u32,
+        outgoing_args_size: u32,
+    ) -> SmallVec<[Self::I; 16]>;
+
+    /// Generate a call instruction/sequence. This method is provided one
+    /// temporary register to use to synthesize the called address, if needed.
+    fn gen_call(
+        dest: &CallDest,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: ir::Opcode,
+        tmp: Writable<Reg>,
+        callee_conv: isa::CallConv,
+        callee_conv: isa::CallConv,
+    ) -> SmallVec<[(InstIsSafepoint, Self::I); 2]>;
+
+    /// Get the number of spillslots required for the given register-class and
+    /// type.
+    fn get_number_of_spillslots_for_value(rc: RegClass, ty: Type) -> u32;
+
+    /// Get the current virtual-SP offset from an instruction-emission state.
+    fn get_virtual_sp_offset_from_state(s: &<Self::I as MachInstEmit>::State) -> i64;
+
+    /// Get the "nominal SP to FP" offset from an instruction-emission state.
+    fn get_nominal_sp_to_fp(s: &<Self::I as MachInstEmit>::State) -> i64;
+
+    /// Get all caller-save registers, that is, registers that we expect
+    /// not to be saved across a call to a callee with the given ABI.
+    fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> Vec<Writable<Reg>>;
+}
+
+/// ABI information shared between body (callee) and caller.
+struct ABISig {
+    /// Argument locations (regs or stack slots). Stack offsets are relative to
+    /// SP on entry to function.
+    args: Vec<ABIArg>,
+    /// Return-value locations. Stack offsets are relative to the return-area
+    /// pointer.
+    rets: Vec<ABIArg>,
+    /// Space on stack used to store arguments.
+    stack_arg_space: i64,
+    /// Space on stack used to store return values.
+    stack_ret_space: i64,
+    /// Index in `args` of the stack-return-value-area argument.
+    stack_ret_arg: Option<usize>,
+    /// Calling convention used.
+    call_conv: isa::CallConv,
+}
+
+impl ABISig {
+    fn from_func_sig<M: ABIMachineSpec>(sig: &ir::Signature) -> CodegenResult<ABISig> {
+        // Compute args and retvals from signature. Handle retvals first,
+        // because we may need to add a return-area arg to the args.
+        let (rets, stack_ret_space, _) = M::compute_arg_locs(
+            sig.call_conv,
+            &sig.returns,
+            ArgsOrRets::Rets,
+            /* extra ret-area ptr = */ false,
+        )?;
+        let need_stack_return_area = stack_ret_space > 0;
+        let (args, stack_arg_space, stack_ret_arg) = M::compute_arg_locs(
+            sig.call_conv,
+            &sig.params,
+            ArgsOrRets::Args,
+            need_stack_return_area,
+        )?;
+
+        trace!(
+            "ABISig: sig {:?} => args = {:?} rets = {:?} arg stack = {} ret stack = {} stack_ret_arg = {:?}",
+            sig,
+            args,
+            rets,
+            stack_arg_space,
+            stack_ret_space,
+            stack_ret_arg
+        );
+
+        Ok(ABISig {
+            args,
+            rets,
+            stack_arg_space,
+            stack_ret_space,
+            stack_ret_arg,
+            call_conv: sig.call_conv,
+        })
+    }
+}
+
+/// ABI object for a function body.
+pub struct ABICalleeImpl<M: ABIMachineSpec> {
+    /// Signature: arg and retval regs.
+    sig: ABISig,
+    /// Offsets to each stackslot.
+    stackslots: Vec<u32>,
+    /// Total stack size of all stackslots.
+    stackslots_size: u32,
+    /// Stack size to be reserved for outgoing arguments.
+    outgoing_args_size: u32,
+    /// Clobbered registers, from regalloc.
+    clobbered: Set<Writable<RealReg>>,
+    /// Total number of spillslots, from regalloc.
+    spillslots: Option<usize>,
+    /// Storage allocated for the fixed part of the stack frame.  This is
+    /// usually the same as the total frame size below, except in the case
+    /// of the baldrdash calling convention.
+    fixed_frame_storage_size: u32,
+    /// "Total frame size", as defined by "distance between FP and nominal SP".
+    /// Some items are pushed below nominal SP, so the function may actually use
+    /// more stack than this would otherwise imply. It is simply the initial
+    /// frame/allocation size needed for stackslots and spillslots.
+    total_frame_size: Option<u32>,
+    /// The register holding the return-area pointer, if needed.
+    ret_area_ptr: Option<Writable<Reg>>,
+    /// Calling convention this function expects.
+    call_conv: isa::CallConv,
+    /// The settings controlling this function's compilation.
+    flags: settings::Flags,
+    /// Whether or not this function is a "leaf", meaning it calls no other
+    /// functions
+    is_leaf: bool,
+    /// If this function has a stack limit specified, then `Reg` is where the
+    /// stack limit will be located after the instructions specified have been
+    /// executed.
+    ///
+    /// Note that this is intended for insertion into the prologue, if
+    /// present. Also note that because the instructions here execute in the
+    /// prologue this happens after legalization/register allocation/etc so we
+    /// need to be extremely careful with each instruction. The instructions are
+    /// manually register-allocated and carefully only use caller-saved
+    /// registers and keep nothing live after this sequence of instructions.
+    stack_limit: Option<(Reg, Vec<M::I>)>,
+
+    _mach: PhantomData<M>,
+}
+
+fn get_special_purpose_param_register(
+    f: &ir::Function,
+    abi: &ABISig,
+    purpose: ir::ArgumentPurpose,
+) -> Option<Reg> {
+    let idx = f.signature.special_param_index(purpose)?;
+    match abi.args[idx] {
+        ABIArg::Reg(reg, ..) => Some(reg.to_reg()),
+        ABIArg::Stack(..) => None,
+    }
+}
+
+impl<M: ABIMachineSpec> ABICalleeImpl<M> {
+    /// Create a new body ABI instance.
+    pub fn new(f: &ir::Function, flags: settings::Flags) -> CodegenResult<Self> {
+        debug!("ABI: func signature {:?}", f.signature);
+
+        let sig = ABISig::from_func_sig::<M>(&f.signature)?;
+
+        let call_conv = f.signature.call_conv;
+        // Only these calling conventions are supported.
+        debug_assert!(
+            call_conv == isa::CallConv::SystemV
+                || call_conv == isa::CallConv::Fast
+                || call_conv == isa::CallConv::Cold
+                || call_conv.extends_baldrdash(),
+            "Unsupported calling convention: {:?}",
+            call_conv
+        );
+
+        // Compute stackslot locations and total stackslot size.
+        let mut stack_offset: u32 = 0;
+        let mut stackslots = vec![];
+        for (stackslot, data) in f.stack_slots.iter() {
+            let off = stack_offset;
+            stack_offset += data.size;
+            let mask = M::word_bytes() - 1;
+            stack_offset = (stack_offset + mask) & !mask;
+            debug_assert_eq!(stackslot.as_u32() as usize, stackslots.len());
+            stackslots.push(off);
+        }
+
+        // Figure out what instructions, if any, will be needed to check the
+        // stack limit. This can either be specified as a special-purpose
+        // argument or as a global value which often calculates the stack limit
+        // from the arguments.
+        let stack_limit =
+            get_special_purpose_param_register(f, &sig, ir::ArgumentPurpose::StackLimit)
+                .map(|reg| (reg, Vec::new()))
+                .or_else(|| f.stack_limit.map(|gv| gen_stack_limit::<M>(f, &sig, gv)));
+
+        Ok(Self {
+            sig,
+            stackslots,
+            stackslots_size: stack_offset,
+            outgoing_args_size: 0,
+            clobbered: Set::empty(),
+            spillslots: None,
+            fixed_frame_storage_size: 0,
+            total_frame_size: None,
+            ret_area_ptr: None,
+            call_conv,
+            flags,
+            is_leaf: f.is_leaf(),
+            stack_limit,
+            _mach: PhantomData,
+        })
+    }
+
+    /// Inserts instructions necessary for checking the stack limit into the
+    /// prologue.
+    ///
+    /// This function will generate instructions necessary for perform a stack
+    /// check at the header of a function. The stack check is intended to trap
+    /// if the stack pointer goes below a particular threshold, preventing stack
+    /// overflow in wasm or other code. The `stack_limit` argument here is the
+    /// register which holds the threshold below which we're supposed to trap.
+    /// This function is known to allocate `stack_size` bytes and we'll push
+    /// instructions onto `insts`.
+    ///
+    /// Note that the instructions generated here are special because this is
+    /// happening so late in the pipeline (e.g. after register allocation). This
+    /// means that we need to do manual register allocation here and also be
+    /// careful to not clobber any callee-saved or argument registers. For now
+    /// this routine makes do with the `spilltmp_reg` as one temporary
+    /// register, and a second register of `tmp2` which is caller-saved. This
+    /// should be fine for us since no spills should happen in this sequence of
+    /// instructions, so our register won't get accidentally clobbered.
+    ///
+    /// No values can be live after the prologue, but in this case that's ok
+    /// because we just need to perform a stack check before progressing with
+    /// the rest of the function.
+    fn insert_stack_check(&self, stack_limit: Reg, stack_size: u32, insts: &mut Vec<M::I>) {
+        // With no explicit stack allocated we can just emit the simple check of
+        // the stack registers against the stack limit register, and trap if
+        // it's out of bounds.
+        if stack_size == 0 {
+            insts.extend(M::gen_stack_lower_bound_trap(stack_limit));
+            return;
+        }
+
+        // Note that the 32k stack size here is pretty special. See the
+        // documentation in x86/abi.rs for why this is here. The general idea is
+        // that we're protecting against overflow in the addition that happens
+        // below.
+        if stack_size >= 32 * 1024 {
+            insts.extend(M::gen_stack_lower_bound_trap(stack_limit));
+        }
+
+        // Add the `stack_size` to `stack_limit`, placing the result in
+        // `scratch`.
+        //
+        // Note though that `stack_limit`'s register may be the same as
+        // `scratch`. If our stack size doesn't fit into an immediate this
+        // means we need a second scratch register for loading the stack size
+        // into a register.
+        let scratch = Writable::from_reg(M::get_stacklimit_reg());
+        insts.extend(M::gen_add_imm(scratch, stack_limit, stack_size).into_iter());
+        insts.extend(M::gen_stack_lower_bound_trap(scratch.to_reg()));
+    }
+}
+
+/// Generates the instructions necessary for the `gv` to be materialized into a
+/// register.
+///
+/// This function will return a register that will contain the result of
+/// evaluating `gv`. It will also return any instructions necessary to calculate
+/// the value of the register.
+///
+/// Note that global values are typically lowered to instructions via the
+/// standard legalization pass. Unfortunately though prologue generation happens
+/// so late in the pipeline that we can't use these legalization passes to
+/// generate the instructions for `gv`. As a result we duplicate some lowering
+/// of `gv` here and support only some global values. This is similar to what
+/// the x86 backend does for now, and hopefully this can be somewhat cleaned up
+/// in the future too!
+///
+/// Also note that this function will make use of `writable_spilltmp_reg()` as a
+/// temporary register to store values in if necessary. Currently after we write
+/// to this register there's guaranteed to be no spilled values between where
+/// it's used, because we're not participating in register allocation anyway!
+fn gen_stack_limit<M: ABIMachineSpec>(
+    f: &ir::Function,
+    abi: &ABISig,
+    gv: ir::GlobalValue,
+) -> (Reg, Vec<M::I>) {
+    let mut insts = Vec::new();
+    let reg = generate_gv::<M>(f, abi, gv, &mut insts);
+    return (reg, insts);
+}
+
+fn generate_gv<M: ABIMachineSpec>(
+    f: &ir::Function,
+    abi: &ABISig,
+    gv: ir::GlobalValue,
+    insts: &mut Vec<M::I>,
+) -> Reg {
+    match f.global_values[gv] {
+        // Return the direct register the vmcontext is in
+        ir::GlobalValueData::VMContext => {
+            get_special_purpose_param_register(f, abi, ir::ArgumentPurpose::VMContext)
+                .expect("no vmcontext parameter found")
+        }
+        // Load our base value into a register, then load from that register
+        // in to a temporary register.
+        ir::GlobalValueData::Load {
+            base,
+            offset,
+            global_type: _,
+            readonly: _,
+        } => {
+            let base = generate_gv::<M>(f, abi, base, insts);
+            let into_reg = Writable::from_reg(M::get_stacklimit_reg());
+            insts.push(M::gen_load_base_offset(
+                into_reg,
+                base,
+                offset.into(),
+                M::word_type(),
+            ));
+            return into_reg.to_reg();
+        }
+        ref other => panic!("global value for stack limit not supported: {}", other),
+    }
+}
+
+/// Return a type either from an optional type hint, or if not, from the default
+/// type associated with the given register's class. This is used to generate
+/// loads/spills appropriately given the type of value loaded/stored (which may
+/// be narrower than the spillslot). We usually have the type because the
+/// regalloc usually provides the vreg being spilled/reloaded, and we know every
+/// vreg's type. However, the regalloc *can* request a spill/reload without an
+/// associated vreg when needed to satisfy a safepoint (which requires all
+/// ref-typed values, even those in real registers in the original vcode, to be
+/// in spillslots).
+fn ty_from_ty_hint_or_reg_class<M: ABIMachineSpec>(r: Reg, ty: Option<Type>) -> Type {
+    match (ty, r.get_class()) {
+        // If the type is provided
+        (Some(t), _) => t,
+        // If no type is provided, this should be a register spill for a
+        // safepoint, so we only expect I32/I64 (integer) registers.
+        (None, rc) if rc == M::word_reg_class() => M::word_type(),
+        _ => panic!("Unexpected register class!"),
+    }
+}
+
+impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
+    type I = M::I;
+
+    fn temp_needed(&self) -> bool {
+        self.sig.stack_ret_arg.is_some()
+    }
+
+    fn init(&mut self, maybe_tmp: Option<Writable<Reg>>) {
+        if self.sig.stack_ret_arg.is_some() {
+            assert!(maybe_tmp.is_some());
+            self.ret_area_ptr = maybe_tmp;
+        }
+    }
+
+    fn accumulate_outgoing_args_size(&mut self, size: u32) {
+        if size > self.outgoing_args_size {
+            self.outgoing_args_size = size;
+        }
+    }
+
+    fn flags(&self) -> &settings::Flags {
+        &self.flags
+    }
+
+    fn call_conv(&self) -> isa::CallConv {
+        self.sig.call_conv
+    }
+
+    fn liveins(&self) -> Set<RealReg> {
+        let mut set: Set<RealReg> = Set::empty();
+        for &arg in &self.sig.args {
+            if let ABIArg::Reg(r, ..) = arg {
+                set.insert(r);
+            }
+        }
+        set
+    }
+
+    fn liveouts(&self) -> Set<RealReg> {
+        let mut set: Set<RealReg> = Set::empty();
+        for &ret in &self.sig.rets {
+            if let ABIArg::Reg(r, ..) = ret {
+                set.insert(r);
+            }
+        }
+        set
+    }
+
+    fn num_args(&self) -> usize {
+        self.sig.args.len()
+    }
+
+    fn num_retvals(&self) -> usize {
+        self.sig.rets.len()
+    }
+
+    fn num_stackslots(&self) -> usize {
+        self.stackslots.len()
+    }
+
+    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I {
+        match &self.sig.args[idx] {
+            // Extension mode doesn't matter (we're copying out, not in; we
+            // ignore high bits by convention).
+            &ABIArg::Reg(r, ty, ..) => M::gen_move(into_reg, r.to_reg(), ty),
+            &ABIArg::Stack(off, ty, ..) => M::gen_load_stack(
+                StackAMode::FPOffset(M::fp_to_arg_offset(self.call_conv, &self.flags) + off, ty),
+                into_reg,
+                ty,
+            ),
+        }
+    }
+
+    fn arg_is_needed_in_body(&self, idx: usize) -> bool {
+        match self.sig.args[idx].get_purpose() {
+            // Special Baldrdash-specific pseudo-args that are present only to
+            // fill stack slots.  Won't ever be used as ordinary values in the
+            // body.
+            ir::ArgumentPurpose::CalleeTLS | ir::ArgumentPurpose::CallerTLS => false,
+            _ => true,
+        }
+    }
+
+    fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Writable<Reg>) -> Vec<Self::I> {
+        let mut ret = Vec::new();
+        let word_bits = M::word_bits() as u8;
+        match &self.sig.rets[idx] {
+            &ABIArg::Reg(r, ty, ext, ..) => {
+                let from_bits = ty_bits(ty) as u8;
+                let dest_reg = Writable::from_reg(r.to_reg());
+                match (ext, from_bits) {
+                    (ArgumentExtension::Uext, n) | (ArgumentExtension::Sext, n)
+                        if n < word_bits =>
+                    {
+                        let signed = ext == ArgumentExtension::Sext;
+                        ret.push(M::gen_extend(
+                            dest_reg,
+                            from_reg.to_reg(),
+                            signed,
+                            from_bits,
+                            /* to_bits = */ word_bits,
+                        ));
+                    }
+                    _ => ret.push(M::gen_move(dest_reg, from_reg.to_reg(), ty)),
+                };
+            }
+            &ABIArg::Stack(off, mut ty, ext, ..) => {
+                let from_bits = ty_bits(ty) as u8;
+                // A machine ABI implementation should ensure that stack frames
+                // have "reasonable" size. All current ABIs for machinst
+                // backends (aarch64 and x64) enforce a 128MB limit.
+                let off = i32::try_from(off)
+                    .expect("Argument stack offset greater than 2GB; should hit impl limit first");
+                // Trash the from_reg; it should be its last use.
+                match (ext, from_bits) {
+                    (ArgumentExtension::Uext, n) | (ArgumentExtension::Sext, n)
+                        if n < word_bits =>
+                    {
+                        assert_eq!(M::word_reg_class(), from_reg.to_reg().get_class());
+                        let signed = ext == ArgumentExtension::Sext;
+                        ret.push(M::gen_extend(
+                            from_reg,
+                            from_reg.to_reg(),
+                            signed,
+                            from_bits,
+                            /* to_bits = */ word_bits,
+                        ));
+                        // Store the extended version.
+                        ty = M::word_type();
+                    }
+                    _ => {}
+                };
+                ret.push(M::gen_store_base_offset(
+                    self.ret_area_ptr.unwrap().to_reg(),
+                    off,
+                    from_reg.to_reg(),
+                    ty,
+                ));
+            }
+        }
+        ret
+    }
+
+    fn gen_retval_area_setup(&self) -> Option<Self::I> {
+        if let Some(i) = self.sig.stack_ret_arg {
+            let inst = self.gen_copy_arg_to_reg(i, self.ret_area_ptr.unwrap());
+            trace!(
+                "gen_retval_area_setup: inst {:?}; ptr reg is {:?}",
+                inst,
+                self.ret_area_ptr.unwrap().to_reg()
+            );
+            Some(inst)
+        } else {
+            trace!("gen_retval_area_setup: not needed");
+            None
+        }
+    }
+
+    fn gen_ret(&self) -> Self::I {
+        M::gen_ret()
+    }
+
+    fn gen_epilogue_placeholder(&self) -> Self::I {
+        M::gen_epilogue_placeholder()
+    }
+
+    fn set_num_spillslots(&mut self, slots: usize) {
+        self.spillslots = Some(slots);
+    }
+
+    fn set_clobbered(&mut self, clobbered: Set<Writable<RealReg>>) {
+        self.clobbered = clobbered;
+    }
+
+    /// Load from a stackslot.
+    fn load_stackslot(
+        &self,
+        slot: StackSlot,
+        offset: u32,
+        ty: Type,
+        into_reg: Writable<Reg>,
+    ) -> Self::I {
+        // Offset from beginning of stackslot area, which is at nominal SP (see
+        // [MemArg::NominalSPOffset] for more details on nominal SP tracking).
+        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
+        let sp_off: i64 = stack_off + (offset as i64);
+        trace!("load_stackslot: slot {} -> sp_off {}", slot, sp_off);
+        M::gen_load_stack(StackAMode::NominalSPOffset(sp_off, ty), into_reg, ty)
+    }
+
+    /// Store to a stackslot.
+    fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Self::I {
+        // Offset from beginning of stackslot area, which is at nominal SP (see
+        // [MemArg::NominalSPOffset] for more details on nominal SP tracking).
+        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
+        let sp_off: i64 = stack_off + (offset as i64);
+        trace!("store_stackslot: slot {} -> sp_off {}", slot, sp_off);
+        M::gen_store_stack(StackAMode::NominalSPOffset(sp_off, ty), from_reg, ty)
+    }
+
+    /// Produce an instruction that computes a stackslot address.
+    fn stackslot_addr(&self, slot: StackSlot, offset: u32, into_reg: Writable<Reg>) -> Self::I {
+        // Offset from beginning of stackslot area, which is at nominal SP (see
+        // [MemArg::NominalSPOffset] for more details on nominal SP tracking).
+        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
+        let sp_off: i64 = stack_off + (offset as i64);
+        M::gen_get_stack_addr(StackAMode::NominalSPOffset(sp_off, I8), into_reg, I8)
+    }
+
+    /// Load from a spillslot.
+    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Self::I {
+        // Offset from beginning of spillslot area, which is at nominal SP + stackslots_size.
+        let islot = slot.get() as i64;
+        let spill_off = islot * M::word_bytes() as i64;
+        let sp_off = self.stackslots_size as i64 + spill_off;
+        trace!("load_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
+        M::gen_load_stack(StackAMode::NominalSPOffset(sp_off, ty), into_reg, ty)
+    }
+
+    /// Store to a spillslot.
+    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Self::I {
+        // Offset from beginning of spillslot area, which is at nominal SP + stackslots_size.
+        let islot = slot.get() as i64;
+        let spill_off = islot * M::word_bytes() as i64;
+        let sp_off = self.stackslots_size as i64 + spill_off;
+        trace!("store_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
+        M::gen_store_stack(StackAMode::NominalSPOffset(sp_off, ty), from_reg, ty)
+    }
+
+    fn spillslots_to_stack_map(
+        &self,
+        slots: &[SpillSlot],
+        state: &<Self::I as MachInstEmit>::State,
+    ) -> StackMap {
+        let virtual_sp_offset = M::get_virtual_sp_offset_from_state(state);
+        let nominal_sp_to_fp = M::get_nominal_sp_to_fp(state);
+        assert!(virtual_sp_offset >= 0);
+        trace!(
+            "spillslots_to_stackmap: slots = {:?}, state = {:?}",
+            slots,
+            state
+        );
+        let map_size = (virtual_sp_offset + nominal_sp_to_fp) as u32;
+        let bytes = M::word_bytes();
+        let map_words = (map_size + bytes - 1) / bytes;
+        let mut bits = std::iter::repeat(false)
+            .take(map_words as usize)
+            .collect::<Vec<bool>>();
+
+        let first_spillslot_word =
+            ((self.stackslots_size + virtual_sp_offset as u32) / bytes) as usize;
+        for &slot in slots {
+            let slot = slot.get() as usize;
+            bits[first_spillslot_word + slot] = true;
+        }
+
+        StackMap::from_slice(&bits[..])
+    }
+
+    fn gen_prologue(&mut self) -> Vec<Self::I> {
+        let mut insts = vec![];
+        if !self.call_conv.extends_baldrdash() {
+            // set up frame
+            insts.extend(M::gen_prologue_frame_setup().into_iter());
+        }
+
+        let bytes = M::word_bytes();
+        let mut total_stacksize = self.stackslots_size + bytes * self.spillslots.unwrap() as u32;
+        if self.call_conv.extends_baldrdash() {
+            debug_assert!(
+                !self.flags.enable_probestack(),
+                "baldrdash does not expect cranelift to emit stack probes"
+            );
+            total_stacksize += self.flags.baldrdash_prologue_words() as u32 * bytes;
+        }
+        let mask = M::stack_align(self.call_conv) - 1;
+        let total_stacksize = (total_stacksize + mask) & !mask; // 16-align the stack.
+
+        if !self.call_conv.extends_baldrdash() {
+            // Leaf functions with zero stack don't need a stack check if one's
+            // specified, otherwise always insert the stack check.
+            if total_stacksize > 0 || !self.is_leaf {
+                if let Some((reg, stack_limit_load)) = &self.stack_limit {
+                    insts.extend_from_slice(stack_limit_load);
+                    self.insert_stack_check(*reg, total_stacksize, &mut insts);
+                }
+            }
+            if total_stacksize > 0 {
+                self.fixed_frame_storage_size += total_stacksize;
+            }
+        }
+
+        // N.B.: "nominal SP", which we use to refer to stackslots and
+        // spillslots, is defined to be equal to the stack pointer at this point
+        // in the prologue.
+        //
+        // If we push any clobbers below, we emit a virtual-SP adjustment
+        // meta-instruction so that the nominal SP references behave as if SP
+        // were still at this point. See documentation for
+        // [crate::machinst::abi_impl](this module) for more details on
+        // stackframe layout and nominal SP maintenance.
+
+        // Save clobbered registers.
+        let (clobber_size, clobber_insts) = M::gen_clobber_save(
+            self.call_conv,
+            &self.flags,
+            &self.clobbered,
+            self.fixed_frame_storage_size,
+            self.outgoing_args_size,
+        );
+        insts.extend(clobber_insts);
+
+        let sp_adj = self.outgoing_args_size as i32 + clobber_size as i32;
+        if sp_adj > 0 {
+            insts.push(M::gen_nominal_sp_adj(sp_adj));
+        }
+
+        self.total_frame_size = Some(total_stacksize);
+        insts
+    }
+
+    fn gen_epilogue(&self) -> Vec<M::I> {
+        let mut insts = vec![];
+
+        // Restore clobbered registers.
+        insts.extend(M::gen_clobber_restore(
+            self.call_conv,
+            &self.flags,
+            &self.clobbered,
+            self.fixed_frame_storage_size,
+            self.outgoing_args_size,
+        ));
+
+        // N.B.: we do *not* emit a nominal SP adjustment here, because (i) there will be no
+        // references to nominal SP offsets before the return below, and (ii) the instruction
+        // emission tracks running SP offset linearly (in straight-line order), not according to
+        // the CFG, so early returns in the middle of function bodies would cause an incorrect
+        // offset for the rest of the body.
+
+        if !self.call_conv.extends_baldrdash() {
+            insts.extend(M::gen_epilogue_frame_restore());
+            insts.push(M::gen_ret());
+        }
+
+        debug!("Epilogue: {:?}", insts);
+        insts
+    }
+
+    fn frame_size(&self) -> u32 {
+        self.total_frame_size
+            .expect("frame size not computed before prologue generation")
+    }
+
+    fn stack_args_size(&self) -> u32 {
+        self.sig.stack_arg_space as u32
+    }
+
+    fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32 {
+        M::get_number_of_spillslots_for_value(rc, ty)
+    }
+
+    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Option<Type>) -> Self::I {
+        let ty = ty_from_ty_hint_or_reg_class::<M>(from_reg.to_reg(), ty);
+        self.store_spillslot(to_slot, ty, from_reg.to_reg())
+    }
+
+    fn gen_reload(
+        &self,
+        to_reg: Writable<RealReg>,
+        from_slot: SpillSlot,
+        ty: Option<Type>,
+    ) -> Self::I {
+        let ty = ty_from_ty_hint_or_reg_class::<M>(to_reg.to_reg().to_reg(), ty);
+        self.load_spillslot(from_slot, ty, to_reg.map(|r| r.to_reg()))
+    }
+
+    fn unwind_info_kind(&self) -> UnwindInfoKind {
+        match self.sig.call_conv {
+            #[cfg(feature = "unwind")]
+            isa::CallConv::Fast | isa::CallConv::Cold | isa::CallConv::SystemV => {
+                UnwindInfoKind::SystemV
+            }
+            #[cfg(feature = "unwind")]
+            isa::CallConv::WindowsFastcall => UnwindInfoKind::Windows,
+            _ => UnwindInfoKind::None,
+        }
+    }
+}
+
+fn abisig_to_uses_and_defs<M: ABIMachineSpec>(sig: &ABISig) -> (Vec<Reg>, Vec<Writable<Reg>>) {
+    // Compute uses: all arg regs.
+    let mut uses = Vec::new();
+    for arg in &sig.args {
+        match arg {
+            &ABIArg::Reg(reg, ..) => uses.push(reg.to_reg()),
+            _ => {}
+        }
+    }
+
+    // Compute defs: all retval regs, and all caller-save (clobbered) regs.
+    let mut defs = M::get_regs_clobbered_by_call(sig.call_conv);
+    for ret in &sig.rets {
+        match ret {
+            &ABIArg::Reg(reg, ..) => defs.push(Writable::from_reg(reg.to_reg())),
+            _ => {}
+        }
+    }
+
+    (uses, defs)
+}
+
+/// ABI object for a callsite.
+pub struct ABICallerImpl<M: ABIMachineSpec> {
+    /// The called function's signature.
+    sig: ABISig,
+    /// All uses for the callsite, i.e., function args.
+    uses: Vec<Reg>,
+    /// All defs for the callsite, i.e., return values and caller-saves.
+    defs: Vec<Writable<Reg>>,
+    /// Call destination.
+    dest: CallDest,
+    /// Actual call opcode; used to distinguish various types of calls.
+    opcode: ir::Opcode,
+    /// Caller's calling convention.
+    caller_conv: isa::CallConv,
+
+    _mach: PhantomData<M>,
+}
+
+/// Destination for a call.
+#[derive(Debug, Clone)]
+pub enum CallDest {
+    /// Call to an ExtName (named function symbol).
+    ExtName(ir::ExternalName, RelocDistance),
+    /// Indirect call to a function pointer in a register.
+    Reg(Reg),
+}
+
+impl<M: ABIMachineSpec> ABICallerImpl<M> {
+    /// Create a callsite ABI object for a call directly to the specified function.
+    pub fn from_func(
+        sig: &ir::Signature,
+        extname: &ir::ExternalName,
+        dist: RelocDistance,
+        caller_conv: isa::CallConv,
+    ) -> CodegenResult<ABICallerImpl<M>> {
+        let sig = ABISig::from_func_sig::<M>(sig)?;
+        let (uses, defs) = abisig_to_uses_and_defs::<M>(&sig);
+        Ok(ABICallerImpl {
+            sig,
+            uses,
+            defs,
+            dest: CallDest::ExtName(extname.clone(), dist),
+            opcode: ir::Opcode::Call,
+            caller_conv,
+            _mach: PhantomData,
+        })
+    }
+
+    /// Create a callsite ABI object for a call to a function pointer with the
+    /// given signature.
+    pub fn from_ptr(
+        sig: &ir::Signature,
+        ptr: Reg,
+        opcode: ir::Opcode,
+        caller_conv: isa::CallConv,
+    ) -> CodegenResult<ABICallerImpl<M>> {
+        let sig = ABISig::from_func_sig::<M>(sig)?;
+        let (uses, defs) = abisig_to_uses_and_defs::<M>(&sig);
+        Ok(ABICallerImpl {
+            sig,
+            uses,
+            defs,
+            dest: CallDest::Reg(ptr),
+            opcode,
+            caller_conv,
+            _mach: PhantomData,
+        })
+    }
+}
+
+fn adjust_stack_and_nominal_sp<M: ABIMachineSpec, C: LowerCtx<I = M::I>>(
+    ctx: &mut C,
+    off: i32,
+    is_sub: bool,
+) {
+    if off == 0 {
+        return;
+    }
+    let amt = if is_sub { -off } else { off };
+    for inst in M::gen_sp_reg_adjust(amt) {
+        ctx.emit(inst);
+    }
+    ctx.emit(M::gen_nominal_sp_adj(-amt));
+}
+
+impl<M: ABIMachineSpec> ABICaller for ABICallerImpl<M> {
+    type I = M::I;
+
+    fn num_args(&self) -> usize {
+        if self.sig.stack_ret_arg.is_some() {
+            self.sig.args.len() - 1
+        } else {
+            self.sig.args.len()
+        }
+    }
+
+    fn accumulate_outgoing_args_size<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) {
+        let off = self.sig.stack_arg_space + self.sig.stack_ret_space;
+        ctx.abi().accumulate_outgoing_args_size(off as u32);
+    }
+
+    fn emit_stack_pre_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) {
+        let off = self.sig.stack_arg_space + self.sig.stack_ret_space;
+        adjust_stack_and_nominal_sp::<M, C>(ctx, off as i32, /* is_sub = */ true)
+    }
+
+    fn emit_stack_post_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) {
+        let off = self.sig.stack_arg_space + self.sig.stack_ret_space;
+        adjust_stack_and_nominal_sp::<M, C>(ctx, off as i32, /* is_sub = */ false)
+    }
+
+    fn emit_copy_reg_to_arg<C: LowerCtx<I = Self::I>>(
+        &self,
+        ctx: &mut C,
+        idx: usize,
+        from_reg: Reg,
+    ) {
+        let word_rc = M::word_reg_class();
+        let word_bits = M::word_bits() as usize;
+        match &self.sig.args[idx] {
+            &ABIArg::Reg(reg, ty, ext, _)
+                if ext != ir::ArgumentExtension::None && ty_bits(ty) < word_bits =>
+            {
+                assert_eq!(word_rc, reg.get_class());
+                let signed = match ext {
+                    ir::ArgumentExtension::Uext => false,
+                    ir::ArgumentExtension::Sext => true,
+                    _ => unreachable!(),
+                };
+                ctx.emit(M::gen_extend(
+                    Writable::from_reg(reg.to_reg()),
+                    from_reg,
+                    signed,
+                    ty_bits(ty) as u8,
+                    word_bits as u8,
+                ));
+            }
+            &ABIArg::Reg(reg, ty, _, _) => {
+                ctx.emit(M::gen_move(Writable::from_reg(reg.to_reg()), from_reg, ty));
+            }
+            &ABIArg::Stack(off, mut ty, ext, _) => {
+                if ext != ir::ArgumentExtension::None && ty_bits(ty) < word_bits {
+                    assert_eq!(word_rc, from_reg.get_class());
+                    let signed = match ext {
+                        ir::ArgumentExtension::Uext => false,
+                        ir::ArgumentExtension::Sext => true,
+                        _ => unreachable!(),
+                    };
+                    // Extend in place in the source register. Our convention is to
+                    // treat high bits as undefined for values in registers, so this
+                    // is safe, even for an argument that is nominally read-only.
+                    ctx.emit(M::gen_extend(
+                        Writable::from_reg(from_reg),
+                        from_reg,
+                        signed,
+                        ty_bits(ty) as u8,
+                        word_bits as u8,
+                    ));
+                    // Store the extended version.
+                    ty = M::word_type();
+                }
+                ctx.emit(M::gen_store_stack(
+                    StackAMode::SPOffset(off, ty),
+                    from_reg,
+                    ty,
+                ));
+            }
+        }
+    }
+
+    fn emit_copy_retval_to_reg<C: LowerCtx<I = Self::I>>(
+        &self,
+        ctx: &mut C,
+        idx: usize,
+        into_reg: Writable<Reg>,
+    ) {
+        match &self.sig.rets[idx] {
+            // Extension mode doesn't matter because we're copying out, not in,
+            // and we ignore high bits in our own registers by convention.
+            &ABIArg::Reg(reg, ty, _, _) => ctx.emit(M::gen_move(into_reg, reg.to_reg(), ty)),
+            &ABIArg::Stack(off, ty, _, _) => {
+                let ret_area_base = self.sig.stack_arg_space;
+                ctx.emit(M::gen_load_stack(
+                    StackAMode::SPOffset(off + ret_area_base, ty),
+                    into_reg,
+                    ty,
+                ));
+            }
+        }
+    }
+
+    fn emit_call<C: LowerCtx<I = Self::I>>(&mut self, ctx: &mut C) {
+        let (uses, defs) = (
+            mem::replace(&mut self.uses, Default::default()),
+            mem::replace(&mut self.defs, Default::default()),
+        );
+        let word_rc = M::word_reg_class();
+        let word_type = M::word_type();
+        if let Some(i) = self.sig.stack_ret_arg {
+            let rd = ctx.alloc_tmp(word_rc, word_type);
+            let ret_area_base = self.sig.stack_arg_space;
+            ctx.emit(M::gen_get_stack_addr(
+                StackAMode::SPOffset(ret_area_base, I8),
+                rd,
+                I8,
+            ));
+            self.emit_copy_reg_to_arg(ctx, i, rd.to_reg());
+        }
+        let tmp = ctx.alloc_tmp(word_rc, word_type);
+        for (is_safepoint, inst) in M::gen_call(
+            &self.dest,
+            uses,
+            defs,
+            self.opcode,
+            tmp,
+            self.sig.call_conv,
+            self.caller_conv,
+        )
+        .into_iter()
+        {
+            match is_safepoint {
+                InstIsSafepoint::Yes => ctx.emit_safepoint(inst),
+                InstIsSafepoint::No => ctx.emit(inst),
+            }
+        }
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/machinst/adapter.rs b/third_party/rust/cranelift-codegen/src/machinst/adapter.rs
new file mode 100644
index 0000000000..43ff9e51a4
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/adapter.rs
@@ -0,0 +1,140 @@
+//! Adapter for a `MachBackend` to implement the `TargetIsa` trait.
+
+use crate::binemit;
+use crate::ir;
+use crate::isa::{EncInfo, Encoding, Encodings, Legalize, RegClass, RegInfo, TargetIsa};
+use crate::machinst::*;
+use crate::regalloc::RegisterSet;
+use crate::settings::Flags;
+
+#[cfg(feature = "testing_hooks")]
+use crate::regalloc::RegDiversions;
+
+use core::any::Any;
+use std::borrow::Cow;
+use std::fmt;
+use target_lexicon::Triple;
+
+/// A wrapper around a `MachBackend` that provides a `TargetIsa` impl.
+pub struct TargetIsaAdapter {
+    backend: Box<dyn MachBackend + Send + Sync + 'static>,
+    triple: Triple,
+}
+
+impl TargetIsaAdapter {
+    /// Create a new `TargetIsa` wrapper around a `MachBackend`.
+    pub fn new<B: MachBackend + Send + Sync + 'static>(backend: B) -> TargetIsaAdapter {
+        let triple = backend.triple();
+        TargetIsaAdapter {
+            backend: Box::new(backend),
+            triple,
+        }
+    }
+}
+
+impl fmt::Display for TargetIsaAdapter {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("MachBackend")
+            .field("name", &self.backend.name())
+            .field("triple", &self.backend.triple())
+            .field("flags", &format!("{}", self.backend.flags()))
+            .finish()
+    }
+}
+
+impl TargetIsa for TargetIsaAdapter {
+    fn name(&self) -> &'static str {
+        self.backend.name()
+    }
+
+    fn triple(&self) -> &Triple {
+        &self.triple
+    }
+
+    fn flags(&self) -> &Flags {
+        self.backend.flags()
+    }
+
+    fn register_info(&self) -> RegInfo {
+        // Called from function's Display impl, so we need a stub here.
+        RegInfo {
+            banks: &[],
+            classes: &[],
+        }
+    }
+
+    fn legal_encodings<'a>(
+        &'a self,
+        _func: &'a ir::Function,
+        _inst: &'a ir::InstructionData,
+        _ctrl_typevar: ir::Type,
+    ) -> Encodings<'a> {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn encode(
+        &self,
+        _func: &ir::Function,
+        _inst: &ir::InstructionData,
+        _ctrl_typevar: ir::Type,
+    ) -> Result<Encoding, Legalize> {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn encoding_info(&self) -> EncInfo {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn legalize_signature(&self, _sig: &mut Cow<ir::Signature>, _current: bool) {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn regclass_for_abi_type(&self, _ty: ir::Type) -> RegClass {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn allocatable_registers(&self, _func: &ir::Function) -> RegisterSet {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn prologue_epilogue(&self, _func: &mut ir::Function) -> CodegenResult<()> {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    #[cfg(feature = "testing_hooks")]
+    fn emit_inst(
+        &self,
+        _func: &ir::Function,
+        _inst: ir::Inst,
+        _divert: &mut RegDiversions,
+        _sink: &mut dyn binemit::CodeSink,
+    ) {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    /// Emit a whole function into memory.
+    fn emit_function_to_memory(&self, _func: &ir::Function, _sink: &mut binemit::MemoryCodeSink) {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn get_mach_backend(&self) -> Option<&dyn MachBackend> {
+        Some(&*self.backend)
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC {
+        self.backend.unsigned_add_overflow_condition()
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC {
+        self.backend.unsigned_sub_overflow_condition()
+    }
+
+    #[cfg(feature = "unwind")]
+    fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> {
+        self.backend.create_systemv_cie()
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/machinst/blockorder.rs b/third_party/rust/cranelift-codegen/src/machinst/blockorder.rs
new file mode 100644
index 0000000000..1052d83858
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/blockorder.rs
@@ -0,0 +1,624 @@
+//! Computation of basic block order in emitted code.
+//!
+//! This module handles the translation from CLIF BBs to VCode BBs.
+//!
+//! The basic idea is that we compute a sequence of "lowered blocks" that
+//! correspond to one or more blocks in the graph: (CLIF CFG) `union` (implicit
+//! block on *every* edge). Conceptually, the lowering pipeline wants to insert
+//! moves for phi-nodes on every block-to-block transfer; these blocks always
+//! conceptually exist, but may be merged with an "original" CLIF block (and
+//! hence not actually exist; this is equivalent to inserting the blocks only on
+//! critical edges).
+//!
+//! In other words, starting from a CFG like this (where each "CLIF block" and
+//! "(edge N->M)" is a separate basic block):
+//!
+//! ```plain
+//!
+//!              CLIF block 0
+//!               /           \
+//!       (edge 0->1)         (edge 0->2)
+//!              |                |
+//!       CLIF block 1         CLIF block 2
+//!              \                /
+//!           (edge 1->3)   (edge 2->3)
+//!                   \      /
+//!                 CLIF block 3
+//! ```
+//!
+//! We can produce a CFG of lowered blocks like so:
+//!
+//! ```plain
+//!            +--------------+
+//!            | CLIF block 0 |
+//!            +--------------+
+//!               /           \
+//!     +--------------+     +--------------+
+//!     | (edge 0->1)  |     |(edge 0->2)   |
+//!     | CLIF block 1 |     | CLIF block 2 |
+//!     +--------------+     +--------------+
+//!              \                /
+//!          +-----------+ +-----------+
+//!          |(edge 1->3)| |(edge 2->3)|
+//!          +-----------+ +-----------+
+//!                   \      /
+//!                +------------+
+//!                |CLIF block 3|
+//!                +------------+
+//! ```
+//!
+//! (note that the edges into CLIF blocks 1 and 2 could be merged with those
+//! blocks' original bodies, but the out-edges could not because for simplicity
+//! in the successor-function definition, we only ever merge an edge onto one
+//! side of an original CLIF block.)
+//!
+//! Each `LoweredBlock` names just an original CLIF block, an original CLIF
+//! block prepended or appended with an edge block (never both, though), or just
+//! an edge block.
+//!
+//! To compute this lowering, we do a DFS over the CLIF-plus-edge-block graph
+//! (never actually materialized, just defined by a "successors" function), and
+//! compute the reverse postorder.
+//!
+//! This algorithm isn't perfect w.r.t. generated code quality: we don't, for
+//! example, consider any information about whether edge blocks will actually
+//! have content, because this computation happens as part of lowering *before*
+//! regalloc, and regalloc may or may not insert moves/spills/reloads on any
+//! particular edge. But it works relatively well and is conceptually simple.
+//! Furthermore, the [MachBuffer] machine-code sink performs final peephole-like
+//! branch editing that in practice elides empty blocks and simplifies some of
+//! the other redundancies that this scheme produces.
+
+use crate::entity::SecondaryMap;
+use crate::fx::{FxHashMap, FxHashSet};
+use crate::ir::{Block, Function, Inst, Opcode};
+use crate::machinst::lower::visit_block_succs;
+use crate::machinst::*;
+
+use log::debug;
+use smallvec::SmallVec;
+
+/// Mapping from CLIF BBs to VCode BBs.
+#[derive(Debug)]
+pub struct BlockLoweringOrder {
+    /// Lowered blocks, in BlockIndex order. Each block is some combination of
+    /// (i) a CLIF block, and (ii) inserted crit-edge blocks before or after;
+    /// see [LoweredBlock] for details.
+    lowered_order: Vec<LoweredBlock>,
+    /// Successors for all lowered blocks, in one serialized vector. Indexed by
+    /// the ranges in `lowered_succ_ranges`.
+    lowered_succs: Vec<(Inst, LoweredBlock)>,
+    /// BlockIndex values for successors for all lowered blocks, in the same
+    /// order as `lowered_succs`.
+    lowered_succ_indices: Vec<(Inst, BlockIndex)>,
+    /// Ranges in `lowered_succs` giving the successor lists for each lowered
+    /// block. Indexed by lowering-order index (`BlockIndex`).
+    lowered_succ_ranges: Vec<(usize, usize)>,
+    /// Mapping from CLIF BB to BlockIndex (index in lowered order). Note that
+    /// some CLIF BBs may not be lowered; in particular, we skip unreachable
+    /// blocks.
+    orig_map: SecondaryMap<Block, Option<BlockIndex>>,
+}
+
+/// The origin of a block in the lowered block-order: either an original CLIF
+/// block, or an inserted edge-block, or a combination of the two if an edge is
+/// non-critical.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum LoweredBlock {
+    /// Block in original CLIF, with no merged edge-blocks.
+    Orig {
+        /// Original CLIF block.
+        block: Block,
+    },
+    /// Block in the original CLIF, plus edge-block to one succ (which is the
+    /// one successor of the original block).
+    OrigAndEdge {
+        /// The original CLIF block contained in this lowered block.
+        block: Block,
+        /// The edge (jump) instruction transitioning from this block
+        /// to the next, i.e., corresponding to the included edge-block. This
+        /// will be an instruction in `block`.
+        edge_inst: Inst,
+        /// The successor CLIF block.
+        succ: Block,
+    },
+    /// Block in the original CLIF, preceded by edge-block from one pred (which
+    /// is the one pred of the original block).
+    EdgeAndOrig {
+        /// The previous CLIF block, i.e., the edge block's predecessor.
+        pred: Block,
+        /// The edge (jump) instruction corresponding to the included
+        /// edge-block. This will be an instruction in `pred`.
+        edge_inst: Inst,
+        /// The original CLIF block included in this lowered block.
+        block: Block,
+    },
+    /// Split critical edge between two CLIF blocks. This lowered block does not
+    /// correspond to any original CLIF blocks; it only serves as an insertion
+    /// point for work to happen on the transition from `pred` to `succ`.
+    Edge {
+        /// The predecessor CLIF block.
+        pred: Block,
+        /// The edge (jump) instruction corresponding to this edge's transition.
+        /// This will be an instruction in `pred`.
+        edge_inst: Inst,
+        /// The successor CLIF block.
+        succ: Block,
+    },
+}
+
+impl LoweredBlock {
+    /// The associated original (CLIF) block included in this lowered block, if
+    /// any.
+    pub fn orig_block(self) -> Option<Block> {
+        match self {
+            LoweredBlock::Orig { block, .. }
+            | LoweredBlock::OrigAndEdge { block, .. }
+            | LoweredBlock::EdgeAndOrig { block, .. } => Some(block),
+            LoweredBlock::Edge { .. } => None,
+        }
+    }
+
+    /// The associated in-edge, if any.
+    pub fn in_edge(self) -> Option<(Block, Inst, Block)> {
+        match self {
+            LoweredBlock::EdgeAndOrig {
+                pred,
+                edge_inst,
+                block,
+            } => Some((pred, edge_inst, block)),
+            _ => None,
+        }
+    }
+
+    /// the associated out-edge, if any. Also includes edge-only blocks.
+    pub fn out_edge(self) -> Option<(Block, Inst, Block)> {
+        match self {
+            LoweredBlock::OrigAndEdge {
+                block,
+                edge_inst,
+                succ,
+            } => Some((block, edge_inst, succ)),
+            LoweredBlock::Edge {
+                pred,
+                edge_inst,
+                succ,
+            } => Some((pred, edge_inst, succ)),
+            _ => None,
+        }
+    }
+}
+
+impl BlockLoweringOrder {
+    /// Compute and return a lowered block order for `f`.
+    pub fn new(f: &Function) -> BlockLoweringOrder {
+        debug!("BlockLoweringOrder: function body {:?}", f);
+
+        // Step 1: compute the in-edge and out-edge count of every block.
+        let mut block_in_count = SecondaryMap::with_default(0);
+        let mut block_out_count = SecondaryMap::with_default(0);
+
+        // Cache the block successors to avoid re-examining branches below.
+        let mut block_succs: SmallVec<[(Inst, Block); 128]> = SmallVec::new();
+        let mut block_succ_range = SecondaryMap::with_default((0, 0));
+        let mut fallthrough_return_block = None;
+        for block in f.layout.blocks() {
+            let block_succ_start = block_succs.len();
+            visit_block_succs(f, block, |inst, succ| {
+                block_out_count[block] += 1;
+                block_in_count[succ] += 1;
+                block_succs.push((inst, succ));
+            });
+            let block_succ_end = block_succs.len();
+            block_succ_range[block] = (block_succ_start, block_succ_end);
+
+            for inst in f.layout.block_likely_branches(block) {
+                if f.dfg[inst].opcode() == Opcode::Return {
+                    // Implicit output edge for any return.
+                    block_out_count[block] += 1;
+                }
+                if f.dfg[inst].opcode() == Opcode::FallthroughReturn {
+                    // Fallthrough return block must come last.
+                    debug_assert!(fallthrough_return_block == None);
+                    fallthrough_return_block = Some(block);
+                }
+            }
+        }
+        // Implicit input edge for entry block.
+        if let Some(entry) = f.layout.entry_block() {
+            block_in_count[entry] += 1;
+        }
+
+        // Here we define the implicit CLIF-plus-edges graph. There are
+        // conceptually two such graphs: the original, with every edge explicit,
+        // and the merged one, with blocks (represented by `LoweredBlock`
+        // values) that contain original CLIF blocks, edges, or both. This
+        // function returns a lowered block's successors as per the latter, with
+        // consideration to edge-block merging.
+        //
+        // Note that there is a property of the block-merging rules below
+        // that is very important to ensure we don't miss any lowered blocks:
+        // any block in the implicit CLIF-plus-edges graph will *only* be
+        // included in one block in the merged graph.
+        //
+        // This, combined with the property that every edge block is reachable
+        // only from one predecessor (and hence cannot be reached by a DFS
+        // backedge), means that it is sufficient in our DFS below to track
+        // visited-bits per original CLIF block only, not per edge. This greatly
+        // simplifies the data structures (no need to keep a sparse hash-set of
+        // (block, block) tuples).
+        let compute_lowered_succs = |ret: &mut Vec<(Inst, LoweredBlock)>, block: LoweredBlock| {
+            let start_idx = ret.len();
+            match block {
+                LoweredBlock::Orig { block } | LoweredBlock::EdgeAndOrig { block, .. } => {
+                    // At an orig block; successors are always edge blocks,
+                    // possibly with orig blocks following.
+                    let range = block_succ_range[block];
+                    for &(edge_inst, succ) in &block_succs[range.0..range.1] {
+                        if block_in_count[succ] == 1 {
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::EdgeAndOrig {
+                                    pred: block,
+                                    edge_inst,
+                                    block: succ,
+                                },
+                            ));
+                        } else {
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::Edge {
+                                    pred: block,
+                                    edge_inst,
+                                    succ,
+                                },
+                            ));
+                        }
+                    }
+                }
+                LoweredBlock::Edge {
+                    succ, edge_inst, ..
+                }
+                | LoweredBlock::OrigAndEdge {
+                    succ, edge_inst, ..
+                } => {
+                    // At an edge block; successors are always orig blocks,
+                    // possibly with edge blocks following.
+                    if block_out_count[succ] == 1 {
+                        let range = block_succ_range[succ];
+                        // check if the one succ is a real CFG edge (vs.
+                        // implicit return succ).
+                        if range.1 - range.0 > 0 {
+                            debug_assert!(range.1 - range.0 == 1);
+                            let (succ_edge_inst, succ_succ) = block_succs[range.0];
+                            ret.push((
+                                edge_inst,
+                                LoweredBlock::OrigAndEdge {
+                                    block: succ,
+                                    edge_inst: succ_edge_inst,
+                                    succ: succ_succ,
+                                },
+                            ));
+                        } else {
+                            ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
+                        }
+                    } else {
+                        ret.push((edge_inst, LoweredBlock::Orig { block: succ }));
+                    }
+                }
+            }
+            let end_idx = ret.len();
+            (start_idx, end_idx)
+        };
+
+        // Build the explicit LoweredBlock-to-LoweredBlock successors list.
+        let mut lowered_succs = vec![];
+        let mut lowered_succ_indices = vec![];
+
+        // Step 2: Compute RPO traversal of the implicit CLIF-plus-edge-block graph. Use an
+        // explicit stack so we don't overflow the real stack with a deep DFS.
+        #[derive(Debug)]
+        struct StackEntry {
+            this: LoweredBlock,
+            succs: (usize, usize), // range in lowered_succs
+            cur_succ: usize,       // index in lowered_succs
+        }
+
+        let mut stack: SmallVec<[StackEntry; 16]> = SmallVec::new();
+        let mut visited = FxHashSet::default();
+        let mut postorder = vec![];
+        if let Some(entry) = f.layout.entry_block() {
+            // FIXME(cfallin): we might be able to use OrigAndEdge. Find a way
+            // to not special-case the entry block here.
+            let block = LoweredBlock::Orig { block: entry };
+            visited.insert(block);
+            let range = compute_lowered_succs(&mut lowered_succs, block);
+            lowered_succ_indices.resize(lowered_succs.len(), 0);
+            stack.push(StackEntry {
+                this: block,
+                succs: range,
+                cur_succ: range.1,
+            });
+        }
+
+        let mut deferred_last = None;
+        while !stack.is_empty() {
+            let stack_entry = stack.last_mut().unwrap();
+            let range = stack_entry.succs;
+            if stack_entry.cur_succ == range.0 {
+                let orig_block = stack_entry.this.orig_block();
+                if orig_block.is_some() && orig_block == fallthrough_return_block {
+                    deferred_last = Some((stack_entry.this, range));
+                } else {
+                    postorder.push((stack_entry.this, range));
+                }
+                stack.pop();
+            } else {
+                // Heuristic: chase the children in reverse. This puts the first
+                // successor block first in RPO, all other things being equal,
+                // which tends to prioritize loop backedges over out-edges,
+                // putting the edge-block closer to the loop body and minimizing
+                // live-ranges in linear instruction space.
+                let next = lowered_succs[stack_entry.cur_succ - 1].1;
+                stack_entry.cur_succ -= 1;
+                if visited.contains(&next) {
+                    continue;
+                }
+                visited.insert(next);
+                let range = compute_lowered_succs(&mut lowered_succs, next);
+                lowered_succ_indices.resize(lowered_succs.len(), 0);
+                stack.push(StackEntry {
+                    this: next,
+                    succs: range,
+                    cur_succ: range.1,
+                });
+            }
+        }
+
+        postorder.reverse();
+        let mut rpo = postorder;
+        if let Some(d) = deferred_last {
+            rpo.push(d);
+        }
+
+        // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
+        let mut lowered_order = vec![];
+        let mut lowered_succ_ranges = vec![];
+        let mut lb_to_bindex = FxHashMap::default();
+        for (block, succ_range) in rpo.into_iter() {
+            lb_to_bindex.insert(block, lowered_order.len() as BlockIndex);
+            lowered_order.push(block);
+            lowered_succ_ranges.push(succ_range);
+        }
+
+        let lowered_succ_indices = lowered_succs
+            .iter()
+            .map(|&(inst, succ)| (inst, lb_to_bindex.get(&succ).cloned().unwrap()))
+            .collect();
+
+        let mut orig_map = SecondaryMap::with_default(None);
+        for (i, lb) in lowered_order.iter().enumerate() {
+            let i = i as BlockIndex;
+            if let Some(b) = lb.orig_block() {
+                orig_map[b] = Some(i);
+            }
+        }
+
+        let result = BlockLoweringOrder {
+            lowered_order,
+            lowered_succs,
+            lowered_succ_indices,
+            lowered_succ_ranges,
+            orig_map,
+        };
+        debug!("BlockLoweringOrder: {:?}", result);
+        result
+    }
+
+    /// Get the lowered order of blocks.
+    pub fn lowered_order(&self) -> &[LoweredBlock] {
+        &self.lowered_order[..]
+    }
+
+    /// Get the successors for a lowered block, by index in `lowered_order()`'s
+    /// returned slice. Each successsor is paired with the edge-instruction
+    /// (branch) corresponding to this edge.
+    pub fn succs(&self, block: BlockIndex) -> &[(Inst, LoweredBlock)] {
+        let range = self.lowered_succ_ranges[block as usize];
+        &self.lowered_succs[range.0..range.1]
+    }
+
+    /// Get the successor indices for a lowered block.
+    pub fn succ_indices(&self, block: BlockIndex) -> &[(Inst, BlockIndex)] {
+        let range = self.lowered_succ_ranges[block as usize];
+        &self.lowered_succ_indices[range.0..range.1]
+    }
+
+    /// Get the lowered block index containing a CLIF block, if any. (May not be
+    /// present if the original CLIF block was unreachable.)
+    pub fn lowered_block_for_bb(&self, bb: Block) -> Option<BlockIndex> {
+        self.orig_map[bb]
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+
+    fn build_test_func(n_blocks: usize, edges: &[(usize, usize)]) -> Function {
+        assert!(n_blocks > 0);
+
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+        let blocks = (0..n_blocks)
+            .map(|i| {
+                let bb = func.dfg.make_block();
+                assert!(bb.as_u32() == i as u32);
+                bb
+            })
+            .collect::<Vec<_>>();
+
+        let arg0 = func.dfg.append_block_param(blocks[0], I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+
+        let mut edge = 0;
+        for i in 0..n_blocks {
+            pos.insert_block(blocks[i]);
+            let mut succs = vec![];
+            while edge < edges.len() && edges[edge].0 == i {
+                succs.push(edges[edge].1);
+                edge += 1;
+            }
+            if succs.len() == 0 {
+                pos.ins().return_(&[arg0]);
+            } else if succs.len() == 1 {
+                pos.ins().jump(blocks[succs[0]], &[]);
+            } else if succs.len() == 2 {
+                pos.ins().brnz(arg0, blocks[succs[0]], &[]);
+                pos.ins().jump(blocks[succs[1]], &[]);
+            } else {
+                panic!("Too many successors");
+            }
+        }
+
+        func
+    }
+
+    #[test]
+    fn test_blockorder_diamond() {
+        let func = build_test_func(4, &[(0, 1), (0, 2), (1, 3), (2, 3)]);
+        let order = BlockLoweringOrder::new(&func);
+
+        assert_eq!(order.lowered_order.len(), 6);
+
+        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert!(order.lowered_order[0].in_edge().is_none());
+        assert!(order.lowered_order[0].out_edge().is_none());
+
+        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
+        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+
+        assert!(order.lowered_order[2].orig_block().is_none());
+        assert!(order.lowered_order[2].in_edge().is_none());
+        assert!(order.lowered_order[2].out_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[2].out_edge().unwrap().2.as_u32() == 3);
+
+        assert!(order.lowered_order[3].orig_block().unwrap().as_u32() == 2);
+        assert!(order.lowered_order[3].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[3].in_edge().unwrap().2.as_u32() == 2);
+        assert!(order.lowered_order[3].out_edge().is_none());
+
+        assert!(order.lowered_order[4].orig_block().is_none());
+        assert!(order.lowered_order[4].in_edge().is_none());
+        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 2);
+        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 3);
+
+        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 3);
+        assert!(order.lowered_order[5].in_edge().is_none());
+        assert!(order.lowered_order[5].out_edge().is_none());
+    }
+
+    #[test]
+    fn test_blockorder_critedge() {
+        //            0
+        //          /   \
+        //         1     2
+        //        /  \     \
+        //       3    4    |
+        //       |\  _|____|
+        //       | \/ |
+        //       | /\ |
+        //       5    6
+        //
+        // (3 -> 5, 3 -> 6, 4 -> 6 are critical edges and must be split)
+        //
+        let func = build_test_func(
+            7,
+            &[
+                (0, 1),
+                (0, 2),
+                (1, 3),
+                (1, 4),
+                (2, 5),
+                (3, 5),
+                (3, 6),
+                (4, 6),
+            ],
+        );
+        let order = BlockLoweringOrder::new(&func);
+
+        assert_eq!(order.lowered_order.len(), 11);
+        println!("ordered = {:?}", order.lowered_order);
+
+        // block 0
+        assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0);
+        assert!(order.lowered_order[0].in_edge().is_none());
+        assert!(order.lowered_order[0].out_edge().is_none());
+
+        // edge 0->1 + block 1
+        assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1);
+        assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1);
+        assert!(order.lowered_order[1].out_edge().is_none());
+
+        // edge 1->3 + block 3
+        assert!(order.lowered_order[2].orig_block().unwrap().as_u32() == 3);
+        assert!(order.lowered_order[2].in_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[2].in_edge().unwrap().2.as_u32() == 3);
+        assert!(order.lowered_order[2].out_edge().is_none());
+
+        // edge 3->5
+        assert!(order.lowered_order[3].orig_block().is_none());
+        assert!(order.lowered_order[3].in_edge().is_none());
+        assert!(order.lowered_order[3].out_edge().unwrap().0.as_u32() == 3);
+        assert!(order.lowered_order[3].out_edge().unwrap().2.as_u32() == 5);
+
+        // edge 3->6
+        assert!(order.lowered_order[4].orig_block().is_none());
+        assert!(order.lowered_order[4].in_edge().is_none());
+        assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 3);
+        assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 6);
+
+        // edge 1->4 + block 4
+        assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 4);
+        assert!(order.lowered_order[5].in_edge().unwrap().0.as_u32() == 1);
+        assert!(order.lowered_order[5].in_edge().unwrap().2.as_u32() == 4);
+        assert!(order.lowered_order[5].out_edge().is_none());
+
+        // edge 4->6
+        assert!(order.lowered_order[6].orig_block().is_none());
+        assert!(order.lowered_order[6].in_edge().is_none());
+        assert!(order.lowered_order[6].out_edge().unwrap().0.as_u32() == 4);
+        assert!(order.lowered_order[6].out_edge().unwrap().2.as_u32() == 6);
+
+        // block 6
+        assert!(order.lowered_order[7].orig_block().unwrap().as_u32() == 6);
+        assert!(order.lowered_order[7].in_edge().is_none());
+        assert!(order.lowered_order[7].out_edge().is_none());
+
+        // edge 0->2 + block 2
+        assert!(order.lowered_order[8].orig_block().unwrap().as_u32() == 2);
+        assert!(order.lowered_order[8].in_edge().unwrap().0.as_u32() == 0);
+        assert!(order.lowered_order[8].in_edge().unwrap().2.as_u32() == 2);
+        assert!(order.lowered_order[8].out_edge().is_none());
+
+        // edge 2->5
+        assert!(order.lowered_order[9].orig_block().is_none());
+        assert!(order.lowered_order[9].in_edge().is_none());
+        assert!(order.lowered_order[9].out_edge().unwrap().0.as_u32() == 2);
+        assert!(order.lowered_order[9].out_edge().unwrap().2.as_u32() == 5);
+
+        // block 5
+        assert!(order.lowered_order[10].orig_block().unwrap().as_u32() == 5);
+        assert!(order.lowered_order[10].in_edge().is_none());
+        assert!(order.lowered_order[10].out_edge().is_none());
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/machinst/buffer.rs b/third_party/rust/cranelift-codegen/src/machinst/buffer.rs
new file mode 100644
index 0000000000..b2187a9b68
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/buffer.rs
@@ -0,0 +1,1813 @@
+//! In-memory representation of compiled machine code, with labels and fixups to
+//! refer to those labels. Handles constant-pool island insertion and also
+//! veneer insertion for out-of-range jumps.
+//!
+//! This code exists to solve three problems:
+//!
+//! - Branch targets for forward branches are not known until later, when we
+//!   emit code in a single pass through the instruction structs.
+//!
+//! - On many architectures, address references or offsets have limited range.
+//!   For example, on AArch64, conditional branches can only target code +/- 1MB
+//!   from the branch itself.
+//!
+//! - The lowering of control flow from the CFG-with-edges produced by
+//!   [BlockLoweringOrder](super::BlockLoweringOrder), combined with many empty
+//!   edge blocks when the register allocator does not need to insert any
+//!   spills/reloads/moves in edge blocks, results in many suboptimal branch
+//!   patterns. The lowering also pays no attention to block order, and so
+//!   two-target conditional forms (cond-br followed by uncond-br) can often by
+//!   avoided because one of the targets is the fallthrough. There are several
+//!   cases here where we can simplify to use fewer branches.
+//!
+//! This "buffer" implements a single-pass code emission strategy (with a later
+//! "fixup" pass, but only through recorded fixups, not all instructions). The
+//! basic idea is:
+//!
+//! - Emit branches as they are, including two-target (cond/uncond) compound
+//!   forms, but with zero offsets and optimistically assuming the target will be
+//!   in range. Record the "fixup" for later. Targets are denoted instead by
+//!   symbolic "labels" that are then bound to certain offsets in the buffer as
+//!   we emit code. (Nominally, there is a label at the start of every basic
+//!   block.)
+//!
+//! - As we do this, track the offset in the buffer at which the first label
+//!   reference "goes out of range". We call this the "deadline". If we reach the
+//!   deadline and we still have not bound the label to which an unresolved branch
+//!   refers, we have a problem!
+//!
+//! - To solve this problem, we emit "islands" full of "veneers". An island is
+//!   simply a chunk of code inserted in the middle of the code actually produced
+//!   by the emitter (e.g., vcode iterating over instruction structs). The emitter
+//!   has some awareness of this: it either asks for an island between blocks, so
+//!   it is not accidentally executed, or else it emits a branch around the island
+//!   when all other options fail (see `Inst::EmitIsland` meta-instruction).
+//!
+//! - A "veneer" is an instruction (or sequence of instructions) in an "island"
+//!   that implements a longer-range reference to a label. The idea is that, for
+//!   example, a branch with a limited range can branch to a "veneer" instead,
+//!   which is simply a branch in a form that can use a longer-range reference. On
+//!   AArch64, for example, conditionals have a +/- 1 MB range, but a conditional
+//!   can branch to an unconditional branch which has a +/- 128 MB range. Hence, a
+//!   conditional branch's label reference can be fixed up with a "veneer" to
+//!   achieve a longer range.
+//!
+//! - To implement all of this, we require the backend to provide a `LabelUse`
+//!   type that implements a trait. This is nominally an enum that records one of
+//!   several kinds of references to an offset in code -- basically, a relocation
+//!   type -- and will usually correspond to different instruction formats. The
+//!   `LabelUse` implementation specifies the maximum range, how to patch in the
+//!   actual label location when known, and how to generate a veneer to extend the
+//!   range.
+//!
+//! That satisfies label references, but we still may have suboptimal branch
+//! patterns. To clean up the branches, we do a simple "peephole"-style
+//! optimization on the fly. To do so, the emitter (e.g., `Inst::emit()`)
+//! informs the buffer of branches in the code and, in the case of conditionals,
+//! the code that would have been emitted to invert this branch's condition. We
+//! track the "latest branches": these are branches that are contiguous up to
+//! the current offset. (If any code is emitted after a branch, that branch or
+//! run of contiguous branches is no longer "latest".) The latest branches are
+//! those that we can edit by simply truncating the buffer and doing something
+//! else instead.
+//!
+//! To optimize branches, we implement several simple rules, and try to apply
+//! them to the "latest branches" when possible:
+//!
+//! - A branch with a label target, when that label is bound to the ending
+//!   offset of the branch (the fallthrough location), can be removed altogether,
+//!   because the branch would have no effect).
+//!
+//! - An unconditional branch that starts at a label location, and branches to
+//!   another label, results in a "label alias": all references to the label bound
+//!   *to* this branch instruction are instead resolved to the *target* of the
+//!   branch instruction. This effectively removes empty blocks that just
+//!   unconditionally branch to the next block. We call this "branch threading".
+//!
+//! - A conditional followed by an unconditional, when the conditional branches
+//!   to the unconditional's fallthrough, results in (i) the truncation of the
+//!   unconditional, (ii) the inversion of the condition's condition, and (iii)
+//!   replacement of the conditional's target (using the original target of the
+//!   unconditional). This is a fancy way of saying "we can flip a two-target
+//!   conditional branch's taken/not-taken targets if it works better with our
+//!   fallthrough". To make this work, the emitter actually gives the buffer
+//!   *both* forms of every conditional branch: the true form is emitted into the
+//!   buffer, and the "inverted" machine-code bytes are provided as part of the
+//!   branch-fixup metadata.
+//!
+//! - An unconditional B preceded by another unconditional P, when B's label(s) have
+//!   been redirected to target(B), can be removed entirely. This is an extension
+//!   of the branch-threading optimization, and is valid because if we know there
+//!   will be no fallthrough into this branch instruction (the prior instruction
+//!   is an unconditional jump), and if we know we have successfully redirected
+//!   all labels, then this branch instruction is unreachable. Note that this
+//!   works because the redirection happens before the label is ever resolved
+//!   (fixups happen at island emission time, at which point latest-branches are
+//!   cleared, or at the end of emission), so we are sure to catch and redirect
+//!   all possible paths to this instruction.
+//!
+//! # Branch-optimization Correctness
+//!
+//! The branch-optimization mechanism depends on a few data structures with
+//! invariants, which are always held outside the scope of top-level public
+//! methods:
+//!
+//! - The latest-branches list. Each entry describes a span of the buffer
+//!   (start/end offsets), the label target, the corresponding fixup-list entry
+//!   index, and the bytes (must be the same length) for the inverted form, if
+//!   conditional. The list of labels that are bound to the start-offset of this
+//!   branch is *complete* (if any label has a resolved offset equal to `start`
+//!   and is not an alias, it must appear in this list) and *precise* (no label
+//!   in this list can be bound to another offset). No label in this list should
+//!   be an alias.  No two branch ranges can overlap, and branches are in
+//!   ascending-offset order.
+//!
+//! - The labels-at-tail list. This contains all MachLabels that have been bound
+//!   to (whose resolved offsets are equal to) the tail offset of the buffer.
+//!   No label in this list should be an alias.
+//!
+//! - The label_offsets array, containing the bound offset of a label or
+//!   UNKNOWN. No label can be bound at an offset greater than the current
+//!   buffer tail.
+//!
+//! - The label_aliases array, containing another label to which a label is
+//!   bound or UNKNOWN. A label's resolved offset is the resolved offset
+//!   of the label it is aliased to, if this is set.
+//!
+//! We argue below, at each method, how the invariants in these data structures
+//! are maintained (grep for "Post-invariant").
+//!
+//! Given these invariants, we argue why each optimization preserves execution
+//! semantics below (grep for "Preserves execution semantics").
+
+use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc, StackMap};
+use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode};
+use crate::machinst::{BlockIndex, MachInstLabelUse, VCodeConstant, VCodeConstants, VCodeInst};
+use crate::timing;
+use cranelift_entity::{entity_impl, SecondaryMap};
+
+use log::trace;
+use smallvec::SmallVec;
+use std::mem;
+use std::string::String;
+
+/// A buffer of output to be produced, fixed up, and then emitted to a CodeSink
+/// in bulk.
+///
+/// This struct uses `SmallVec`s to support small-ish function bodies without
+/// any heap allocation. As such, it will be several kilobytes large. This is
+/// likely fine as long as it is stack-allocated for function emission then
+/// thrown away; but beware if many buffer objects are retained persistently.
+pub struct MachBuffer<I: VCodeInst> {
+    /// The buffer contents, as raw bytes.
+    data: SmallVec<[u8; 1024]>,
+    /// Any relocations referring to this code. Note that only *external*
+    /// relocations are tracked here; references to labels within the buffer are
+    /// resolved before emission.
+    relocs: SmallVec<[MachReloc; 16]>,
+    /// Any trap records referring to this code.
+    traps: SmallVec<[MachTrap; 16]>,
+    /// Any call site records referring to this code.
+    call_sites: SmallVec<[MachCallSite; 16]>,
+    /// Any source location mappings referring to this code.
+    srclocs: SmallVec<[MachSrcLoc; 64]>,
+    /// Any stack maps referring to this code.
+    stack_maps: SmallVec<[MachStackMap; 8]>,
+    /// The current source location in progress (after `start_srcloc()` and
+    /// before `end_srcloc()`).  This is a (start_offset, src_loc) tuple.
+    cur_srcloc: Option<(CodeOffset, SourceLoc)>,
+    /// Known label offsets; `UNKNOWN_LABEL_OFFSET` if unknown.
+    label_offsets: SmallVec<[CodeOffset; 16]>,
+    /// Label aliases: when one label points to an unconditional jump, and that
+    /// jump points to another label, we can redirect references to the first
+    /// label immediately to the second.
+    ///
+    /// Invariant: we don't have label-alias cycles. We ensure this by,
+    /// before setting label A to alias label B, resolving B's alias
+    /// target (iteratively until a non-aliased label); if B is already
+    /// aliased to A, then we cannot alias A back to B.
+    label_aliases: SmallVec<[MachLabel; 16]>,
+    /// Constants that must be emitted at some point.
+    pending_constants: SmallVec<[MachLabelConstant; 16]>,
+    /// Fixups that must be performed after all code is emitted.
+    fixup_records: SmallVec<[MachLabelFixup<I>; 16]>,
+    /// Current deadline at which all constants are flushed and all code labels
+    /// are extended by emitting long-range jumps in an island. This flush
+    /// should be rare (e.g., on AArch64, the shortest-range PC-rel references
+    /// are +/- 1MB for conditional jumps and load-literal instructions), so
+    /// it's acceptable to track a minimum and flush-all rather than doing more
+    /// detailed "current minimum" / sort-by-deadline trickery.
+    island_deadline: CodeOffset,
+    /// How many bytes are needed in the worst case for an island, given all
+    /// pending constants and fixups.
+    island_worst_case_size: CodeOffset,
+    /// Latest branches, to facilitate in-place editing for better fallthrough
+    /// behavior and empty-block removal.
+    latest_branches: SmallVec<[MachBranch; 4]>,
+    /// All labels at the current offset (emission tail). This is lazily
+    /// cleared: it is actually accurate as long as the current offset is
+    /// `labels_at_tail_off`, but if `cur_offset()` has grown larger, it should
+    /// be considered as empty.
+    ///
+    /// For correctness, this *must* be complete (i.e., the vector must contain
+    /// all labels whose offsets are resolved to the current tail), because we
+    /// rely on it to update labels when we truncate branches.
+    labels_at_tail: SmallVec<[MachLabel; 4]>,
+    /// The last offset at which `labels_at_tail` is valid. It is conceptually
+    /// always describing the tail of the buffer, but we do not clear
+    /// `labels_at_tail` eagerly when the tail grows, rather we lazily clear it
+    /// when the offset has grown past this (`labels_at_tail_off`) point.
+    /// Always <= `cur_offset()`.
+    labels_at_tail_off: CodeOffset,
+    /// Map used constants to their [MachLabel].
+    constant_labels: SecondaryMap<VCodeConstant, MachLabel>,
+}
+
+/// A `MachBuffer` once emission is completed: holds generated code and records,
+/// without fixups. This allows the type to be independent of the backend.
+pub struct MachBufferFinalized {
+    /// The buffer contents, as raw bytes.
+    pub data: SmallVec<[u8; 1024]>,
+    /// Any relocations referring to this code. Note that only *external*
+    /// relocations are tracked here; references to labels within the buffer are
+    /// resolved before emission.
+    relocs: SmallVec<[MachReloc; 16]>,
+    /// Any trap records referring to this code.
+    traps: SmallVec<[MachTrap; 16]>,
+    /// Any call site records referring to this code.
+    call_sites: SmallVec<[MachCallSite; 16]>,
+    /// Any source location mappings referring to this code.
+    srclocs: SmallVec<[MachSrcLoc; 64]>,
+    /// Any stack maps referring to this code.
+    stack_maps: SmallVec<[MachStackMap; 8]>,
+}
+
+static UNKNOWN_LABEL_OFFSET: CodeOffset = 0xffff_ffff;
+static UNKNOWN_LABEL: MachLabel = MachLabel(0xffff_ffff);
+
+/// A label refers to some offset in a `MachBuffer`. It may not be resolved at
+/// the point at which it is used by emitted code; the buffer records "fixups"
+/// for references to the label, and will come back and patch the code
+/// appropriately when the label's location is eventually known.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct MachLabel(u32);
+entity_impl!(MachLabel);
+
+impl MachLabel {
+    /// Get a label for a block. (The first N MachLabels are always reseved for
+    /// the N blocks in the vcode.)
+    pub fn from_block(bindex: BlockIndex) -> MachLabel {
+        MachLabel(bindex)
+    }
+
+    /// Get the numeric label index.
+    pub fn get(self) -> u32 {
+        self.0
+    }
+
+    /// Creates a string representing this label, for convenience.
+    pub fn to_string(&self) -> String {
+        format!("label{}", self.0)
+    }
+}
+
+impl Default for MachLabel {
+    fn default() -> Self {
+        UNKNOWN_LABEL
+    }
+}
+
+/// A stack map extent, when creating a stack map.
+pub enum StackMapExtent {
+    /// The stack map starts at this instruction, and ends after the number of upcoming bytes
+    /// (note: this is a code offset diff).
+    UpcomingBytes(CodeOffset),
+
+    /// The stack map started at the given offset and ends at the current one. This helps
+    /// architectures where the instruction size has not a fixed length.
+    StartedAtOffset(CodeOffset),
+}
+
+impl<I: VCodeInst> MachBuffer<I> {
+    /// Create a new section, known to start at `start_offset` and with a size limited to
+    /// `length_limit`.
+    pub fn new() -> MachBuffer<I> {
+        MachBuffer {
+            data: SmallVec::new(),
+            relocs: SmallVec::new(),
+            traps: SmallVec::new(),
+            call_sites: SmallVec::new(),
+            srclocs: SmallVec::new(),
+            stack_maps: SmallVec::new(),
+            cur_srcloc: None,
+            label_offsets: SmallVec::new(),
+            label_aliases: SmallVec::new(),
+            pending_constants: SmallVec::new(),
+            fixup_records: SmallVec::new(),
+            island_deadline: UNKNOWN_LABEL_OFFSET,
+            island_worst_case_size: 0,
+            latest_branches: SmallVec::new(),
+            labels_at_tail: SmallVec::new(),
+            labels_at_tail_off: 0,
+            constant_labels: SecondaryMap::new(),
+        }
+    }
+
+    /// Debug-only: check invariants of labels and branch-records described
+    /// under "Branch-optimization Correctness" above.
+    #[cfg(debug)]
+    fn check_label_branch_invariants(&self) {
+        let cur_off = self.cur_offset();
+        // Check that every entry in latest_branches has *correct*
+        // labels_at_this_branch lists. We do not check completeness because
+        // that would require building a reverse index, which is too slow even
+        // for a debug invariant check.
+        let mut last_end = 0;
+        for b in &self.latest_branches {
+            debug_assert!(b.start < b.end);
+            debug_assert!(b.end <= cur_off);
+            debug_assert!(b.start >= last_end);
+            last_end = b.end;
+            for &l in &b.labels_at_this_branch {
+                debug_assert_eq!(self.resolve_label_offset(l), b.start);
+                debug_assert_eq!(self.label_aliases[l.0 as usize], UNKNOWN_LABEL);
+            }
+        }
+
+        // Check that every label is unresolved, or resolved at or before
+        // cur_offset. If at cur_offset, must be in `labels_at_tail`.
+        for (i, &off) in self.label_offsets.iter().enumerate() {
+            let label = MachLabel(i as u32);
+            debug_assert!(off == UNKNOWN_LABEL_OFFSET || off <= cur_off);
+            if off == cur_off {
+                debug_assert!(
+                    self.labels_at_tail_off == cur_off && self.labels_at_tail.contains(&label)
+                );
+            }
+        }
+
+        // Check that every label in `labels_at_tail_off` is precise, i.e.,
+        // resolves to the cur offset.
+        debug_assert!(self.labels_at_tail_off <= cur_off);
+        if self.labels_at_tail_off == cur_off {
+            for &l in &self.labels_at_tail {
+                debug_assert_eq!(self.resolve_label_offset(l), cur_off);
+                debug_assert_eq!(self.label_aliases[l.0 as usize], UNKNOWN_LABEL);
+            }
+        }
+    }
+
+    #[cfg(not(debug))]
+    fn check_label_branch_invariants(&self) {
+        // Nothing.
+    }
+
+    /// Current offset from start of buffer.
+    pub fn cur_offset(&self) -> CodeOffset {
+        self.data.len() as CodeOffset
+    }
+
+    /// Add a byte.
+    pub fn put1(&mut self, value: u8) {
+        trace!("MachBuffer: put byte @ {}: {:x}", self.cur_offset(), value);
+        self.data.push(value);
+
+        // Post-invariant: conceptual-labels_at_tail contains a complete and
+        // precise list of labels bound at `cur_offset()`. We have advanced
+        // `cur_offset()`, hence if it had been equal to `labels_at_tail_off`
+        // before, it is not anymore (and it cannot become equal, because
+        // `labels_at_tail_off` is always <= `cur_offset()`). Thus the list is
+        // conceptually empty (even though it is only lazily cleared). No labels
+        // can be bound at this new offset (by invariant on `label_offsets`).
+        // Hence the invariant holds.
+    }
+
+    /// Add 2 bytes.
+    pub fn put2(&mut self, value: u16) {
+        trace!(
+            "MachBuffer: put 16-bit word @ {}: {:x}",
+            self.cur_offset(),
+            value
+        );
+        let bytes = value.to_le_bytes();
+        self.data.extend_from_slice(&bytes[..]);
+
+        // Post-invariant: as for `put1()`.
+    }
+
+    /// Add 4 bytes.
+    pub fn put4(&mut self, value: u32) {
+        trace!(
+            "MachBuffer: put 32-bit word @ {}: {:x}",
+            self.cur_offset(),
+            value
+        );
+        let bytes = value.to_le_bytes();
+        self.data.extend_from_slice(&bytes[..]);
+
+        // Post-invariant: as for `put1()`.
+    }
+
+    /// Add 8 bytes.
+    pub fn put8(&mut self, value: u64) {
+        trace!(
+            "MachBuffer: put 64-bit word @ {}: {:x}",
+            self.cur_offset(),
+            value
+        );
+        let bytes = value.to_le_bytes();
+        self.data.extend_from_slice(&bytes[..]);
+
+        // Post-invariant: as for `put1()`.
+    }
+
+    /// Add a slice of bytes.
+    pub fn put_data(&mut self, data: &[u8]) {
+        trace!(
+            "MachBuffer: put data @ {}: len {}",
+            self.cur_offset(),
+            data.len()
+        );
+        self.data.extend_from_slice(data);
+
+        // Post-invariant: as for `put1()`.
+    }
+
+    /// Reserve appended space and return a mutable slice referring to it.
+    pub fn get_appended_space(&mut self, len: usize) -> &mut [u8] {
+        trace!("MachBuffer: put data @ {}: len {}", self.cur_offset(), len);
+        let off = self.data.len();
+        let new_len = self.data.len() + len;
+        self.data.resize(new_len, 0);
+        &mut self.data[off..]
+
+        // Post-invariant: as for `put1()`.
+    }
+
+    /// Align up to the given alignment.
+    pub fn align_to(&mut self, align_to: CodeOffset) {
+        trace!("MachBuffer: align to {}", align_to);
+        assert!(align_to.is_power_of_two());
+        while self.cur_offset() & (align_to - 1) != 0 {
+            self.put1(0);
+        }
+
+        // Post-invariant: as for `put1()`.
+    }
+
+    /// Allocate a `Label` to refer to some offset. May not be bound to a fixed
+    /// offset yet.
+    pub fn get_label(&mut self) -> MachLabel {
+        let l = self.label_offsets.len() as u32;
+        self.label_offsets.push(UNKNOWN_LABEL_OFFSET);
+        self.label_aliases.push(UNKNOWN_LABEL);
+        trace!("MachBuffer: new label -> {:?}", MachLabel(l));
+        MachLabel(l)
+
+        // Post-invariant: the only mutation is to add a new label; it has no
+        // bound offset yet, so it trivially satisfies all invariants.
+    }
+
+    /// Reserve the first N MachLabels for blocks.
+    pub fn reserve_labels_for_blocks(&mut self, blocks: BlockIndex) {
+        trace!("MachBuffer: first {} labels are for blocks", blocks);
+        debug_assert!(self.label_offsets.is_empty());
+        self.label_offsets
+            .resize(blocks as usize, UNKNOWN_LABEL_OFFSET);
+        self.label_aliases.resize(blocks as usize, UNKNOWN_LABEL);
+
+        // Post-invariant: as for `get_label()`.
+    }
+
+    /// Reserve the next N MachLabels for constants.
+    pub fn reserve_labels_for_constants(&mut self, constants: &VCodeConstants) {
+        trace!(
+            "MachBuffer: next {} labels are for constants",
+            constants.len()
+        );
+        for c in constants.keys() {
+            self.constant_labels[c] = self.get_label();
+        }
+
+        // Post-invariant: as for `get_label()`.
+    }
+
+    /// Retrieve the reserved label for a constant.
+    pub fn get_label_for_constant(&self, constant: VCodeConstant) -> MachLabel {
+        self.constant_labels[constant]
+    }
+
+    /// Bind a label to the current offset. A label can only be bound once.
+    pub fn bind_label(&mut self, label: MachLabel) {
+        trace!(
+            "MachBuffer: bind label {:?} at offset {}",
+            label,
+            self.cur_offset()
+        );
+        debug_assert_eq!(self.label_offsets[label.0 as usize], UNKNOWN_LABEL_OFFSET);
+        debug_assert_eq!(self.label_aliases[label.0 as usize], UNKNOWN_LABEL);
+        let offset = self.cur_offset();
+        self.label_offsets[label.0 as usize] = offset;
+        self.lazily_clear_labels_at_tail();
+        self.labels_at_tail.push(label);
+
+        // Invariants hold: bound offset of label is <= cur_offset (in fact it
+        // is equal). If the `labels_at_tail` list was complete and precise
+        // before, it is still, because we have bound this label to the current
+        // offset and added it to the list (which contains all labels at the
+        // current offset).
+
+        self.check_label_branch_invariants();
+        self.optimize_branches();
+
+        // Post-invariant: by `optimize_branches()` (see argument there).
+        self.check_label_branch_invariants();
+    }
+
+    /// Lazily clear `labels_at_tail` if the tail offset has moved beyond the
+    /// offset that it applies to.
+    fn lazily_clear_labels_at_tail(&mut self) {
+        let offset = self.cur_offset();
+        if offset > self.labels_at_tail_off {
+            self.labels_at_tail_off = offset;
+            self.labels_at_tail.clear();
+        }
+
+        // Post-invariant: either labels_at_tail_off was at cur_offset, and
+        // state is untouched, or was less than cur_offset, in which case the
+        // labels_at_tail list was conceptually empty, and is now actually
+        // empty.
+    }
+
+    /// Resolve a label to an offset, if known. May return `UNKNOWN_LABEL_OFFSET`.
+    fn resolve_label_offset(&self, mut label: MachLabel) -> CodeOffset {
+        let mut iters = 0;
+        while self.label_aliases[label.0 as usize] != UNKNOWN_LABEL {
+            label = self.label_aliases[label.0 as usize];
+            // To protect against an infinite loop (despite our assurances to
+            // ourselves that the invariants make this impossible), assert out
+            // after 1M iterations. The number of basic blocks is limited
+            // in most contexts anyway so this should be impossible to hit with
+            // a legitimate input.
+            iters += 1;
+            assert!(iters < 1_000_000, "Unexpected cycle in label aliases");
+        }
+        self.label_offsets[label.0 as usize]
+
+        // Post-invariant: no mutations.
+    }
+
+    /// Emit a reference to the given label with the given reference type (i.e.,
+    /// branch-instruction format) at the current offset.  This is like a
+    /// relocation, but handled internally.
+    ///
+    /// This can be called before the branch is actually emitted; fixups will
+    /// not happen until an island is emitted or the buffer is finished.
+    pub fn use_label_at_offset(&mut self, offset: CodeOffset, label: MachLabel, kind: I::LabelUse) {
+        trace!(
+            "MachBuffer: use_label_at_offset: offset {} label {:?} kind {:?}",
+            offset,
+            label,
+            kind
+        );
+
+        // Add the fixup, and update the worst-case island size based on a
+        // veneer for this label use.
+        self.fixup_records.push(MachLabelFixup {
+            label,
+            offset,
+            kind,
+        });
+        if kind.supports_veneer() {
+            self.island_worst_case_size += kind.veneer_size();
+            self.island_worst_case_size &= !(I::LabelUse::ALIGN - 1);
+        }
+        let deadline = offset + kind.max_pos_range();
+        if deadline < self.island_deadline {
+            self.island_deadline = deadline;
+        }
+
+        // Post-invariant: no mutations to branches/labels data structures.
+        self.check_label_branch_invariants();
+    }
+
+    /// Inform the buffer of an unconditional branch at the given offset,
+    /// targetting the given label. May be used to optimize branches.
+    /// The last added label-use must correspond to this branch.
+    /// This must be called when the current offset is equal to `start`; i.e.,
+    /// before actually emitting the branch. This implies that for a branch that
+    /// uses a label and is eligible for optimizations by the MachBuffer, the
+    /// proper sequence is:
+    ///
+    /// - Call `use_label_at_offset()` to emit the fixup record.
+    /// - Call `add_uncond_branch()` to make note of the branch.
+    /// - Emit the bytes for the branch's machine code.
+    ///
+    /// Additional requirement: no labels may be bound between `start` and `end`
+    /// (exclusive on both ends).
+    pub fn add_uncond_branch(&mut self, start: CodeOffset, end: CodeOffset, target: MachLabel) {
+        assert!(self.cur_offset() == start);
+        debug_assert!(end > start);
+        assert!(!self.fixup_records.is_empty());
+        let fixup = self.fixup_records.len() - 1;
+        self.lazily_clear_labels_at_tail();
+        self.latest_branches.push(MachBranch {
+            start,
+            end,
+            target,
+            fixup,
+            inverted: None,
+            labels_at_this_branch: self.labels_at_tail.clone(),
+        });
+
+        // Post-invariant: we asserted branch start is current tail; the list of
+        // labels at branch is cloned from list of labels at current tail.
+        self.check_label_branch_invariants();
+    }
+
+    /// Inform the buffer of a conditional branch at the given offset,
+    /// targetting the given label. May be used to optimize branches.
+    /// The last added label-use must correspond to this branch.
+    ///
+    /// Additional requirement: no labels may be bound between `start` and `end`
+    /// (exclusive on both ends).
+    pub fn add_cond_branch(
+        &mut self,
+        start: CodeOffset,
+        end: CodeOffset,
+        target: MachLabel,
+        inverted: &[u8],
+    ) {
+        assert!(self.cur_offset() == start);
+        debug_assert!(end > start);
+        assert!(!self.fixup_records.is_empty());
+        debug_assert!(inverted.len() == (end - start) as usize);
+        let fixup = self.fixup_records.len() - 1;
+        let inverted = Some(SmallVec::from(inverted));
+        self.lazily_clear_labels_at_tail();
+        self.latest_branches.push(MachBranch {
+            start,
+            end,
+            target,
+            fixup,
+            inverted,
+            labels_at_this_branch: self.labels_at_tail.clone(),
+        });
+
+        // Post-invariant: we asserted branch start is current tail; labels at
+        // branch list is cloned from list of labels at current tail.
+        self.check_label_branch_invariants();
+    }
+
+    fn truncate_last_branch(&mut self) {
+        self.lazily_clear_labels_at_tail();
+        // Invariants hold at this point.
+
+        let b = self.latest_branches.pop().unwrap();
+        assert!(b.end == self.cur_offset());
+
+        // State:
+        //    [PRE CODE]
+        //  Offset b.start, b.labels_at_this_branch:
+        //    [BRANCH CODE]
+        //  cur_off, self.labels_at_tail -->
+        //    (end of buffer)
+        self.data.truncate(b.start as usize);
+        self.fixup_records.truncate(b.fixup);
+        // State:
+        //    [PRE CODE]
+        //  cur_off, Offset b.start, b.labels_at_this_branch:
+        //    (end of buffer)
+        //
+        //  self.labels_at_tail -->  (past end of buffer)
+        let cur_off = self.cur_offset();
+        self.labels_at_tail_off = cur_off;
+        // State:
+        //    [PRE CODE]
+        //  cur_off, Offset b.start, b.labels_at_this_branch,
+        //  self.labels_at_tail:
+        //    (end of buffer)
+        //
+        // resolve_label_offset(l) for l in labels_at_tail:
+        //    (past end of buffer)
+
+        trace!(
+            "truncate_last_branch: truncated {:?}; off now {}",
+            b,
+            cur_off
+        );
+
+        // Fix up resolved label offsets for labels at tail.
+        for &l in &self.labels_at_tail {
+            self.label_offsets[l.0 as usize] = cur_off;
+        }
+        // Old labels_at_this_branch are now at cur_off.
+        self.labels_at_tail
+            .extend(b.labels_at_this_branch.into_iter());
+
+        // Post-invariant: this operation is defined to truncate the buffer,
+        // which moves cur_off backward, and to move labels at the end of the
+        // buffer back to the start-of-branch offset.
+        //
+        // latest_branches satisfies all invariants:
+        // - it has no branches past the end of the buffer (branches are in
+        //   order, we removed the last one, and we truncated the buffer to just
+        //   before the start of that branch)
+        // - no labels were moved to lower offsets than the (new) cur_off, so
+        //   the labels_at_this_branch list for any other branch need not change.
+        //
+        // labels_at_tail satisfies all invariants:
+        // - all labels that were at the tail after the truncated branch are
+        //   moved backward to just before the branch, which becomes the new tail;
+        //   thus every element in the list should remain (ensured by `.extend()`
+        //   above).
+        // - all labels that refer to the new tail, which is the start-offset of
+        //   the truncated branch, must be present. The `labels_at_this_branch`
+        //   list in the truncated branch's record is a complete and precise list
+        //   of exactly these labels; we append these to labels_at_tail.
+        // - labels_at_tail_off is at cur_off after truncation occurs, so the
+        //   list is valid (not to be lazily cleared).
+        //
+        // The stated operation was performed:
+        // - For each label at the end of the buffer prior to this method, it
+        //   now resolves to the new (truncated) end of the buffer: it must have
+        //   been in `labels_at_tail` (this list is precise and complete, and
+        //   the tail was at the end of the truncated branch on entry), and we
+        //   iterate over this list and set `label_offsets` to the new tail.
+        //   None of these labels could have been an alias (by invariant), so
+        //   `label_offsets` is authoritative for each.
+        // - No other labels will be past the end of the buffer, because of the
+        //   requirement that no labels be bound to the middle of branch ranges
+        //   (see comments to `add_{cond,uncond}_branch()`).
+        // - The buffer is truncated to just before the last branch, and the
+        //   fixup record referring to that last branch is removed.
+        self.check_label_branch_invariants();
+    }
+
+    fn optimize_branches(&mut self) {
+        self.lazily_clear_labels_at_tail();
+        // Invariants valid at this point.
+
+        trace!(
+            "enter optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}",
+            self.latest_branches,
+            self.labels_at_tail,
+            self.fixup_records
+        );
+
+        // We continue to munch on branches at the tail of the buffer until no
+        // more rules apply. Note that the loop only continues if a branch is
+        // actually truncated (or if labels are redirected away from a branch),
+        // so this always makes progress.
+        while let Some(b) = self.latest_branches.last() {
+            let cur_off = self.cur_offset();
+            trace!("optimize_branches: last branch {:?} at off {}", b, cur_off);
+            // If there has been any code emission since the end of the last branch or
+            // label definition, then there's nothing we can edit (because we
+            // don't move code once placed, only back up and overwrite), so
+            // clear the records and finish.
+            if b.end < cur_off {
+                break;
+            }
+
+            // Invariant: we are looking at a branch that ends at the tail of
+            // the buffer.
+
+            // For any branch, conditional or unconditional:
+            // - If the target is a label at the current offset, then remove
+            //   the conditional branch, and reset all labels that targetted
+            //   the current offset (end of branch) to the truncated
+            //   end-of-code.
+            //
+            // Preserves execution semantics: a branch to its own fallthrough
+            // address is equivalent to a no-op; in both cases, nextPC is the
+            // fallthrough.
+            if self.resolve_label_offset(b.target) == cur_off {
+                trace!("branch with target == cur off; truncating");
+                self.truncate_last_branch();
+                continue;
+            }
+
+            // If latest is an unconditional branch:
+            //
+            // - If the branch's target is not its own start address, then for
+            //   each label at the start of branch, make the label an alias of the
+            //   branch target, and remove the label from the "labels at this
+            //   branch" list.
+            //
+            //   - Preserves execution semantics: an unconditional branch's
+            //     only effect is to set PC to a new PC; this change simply
+            //     collapses one step in the step-semantics.
+            //
+            //   - Post-invariant: the labels that were bound to the start of
+            //     this branch become aliases, so they must not be present in any
+            //     labels-at-this-branch list or the labels-at-tail list. The
+            //     labels are removed form the latest-branch record's
+            //     labels-at-this-branch list, and are never placed in the
+            //     labels-at-tail list. Furthermore, it is correct that they are
+            //     not in either list, because they are now aliases, and labels
+            //     that are aliases remain aliases forever.
+            //
+            // - If there is a prior unconditional branch that ends just before
+            //   this one begins, and this branch has no labels bound to its
+            //   start, then we can truncate this branch, because it is entirely
+            //   unreachable (we have redirected all labels that make it
+            //   reachable otherwise). Do so and continue around the loop.
+            //
+            //   - Preserves execution semantics: the branch is unreachable,
+            //     because execution can only flow into an instruction from the
+            //     prior instruction's fallthrough or from a branch bound to that
+            //     instruction's start offset. Unconditional branches have no
+            //     fallthrough, so if the prior instruction is an unconditional
+            //     branch, no fallthrough entry can happen. The
+            //     labels-at-this-branch list is complete (by invariant), so if it
+            //     is empty, then the instruction is entirely unreachable. Thus,
+            //     it can be removed.
+            //
+            //   - Post-invariant: ensured by truncate_last_branch().
+            //
+            // - If there is a prior conditional branch whose target label
+            //   resolves to the current offset (branches around the
+            //   unconditional branch), then remove the unconditional branch,
+            //   and make the target of the unconditional the target of the
+            //   conditional instead.
+            //
+            //   - Preserves execution semantics: previously we had:
+            //
+            //         L1:
+            //            cond_br L2
+            //            br L3
+            //         L2:
+            //            (end of buffer)
+            //
+            //     by removing the last branch, we have:
+            //
+            //         L1:
+            //            cond_br L2
+            //         L2:
+            //            (end of buffer)
+            //
+            //     we then fix up the records for the conditional branch to
+            //     have:
+            //
+            //         L1:
+            //           cond_br.inverted L3
+            //         L2:
+            //
+            //     In the original code, control flow reaches L2 when the
+            //     conditional branch's predicate is true, and L3 otherwise. In
+            //     the optimized code, the same is true.
+            //
+            //   - Post-invariant: all edits to latest_branches and
+            //     labels_at_tail are performed by `truncate_last_branch()`,
+            //     which maintains the invariants at each step.
+
+            if b.is_uncond() {
+                // Set any label equal to current branch's start as an alias of
+                // the branch's target, if the target is not the branch itself
+                // (i.e., an infinite loop).
+                //
+                // We cannot perform this aliasing if the target of this branch
+                // ultimately aliases back here; if so, we need to keep this
+                // branch, so break out of this loop entirely (and clear the
+                // latest-branches list below).
+                //
+                // Note that this check is what prevents cycles from forming in
+                // `self.label_aliases`. To see why, consider an arbitrary start
+                // state:
+                //
+                // label_aliases[L1] = L2, label_aliases[L2] = L3, ..., up to
+                // Ln, which is not aliased.
+                //
+                // We would create a cycle if we assigned label_aliases[Ln]
+                // = L1.  Note that the below assignment is the only write
+                // to label_aliases.
+                //
+                // By our other invariants, we have that Ln (`l` below)
+                // resolves to the offset `b.start`, because it is in the
+                // set `b.labels_at_this_branch`.
+                //
+                // If L1 were already aliased, through some arbitrarily deep
+                // chain, to Ln, then it must also resolve to this offset
+                // `b.start`.
+                //
+                // By checking the resolution of `L1` against this offset,
+                // and aborting this branch-simplification if they are
+                // equal, we prevent the below assignment from ever creating
+                // a cycle.
+                if self.resolve_label_offset(b.target) != b.start {
+                    let redirected = b.labels_at_this_branch.len();
+                    for &l in &b.labels_at_this_branch {
+                        trace!(
+                            " -> label at start of branch {:?} redirected to target {:?}",
+                            l,
+                            b.target
+                        );
+                        self.label_aliases[l.0 as usize] = b.target;
+                        // NOTE: we continue to ensure the invariant that labels
+                        // pointing to tail of buffer are in `labels_at_tail`
+                        // because we already ensured above that the last branch
+                        // cannot have a target of `cur_off`; so we never have
+                        // to put the label into `labels_at_tail` when moving it
+                        // here.
+                    }
+                    // Maintain invariant: all branches have been redirected
+                    // and are no longer pointing at the start of this branch.
+                    let mut_b = self.latest_branches.last_mut().unwrap();
+                    mut_b.labels_at_this_branch.clear();
+
+                    if redirected > 0 {
+                        trace!(" -> after label redirects, restarting loop");
+                        continue;
+                    }
+                } else {
+                    break;
+                }
+
+                let b = self.latest_branches.last().unwrap();
+
+                // Examine any immediately preceding branch.
+                if self.latest_branches.len() > 1 {
+                    let prev_b = &self.latest_branches[self.latest_branches.len() - 2];
+                    trace!(" -> more than one branch; prev_b = {:?}", prev_b);
+                    // This uncond is immediately after another uncond; we
+                    // should have already redirected labels to this uncond away
+                    // (but check to be sure); so we can truncate this uncond.
+                    if prev_b.is_uncond()
+                        && prev_b.end == b.start
+                        && b.labels_at_this_branch.is_empty()
+                    {
+                        trace!(" -> uncond follows another uncond; truncating");
+                        self.truncate_last_branch();
+                        continue;
+                    }
+
+                    // This uncond is immediately after a conditional, and the
+                    // conditional's target is the end of this uncond, and we've
+                    // already redirected labels to this uncond away; so we can
+                    // truncate this uncond, flip the sense of the conditional, and
+                    // set the conditional's target (in `latest_branches` and in
+                    // `fixup_records`) to the uncond's target.
+                    if prev_b.is_cond()
+                        && prev_b.end == b.start
+                        && self.resolve_label_offset(prev_b.target) == cur_off
+                    {
+                        trace!(" -> uncond follows a conditional, and conditional's target resolves to current offset");
+                        // Save the target of the uncond (this becomes the
+                        // target of the cond), and truncate the uncond.
+                        let target = b.target;
+                        let data = prev_b.inverted.clone().unwrap();
+                        self.truncate_last_branch();
+
+                        // Mutate the code and cond branch.
+                        let off_before_edit = self.cur_offset();
+                        let prev_b = self.latest_branches.last_mut().unwrap();
+                        let not_inverted = SmallVec::from(
+                            &self.data[(prev_b.start as usize)..(prev_b.end as usize)],
+                        );
+
+                        // Low-level edit: replaces bytes of branch with
+                        // inverted form. cur_off remains the same afterward, so
+                        // we do not need to modify label data structures.
+                        self.data.truncate(prev_b.start as usize);
+                        self.data.extend_from_slice(&data[..]);
+
+                        // Save the original code as the inversion of the
+                        // inverted branch, in case we later edit this branch
+                        // again.
+                        prev_b.inverted = Some(not_inverted);
+                        self.fixup_records[prev_b.fixup].label = target;
+                        trace!(" -> reassigning target of condbr to {:?}", target);
+                        prev_b.target = target;
+                        debug_assert_eq!(off_before_edit, self.cur_offset());
+                        continue;
+                    }
+                }
+            }
+
+            // If we couldn't do anything with the last branch, then break.
+            break;
+        }
+
+        self.purge_latest_branches();
+
+        trace!(
+            "leave optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}",
+            self.latest_branches,
+            self.labels_at_tail,
+            self.fixup_records
+        );
+    }
+
+    fn purge_latest_branches(&mut self) {
+        // All of our branch simplification rules work only if a branch ends at
+        // the tail of the buffer, with no following code; and branches are in
+        // order in latest_branches; so if the last entry ends prior to
+        // cur_offset, then clear all entries.
+        let cur_off = self.cur_offset();
+        if let Some(l) = self.latest_branches.last() {
+            if l.end < cur_off {
+                trace!("purge_latest_branches: removing branch {:?}", l);
+                self.latest_branches.clear();
+            }
+        }
+
+        // Post-invariant: no invariant requires any branch to appear in
+        // `latest_branches`; it is always optional. The list-clear above thus
+        // preserves all semantics.
+    }
+
+    /// Emit a constant at some point in the future, binding the given label to
+    /// its offset. The constant will be placed at most `max_distance` from the
+    /// current offset.
+    pub fn defer_constant(
+        &mut self,
+        label: MachLabel,
+        align: CodeOffset,
+        data: &[u8],
+        max_distance: CodeOffset,
+    ) {
+        trace!(
+            "defer_constant: eventually emit {} bytes aligned to {} at label {:?}",
+            data.len(),
+            align,
+            label
+        );
+        let deadline = self.cur_offset().saturating_add(max_distance);
+        self.island_worst_case_size += data.len() as CodeOffset;
+        self.island_worst_case_size =
+            (self.island_worst_case_size + I::LabelUse::ALIGN - 1) & !(I::LabelUse::ALIGN - 1);
+        self.pending_constants.push(MachLabelConstant {
+            label,
+            align,
+            data: SmallVec::from(data),
+        });
+        if deadline < self.island_deadline {
+            self.island_deadline = deadline;
+        }
+    }
+
+    /// Is an island needed within the next N bytes?
+    pub fn island_needed(&self, distance: CodeOffset) -> bool {
+        let worst_case_end_of_island = self.cur_offset() + distance + self.island_worst_case_size;
+        worst_case_end_of_island > self.island_deadline
+    }
+
+    /// Emit all pending constants and veneers. Should only be called if
+    /// `island_needed()` returns true, i.e., if we actually reach a deadline:
+    /// otherwise, unnecessary veneers may be inserted.
+    pub fn emit_island(&mut self) {
+        // We're going to purge fixups, so no latest-branch editing can happen
+        // anymore.
+        self.latest_branches.clear();
+
+        let pending_constants = mem::replace(&mut self.pending_constants, SmallVec::new());
+        for MachLabelConstant { label, align, data } in pending_constants.into_iter() {
+            self.align_to(align);
+            self.bind_label(label);
+            self.put_data(&data[..]);
+        }
+
+        let fixup_records = mem::replace(&mut self.fixup_records, SmallVec::new());
+        let mut new_fixups = SmallVec::new();
+        for MachLabelFixup {
+            label,
+            offset,
+            kind,
+        } in fixup_records.into_iter()
+        {
+            trace!(
+                "emit_island: fixup for label {:?} at offset {} kind {:?}",
+                label,
+                offset,
+                kind
+            );
+            // We eagerly perform fixups whose label targets are known, if not out
+            // of range, to avoid unnecessary veneers.
+            let label_offset = self.resolve_label_offset(label);
+            let known = label_offset != UNKNOWN_LABEL_OFFSET;
+            let in_range = if known {
+                if label_offset >= offset {
+                    (label_offset - offset) <= kind.max_pos_range()
+                } else {
+                    (offset - label_offset) <= kind.max_neg_range()
+                }
+            } else {
+                false
+            };
+
+            trace!(
+                " -> label_offset = {}, known = {}, in_range = {} (pos {} neg {})",
+                label_offset,
+                known,
+                in_range,
+                kind.max_pos_range(),
+                kind.max_neg_range()
+            );
+
+            let start = offset as usize;
+            let end = (offset + kind.patch_size()) as usize;
+            if in_range {
+                debug_assert!(known); // implied by in_range.
+                let slice = &mut self.data[start..end];
+                trace!("patching in-range!");
+                kind.patch(slice, offset, label_offset);
+            } else if !known && !kind.supports_veneer() {
+                // Nothing for now. Keep it for next round.
+                new_fixups.push(MachLabelFixup {
+                    label,
+                    offset,
+                    kind,
+                });
+            } else if !in_range && kind.supports_veneer() {
+                // Allocate space for a veneer in the island.
+                self.align_to(I::LabelUse::ALIGN);
+                let veneer_offset = self.cur_offset();
+                trace!("making a veneer at {}", veneer_offset);
+                let slice = &mut self.data[start..end];
+                // Patch the original label use to refer to the veneer.
+                trace!(
+                    "patching original at offset {} to veneer offset {}",
+                    offset,
+                    veneer_offset
+                );
+                kind.patch(slice, offset, veneer_offset);
+                // Generate the veneer.
+                let veneer_slice = self.get_appended_space(kind.veneer_size() as usize);
+                let (veneer_fixup_off, veneer_label_use) =
+                    kind.generate_veneer(veneer_slice, veneer_offset);
+                trace!(
+                    "generated veneer; fixup offset {}, label_use {:?}",
+                    veneer_fixup_off,
+                    veneer_label_use
+                );
+                // If the label is known (but was just out of range), do the
+                // veneer label-use fixup now too; otherwise, save it for later.
+                if known {
+                    let start = veneer_fixup_off as usize;
+                    let end = (veneer_fixup_off + veneer_label_use.patch_size()) as usize;
+                    let veneer_slice = &mut self.data[start..end];
+                    trace!("doing veneer fixup right away too");
+                    veneer_label_use.patch(veneer_slice, veneer_fixup_off, label_offset);
+                } else {
+                    new_fixups.push(MachLabelFixup {
+                        label,
+                        offset: veneer_fixup_off,
+                        kind: veneer_label_use,
+                    });
+                }
+            } else {
+                panic!(
+                    "Cannot support label-use {:?} (known = {}, in-range = {})",
+                    kind, known, in_range
+                );
+            }
+        }
+
+        self.fixup_records = new_fixups;
+        self.island_deadline = UNKNOWN_LABEL_OFFSET;
+    }
+
+    /// Finish any deferred emissions and/or fixups.
+    pub fn finish(mut self) -> MachBufferFinalized {
+        let _tt = timing::vcode_emit_finish();
+
+        while !self.pending_constants.is_empty() || !self.fixup_records.is_empty() {
+            // `emit_island()` will emit any pending veneers and constants, and
+            // as a side-effect, will also take care of any fixups with resolved
+            // labels eagerly.
+            self.emit_island();
+        }
+
+        // Ensure that all labels have been fixed up after the last island is emitted. This is a
+        // full (release-mode) assert because an unresolved label means the emitted code is
+        // incorrect.
+        assert!(self.fixup_records.is_empty());
+
+        MachBufferFinalized {
+            data: self.data,
+            relocs: self.relocs,
+            traps: self.traps,
+            call_sites: self.call_sites,
+            srclocs: self.srclocs,
+            stack_maps: self.stack_maps,
+        }
+    }
+
+    /// Add an external relocation at the current offset.
+    pub fn add_reloc(
+        &mut self,
+        srcloc: SourceLoc,
+        kind: Reloc,
+        name: &ExternalName,
+        addend: Addend,
+    ) {
+        let name = name.clone();
+        self.relocs.push(MachReloc {
+            offset: self.data.len() as CodeOffset,
+            srcloc,
+            kind,
+            name,
+            addend,
+        });
+    }
+
+    /// Add a trap record at the current offset.
+    pub fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) {
+        self.traps.push(MachTrap {
+            offset: self.data.len() as CodeOffset,
+            srcloc,
+            code,
+        });
+    }
+
+    /// Add a call-site record at the current offset.
+    pub fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) {
+        self.call_sites.push(MachCallSite {
+            ret_addr: self.data.len() as CodeOffset,
+            srcloc,
+            opcode,
+        });
+    }
+
+    /// Set the `SourceLoc` for code from this offset until the offset at the
+    /// next call to `end_srcloc()`.
+    pub fn start_srcloc(&mut self, loc: SourceLoc) {
+        self.cur_srcloc = Some((self.cur_offset(), loc));
+    }
+
+    /// Mark the end of the `SourceLoc` segment started at the last
+    /// `start_srcloc()` call.
+    pub fn end_srcloc(&mut self) {
+        let (start, loc) = self
+            .cur_srcloc
+            .take()
+            .expect("end_srcloc() called without start_srcloc()");
+        let end = self.cur_offset();
+        // Skip zero-length extends.
+        debug_assert!(end >= start);
+        if end > start {
+            self.srclocs.push(MachSrcLoc { start, end, loc });
+        }
+    }
+
+    /// Add stack map metadata for this program point: a set of stack offsets
+    /// (from SP upward) that contain live references.
+    ///
+    /// The `offset_to_fp` value is the offset from the nominal SP (at which the `stack_offsets`
+    /// are based) and the FP value. By subtracting `offset_to_fp` from each `stack_offsets`
+    /// element, one can obtain live-reference offsets from FP instead.
+    pub fn add_stack_map(&mut self, extent: StackMapExtent, stack_map: StackMap) {
+        let (start, end) = match extent {
+            StackMapExtent::UpcomingBytes(insn_len) => {
+                let start_offset = self.cur_offset();
+                (start_offset, start_offset + insn_len)
+            }
+            StackMapExtent::StartedAtOffset(start_offset) => {
+                let end_offset = self.cur_offset();
+                debug_assert!(end_offset >= start_offset);
+                (start_offset, end_offset)
+            }
+        };
+        self.stack_maps.push(MachStackMap {
+            offset: start,
+            offset_end: end,
+            stack_map,
+        });
+    }
+}
+
+impl MachBufferFinalized {
+    /// Get a list of source location mapping tuples in sorted-by-start-offset order.
+    pub fn get_srclocs_sorted(&self) -> &[MachSrcLoc] {
+        &self.srclocs[..]
+    }
+
+    /// Get the total required size for the code.
+    pub fn total_size(&self) -> CodeOffset {
+        self.data.len() as CodeOffset
+    }
+
+    /// Emit this buffer to the given CodeSink.
+    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
+        // N.B.: we emit every section into the .text section as far as
+        // the `CodeSink` is concerned; we do not bother to segregate
+        // the contents into the actual program text, the jumptable and the
+        // rodata (constant pool). This allows us to generate code assuming
+        // that these will not be relocated relative to each other, and avoids
+        // having to designate each section as belonging in one of the three
+        // fixed categories defined by `CodeSink`. If this becomes a problem
+        // later (e.g. because of memory permissions or similar), we can
+        // add this designation and segregate the output; take care, however,
+        // to add the appropriate relocations in this case.
+
+        let mut next_reloc = 0;
+        let mut next_trap = 0;
+        let mut next_call_site = 0;
+        for (idx, byte) in self.data.iter().enumerate() {
+            if next_reloc < self.relocs.len() {
+                let reloc = &self.relocs[next_reloc];
+                if reloc.offset == idx as CodeOffset {
+                    sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend);
+                    next_reloc += 1;
+                }
+            }
+            if next_trap < self.traps.len() {
+                let trap = &self.traps[next_trap];
+                if trap.offset == idx as CodeOffset {
+                    sink.trap(trap.code, trap.srcloc);
+                    next_trap += 1;
+                }
+            }
+            if next_call_site < self.call_sites.len() {
+                let call_site = &self.call_sites[next_call_site];
+                if call_site.ret_addr == idx as CodeOffset {
+                    sink.add_call_site(call_site.opcode, call_site.srcloc);
+                    next_call_site += 1;
+                }
+            }
+            sink.put1(*byte);
+        }
+
+        sink.begin_jumptables();
+        sink.begin_rodata();
+        sink.end_codegen();
+    }
+
+    /// Get the stack map metadata for this code.
+    pub fn stack_maps(&self) -> &[MachStackMap] {
+        &self.stack_maps[..]
+    }
+}
+
+/// A constant that is deferred to the next constant-pool opportunity.
+struct MachLabelConstant {
+    /// This label will refer to the constant's offset.
+    label: MachLabel,
+    /// Required alignment.
+    align: CodeOffset,
+    /// This data will be emitted when able.
+    data: SmallVec<[u8; 16]>,
+}
+
+/// A fixup to perform on the buffer once code is emitted. Fixups always refer
+/// to labels and patch the code based on label offsets. Hence, they are like
+/// relocations, but internal to one buffer.
+#[derive(Debug)]
+struct MachLabelFixup<I: VCodeInst> {
+    /// The label whose offset controls this fixup.
+    label: MachLabel,
+    /// The offset to fix up / patch to refer to this label.
+    offset: CodeOffset,
+    /// The kind of fixup. This is architecture-specific; each architecture may have,
+    /// e.g., several types of branch instructions, each with differently-sized
+    /// offset fields and different places within the instruction to place the
+    /// bits.
+    kind: I::LabelUse,
+}
+
+/// A relocation resulting from a compilation.
+struct MachReloc {
+    /// The offset at which the relocation applies, *relative to the
+    /// containing section*.
+    offset: CodeOffset,
+    /// The original source location.
+    srcloc: SourceLoc,
+    /// The kind of relocation.
+    kind: Reloc,
+    /// The external symbol / name to which this relocation refers.
+    name: ExternalName,
+    /// The addend to add to the symbol value.
+    addend: i64,
+}
+
+/// A trap record resulting from a compilation.
+struct MachTrap {
+    /// The offset at which the trap instruction occurs, *relative to the
+    /// containing section*.
+    offset: CodeOffset,
+    /// The original source location.
+    srcloc: SourceLoc,
+    /// The trap code.
+    code: TrapCode,
+}
+
+/// A call site record resulting from a compilation.
+struct MachCallSite {
+    /// The offset of the call's return address, *relative to the containing section*.
+    ret_addr: CodeOffset,
+    /// The original source location.
+    srcloc: SourceLoc,
+    /// The call's opcode.
+    opcode: Opcode,
+}
+
+/// A source-location mapping resulting from a compilation.
+#[derive(Clone, Debug)]
+pub struct MachSrcLoc {
+    /// The start of the region of code corresponding to a source location.
+    /// This is relative to the start of the function, not to the start of the
+    /// section.
+    pub start: CodeOffset,
+    /// The end of the region of code corresponding to a source location.
+    /// This is relative to the start of the section, not to the start of the
+    /// section.
+    pub end: CodeOffset,
+    /// The source location.
+    pub loc: SourceLoc,
+}
+
+/// Record of stack map metadata: stack offsets containing references.
+#[derive(Clone, Debug)]
+pub struct MachStackMap {
+    /// The code offset at which this stack map applies.
+    pub offset: CodeOffset,
+    /// The code offset just past the "end" of the instruction: that is, the
+    /// offset of the first byte of the following instruction, or equivalently,
+    /// the start offset plus the instruction length.
+    pub offset_end: CodeOffset,
+    /// The stack map itself.
+    pub stack_map: StackMap,
+}
+
+/// Record of branch instruction in the buffer, to facilitate editing.
+#[derive(Clone, Debug)]
+struct MachBranch {
+    start: CodeOffset,
+    end: CodeOffset,
+    target: MachLabel,
+    fixup: usize,
+    inverted: Option<SmallVec<[u8; 8]>>,
+    /// All labels pointing to the start of this branch. For correctness, this
+    /// *must* be complete (i.e., must contain all labels whose resolved offsets
+    /// are at the start of this branch): we rely on being able to redirect all
+    /// labels that could jump to this branch before removing it, if it is
+    /// otherwise unreachable.
+    labels_at_this_branch: SmallVec<[MachLabel; 4]>,
+}
+
+impl MachBranch {
+    fn is_cond(&self) -> bool {
+        self.inverted.is_some()
+    }
+    fn is_uncond(&self) -> bool {
+        self.inverted.is_none()
+    }
+}
+
+// We use an actual instruction definition to do tests, so we depend on the `arm64` feature here.
+#[cfg(all(test, feature = "arm64"))]
+mod test {
+    use super::*;
+    use crate::isa::aarch64::inst::xreg;
+    use crate::isa::aarch64::inst::{BranchTarget, CondBrKind, EmitInfo, Inst};
+    use crate::machinst::MachInstEmit;
+    use crate::settings;
+    use std::default::Default;
+
+    fn label(n: u32) -> MachLabel {
+        MachLabel::from_block(n)
+    }
+    fn target(n: u32) -> BranchTarget {
+        BranchTarget::Label(label(n))
+    }
+
+    #[test]
+    fn test_elide_jump_to_next() {
+        let info = EmitInfo::new(settings::Flags::new(settings::builder()));
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(2);
+        buf.bind_label(label(0));
+        let inst = Inst::Jump { dest: target(1) };
+        inst.emit(&mut buf, &info, &mut state);
+        buf.bind_label(label(1));
+        let buf = buf.finish();
+        assert_eq!(0, buf.total_size());
+    }
+
+    #[test]
+    fn test_elide_trivial_jump_blocks() {
+        let info = EmitInfo::new(settings::Flags::new(settings::builder()));
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(4);
+
+        buf.bind_label(label(0));
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: target(1),
+            not_taken: target(2),
+        };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(1));
+        let inst = Inst::Jump { dest: target(3) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(2));
+        let inst = Inst::Jump { dest: target(3) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(3));
+
+        let buf = buf.finish();
+        assert_eq!(0, buf.total_size());
+    }
+
+    #[test]
+    fn test_flip_cond() {
+        let info = EmitInfo::new(settings::Flags::new(settings::builder()));
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(4);
+
+        buf.bind_label(label(0));
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: target(1),
+            not_taken: target(2),
+        };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(1));
+        let inst = Inst::Udf {
+            trap_code: TrapCode::Interrupt,
+        };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(2));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(3));
+
+        let buf = buf.finish();
+
+        let mut buf2 = MachBuffer::new();
+        let mut state = Default::default();
+        let inst = Inst::TrapIf {
+            kind: CondBrKind::NotZero(xreg(0)),
+            trap_code: TrapCode::Interrupt,
+        };
+        inst.emit(&mut buf2, &info, &mut state);
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf2, &info, &mut state);
+
+        let buf2 = buf2.finish();
+
+        assert_eq!(buf.data, buf2.data);
+    }
+
+    #[test]
+    fn test_island() {
+        let info = EmitInfo::new(settings::Flags::new(settings::builder()));
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(4);
+
+        buf.bind_label(label(0));
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: target(2),
+            not_taken: target(3),
+        };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(1));
+        while buf.cur_offset() < 2000000 {
+            if buf.island_needed(0) {
+                buf.emit_island();
+            }
+            let inst = Inst::Nop4;
+            inst.emit(&mut buf, &info, &mut state);
+        }
+
+        buf.bind_label(label(2));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(3));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &info, &mut state);
+
+        let buf = buf.finish();
+
+        assert_eq!(2000000 + 8, buf.total_size());
+
+        let mut buf2 = MachBuffer::new();
+        let mut state = Default::default();
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: BranchTarget::ResolvedOffset(1048576 - 4),
+            not_taken: BranchTarget::ResolvedOffset(2000000 + 4 - 4),
+        };
+        inst.emit(&mut buf2, &info, &mut state);
+
+        let buf2 = buf2.finish();
+
+        assert_eq!(&buf.data[0..8], &buf2.data[..]);
+    }
+
+    #[test]
+    fn test_island_backward() {
+        let info = EmitInfo::new(settings::Flags::new(settings::builder()));
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(4);
+
+        buf.bind_label(label(0));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(1));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(2));
+        while buf.cur_offset() < 2000000 {
+            let inst = Inst::Nop4;
+            inst.emit(&mut buf, &info, &mut state);
+        }
+
+        buf.bind_label(label(3));
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: target(0),
+            not_taken: target(1),
+        };
+        inst.emit(&mut buf, &info, &mut state);
+
+        let buf = buf.finish();
+
+        assert_eq!(2000000 + 12, buf.total_size());
+
+        let mut buf2 = MachBuffer::new();
+        let mut state = Default::default();
+        let inst = Inst::CondBr {
+            kind: CondBrKind::NotZero(xreg(0)),
+            taken: BranchTarget::ResolvedOffset(8),
+            not_taken: BranchTarget::ResolvedOffset(4 - (2000000 + 4)),
+        };
+        inst.emit(&mut buf2, &info, &mut state);
+        let inst = Inst::Jump {
+            dest: BranchTarget::ResolvedOffset(-(2000000 + 8)),
+        };
+        inst.emit(&mut buf2, &info, &mut state);
+
+        let buf2 = buf2.finish();
+
+        assert_eq!(&buf.data[2000000..], &buf2.data[..]);
+    }
+
+    #[test]
+    fn test_multiple_redirect() {
+        // label0:
+        //   cbz x0, label1
+        //   b label2
+        // label1:
+        //   b label3
+        // label2:
+        //   nop
+        //   nop
+        //   b label0
+        // label3:
+        //   b label4
+        // label4:
+        //   b label5
+        // label5:
+        //   b label7
+        // label6:
+        //   nop
+        // label7:
+        //   ret
+        //
+        // -- should become:
+        //
+        // label0:
+        //   cbz x0, label7
+        // label2:
+        //   nop
+        //   nop
+        //   b label0
+        // label6:
+        //   nop
+        // label7:
+        //   ret
+
+        let info = EmitInfo::new(settings::Flags::new(settings::builder()));
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(8);
+
+        buf.bind_label(label(0));
+        let inst = Inst::CondBr {
+            kind: CondBrKind::Zero(xreg(0)),
+            taken: target(1),
+            not_taken: target(2),
+        };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(1));
+        let inst = Inst::Jump { dest: target(3) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(2));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &info, &mut state);
+        inst.emit(&mut buf, &info, &mut state);
+        let inst = Inst::Jump { dest: target(0) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(3));
+        let inst = Inst::Jump { dest: target(4) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(4));
+        let inst = Inst::Jump { dest: target(5) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(5));
+        let inst = Inst::Jump { dest: target(7) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(6));
+        let inst = Inst::Nop4;
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(7));
+        let inst = Inst::Ret;
+        inst.emit(&mut buf, &info, &mut state);
+
+        let buf = buf.finish();
+
+        let golden_data = vec![
+            0xa0, 0x00, 0x00, 0xb4, // cbz x0, 0x14
+            0x1f, 0x20, 0x03, 0xd5, // nop
+            0x1f, 0x20, 0x03, 0xd5, // nop
+            0xfd, 0xff, 0xff, 0x17, // b 0
+            0x1f, 0x20, 0x03, 0xd5, // nop
+            0xc0, 0x03, 0x5f, 0xd6, // ret
+        ];
+
+        assert_eq!(&golden_data[..], &buf.data[..]);
+    }
+
+    #[test]
+    fn test_handle_branch_cycle() {
+        // label0:
+        //   b label1
+        // label1:
+        //   b label2
+        // label2:
+        //   b label3
+        // label3:
+        //   b label4
+        // label4:
+        //   b label1  // note: not label0 (to make it interesting).
+        //
+        // -- should become:
+        //
+        // label0, label1, ..., label4:
+        //   b label0
+        let info = EmitInfo::new(settings::Flags::new(settings::builder()));
+        let mut buf = MachBuffer::new();
+        let mut state = Default::default();
+
+        buf.reserve_labels_for_blocks(5);
+
+        buf.bind_label(label(0));
+        let inst = Inst::Jump { dest: target(1) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(1));
+        let inst = Inst::Jump { dest: target(2) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(2));
+        let inst = Inst::Jump { dest: target(3) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(3));
+        let inst = Inst::Jump { dest: target(4) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        buf.bind_label(label(4));
+        let inst = Inst::Jump { dest: target(1) };
+        inst.emit(&mut buf, &info, &mut state);
+
+        let buf = buf.finish();
+
+        let golden_data = vec![
+            0x00, 0x00, 0x00, 0x14, // b 0
+        ];
+
+        assert_eq!(&golden_data[..], &buf.data[..]);
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/machinst/compile.rs b/third_party/rust/cranelift-codegen/src/machinst/compile.rs
new file mode 100644
index 0000000000..9a00cee805
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/compile.rs
@@ -0,0 +1,110 @@
+//! Compilation backend pipeline: optimized IR to VCode / binemit.
+
+use crate::ir::Function;
+use crate::machinst::*;
+use crate::settings;
+use crate::timing;
+
+use log::debug;
+use regalloc::{allocate_registers_with_opts, Algorithm, Options, PrettyPrint};
+
+/// Compile the given function down to VCode with allocated registers, ready
+/// for binary emission.
+pub fn compile<B: LowerBackend + MachBackend>(
+    f: &Function,
+    b: &B,
+    abi: Box<dyn ABICallee<I = B::MInst>>,
+    emit_info: <B::MInst as MachInstEmit>::Info,
+) -> CodegenResult<VCode<B::MInst>>
+where
+    B::MInst: PrettyPrint,
+{
+    // Compute lowered block order.
+    let block_order = BlockLoweringOrder::new(f);
+    // Build the lowering context.
+    let lower = Lower::new(f, abi, emit_info, block_order)?;
+    // Lower the IR.
+    let (mut vcode, stack_map_request_info) = {
+        let _tt = timing::vcode_lower();
+        lower.lower(b)?
+    };
+
+    debug!(
+        "vcode from lowering: \n{}",
+        vcode.show_rru(Some(b.reg_universe()))
+    );
+
+    // Perform register allocation.
+    let (run_checker, algorithm) = match vcode.flags().regalloc() {
+        settings::Regalloc::Backtracking => (false, Algorithm::Backtracking(Default::default())),
+        settings::Regalloc::BacktrackingChecked => {
+            (true, Algorithm::Backtracking(Default::default()))
+        }
+        settings::Regalloc::ExperimentalLinearScan => {
+            (false, Algorithm::LinearScan(Default::default()))
+        }
+        settings::Regalloc::ExperimentalLinearScanChecked => {
+            (true, Algorithm::LinearScan(Default::default()))
+        }
+    };
+
+    #[cfg(feature = "regalloc-snapshot")]
+    {
+        use std::fs;
+        use std::path::Path;
+        if let Some(path) = std::env::var("SERIALIZE_REGALLOC").ok() {
+            let snapshot = regalloc::IRSnapshot::from_function(&vcode, b.reg_universe());
+            let serialized = bincode::serialize(&snapshot).expect("couldn't serialize snapshot");
+
+            let file_path = Path::new(&path).join(Path::new(&format!("ir{}.bin", f.name)));
+            fs::write(file_path, &serialized).expect("couldn't write IR snapshot file");
+        }
+    }
+
+    // If either there are no reference-typed values, or else there are
+    // but there are no safepoints at which we need to know about them,
+    // then we don't need stack maps.
+    let sri = if stack_map_request_info.reftyped_vregs.len() > 0
+        && stack_map_request_info.safepoint_insns.len() > 0
+    {
+        Some(&stack_map_request_info)
+    } else {
+        None
+    };
+
+    let result = {
+        let _tt = timing::regalloc();
+        allocate_registers_with_opts(
+            &mut vcode,
+            b.reg_universe(),
+            sri,
+            Options {
+                run_checker,
+                algorithm,
+            },
+        )
+        .map_err(|err| {
+            debug!(
+                "Register allocation error for vcode\n{}\nError: {:?}",
+                vcode.show_rru(Some(b.reg_universe())),
+                err
+            );
+            err
+        })
+        .expect("register allocation")
+    };
+
+    // Reorder vcode into final order and copy out final instruction sequence
+    // all at once. This also inserts prologues/epilogues.
+    {
+        let _tt = timing::vcode_post_ra();
+        vcode.replace_insns_from_regalloc(result);
+    }
+
+    debug!(
+        "vcode after regalloc: final version:\n{}",
+        vcode.show_rru(Some(b.reg_universe()))
+    );
+
+    Ok(vcode)
+}
diff --git a/third_party/rust/cranelift-codegen/src/machinst/helpers.rs b/third_party/rust/cranelift-codegen/src/machinst/helpers.rs
new file mode 100644
index 0000000000..0138da7670
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/helpers.rs
@@ -0,0 +1,28 @@
+//! Miscellaneous helpers for machine backends.
+
+use super::{InsnOutput, LowerCtx, VCodeInst};
+use crate::ir::Type;
+use regalloc::{Reg, Writable};
+
+/// Returns the size (in bits) of a given type.
+pub fn ty_bits(ty: Type) -> usize {
+    usize::from(ty.bits())
+}
+
+/// Is the type represented by an integer (not float) at the machine level?
+pub(crate) fn ty_has_int_representation(ty: Type) -> bool {
+    ty.is_int() || ty.is_bool() || ty.is_ref()
+}
+
+/// Is the type represented by a float or vector value at the machine level?
+pub(crate) fn ty_has_float_or_vec_representation(ty: Type) -> bool {
+    ty.is_vector() || ty.is_float()
+}
+
+/// Allocate a register for an instruction output and return it.
+pub(crate) fn get_output_reg<I: VCodeInst, C: LowerCtx<I = I>>(
+    ctx: &mut C,
+    spec: InsnOutput,
+) -> Writable<Reg> {
+    ctx.get_output(spec.insn, spec.output)
+}
diff --git a/third_party/rust/cranelift-codegen/src/machinst/inst_common.rs b/third_party/rust/cranelift-codegen/src/machinst/inst_common.rs
new file mode 100644
index 0000000000..1ff6faedd4
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/inst_common.rs
@@ -0,0 +1,59 @@
+//! A place to park MachInst::Inst fragments which are common across multiple architectures.
+
+use crate::ir::{self, Inst as IRInst};
+
+//============================================================================
+// Instruction input "slots".
+//
+// We use these types to refer to operand numbers, and result numbers, together
+// with the associated instruction, in a type-safe way.
+
+/// Identifier for a particular input of an instruction.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) struct InsnInput {
+    pub(crate) insn: IRInst,
+    pub(crate) input: usize,
+}
+
+/// Identifier for a particular output of an instruction.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) struct InsnOutput {
+    pub(crate) insn: IRInst,
+    pub(crate) output: usize,
+}
+
+//============================================================================
+// Atomic instructions.
+
+/// Atomic memory update operations.  As of 21 Aug 2020 these are used for the aarch64 and x64
+/// targets.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[repr(u8)]
+pub enum AtomicRmwOp {
+    /// Add
+    Add,
+    /// Sub
+    Sub,
+    /// And
+    And,
+    /// Or
+    Or,
+    /// Exclusive Or
+    Xor,
+    /// Exchange (swap operands)
+    Xchg,
+}
+
+impl AtomicRmwOp {
+    /// Converts an `ir::AtomicRmwOp` to the corresponding `inst_common::AtomicRmwOp`.
+    pub fn from(ir_op: ir::AtomicRmwOp) -> Self {
+        match ir_op {
+            ir::AtomicRmwOp::Add => AtomicRmwOp::Add,
+            ir::AtomicRmwOp::Sub => AtomicRmwOp::Sub,
+            ir::AtomicRmwOp::And => AtomicRmwOp::And,
+            ir::AtomicRmwOp::Or => AtomicRmwOp::Or,
+            ir::AtomicRmwOp::Xor => AtomicRmwOp::Xor,
+            ir::AtomicRmwOp::Xchg => AtomicRmwOp::Xchg,
+        }
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/machinst/lower.rs b/third_party/rust/cranelift-codegen/src/machinst/lower.rs
new file mode 100644
index 0000000000..531085ae4e
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/lower.rs
@@ -0,0 +1,1084 @@
+//! This module implements lowering (instruction selection) from Cranelift IR
+//! to machine instructions with virtual registers. This is *almost* the final
+//! machine code, except for register allocation.
+
+use crate::entity::SecondaryMap;
+use crate::fx::{FxHashMap, FxHashSet};
+use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit};
+use crate::ir::instructions::BranchInfo;
+use crate::ir::types::I64;
+use crate::ir::{
+    ArgumentPurpose, Block, Constant, ConstantData, ExternalName, Function, GlobalValueData, Inst,
+    InstructionData, MemFlags, Opcode, Signature, SourceLoc, Type, Value, ValueDef,
+};
+use crate::machinst::{
+    ABICallee, BlockIndex, BlockLoweringOrder, LoweredBlock, MachLabel, VCode, VCodeBuilder,
+    VCodeConstant, VCodeConstantData, VCodeConstants, VCodeInst,
+};
+use crate::CodegenResult;
+
+use regalloc::{Reg, RegClass, StackmapRequestInfo, VirtualReg, Writable};
+
+use crate::data_value::DataValue;
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use core::convert::TryInto;
+use log::debug;
+use smallvec::SmallVec;
+
+/// An "instruction color" partitions CLIF instructions by side-effecting ops.
+/// All instructions with the same "color" are guaranteed not to be separated by
+/// any side-effecting op (for this purpose, loads are also considered
+/// side-effecting, to avoid subtle questions w.r.t. the memory model), and
+/// furthermore, it is guaranteed that for any two instructions A and B such
+/// that color(A) == color(B), either A dominates B and B postdominates A, or
+/// vice-versa. (For now, in practice, only ops in the same basic block can ever
+/// have the same color, trivially providing the second condition.) Intuitively,
+/// this means that the ops of the same color must always execute "together", as
+/// part of one atomic contiguous section of the dynamic execution trace, and
+/// they can be freely permuted (modulo true dataflow dependencies) without
+/// affecting program behavior.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub struct InstColor(u32);
+impl InstColor {
+    fn new(n: u32) -> InstColor {
+        InstColor(n)
+    }
+
+    /// Get an arbitrary index representing this color. The index is unique
+    /// *within a single function compilation*, but indices may be reused across
+    /// functions.
+    pub fn get(self) -> u32 {
+        self.0
+    }
+}
+
+/// A context that machine-specific lowering code can use to emit lowered
+/// instructions. This is the view of the machine-independent per-function
+/// lowering context that is seen by the machine backend.
+pub trait LowerCtx {
+    /// The instruction type for which this lowering framework is instantiated.
+    type I: VCodeInst;
+
+    // Function-level queries:
+
+    /// Get the `ABICallee`.
+    fn abi(&mut self) -> &mut dyn ABICallee<I = Self::I>;
+    /// Get the (virtual) register that receives the return value. A return
+    /// instruction should lower into a sequence that fills this register. (Why
+    /// not allow the backend to specify its own result register for the return?
+    /// Because there may be multiple return points.)
+    fn retval(&self, idx: usize) -> Writable<Reg>;
+    /// Returns the vreg containing the VmContext parameter, if there's one.
+    fn get_vm_context(&self) -> Option<Reg>;
+
+    // General instruction queries:
+
+    /// Get the instdata for a given IR instruction.
+    fn data(&self, ir_inst: Inst) -> &InstructionData;
+    /// Get the controlling type for a polymorphic IR instruction.
+    fn ty(&self, ir_inst: Inst) -> Type;
+    /// Get the target for a call instruction, as an `ExternalName`. Returns a tuple
+    /// providing this name and the "relocation distance", i.e., whether the backend
+    /// can assume the target will be "nearby" (within some small offset) or an
+    /// arbitrary address. (This comes from the `colocated` bit in the CLIF.)
+    fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance)>;
+    /// Get the signature for a call or call-indirect instruction.
+    fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature>;
+    /// Get the symbol name, relocation distance estimate, and offset for a
+    /// symbol_value instruction.
+    fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance, i64)>;
+    /// Returns the memory flags of a given memory access.
+    fn memflags(&self, ir_inst: Inst) -> Option<MemFlags>;
+    /// Get the source location for a given instruction.
+    fn srcloc(&self, ir_inst: Inst) -> SourceLoc;
+    /// Get the side-effect color of the given instruction (specifically, at the
+    /// program point just prior to the instruction). The "color" changes at
+    /// every side-effecting op; the backend should not try to merge across
+    /// side-effect colors unless the op being merged is known to be pure.
+    fn inst_color(&self, ir_inst: Inst) -> InstColor;
+
+    // Instruction input/output queries:
+
+    /// Get the number of inputs to the given IR instruction.
+    fn num_inputs(&self, ir_inst: Inst) -> usize;
+    /// Get the number of outputs to the given IR instruction.
+    fn num_outputs(&self, ir_inst: Inst) -> usize;
+    /// Get the type for an instruction's input.
+    fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type;
+    /// Get the type for an instruction's output.
+    fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type;
+    /// Get the value of a constant instruction (`iconst`, etc.) as a 64-bit
+    /// value, if possible.
+    fn get_constant(&self, ir_inst: Inst) -> Option<u64>;
+    /// Get the input in any combination of three forms:
+    ///
+    /// - An instruction, if the same color as this instruction or if the
+    ///   producing instruction has no side effects (thus in both cases
+    ///   mergeable);
+    /// - A constant, if the value is a constant;
+    /// - A register.
+    ///
+    /// The instruction input may be available in some or all of these
+    /// forms. More than one is possible: e.g., it may be produced by an
+    /// instruction in the same block, but may also have been forced into a
+    /// register already by an earlier op. It will *always* be available
+    /// in a register, at least.
+    ///
+    /// If the backend uses the register, rather than one of the other
+    /// forms (constant or merging of the producing op), it must call
+    /// `use_input_reg()` to ensure the producing inst is actually lowered
+    /// as well. Failing to do so may result in the instruction that generates
+    /// this value never being generated, thus resulting in incorrect execution.
+    /// For correctness, backends should thus wrap `get_input()` and
+    /// `use_input_regs()` with helpers that return a register only after
+    /// ensuring it is marked as used.
+    fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput;
+    /// Get the `idx`th output register of the given IR instruction. When
+    /// `backend.lower_inst_to_regs(ctx, inst)` is called, it is expected that
+    /// the backend will write results to these output register(s).  This
+    /// register will always be "fresh"; it is guaranteed not to overlap with
+    /// any of the inputs, and can be freely used as a scratch register within
+    /// the lowered instruction sequence, as long as its final value is the
+    /// result of the computation.
+    fn get_output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg>;
+
+    // Codegen primitives: allocate temps, emit instructions, set result registers,
+    // ask for an input to be gen'd into a register.
+
+    /// Get a new temp.
+    fn alloc_tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg>;
+    /// Emit a machine instruction.
+    fn emit(&mut self, mach_inst: Self::I);
+    /// Emit a machine instruction that is a safepoint.
+    fn emit_safepoint(&mut self, mach_inst: Self::I);
+    /// Indicate that the given input uses the register returned by
+    /// `get_input()`. Codegen may not happen otherwise for the producing
+    /// instruction if it has no side effects and no uses.
+    fn use_input_reg(&mut self, input: LowerInput);
+    /// Is the given register output needed after the given instruction? Allows
+    /// instructions with multiple outputs to make fine-grained decisions on
+    /// which outputs to actually generate.
+    fn is_reg_needed(&self, ir_inst: Inst, reg: Reg) -> bool;
+    /// Retrieve constant data given a handle.
+    fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData;
+    /// Indicate that a constant should be emitted.
+    fn use_constant(&mut self, constant: VCodeConstantData) -> VCodeConstant;
+    /// Retrieve the value immediate from an instruction. This will perform necessary lookups on the
+    /// `DataFlowGraph` to retrieve even large immediates.
+    fn get_immediate(&self, ir_inst: Inst) -> Option<DataValue>;
+    /// Cause the value in `reg` to be in a virtual reg, by copying it into a new virtual reg
+    /// if `reg` is a real reg.  `ty` describes the type of the value in `reg`.
+    fn ensure_in_vreg(&mut self, reg: Reg, ty: Type) -> Reg;
+}
+
+/// A representation of all of the ways in which an instruction input is
+/// available: as a producing instruction (in the same color-partition), as a
+/// constant, and/or in an existing register. See [LowerCtx::get_input] for more
+/// details.
+#[derive(Clone, Copy, Debug)]
+pub struct LowerInput {
+    /// The value is live in a register. This option is always available. Call
+    /// [LowerCtx::use_input_reg()] if the register is used.
+    pub reg: Reg,
+    /// An instruction produces this value; the instruction's result index that
+    /// produces this value is given.
+    pub inst: Option<(Inst, usize)>,
+    /// The value is a known constant.
+    pub constant: Option<u64>,
+}
+
+/// A machine backend.
+pub trait LowerBackend {
+    /// The machine instruction type.
+    type MInst: VCodeInst;
+
+    /// Lower a single instruction.
+    ///
+    /// For a branch, this function should not generate the actual branch
+    /// instruction. However, it must force any values it needs for the branch
+    /// edge (block-param actuals) into registers, because the actual branch
+    /// generation (`lower_branch_group()`) happens *after* any possible merged
+    /// out-edge.
+    fn lower<C: LowerCtx<I = Self::MInst>>(&self, ctx: &mut C, inst: Inst) -> CodegenResult<()>;
+
+    /// Lower a block-terminating group of branches (which together can be seen
+    /// as one N-way branch), given a vcode MachLabel for each target.
+    fn lower_branch_group<C: LowerCtx<I = Self::MInst>>(
+        &self,
+        ctx: &mut C,
+        insts: &[Inst],
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()>;
+
+    /// A bit of a hack: give a fixed register that always holds the result of a
+    /// `get_pinned_reg` instruction, if known.  This allows elision of moves
+    /// into the associated vreg, instead using the real reg directly.
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        None
+    }
+}
+
+/// A pending instruction to insert and auxiliary information about it: its source location and
+/// whether it is a safepoint.
+struct InstTuple<I: VCodeInst> {
+    loc: SourceLoc,
+    is_safepoint: bool,
+    inst: I,
+}
+
+/// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence
+/// from original Inst to MachInsts.
+pub struct Lower<'func, I: VCodeInst> {
+    /// The function to lower.
+    f: &'func Function,
+
+    /// Lowered machine instructions.
+    vcode: VCodeBuilder<I>,
+
+    /// Mapping from `Value` (SSA value in IR) to virtual register.
+    value_regs: SecondaryMap<Value, Reg>,
+
+    /// Return-value vregs.
+    retval_regs: Vec<Reg>,
+
+    /// Instruction colors.
+    inst_colors: SecondaryMap<Inst, InstColor>,
+
+    /// Instruction constant values, if known.
+    inst_constants: FxHashMap<Inst, u64>,
+
+    /// Instruction has a side-effect and must be codegen'd.
+    inst_needed: SecondaryMap<Inst, bool>,
+
+    /// Value (vreg) is needed and producer must be codegen'd.
+    vreg_needed: Vec<bool>,
+
+    /// Next virtual register number to allocate.
+    next_vreg: u32,
+
+    /// Insts in reverse block order, before final copy to vcode.
+    block_insts: Vec<InstTuple<I>>,
+
+    /// Ranges in `block_insts` constituting BBs.
+    block_ranges: Vec<(usize, usize)>,
+
+    /// Instructions collected for the BB in progress, in reverse order, with
+    /// source-locs attached.
+    bb_insts: Vec<InstTuple<I>>,
+
+    /// Instructions collected for the CLIF inst in progress, in forward order.
+    ir_insts: Vec<InstTuple<I>>,
+
+    /// The register to use for GetPinnedReg, if any, on this architecture.
+    pinned_reg: Option<Reg>,
+
+    /// The vreg containing the special VmContext parameter, if it is present in the current
+    /// function's signature.
+    vm_context: Option<Reg>,
+}
+
+/// Notion of "relocation distance". This gives an estimate of how far away a symbol will be from a
+/// reference.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum RelocDistance {
+    /// Target of relocation is "nearby". The threshold for this is fuzzy but should be interpreted
+    /// as approximately "within the compiled output of one module"; e.g., within AArch64's +/-
+    /// 128MB offset. If unsure, use `Far` instead.
+    Near,
+    /// Target of relocation could be anywhere in the address space.
+    Far,
+}
+
+fn alloc_vreg(
+    value_regs: &mut SecondaryMap<Value, Reg>,
+    regclass: RegClass,
+    value: Value,
+    next_vreg: &mut u32,
+) -> VirtualReg {
+    if value_regs[value].is_invalid() {
+        // default value in map.
+        let v = *next_vreg;
+        *next_vreg += 1;
+        value_regs[value] = Reg::new_virtual(regclass, v);
+        debug!("value {} gets vreg {:?}", value, v);
+    }
+    value_regs[value].as_virtual_reg().unwrap()
+}
+
+enum GenerateReturn {
+    Yes,
+    No,
+}
+
+impl<'func, I: VCodeInst> Lower<'func, I> {
+    /// Prepare a new lowering context for the given IR function.
+    pub fn new(
+        f: &'func Function,
+        abi: Box<dyn ABICallee<I = I>>,
+        emit_info: I::Info,
+        block_order: BlockLoweringOrder,
+    ) -> CodegenResult<Lower<'func, I>> {
+        let constants = VCodeConstants::with_capacity(f.dfg.constants.len());
+        let mut vcode = VCodeBuilder::new(abi, emit_info, block_order, constants);
+
+        let mut next_vreg: u32 = 0;
+
+        let mut value_regs = SecondaryMap::with_default(Reg::invalid());
+
+        // Assign a vreg to each block param and each inst result.
+        for bb in f.layout.blocks() {
+            for &param in f.dfg.block_params(bb) {
+                let ty = f.dfg.value_type(param);
+                let vreg = alloc_vreg(&mut value_regs, I::rc_for_type(ty)?, param, &mut next_vreg);
+                vcode.set_vreg_type(vreg, ty);
+                debug!("bb {} param {}: vreg {:?}", bb, param, vreg);
+            }
+            for inst in f.layout.block_insts(bb) {
+                for &result in f.dfg.inst_results(inst) {
+                    let ty = f.dfg.value_type(result);
+                    let vreg =
+                        alloc_vreg(&mut value_regs, I::rc_for_type(ty)?, result, &mut next_vreg);
+                    vcode.set_vreg_type(vreg, ty);
+                    debug!(
+                        "bb {} inst {} ({:?}): result vreg {:?}",
+                        bb, inst, f.dfg[inst], vreg
+                    );
+                }
+            }
+        }
+
+        let vm_context = f
+            .signature
+            .special_param_index(ArgumentPurpose::VMContext)
+            .map(|vm_context_index| {
+                let entry_block = f.layout.entry_block().unwrap();
+                let param = f.dfg.block_params(entry_block)[vm_context_index];
+                value_regs[param]
+            });
+
+        // Assign a vreg to each return value.
+        let mut retval_regs = vec![];
+        for ret in &f.signature.returns {
+            let v = next_vreg;
+            next_vreg += 1;
+            let regclass = I::rc_for_type(ret.value_type)?;
+            let vreg = Reg::new_virtual(regclass, v);
+            retval_regs.push(vreg);
+            vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ret.value_type);
+        }
+
+        // Compute instruction colors, find constant instructions, and find instructions with
+        // side-effects, in one combined pass.
+        let mut cur_color = 0;
+        let mut inst_colors = SecondaryMap::with_default(InstColor::new(0));
+        let mut inst_constants = FxHashMap::default();
+        let mut inst_needed = SecondaryMap::with_default(false);
+        for bb in f.layout.blocks() {
+            cur_color += 1;
+            for inst in f.layout.block_insts(bb) {
+                let side_effect = has_lowering_side_effect(f, inst);
+
+                // Assign colors. A new color is chosen *after* any side-effecting instruction.
+                inst_colors[inst] = InstColor::new(cur_color);
+                debug!("bb {} inst {} has color {}", bb, inst, cur_color);
+                if side_effect {
+                    debug!(" -> side-effecting");
+                    inst_needed[inst] = true;
+                    cur_color += 1;
+                }
+
+                // Determine if this is a constant; if so, add to the table.
+                if let Some(c) = is_constant_64bit(f, inst) {
+                    debug!(" -> constant: {}", c);
+                    inst_constants.insert(inst, c);
+                }
+            }
+        }
+
+        let vreg_needed = std::iter::repeat(false).take(next_vreg as usize).collect();
+
+        Ok(Lower {
+            f,
+            vcode,
+            value_regs,
+            retval_regs,
+            inst_colors,
+            inst_constants,
+            inst_needed,
+            vreg_needed,
+            next_vreg,
+            block_insts: vec![],
+            block_ranges: vec![],
+            bb_insts: vec![],
+            ir_insts: vec![],
+            pinned_reg: None,
+            vm_context,
+        })
+    }
+
+    fn gen_arg_setup(&mut self) {
+        if let Some(entry_bb) = self.f.layout.entry_block() {
+            debug!(
+                "gen_arg_setup: entry BB {} args are:\n{:?}",
+                entry_bb,
+                self.f.dfg.block_params(entry_bb)
+            );
+            for (i, param) in self.f.dfg.block_params(entry_bb).iter().enumerate() {
+                if !self.vcode.abi().arg_is_needed_in_body(i) {
+                    continue;
+                }
+                let reg = Writable::from_reg(self.value_regs[*param]);
+                let insn = self.vcode.abi().gen_copy_arg_to_reg(i, reg);
+                self.emit(insn);
+            }
+            if let Some(insn) = self.vcode.abi().gen_retval_area_setup() {
+                self.emit(insn);
+            }
+        }
+    }
+
+    fn gen_retval_setup(&mut self, gen_ret_inst: GenerateReturn) {
+        let retval_regs = self.retval_regs.clone();
+        for (i, reg) in retval_regs.into_iter().enumerate() {
+            let reg = Writable::from_reg(reg);
+            let insns = self.vcode.abi().gen_copy_reg_to_retval(i, reg);
+            for insn in insns {
+                self.emit(insn);
+            }
+        }
+        let inst = match gen_ret_inst {
+            GenerateReturn::Yes => self.vcode.abi().gen_ret(),
+            GenerateReturn::No => self.vcode.abi().gen_epilogue_placeholder(),
+        };
+        self.emit(inst);
+    }
+
+    fn lower_edge(&mut self, pred: Block, inst: Inst, succ: Block) -> CodegenResult<()> {
+        debug!("lower_edge: pred {} succ {}", pred, succ);
+
+        let num_args = self.f.dfg.block_params(succ).len();
+        debug_assert!(num_args == self.f.dfg.inst_variable_args(inst).len());
+
+        // Most blocks have no params, so skip all the hoop-jumping below and make an early exit.
+        if num_args == 0 {
+            return Ok(());
+        }
+
+        // Make up two vectors of info:
+        //
+        // * one for dsts which are to be assigned constants.  We'll deal with those second, so
+        //   as to minimise live ranges.
+        //
+        // * one for dsts whose sources are non-constants.
+
+        let mut const_bundles = SmallVec::<[(Type, Writable<Reg>, u64); 16]>::new();
+        let mut var_bundles = SmallVec::<[(Type, Writable<Reg>, Reg); 16]>::new();
+
+        let mut i = 0;
+        for (dst_val, src_val) in self
+            .f
+            .dfg
+            .block_params(succ)
+            .iter()
+            .zip(self.f.dfg.inst_variable_args(inst).iter())
+        {
+            let src_val = self.f.dfg.resolve_aliases(*src_val);
+            let ty = self.f.dfg.value_type(src_val);
+
+            debug_assert!(ty == self.f.dfg.value_type(*dst_val));
+            let dst_reg = self.value_regs[*dst_val];
+
+            let input = self.get_input_for_val(inst, src_val);
+            debug!("jump arg {} is {}, reg {:?}", i, src_val, input.reg);
+            i += 1;
+
+            if let Some(c) = input.constant {
+                const_bundles.push((ty, Writable::from_reg(dst_reg), c));
+            } else {
+                self.use_input_reg(input);
+                let src_reg = input.reg;
+                // Skip self-assignments.  Not only are they pointless, they falsely trigger the
+                // overlap-check below and hence can cause a lot of unnecessary copying through
+                // temporaries.
+                if dst_reg != src_reg {
+                    var_bundles.push((ty, Writable::from_reg(dst_reg), src_reg));
+                }
+            }
+        }
+
+        // Deal first with the moves whose sources are variables.
+
+        // FIXME: use regalloc.rs' SparseSetU here.  This would avoid all heap allocation
+        // for cases of up to circa 16 args.  Currently not possible because regalloc.rs
+        // does not export it.
+        let mut src_reg_set = FxHashSet::<Reg>::default();
+        for (_, _, src_reg) in &var_bundles {
+            src_reg_set.insert(*src_reg);
+        }
+        let mut overlaps = false;
+        for (_, dst_reg, _) in &var_bundles {
+            if src_reg_set.contains(&dst_reg.to_reg()) {
+                overlaps = true;
+                break;
+            }
+        }
+
+        // If, as is mostly the case, the source and destination register sets are non
+        // overlapping, then we can copy directly, so as to save the register allocator work.
+        if !overlaps {
+            for (ty, dst_reg, src_reg) in &var_bundles {
+                self.emit(I::gen_move(*dst_reg, *src_reg, *ty));
+            }
+        } else {
+            // There's some overlap, so play safe and copy via temps.
+            let mut tmp_regs = SmallVec::<[Writable<Reg>; 16]>::new();
+            for (ty, _, _) in &var_bundles {
+                tmp_regs.push(self.alloc_tmp(I::rc_for_type(*ty)?, *ty));
+            }
+            for ((ty, _, src_reg), tmp_reg) in var_bundles.iter().zip(tmp_regs.iter()) {
+                self.emit(I::gen_move(*tmp_reg, *src_reg, *ty));
+            }
+            for ((ty, dst_reg, _), tmp_reg) in var_bundles.iter().zip(tmp_regs.iter()) {
+                self.emit(I::gen_move(*dst_reg, (*tmp_reg).to_reg(), *ty));
+            }
+        }
+
+        // Now, finally, deal with the moves whose sources are constants.
+        for (ty, dst_reg, const_u64) in &const_bundles {
+            for inst in I::gen_constant(*dst_reg, *const_u64, *ty, |reg_class, ty| {
+                self.alloc_tmp(reg_class, ty)
+            })
+            .into_iter()
+            {
+                self.emit(inst);
+            }
+        }
+
+        Ok(())
+    }
+
+    fn lower_clif_block<B: LowerBackend<MInst = I>>(
+        &mut self,
+        backend: &B,
+        block: Block,
+    ) -> CodegenResult<()> {
+        // Lowering loop:
+        // - For each non-branch instruction, in reverse order:
+        //   - If side-effecting (load, store, branch/call/return, possible trap), or if
+        //     used outside of this block, or if demanded by another inst, then lower.
+        //
+        // That's it! Lowering of side-effecting ops will force all *needed*
+        // (live) non-side-effecting ops to be lowered at the right places, via
+        // the `use_input_reg()` callback on the `LowerCtx` (that's us). That's
+        // because `use_input_reg()` sets the eager/demand bit for any insts
+        // whose result registers are used.
+        //
+        // We build up the BB in reverse instruction order in `bb_insts`.
+        // Because the machine backend calls `ctx.emit()` in forward order, we
+        // collect per-IR-inst lowered instructions in `ir_insts`, then reverse
+        // these and append to `bb_insts` as we go backward through the block.
+        // `bb_insts` are then reversed again and appended to the VCode at the
+        // end of the BB (in the toplevel driver `lower()`).
+        for inst in self.f.layout.block_insts(block).rev() {
+            let data = &self.f.dfg[inst];
+            let value_needed = self
+                .f
+                .dfg
+                .inst_results(inst)
+                .iter()
+                .any(|&result| self.vreg_needed[self.value_regs[result].get_index()]);
+            debug!(
+                "lower_clif_block: block {} inst {} ({:?}) is_branch {} inst_needed {} value_needed {}",
+                block,
+                inst,
+                data,
+                data.opcode().is_branch(),
+                self.inst_needed[inst],
+                value_needed,
+            );
+            if self.f.dfg[inst].opcode().is_branch() {
+                continue;
+            }
+            // Normal instruction: codegen if eager bit is set. (Other instructions may also be
+            // codegened if not eager when they are used by another instruction.)
+            if self.inst_needed[inst] || value_needed {
+                debug!("lowering: inst {}: {:?}", inst, self.f.dfg[inst]);
+                backend.lower(self, inst)?;
+            }
+            if data.opcode().is_return() {
+                // Return: handle specially, using ABI-appropriate sequence.
+                let gen_ret = if data.opcode() == Opcode::Return {
+                    GenerateReturn::Yes
+                } else {
+                    debug_assert!(data.opcode() == Opcode::FallthroughReturn);
+                    GenerateReturn::No
+                };
+                self.gen_retval_setup(gen_ret);
+            }
+
+            let loc = self.srcloc(inst);
+            self.finish_ir_inst(loc);
+        }
+        Ok(())
+    }
+
+    fn finish_ir_inst(&mut self, loc: SourceLoc) {
+        // `bb_insts` is kept in reverse order, so emit the instructions in
+        // reverse order.
+        for mut tuple in self.ir_insts.drain(..).rev() {
+            tuple.loc = loc;
+            self.bb_insts.push(tuple);
+        }
+    }
+
+    fn finish_bb(&mut self) {
+        let start = self.block_insts.len();
+        for tuple in self.bb_insts.drain(..).rev() {
+            self.block_insts.push(tuple);
+        }
+        let end = self.block_insts.len();
+        self.block_ranges.push((start, end));
+    }
+
+    fn copy_bbs_to_vcode(&mut self) {
+        for &(start, end) in self.block_ranges.iter().rev() {
+            for &InstTuple {
+                loc,
+                is_safepoint,
+                ref inst,
+            } in &self.block_insts[start..end]
+            {
+                self.vcode.set_srcloc(loc);
+                self.vcode.push(inst.clone(), is_safepoint);
+            }
+            self.vcode.end_bb();
+        }
+    }
+
+    fn lower_clif_branches<B: LowerBackend<MInst = I>>(
+        &mut self,
+        backend: &B,
+        block: Block,
+        branches: &SmallVec<[Inst; 2]>,
+        targets: &SmallVec<[MachLabel; 2]>,
+        maybe_fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
+        debug!(
+            "lower_clif_branches: block {} branches {:?} targets {:?} maybe_fallthrough {:?}",
+            block, branches, targets, maybe_fallthrough
+        );
+        backend.lower_branch_group(self, branches, targets, maybe_fallthrough)?;
+        let loc = self.srcloc(branches[0]);
+        self.finish_ir_inst(loc);
+        Ok(())
+    }
+
+    fn collect_branches_and_targets(
+        &self,
+        bindex: BlockIndex,
+        _bb: Block,
+        branches: &mut SmallVec<[Inst; 2]>,
+        targets: &mut SmallVec<[MachLabel; 2]>,
+    ) {
+        branches.clear();
+        targets.clear();
+        let mut last_inst = None;
+        for &(inst, succ) in self.vcode.block_order().succ_indices(bindex) {
+            // Avoid duplicates: this ensures a br_table is only inserted once.
+            if last_inst != Some(inst) {
+                branches.push(inst);
+            } else {
+                debug_assert!(self.f.dfg[inst].opcode() == Opcode::BrTable);
+                debug_assert!(branches.len() == 1);
+            }
+            last_inst = Some(inst);
+            targets.push(MachLabel::from_block(succ));
+        }
+    }
+
+    /// Lower the function.
+    pub fn lower<B: LowerBackend<MInst = I>>(
+        mut self,
+        backend: &B,
+    ) -> CodegenResult<(VCode<I>, StackmapRequestInfo)> {
+        debug!("about to lower function: {:?}", self.f);
+
+        // Initialize the ABI object, giving it a temp if requested.
+        let maybe_tmp = if self.vcode.abi().temp_needed() {
+            Some(self.alloc_tmp(RegClass::I64, I64))
+        } else {
+            None
+        };
+        self.vcode.abi().init(maybe_tmp);
+
+        // Get the pinned reg here (we only parameterize this function on `B`,
+        // not the whole `Lower` impl).
+        self.pinned_reg = backend.maybe_pinned_reg();
+
+        self.vcode.set_entry(0);
+
+        // Reused vectors for branch lowering.
+        let mut branches: SmallVec<[Inst; 2]> = SmallVec::new();
+        let mut targets: SmallVec<[MachLabel; 2]> = SmallVec::new();
+
+        // get a copy of the lowered order; we hold this separately because we
+        // need a mut ref to the vcode to mutate it below.
+        let lowered_order: SmallVec<[LoweredBlock; 64]> = self
+            .vcode
+            .block_order()
+            .lowered_order()
+            .iter()
+            .cloned()
+            .collect();
+
+        // Main lowering loop over lowered blocks.
+        for (bindex, lb) in lowered_order.iter().enumerate().rev() {
+            let bindex = bindex as BlockIndex;
+
+            // Lower the block body in reverse order (see comment in
+            // `lower_clif_block()` for rationale).
+
+            // End branches.
+            if let Some(bb) = lb.orig_block() {
+                self.collect_branches_and_targets(bindex, bb, &mut branches, &mut targets);
+                if branches.len() > 0 {
+                    let maybe_fallthrough = if (bindex + 1) < (lowered_order.len() as BlockIndex) {
+                        Some(MachLabel::from_block(bindex + 1))
+                    } else {
+                        None
+                    };
+                    self.lower_clif_branches(backend, bb, &branches, &targets, maybe_fallthrough)?;
+                    self.finish_ir_inst(self.srcloc(branches[0]));
+                }
+            } else {
+                // If no orig block, this must be a pure edge block; get the successor and
+                // emit a jump.
+                let (_, succ) = self.vcode.block_order().succ_indices(bindex)[0];
+                self.emit(I::gen_jump(MachLabel::from_block(succ)));
+                self.finish_ir_inst(SourceLoc::default());
+            }
+
+            // Out-edge phi moves.
+            if let Some((pred, inst, succ)) = lb.out_edge() {
+                self.lower_edge(pred, inst, succ)?;
+                self.finish_ir_inst(SourceLoc::default());
+            }
+            // Original block body.
+            if let Some(bb) = lb.orig_block() {
+                self.lower_clif_block(backend, bb)?;
+            }
+            // In-edge phi moves.
+            if let Some((pred, inst, succ)) = lb.in_edge() {
+                self.lower_edge(pred, inst, succ)?;
+                self.finish_ir_inst(SourceLoc::default());
+            }
+
+            if bindex == 0 {
+                // Set up the function with arg vreg inits.
+                self.gen_arg_setup();
+                self.finish_ir_inst(SourceLoc::default());
+            }
+
+            self.finish_bb();
+        }
+
+        self.copy_bbs_to_vcode();
+
+        // Now that we've emitted all instructions into the VCodeBuilder, let's build the VCode.
+        let (vcode, stack_map_info) = self.vcode.build();
+        debug!("built vcode: {:?}", vcode);
+
+        Ok((vcode, stack_map_info))
+    }
+
+    /// Get the actual inputs for a value. This is the implementation for
+    /// `get_input()` but starting from the SSA value, which is not exposed to
+    /// the backend.
+    fn get_input_for_val(&self, at_inst: Inst, val: Value) -> LowerInput {
+        debug!("get_input_for_val: val {} at inst {}", val, at_inst);
+        let mut reg = self.value_regs[val];
+        debug!(" -> reg {:?}", reg);
+        assert!(reg.is_valid());
+        let mut inst = match self.f.dfg.value_def(val) {
+            // OK to merge source instruction if (i) we have a source
+            // instruction, and either (ii-a) it has no side effects, or (ii-b)
+            // it has the same color as this instruction.
+            ValueDef::Result(src_inst, result_idx) => {
+                debug!(" -> src inst {}", src_inst);
+                debug!(
+                    " -> has lowering side effect: {}",
+                    has_lowering_side_effect(self.f, src_inst)
+                );
+                debug!(
+                    " -> our color is {:?}, src inst is {:?}",
+                    self.inst_color(at_inst),
+                    self.inst_color(src_inst)
+                );
+                if !has_lowering_side_effect(self.f, src_inst)
+                    || self.inst_color(at_inst) == self.inst_color(src_inst)
+                {
+                    Some((src_inst, result_idx))
+                } else {
+                    None
+                }
+            }
+            _ => None,
+        };
+        let constant = inst.and_then(|(inst, _)| self.get_constant(inst));
+
+        // Pinned-reg hack: if backend specifies a fixed pinned register, use it
+        // directly when we encounter a GetPinnedReg op, rather than lowering
+        // the actual op, and do not return the source inst to the caller; the
+        // value comes "out of the ether" and we will not force generation of
+        // the superfluous move.
+        if let Some((i, _)) = inst {
+            if self.f.dfg[i].opcode() == Opcode::GetPinnedReg {
+                if let Some(pr) = self.pinned_reg {
+                    reg = pr;
+                }
+                inst = None;
+            }
+        }
+
+        LowerInput {
+            reg,
+            inst,
+            constant,
+        }
+    }
+}
+
+impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
+    type I = I;
+
+    fn abi(&mut self) -> &mut dyn ABICallee<I = I> {
+        self.vcode.abi()
+    }
+
+    fn retval(&self, idx: usize) -> Writable<Reg> {
+        Writable::from_reg(self.retval_regs[idx])
+    }
+
+    fn get_vm_context(&self) -> Option<Reg> {
+        self.vm_context
+    }
+
+    fn data(&self, ir_inst: Inst) -> &InstructionData {
+        &self.f.dfg[ir_inst]
+    }
+
+    fn ty(&self, ir_inst: Inst) -> Type {
+        self.f.dfg.ctrl_typevar(ir_inst)
+    }
+
+    fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance)> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::Call { func_ref, .. }
+            | &InstructionData::FuncAddr { func_ref, .. } => {
+                let funcdata = &self.f.dfg.ext_funcs[func_ref];
+                let dist = funcdata.reloc_distance();
+                Some((&funcdata.name, dist))
+            }
+            _ => None,
+        }
+    }
+
+    fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::Call { func_ref, .. } => {
+                let funcdata = &self.f.dfg.ext_funcs[func_ref];
+                Some(&self.f.dfg.signatures[funcdata.signature])
+            }
+            &InstructionData::CallIndirect { sig_ref, .. } => Some(&self.f.dfg.signatures[sig_ref]),
+            _ => None,
+        }
+    }
+
+    fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance, i64)> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::UnaryGlobalValue { global_value, .. } => {
+                let gvdata = &self.f.global_values[global_value];
+                match gvdata {
+                    &GlobalValueData::Symbol {
+                        ref name,
+                        ref offset,
+                        ..
+                    } => {
+                        let offset = offset.bits();
+                        let dist = gvdata.maybe_reloc_distance().unwrap();
+                        Some((name, dist, offset))
+                    }
+                    _ => None,
+                }
+            }
+            _ => None,
+        }
+    }
+
+    fn memflags(&self, ir_inst: Inst) -> Option<MemFlags> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::AtomicCas { flags, .. } => Some(flags),
+            &InstructionData::AtomicRmw { flags, .. } => Some(flags),
+            &InstructionData::Load { flags, .. }
+            | &InstructionData::LoadComplex { flags, .. }
+            | &InstructionData::LoadNoOffset { flags, .. }
+            | &InstructionData::Store { flags, .. }
+            | &InstructionData::StoreComplex { flags, .. } => Some(flags),
+            &InstructionData::StoreNoOffset { flags, .. } => Some(flags),
+            _ => None,
+        }
+    }
+
+    fn srcloc(&self, ir_inst: Inst) -> SourceLoc {
+        self.f.srclocs[ir_inst]
+    }
+
+    fn inst_color(&self, ir_inst: Inst) -> InstColor {
+        self.inst_colors[ir_inst]
+    }
+
+    fn num_inputs(&self, ir_inst: Inst) -> usize {
+        self.f.dfg.inst_args(ir_inst).len()
+    }
+
+    fn num_outputs(&self, ir_inst: Inst) -> usize {
+        self.f.dfg.inst_results(ir_inst).len()
+    }
+
+    fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type {
+        let val = self.f.dfg.inst_args(ir_inst)[idx];
+        let val = self.f.dfg.resolve_aliases(val);
+        self.f.dfg.value_type(val)
+    }
+
+    fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type {
+        self.f.dfg.value_type(self.f.dfg.inst_results(ir_inst)[idx])
+    }
+
+    fn get_constant(&self, ir_inst: Inst) -> Option<u64> {
+        self.inst_constants.get(&ir_inst).cloned()
+    }
+
+    fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput {
+        let val = self.f.dfg.inst_args(ir_inst)[idx];
+        let val = self.f.dfg.resolve_aliases(val);
+        self.get_input_for_val(ir_inst, val)
+    }
+
+    fn get_output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg> {
+        let val = self.f.dfg.inst_results(ir_inst)[idx];
+        Writable::from_reg(self.value_regs[val])
+    }
+
+    fn alloc_tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg> {
+        let v = self.next_vreg;
+        self.next_vreg += 1;
+        let vreg = Reg::new_virtual(rc, v);
+        self.vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ty);
+        Writable::from_reg(vreg)
+    }
+
+    fn emit(&mut self, mach_inst: I) {
+        self.ir_insts.push(InstTuple {
+            loc: SourceLoc::default(),
+            is_safepoint: false,
+            inst: mach_inst,
+        });
+    }
+
+    fn emit_safepoint(&mut self, mach_inst: I) {
+        self.ir_insts.push(InstTuple {
+            loc: SourceLoc::default(),
+            is_safepoint: true,
+            inst: mach_inst,
+        });
+    }
+
+    fn use_input_reg(&mut self, input: LowerInput) {
+        debug!("use_input_reg: vreg {:?} is needed", input.reg);
+        // We may directly return a real (machine) register when we know that register holds the
+        // result of an opcode (e.g. GetPinnedReg).
+        if input.reg.is_virtual() {
+            self.vreg_needed[input.reg.get_index()] = true;
+        }
+    }
+
+    fn is_reg_needed(&self, ir_inst: Inst, reg: Reg) -> bool {
+        self.inst_needed[ir_inst] || self.vreg_needed[reg.get_index()]
+    }
+
+    fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData {
+        self.f.dfg.constants.get(constant_handle)
+    }
+
+    fn use_constant(&mut self, constant: VCodeConstantData) -> VCodeConstant {
+        self.vcode.constants().insert(constant)
+    }
+
+    fn get_immediate(&self, ir_inst: Inst) -> Option<DataValue> {
+        let inst_data = self.data(ir_inst);
+        match inst_data {
+            InstructionData::Shuffle { mask, .. } => {
+                let buffer = self.f.dfg.immediates.get(mask.clone()).unwrap().as_slice();
+                let value = DataValue::V128(buffer.try_into().expect("a 16-byte data buffer"));
+                Some(value)
+            }
+            InstructionData::UnaryConst {
+                constant_handle, ..
+            } => {
+                let buffer = self.f.dfg.constants.get(constant_handle.clone()).as_slice();
+                let value = DataValue::V128(buffer.try_into().expect("a 16-byte data buffer"));
+                Some(value)
+            }
+            _ => inst_data.imm_value(),
+        }
+    }
+
+    fn ensure_in_vreg(&mut self, reg: Reg, ty: Type) -> Reg {
+        if reg.is_virtual() {
+            reg
+        } else {
+            let rc = reg.get_class();
+            let new_reg = self.alloc_tmp(rc, ty);
+            self.emit(I::gen_move(new_reg, reg, ty));
+            new_reg.to_reg()
+        }
+    }
+}
+
+/// Visit all successors of a block with a given visitor closure.
+pub(crate) fn visit_block_succs<F: FnMut(Inst, Block)>(f: &Function, block: Block, mut visit: F) {
+    for inst in f.layout.block_likely_branches(block) {
+        if f.dfg[inst].opcode().is_branch() {
+            visit_branch_targets(f, block, inst, &mut visit);
+        }
+    }
+}
+
+fn visit_branch_targets<F: FnMut(Inst, Block)>(
+    f: &Function,
+    block: Block,
+    inst: Inst,
+    visit: &mut F,
+) {
+    if f.dfg[inst].opcode() == Opcode::Fallthrough {
+        visit(inst, f.layout.next_block(block).unwrap());
+    } else {
+        match f.dfg[inst].analyze_branch(&f.dfg.value_lists) {
+            BranchInfo::NotABranch => {}
+            BranchInfo::SingleDest(dest, _) => {
+                visit(inst, dest);
+            }
+            BranchInfo::Table(table, maybe_dest) => {
+                if let Some(dest) = maybe_dest {
+                    visit(inst, dest);
+                }
+                for &dest in f.jump_tables[table].as_slice() {
+                    visit(inst, dest);
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/machinst/mod.rs b/third_party/rust/cranelift-codegen/src/machinst/mod.rs
new file mode 100644
index 0000000000..4b12f2fd1d
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/mod.rs
@@ -0,0 +1,421 @@
+//! This module exposes the machine-specific backend definition pieces.
+//!
+//! The MachInst infrastructure is the compiler backend, from CLIF
+//! (ir::Function) to machine code. The purpose of this infrastructure is, at a
+//! high level, to do instruction selection/lowering (to machine instructions),
+//! register allocation, and then perform all the fixups to branches, constant
+//! data references, etc., needed to actually generate machine code.
+//!
+//! The container for machine instructions, at various stages of construction,
+//! is the `VCode` struct. We refer to a sequence of machine instructions organized
+//! into basic blocks as "vcode". This is short for "virtual-register code", though
+//! it's a bit of a misnomer because near the end of the pipeline, vcode has all
+//! real registers. Nevertheless, the name is catchy and we like it.
+//!
+//! The compilation pipeline, from an `ir::Function` (already optimized as much as
+//! you like by machine-independent optimization passes) onward, is as follows.
+//! (N.B.: though we show the VCode separately at each stage, the passes
+//! mutate the VCode in place; these are not separate copies of the code.)
+//!
+//! ```plain
+//!
+//!     ir::Function                (SSA IR, machine-independent opcodes)
+//!         |
+//!         |  [lower]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - mostly virtual registers.
+//!         |                        - cond branches in two-target form.
+//!         |                        - branch targets are block indices.
+//!         |                        - in-memory constants held by insns,
+//!         |                          with unknown offsets.
+//!         |                        - critical edges (actually all edges)
+//!         |                          are split.)
+//!         | [regalloc]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all real registers.
+//!         |                        - new instruction sequence returned
+//!         |                          out-of-band in RegAllocResult.
+//!         |                        - instruction sequence has spills,
+//!         |                          reloads, and moves inserted.
+//!         |                        - other invariants same as above.)
+//!         |
+//!         | [preamble/postamble]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - stack-frame size known.
+//!         |                        - out-of-band instruction sequence
+//!         |                          has preamble prepended to entry
+//!         |                          block, and postamble injected before
+//!         |                          every return instruction.
+//!         |                        - all symbolic stack references to
+//!         |                          stackslots and spillslots are resolved
+//!         |                          to concrete FP-offset mem addresses.)
+//!         | [block/insn ordering]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - vcode.final_block_order is filled in.
+//!         |                        - new insn sequence from regalloc is
+//!         |                          placed back into vcode and block
+//!         |                          boundaries are updated.)
+//!         | [redundant branch/block
+//!         |  removal]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all blocks that were just an
+//!         |                          unconditional branch are removed.)
+//!         |
+//!         | [branch finalization
+//!         |  (fallthroughs)]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all branches are in lowered one-
+//!         |                          target form, but targets are still
+//!         |                          block indices.)
+//!         |
+//!         | [branch finalization
+//!         |  (offsets)]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all branch offsets from start of
+//!         |                          function are known, and all branches
+//!         |                          have resolved-offset targets.)
+//!         |
+//!         | [MemArg finalization]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all MemArg references to the constant
+//!         |                          pool are replaced with offsets.
+//!         |                        - all constant-pool data is collected
+//!         |                          in the VCode.)
+//!         |
+//!         | [binary emission]
+//!         |
+//!     Vec<u8>                     (machine code!)
+//!
+//! ```
+
+use crate::binemit::{CodeInfo, CodeOffset, StackMap};
+use crate::ir::condcodes::IntCC;
+use crate::ir::{Function, SourceLoc, Type};
+use crate::isa::unwind::input as unwind_input;
+use crate::result::CodegenResult;
+use crate::settings::Flags;
+
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use core::fmt::Debug;
+use core::ops::Range;
+use regalloc::RegUsageCollector;
+use regalloc::{
+    RealReg, RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable,
+};
+use smallvec::SmallVec;
+use std::string::String;
+use target_lexicon::Triple;
+
+pub mod lower;
+pub use lower::*;
+pub mod vcode;
+pub use vcode::*;
+pub mod compile;
+pub use compile::*;
+pub mod blockorder;
+pub use blockorder::*;
+pub mod abi;
+pub use abi::*;
+pub mod abi_impl;
+pub use abi_impl::*;
+pub mod buffer;
+pub use buffer::*;
+pub mod adapter;
+pub use adapter::*;
+pub mod helpers;
+pub use helpers::*;
+pub mod inst_common;
+pub use inst_common::*;
+
+/// A machine instruction.
+pub trait MachInst: Clone + Debug {
+    /// Return the registers referenced by this machine instruction along with
+    /// the modes of reference (use, def, modify).
+    fn get_regs(&self, collector: &mut RegUsageCollector);
+
+    /// Map virtual registers to physical registers using the given virt->phys
+    /// maps corresponding to the program points prior to, and after, this instruction.
+    fn map_regs<RUM: RegUsageMapper>(&mut self, maps: &RUM);
+
+    /// If this is a simple move, return the (source, destination) tuple of registers.
+    fn is_move(&self) -> Option<(Writable<Reg>, Reg)>;
+
+    /// Is this a terminator (branch or ret)? If so, return its type
+    /// (ret/uncond/cond) and target if applicable.
+    fn is_term<'a>(&'a self) -> MachTerminator<'a>;
+
+    /// Returns true if the instruction is an epilogue placeholder.
+    fn is_epilogue_placeholder(&self) -> bool;
+
+    /// Should this instruction be included in the clobber-set?
+    fn is_included_in_clobbers(&self) -> bool {
+        true
+    }
+
+    /// Generate a move.
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self;
+
+    /// Generate a constant into a reg.
+    fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        to_reg: Writable<Reg>,
+        value: u64,
+        ty: Type,
+        alloc_tmp: F,
+    ) -> SmallVec<[Self; 4]>;
+
+    /// Generate a zero-length no-op.
+    fn gen_zero_len_nop() -> Self;
+
+    /// Possibly operate on a value directly in a spill-slot rather than a
+    /// register. Useful if the machine has register-memory instruction forms
+    /// (e.g., add directly from or directly to memory), like x86.
+    fn maybe_direct_reload(&self, reg: VirtualReg, slot: SpillSlot) -> Option<Self>;
+
+    /// Determine a register class to store the given Cranelift type.
+    /// May return an error if the type isn't supported by this backend.
+    fn rc_for_type(ty: Type) -> CodegenResult<RegClass>;
+
+    /// Generate a jump to another target. Used during lowering of
+    /// control flow.
+    fn gen_jump(target: MachLabel) -> Self;
+
+    /// Generate a NOP. The `preferred_size` parameter allows the caller to
+    /// request a NOP of that size, or as close to it as possible. The machine
+    /// backend may return a NOP whose binary encoding is smaller than the
+    /// preferred size, but must not return a NOP that is larger. However,
+    /// the instruction must have a nonzero size.
+    fn gen_nop(preferred_size: usize) -> Self;
+
+    /// Get the register universe for this backend.
+    fn reg_universe(flags: &Flags) -> RealRegUniverse;
+
+    /// Align a basic block offset (from start of function).  By default, no
+    /// alignment occurs.
+    fn align_basic_block(offset: CodeOffset) -> CodeOffset {
+        offset
+    }
+
+    /// What is the worst-case instruction size emitted by this instruction type?
+    fn worst_case_size() -> CodeOffset;
+
+    /// What is the register class used for reference types (GC-observable pointers)? Can
+    /// be dependent on compilation flags.
+    fn ref_type_regclass(_flags: &Flags) -> RegClass;
+
+    /// A label-use kind: a type that describes the types of label references that
+    /// can occur in an instruction.
+    type LabelUse: MachInstLabelUse;
+}
+
+/// A descriptor of a label reference (use) in an instruction set.
+pub trait MachInstLabelUse: Clone + Copy + Debug + Eq {
+    /// Required alignment for any veneer. Usually the required instruction
+    /// alignment (e.g., 4 for a RISC with 32-bit instructions, or 1 for x86).
+    const ALIGN: CodeOffset;
+
+    /// What is the maximum PC-relative range (positive)? E.g., if `1024`, a
+    /// label-reference fixup at offset `x` is valid if the label resolves to `x
+    /// + 1024`.
+    fn max_pos_range(self) -> CodeOffset;
+    /// What is the maximum PC-relative range (negative)? This is the absolute
+    /// value; i.e., if `1024`, then a label-reference fixup at offset `x` is
+    /// valid if the label resolves to `x - 1024`.
+    fn max_neg_range(self) -> CodeOffset;
+    /// What is the size of code-buffer slice this label-use needs to patch in
+    /// the label's value?
+    fn patch_size(self) -> CodeOffset;
+    /// Perform a code-patch, given the offset into the buffer of this label use
+    /// and the offset into the buffer of the label's definition.
+    /// It is guaranteed that, given `delta = offset - label_offset`, we will
+    /// have `offset >= -self.max_neg_range()` and `offset <=
+    /// self.max_pos_range()`.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset);
+    /// Can the label-use be patched to a veneer that supports a longer range?
+    /// Usually valid for jumps (a short-range jump can jump to a longer-range
+    /// jump), but not for e.g. constant pool references, because the constant
+    /// load would require different code (one more level of indirection).
+    fn supports_veneer(self) -> bool;
+    /// How many bytes are needed for a veneer?
+    fn veneer_size(self) -> CodeOffset;
+    /// Generate a veneer. The given code-buffer slice is `self.veneer_size()`
+    /// bytes long at offset `veneer_offset` in the buffer. The original
+    /// label-use will be patched to refer to this veneer's offset.  A new
+    /// (offset, LabelUse) is returned that allows the veneer to use the actual
+    /// label. For veneers to work properly, it is expected that the new veneer
+    /// has a larger range; on most platforms this probably means either a
+    /// "long-range jump" (e.g., on ARM, the 26-bit form), or if already at that
+    /// stage, a jump that supports a full 32-bit range, for example.
+    fn generate_veneer(self, buffer: &mut [u8], veneer_offset: CodeOffset) -> (CodeOffset, Self);
+}
+
+/// Describes a block terminator (not call) in the vcode, when its branches
+/// have not yet been finalized (so a branch may have two targets).
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum MachTerminator<'a> {
+    /// Not a terminator.
+    None,
+    /// A return instruction.
+    Ret,
+    /// An unconditional branch to another block.
+    Uncond(MachLabel),
+    /// A conditional branch to one of two other blocks.
+    Cond(MachLabel, MachLabel),
+    /// An indirect branch with known possible targets.
+    Indirect(&'a [MachLabel]),
+}
+
+/// A trait describing the ability to encode a MachInst into binary machine code.
+pub trait MachInstEmit: MachInst {
+    /// Persistent state carried across `emit` invocations.
+    type State: MachInstEmitState<Self>;
+    /// Constant information used in `emit` invocations.
+    type Info: MachInstEmitInfo;
+    /// Unwind info generator.
+    type UnwindInfo: UnwindInfoGenerator<Self>;
+    /// Emit the instruction.
+    fn emit(&self, code: &mut MachBuffer<Self>, info: &Self::Info, state: &mut Self::State);
+    /// Pretty-print the instruction.
+    fn pretty_print(&self, mb_rru: Option<&RealRegUniverse>, state: &mut Self::State) -> String;
+}
+
+/// Constant information used to emit an instruction.
+pub trait MachInstEmitInfo {
+    /// Return the target-independent settings used for the compilation of this
+    /// particular function.
+    fn flags(&self) -> &Flags;
+}
+
+/// A trait describing the emission state carried between MachInsts when
+/// emitting a function body.
+pub trait MachInstEmitState<I: MachInst>: Default + Clone + Debug {
+    /// Create a new emission state given the ABI object.
+    fn new(abi: &dyn ABICallee<I = I>) -> Self;
+    /// Update the emission state before emitting an instruction that is a
+    /// safepoint.
+    fn pre_safepoint(&mut self, _stack_map: StackMap) {}
+    /// Update the emission state to indicate instructions are associated with a
+    /// particular SourceLoc.
+    fn pre_sourceloc(&mut self, _srcloc: SourceLoc) {}
+}
+
+/// The result of a `MachBackend::compile_function()` call. Contains machine
+/// code (as bytes) and a disassembly, if requested.
+pub struct MachCompileResult {
+    /// Machine code.
+    pub buffer: MachBufferFinalized,
+    /// Size of stack frame, in bytes.
+    pub frame_size: u32,
+    /// Disassembly, if requested.
+    pub disasm: Option<String>,
+    /// Unwind info.
+    pub unwind_info: Option<unwind_input::UnwindInfo<Reg>>,
+}
+
+impl MachCompileResult {
+    /// Get a `CodeInfo` describing section sizes from this compilation result.
+    pub fn code_info(&self) -> CodeInfo {
+        let code_size = self.buffer.total_size();
+        CodeInfo {
+            code_size,
+            jumptables_size: 0,
+            rodata_size: 0,
+            total_size: code_size,
+        }
+    }
+}
+
+/// Top-level machine backend trait, which wraps all monomorphized code and
+/// allows a virtual call from the machine-independent `Function::compile()`.
+pub trait MachBackend {
+    /// Compile the given function.
+    fn compile_function(
+        &self,
+        func: &Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult>;
+
+    /// Return flags for this backend.
+    fn flags(&self) -> &Flags;
+
+    /// Return triple for this backend.
+    fn triple(&self) -> Triple;
+
+    /// Return name for this backend.
+    fn name(&self) -> &'static str;
+
+    /// Return the register universe for this backend.
+    fn reg_universe(&self) -> &RealRegUniverse;
+
+    /// Machine-specific condcode info needed by TargetIsa.
+    /// Condition that will be true when an IaddIfcout overflows.
+    fn unsigned_add_overflow_condition(&self) -> IntCC;
+
+    /// Machine-specific condcode info needed by TargetIsa.
+    /// Condition that will be true when an IsubIfcout overflows.
+    fn unsigned_sub_overflow_condition(&self) -> IntCC;
+
+    /// Produces unwind info based on backend results.
+    #[cfg(feature = "unwind")]
+    fn emit_unwind_info(
+        &self,
+        _result: &MachCompileResult,
+        _kind: UnwindInfoKind,
+    ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> {
+        // By default, an backend cannot produce unwind info.
+        Ok(None)
+    }
+
+    /// Machine-specific condcode info needed by TargetIsa.
+    /// Creates a new System V Common Information Entry for the ISA.
+    #[cfg(feature = "unwind")]
+    fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> {
+        // By default, an ISA cannot create a System V CIE
+        None
+    }
+}
+
+/// Expected unwind info type.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum UnwindInfoKind {
+    /// No unwind info.
+    None,
+    /// SystemV CIE/FDE unwind info.
+    #[cfg(feature = "unwind")]
+    SystemV,
+    /// Windows X64 Unwind info
+    #[cfg(feature = "unwind")]
+    Windows,
+}
+
+/// Input data for UnwindInfoGenerator.
+pub struct UnwindInfoContext<'a, Inst: MachInstEmit> {
+    /// Function instructions.
+    pub insts: &'a [Inst],
+    /// Instruction layout: end offsets
+    pub insts_layout: &'a [CodeOffset],
+    /// Length of the function.
+    pub len: CodeOffset,
+    /// Prologue range.
+    pub prologue: Range<u32>,
+    /// Epilogue ranges.
+    pub epilogues: &'a [Range<u32>],
+}
+
+/// UnwindInfo generator/helper.
+pub trait UnwindInfoGenerator<I: MachInstEmit> {
+    /// Creates unwind info based on function signature and
+    /// emitted instructions.
+    fn create_unwind_info(
+        context: UnwindInfoContext<I>,
+    ) -> CodegenResult<Option<unwind_input::UnwindInfo<Reg>>>;
+}
diff --git a/third_party/rust/cranelift-codegen/src/machinst/vcode.rs b/third_party/rust/cranelift-codegen/src/machinst/vcode.rs
new file mode 100644
index 0000000000..c57f018e35
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/machinst/vcode.rs
@@ -0,0 +1,895 @@
+//! This implements the VCode container: a CFG of Insts that have been lowered.
+//!
+//! VCode is virtual-register code. An instruction in VCode is almost a machine
+//! instruction; however, its register slots can refer to virtual registers in
+//! addition to real machine registers.
+//!
+//! VCode is structured with traditional basic blocks, and
+//! each block must be terminated by an unconditional branch (one target), a
+//! conditional branch (two targets), or a return (no targets). Note that this
+//! slightly differs from the machine code of most ISAs: in most ISAs, a
+//! conditional branch has one target (and the not-taken case falls through).
+//! However, we expect that machine backends will elide branches to the following
+//! block (i.e., zero-offset jumps), and will be able to codegen a branch-cond /
+//! branch-uncond pair if *both* targets are not fallthrough. This allows us to
+//! play with layout prior to final binary emission, as well, if we want.
+//!
+//! See the main module comment in `mod.rs` for more details on the VCode-based
+//! backend pipeline.
+
+use crate::ir::{self, types, Constant, ConstantData, SourceLoc};
+use crate::machinst::*;
+use crate::settings;
+use crate::timing;
+
+use regalloc::Function as RegallocFunction;
+use regalloc::Set as RegallocSet;
+use regalloc::{
+    BlockIx, InstIx, PrettyPrint, Range, RegAllocResult, RegClass, RegUsageCollector,
+    RegUsageMapper, SpillSlot, StackmapRequestInfo,
+};
+
+use alloc::boxed::Box;
+use alloc::{borrow::Cow, vec::Vec};
+use cranelift_entity::{entity_impl, Keys, PrimaryMap};
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::fmt;
+use std::iter;
+use std::string::String;
+
+/// Index referring to an instruction in VCode.
+pub type InsnIndex = u32;
+/// Index referring to a basic block in VCode.
+pub type BlockIndex = u32;
+/// Range of an instructions in VCode.
+pub type InsnRange = core::ops::Range<InsnIndex>;
+
+/// VCodeInst wraps all requirements for a MachInst to be in VCode: it must be
+/// a `MachInst` and it must be able to emit itself at least to a `SizeCodeSink`.
+pub trait VCodeInst: MachInst + MachInstEmit {}
+impl<I: MachInst + MachInstEmit> VCodeInst for I {}
+
+/// A function in "VCode" (virtualized-register code) form, after lowering.
+/// This is essentially a standard CFG of basic blocks, where each basic block
+/// consists of lowered instructions produced by the machine-specific backend.
+pub struct VCode<I: VCodeInst> {
+    /// Function liveins.
+    liveins: RegallocSet<RealReg>,
+
+    /// Function liveouts.
+    liveouts: RegallocSet<RealReg>,
+
+    /// VReg IR-level types.
+    vreg_types: Vec<Type>,
+
+    /// Do we have any ref values among our vregs?
+    have_ref_values: bool,
+
+    /// Lowered machine instructions in order corresponding to the original IR.
+    insts: Vec<I>,
+
+    /// Source locations for each instruction. (`SourceLoc` is a `u32`, so it is
+    /// reasonable to keep one of these per instruction.)
+    srclocs: Vec<SourceLoc>,
+
+    /// Entry block.
+    entry: BlockIndex,
+
+    /// Block instruction indices.
+    block_ranges: Vec<(InsnIndex, InsnIndex)>,
+
+    /// Block successors: index range in the successor-list below.
+    block_succ_range: Vec<(usize, usize)>,
+
+    /// Block successor lists, concatenated into one Vec. The `block_succ_range`
+    /// list of tuples above gives (start, end) ranges within this list that
+    /// correspond to each basic block's successors.
+    block_succs: Vec<BlockIx>,
+
+    /// Block-order information.
+    block_order: BlockLoweringOrder,
+
+    /// ABI object.
+    abi: Box<dyn ABICallee<I = I>>,
+
+    /// Constant information used during code emission. This should be
+    /// immutable across function compilations within the same module.
+    emit_info: I::Info,
+
+    /// Safepoint instruction indices. Filled in post-regalloc. (Prior to
+    /// regalloc, the safepoint instructions are listed in the separate
+    /// `StackmapRequestInfo` held separate from the `VCode`.)
+    safepoint_insns: Vec<InsnIndex>,
+
+    /// For each safepoint entry in `safepoint_insns`, a list of `SpillSlot`s.
+    /// These are used to generate actual stack maps at emission. Filled in
+    /// post-regalloc.
+    safepoint_slots: Vec<Vec<SpillSlot>>,
+
+    /// Ranges for prologue and epilogue instructions.
+    prologue_epilogue_ranges: Option<(InsnRange, Box<[InsnRange]>)>,
+
+    /// Instruction end offsets
+    insts_layout: RefCell<(Vec<u32>, u32)>,
+
+    /// Constants.
+    constants: VCodeConstants,
+}
+
+/// A builder for a VCode function body. This builder is designed for the
+/// lowering approach that we take: we traverse basic blocks in forward
+/// (original IR) order, but within each basic block, we generate code from
+/// bottom to top; and within each IR instruction that we visit in this reverse
+/// order, we emit machine instructions in *forward* order again.
+///
+/// Hence, to produce the final instructions in proper order, we perform two
+/// swaps.  First, the machine instructions (`I` instances) are produced in
+/// forward order for an individual IR instruction. Then these are *reversed*
+/// and concatenated to `bb_insns` at the end of the IR instruction lowering.
+/// The `bb_insns` vec will thus contain all machine instructions for a basic
+/// block, in reverse order. Finally, when we're done with a basic block, we
+/// reverse the whole block's vec of instructions again, and concatenate onto
+/// the VCode's insts.
+pub struct VCodeBuilder<I: VCodeInst> {
+    /// In-progress VCode.
+    vcode: VCode<I>,
+
+    /// In-progress stack map-request info.
+    stack_map_info: StackmapRequestInfo,
+
+    /// Index of the last block-start in the vcode.
+    block_start: InsnIndex,
+
+    /// Start of succs for the current block in the concatenated succs list.
+    succ_start: usize,
+
+    /// Current source location.
+    cur_srcloc: SourceLoc,
+}
+
+impl<I: VCodeInst> VCodeBuilder<I> {
+    /// Create a new VCodeBuilder.
+    pub fn new(
+        abi: Box<dyn ABICallee<I = I>>,
+        emit_info: I::Info,
+        block_order: BlockLoweringOrder,
+        constants: VCodeConstants,
+    ) -> VCodeBuilder<I> {
+        let reftype_class = I::ref_type_regclass(abi.flags());
+        let vcode = VCode::new(abi, emit_info, block_order, constants);
+        let stack_map_info = StackmapRequestInfo {
+            reftype_class,
+            reftyped_vregs: vec![],
+            safepoint_insns: vec![],
+        };
+
+        VCodeBuilder {
+            vcode,
+            stack_map_info,
+            block_start: 0,
+            succ_start: 0,
+            cur_srcloc: SourceLoc::default(),
+        }
+    }
+
+    /// Access the ABI object.
+    pub fn abi(&mut self) -> &mut dyn ABICallee<I = I> {
+        &mut *self.vcode.abi
+    }
+
+    /// Access to the BlockLoweringOrder object.
+    pub fn block_order(&self) -> &BlockLoweringOrder {
+        &self.vcode.block_order
+    }
+
+    /// Set the type of a VReg.
+    pub fn set_vreg_type(&mut self, vreg: VirtualReg, ty: Type) {
+        if self.vcode.vreg_types.len() <= vreg.get_index() {
+            self.vcode
+                .vreg_types
+                .resize(vreg.get_index() + 1, ir::types::I8);
+        }
+        self.vcode.vreg_types[vreg.get_index()] = ty;
+        if is_reftype(ty) {
+            self.stack_map_info.reftyped_vregs.push(vreg);
+            self.vcode.have_ref_values = true;
+        }
+    }
+
+    /// Are there any reference-typed values at all among the vregs?
+    pub fn have_ref_values(&self) -> bool {
+        self.vcode.have_ref_values()
+    }
+
+    /// Set the current block as the entry block.
+    pub fn set_entry(&mut self, block: BlockIndex) {
+        self.vcode.entry = block;
+    }
+
+    /// End the current basic block. Must be called after emitting vcode insts
+    /// for IR insts and prior to ending the function (building the VCode).
+    pub fn end_bb(&mut self) {
+        let start_idx = self.block_start;
+        let end_idx = self.vcode.insts.len() as InsnIndex;
+        self.block_start = end_idx;
+        // Add the instruction index range to the list of blocks.
+        self.vcode.block_ranges.push((start_idx, end_idx));
+        // End the successors list.
+        let succ_end = self.vcode.block_succs.len();
+        self.vcode
+            .block_succ_range
+            .push((self.succ_start, succ_end));
+        self.succ_start = succ_end;
+    }
+
+    /// Push an instruction for the current BB and current IR inst within the BB.
+    pub fn push(&mut self, insn: I, is_safepoint: bool) {
+        match insn.is_term() {
+            MachTerminator::None | MachTerminator::Ret => {}
+            MachTerminator::Uncond(target) => {
+                self.vcode.block_succs.push(BlockIx::new(target.get()));
+            }
+            MachTerminator::Cond(true_branch, false_branch) => {
+                self.vcode.block_succs.push(BlockIx::new(true_branch.get()));
+                self.vcode
+                    .block_succs
+                    .push(BlockIx::new(false_branch.get()));
+            }
+            MachTerminator::Indirect(targets) => {
+                for target in targets {
+                    self.vcode.block_succs.push(BlockIx::new(target.get()));
+                }
+            }
+        }
+        self.vcode.insts.push(insn);
+        self.vcode.srclocs.push(self.cur_srcloc);
+        if is_safepoint {
+            self.stack_map_info
+                .safepoint_insns
+                .push(InstIx::new((self.vcode.insts.len() - 1) as u32));
+        }
+    }
+
+    /// Get the current source location.
+    pub fn get_srcloc(&self) -> SourceLoc {
+        self.cur_srcloc
+    }
+
+    /// Set the current source location.
+    pub fn set_srcloc(&mut self, srcloc: SourceLoc) {
+        self.cur_srcloc = srcloc;
+    }
+
+    /// Access the constants.
+    pub fn constants(&mut self) -> &mut VCodeConstants {
+        &mut self.vcode.constants
+    }
+
+    /// Build the final VCode, returning the vcode itself as well as auxiliary
+    /// information, such as the stack map request information.
+    pub fn build(self) -> (VCode<I>, StackmapRequestInfo) {
+        // TODO: come up with an abstraction for "vcode and auxiliary data". The
+        // auxiliary data needs to be separate from the vcode so that it can be
+        // referenced as the vcode is mutated (e.g. by the register allocator).
+        (self.vcode, self.stack_map_info)
+    }
+}
+
+fn is_redundant_move<I: VCodeInst>(insn: &I) -> bool {
+    if let Some((to, from)) = insn.is_move() {
+        to.to_reg() == from
+    } else {
+        false
+    }
+}
+
+/// Is this type a reference type?
+fn is_reftype(ty: Type) -> bool {
+    ty == types::R64 || ty == types::R32
+}
+
+impl<I: VCodeInst> VCode<I> {
+    /// New empty VCode.
+    fn new(
+        abi: Box<dyn ABICallee<I = I>>,
+        emit_info: I::Info,
+        block_order: BlockLoweringOrder,
+        constants: VCodeConstants,
+    ) -> VCode<I> {
+        VCode {
+            liveins: abi.liveins(),
+            liveouts: abi.liveouts(),
+            vreg_types: vec![],
+            have_ref_values: false,
+            insts: vec![],
+            srclocs: vec![],
+            entry: 0,
+            block_ranges: vec![],
+            block_succ_range: vec![],
+            block_succs: vec![],
+            block_order,
+            abi,
+            emit_info,
+            safepoint_insns: vec![],
+            safepoint_slots: vec![],
+            prologue_epilogue_ranges: None,
+            insts_layout: RefCell::new((vec![], 0)),
+            constants,
+        }
+    }
+
+    /// Returns the flags controlling this function's compilation.
+    pub fn flags(&self) -> &settings::Flags {
+        self.abi.flags()
+    }
+
+    /// Get the IR-level type of a VReg.
+    pub fn vreg_type(&self, vreg: VirtualReg) -> Type {
+        self.vreg_types[vreg.get_index()]
+    }
+
+    /// Are there any reference-typed values at all among the vregs?
+    pub fn have_ref_values(&self) -> bool {
+        self.have_ref_values
+    }
+
+    /// Get the entry block.
+    pub fn entry(&self) -> BlockIndex {
+        self.entry
+    }
+
+    /// Get the number of blocks. Block indices will be in the range `0 ..
+    /// (self.num_blocks() - 1)`.
+    pub fn num_blocks(&self) -> usize {
+        self.block_ranges.len()
+    }
+
+    /// Stack frame size for the full function's body.
+    pub fn frame_size(&self) -> u32 {
+        self.abi.frame_size()
+    }
+
+    /// Inbound stack-args size.
+    pub fn stack_args_size(&self) -> u32 {
+        self.abi.stack_args_size()
+    }
+
+    /// Get the successors for a block.
+    pub fn succs(&self, block: BlockIndex) -> &[BlockIx] {
+        let (start, end) = self.block_succ_range[block as usize];
+        &self.block_succs[start..end]
+    }
+
+    /// Take the results of register allocation, with a sequence of
+    /// instructions including spliced fill/reload/move instructions, and replace
+    /// the VCode with them.
+    pub fn replace_insns_from_regalloc(&mut self, result: RegAllocResult<Self>) {
+        // Record the spillslot count and clobbered registers for the ABI/stack
+        // setup code.
+        self.abi.set_num_spillslots(result.num_spill_slots as usize);
+        self.abi
+            .set_clobbered(result.clobbered_registers.map(|r| Writable::from_reg(*r)));
+
+        let mut final_insns = vec![];
+        let mut final_block_ranges = vec![(0, 0); self.num_blocks()];
+        let mut final_srclocs = vec![];
+        let mut final_safepoint_insns = vec![];
+        let mut safept_idx = 0;
+
+        let mut prologue_start = None;
+        let mut prologue_end = None;
+        let mut epilogue_islands = vec![];
+
+        assert!(result.target_map.elems().len() == self.num_blocks());
+        for block in 0..self.num_blocks() {
+            let start = result.target_map.elems()[block].get() as usize;
+            let end = if block == self.num_blocks() - 1 {
+                result.insns.len()
+            } else {
+                result.target_map.elems()[block + 1].get() as usize
+            };
+            let block = block as BlockIndex;
+            let final_start = final_insns.len() as InsnIndex;
+
+            if block == self.entry {
+                prologue_start = Some(final_insns.len() as InsnIndex);
+                // Start with the prologue.
+                let prologue = self.abi.gen_prologue();
+                let len = prologue.len();
+                final_insns.extend(prologue.into_iter());
+                final_srclocs.extend(iter::repeat(SourceLoc::default()).take(len));
+                prologue_end = Some(final_insns.len() as InsnIndex);
+            }
+
+            for i in start..end {
+                let insn = &result.insns[i];
+
+                // Elide redundant moves at this point (we only know what is
+                // redundant once registers are allocated).
+                if is_redundant_move(insn) {
+                    continue;
+                }
+
+                // Is there a srcloc associated with this insn? Look it up based on original
+                // instruction index (if new insn corresponds to some original insn, i.e., is not
+                // an inserted load/spill/move).
+                let orig_iix = result.orig_insn_map[InstIx::new(i as u32)];
+                let srcloc = if orig_iix.is_invalid() {
+                    SourceLoc::default()
+                } else {
+                    self.srclocs[orig_iix.get() as usize]
+                };
+
+                // Whenever encountering a return instruction, replace it
+                // with the epilogue.
+                let is_ret = insn.is_term() == MachTerminator::Ret;
+                if is_ret {
+                    let epilogue_start = final_insns.len() as InsnIndex;
+                    let epilogue = self.abi.gen_epilogue();
+                    let len = epilogue.len();
+                    final_insns.extend(epilogue.into_iter());
+                    final_srclocs.extend(iter::repeat(srcloc).take(len));
+                    epilogue_islands.push(epilogue_start..final_insns.len() as InsnIndex);
+                } else {
+                    final_insns.push(insn.clone());
+                    final_srclocs.push(srcloc);
+                }
+
+                // Was this instruction a safepoint instruction? Add its final
+                // index to the safepoint insn-index list if so.
+                if safept_idx < result.new_safepoint_insns.len()
+                    && (result.new_safepoint_insns[safept_idx].get() as usize) == i
+                {
+                    let idx = final_insns.len() - 1;
+                    final_safepoint_insns.push(idx as InsnIndex);
+                    safept_idx += 1;
+                }
+            }
+
+            let final_end = final_insns.len() as InsnIndex;
+            final_block_ranges[block as usize] = (final_start, final_end);
+        }
+
+        debug_assert!(final_insns.len() == final_srclocs.len());
+
+        self.insts = final_insns;
+        self.srclocs = final_srclocs;
+        self.block_ranges = final_block_ranges;
+        self.safepoint_insns = final_safepoint_insns;
+
+        // Save safepoint slot-lists. These will be passed to the `EmitState`
+        // for the machine backend during emission so that it can do
+        // target-specific translations of slot numbers to stack offsets.
+        self.safepoint_slots = result.stackmaps;
+
+        self.prologue_epilogue_ranges = Some((
+            prologue_start.unwrap()..prologue_end.unwrap(),
+            epilogue_islands.into_boxed_slice(),
+        ));
+    }
+
+    /// Emit the instructions to a `MachBuffer`, containing fixed-up code and external
+    /// reloc/trap/etc. records ready for use.
+    pub fn emit(&self) -> MachBuffer<I>
+    where
+        I: MachInstEmit,
+    {
+        let _tt = timing::vcode_emit();
+        let mut buffer = MachBuffer::new();
+        let mut state = I::State::new(&*self.abi);
+
+        // The first M MachLabels are reserved for block indices, the next N MachLabels for
+        // constants.
+        buffer.reserve_labels_for_blocks(self.num_blocks() as BlockIndex);
+        buffer.reserve_labels_for_constants(&self.constants);
+
+        let mut insts_layout = vec![0; self.insts.len()];
+
+        let mut safepoint_idx = 0;
+        let mut cur_srcloc = None;
+        for block in 0..self.num_blocks() {
+            let block = block as BlockIndex;
+            let new_offset = I::align_basic_block(buffer.cur_offset());
+            while new_offset > buffer.cur_offset() {
+                // Pad with NOPs up to the aligned block offset.
+                let nop = I::gen_nop((new_offset - buffer.cur_offset()) as usize);
+                nop.emit(&mut buffer, &self.emit_info, &mut Default::default());
+            }
+            assert_eq!(buffer.cur_offset(), new_offset);
+
+            let (start, end) = self.block_ranges[block as usize];
+            buffer.bind_label(MachLabel::from_block(block));
+            for iix in start..end {
+                let srcloc = self.srclocs[iix as usize];
+                if cur_srcloc != Some(srcloc) {
+                    if cur_srcloc.is_some() {
+                        buffer.end_srcloc();
+                    }
+                    buffer.start_srcloc(srcloc);
+                    cur_srcloc = Some(srcloc);
+                }
+                state.pre_sourceloc(cur_srcloc.unwrap_or(SourceLoc::default()));
+
+                if safepoint_idx < self.safepoint_insns.len()
+                    && self.safepoint_insns[safepoint_idx] == iix
+                {
+                    if self.safepoint_slots[safepoint_idx].len() > 0 {
+                        let stack_map = self.abi.spillslots_to_stack_map(
+                            &self.safepoint_slots[safepoint_idx][..],
+                            &state,
+                        );
+                        state.pre_safepoint(stack_map);
+                    }
+                    safepoint_idx += 1;
+                }
+
+                self.insts[iix as usize].emit(&mut buffer, &self.emit_info, &mut state);
+
+                insts_layout[iix as usize] = buffer.cur_offset();
+            }
+
+            if cur_srcloc.is_some() {
+                buffer.end_srcloc();
+                cur_srcloc = None;
+            }
+
+            // Do we need an island? Get the worst-case size of the next BB and see if, having
+            // emitted that many bytes, we will be beyond the deadline.
+            if block < (self.num_blocks() - 1) as BlockIndex {
+                let next_block = block + 1;
+                let next_block_range = self.block_ranges[next_block as usize];
+                let next_block_size = next_block_range.1 - next_block_range.0;
+                let worst_case_next_bb = I::worst_case_size() * next_block_size;
+                if buffer.island_needed(worst_case_next_bb) {
+                    buffer.emit_island();
+                }
+            }
+        }
+
+        // Emit the constants used by the function.
+        for (constant, data) in self.constants.iter() {
+            let label = buffer.get_label_for_constant(constant);
+            buffer.defer_constant(label, data.alignment(), data.as_slice(), u32::max_value());
+        }
+
+        *self.insts_layout.borrow_mut() = (insts_layout, buffer.cur_offset());
+
+        buffer
+    }
+
+    /// Generates unwind info.
+    pub fn unwind_info(
+        &self,
+    ) -> crate::result::CodegenResult<Option<crate::isa::unwind::input::UnwindInfo<Reg>>> {
+        let layout = &self.insts_layout.borrow();
+        let (prologue, epilogues) = self.prologue_epilogue_ranges.as_ref().unwrap();
+        let context = UnwindInfoContext {
+            insts: &self.insts,
+            insts_layout: &layout.0,
+            len: layout.1,
+            prologue: prologue.clone(),
+            epilogues,
+        };
+        I::UnwindInfo::create_unwind_info(context)
+    }
+
+    /// Get the IR block for a BlockIndex, if one exists.
+    pub fn bindex_to_bb(&self, block: BlockIndex) -> Option<ir::Block> {
+        self.block_order.lowered_order()[block as usize].orig_block()
+    }
+}
+
+impl<I: VCodeInst> RegallocFunction for VCode<I> {
+    type Inst = I;
+
+    fn insns(&self) -> &[I] {
+        &self.insts[..]
+    }
+
+    fn insns_mut(&mut self) -> &mut [I] {
+        &mut self.insts[..]
+    }
+
+    fn get_insn(&self, insn: InstIx) -> &I {
+        &self.insts[insn.get() as usize]
+    }
+
+    fn get_insn_mut(&mut self, insn: InstIx) -> &mut I {
+        &mut self.insts[insn.get() as usize]
+    }
+
+    fn blocks(&self) -> Range<BlockIx> {
+        Range::new(BlockIx::new(0), self.block_ranges.len())
+    }
+
+    fn entry_block(&self) -> BlockIx {
+        BlockIx::new(self.entry)
+    }
+
+    fn block_insns(&self, block: BlockIx) -> Range<InstIx> {
+        let (start, end) = self.block_ranges[block.get() as usize];
+        Range::new(InstIx::new(start), (end - start) as usize)
+    }
+
+    fn block_succs(&self, block: BlockIx) -> Cow<[BlockIx]> {
+        let (start, end) = self.block_succ_range[block.get() as usize];
+        Cow::Borrowed(&self.block_succs[start..end])
+    }
+
+    fn is_ret(&self, insn: InstIx) -> bool {
+        match self.insts[insn.get() as usize].is_term() {
+            MachTerminator::Ret => true,
+            _ => false,
+        }
+    }
+
+    fn is_included_in_clobbers(&self, insn: &I) -> bool {
+        insn.is_included_in_clobbers()
+    }
+
+    fn get_regs(insn: &I, collector: &mut RegUsageCollector) {
+        insn.get_regs(collector)
+    }
+
+    fn map_regs<RUM: RegUsageMapper>(insn: &mut I, mapper: &RUM) {
+        insn.map_regs(mapper);
+    }
+
+    fn is_move(&self, insn: &I) -> Option<(Writable<Reg>, Reg)> {
+        insn.is_move()
+    }
+
+    fn get_num_vregs(&self) -> usize {
+        self.vreg_types.len()
+    }
+
+    fn get_spillslot_size(&self, regclass: RegClass, vreg: VirtualReg) -> u32 {
+        let ty = self.vreg_type(vreg);
+        self.abi.get_spillslot_size(regclass, ty)
+    }
+
+    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, vreg: Option<VirtualReg>) -> I {
+        let ty = vreg.map(|v| self.vreg_type(v));
+        self.abi.gen_spill(to_slot, from_reg, ty)
+    }
+
+    fn gen_reload(
+        &self,
+        to_reg: Writable<RealReg>,
+        from_slot: SpillSlot,
+        vreg: Option<VirtualReg>,
+    ) -> I {
+        let ty = vreg.map(|v| self.vreg_type(v));
+        self.abi.gen_reload(to_reg, from_slot, ty)
+    }
+
+    fn gen_move(&self, to_reg: Writable<RealReg>, from_reg: RealReg, vreg: VirtualReg) -> I {
+        let ty = self.vreg_type(vreg);
+        I::gen_move(to_reg.map(|r| r.to_reg()), from_reg.to_reg(), ty)
+    }
+
+    fn gen_zero_len_nop(&self) -> I {
+        I::gen_zero_len_nop()
+    }
+
+    fn maybe_direct_reload(&self, insn: &I, reg: VirtualReg, slot: SpillSlot) -> Option<I> {
+        insn.maybe_direct_reload(reg, slot)
+    }
+
+    fn func_liveins(&self) -> RegallocSet<RealReg> {
+        self.liveins.clone()
+    }
+
+    fn func_liveouts(&self) -> RegallocSet<RealReg> {
+        self.liveouts.clone()
+    }
+}
+
+impl<I: VCodeInst> fmt::Debug for VCode<I> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        writeln!(f, "VCode_Debug {{")?;
+        writeln!(f, "  Entry block: {}", self.entry)?;
+
+        for block in 0..self.num_blocks() {
+            writeln!(f, "Block {}:", block,)?;
+            for succ in self.succs(block as BlockIndex) {
+                writeln!(f, "  (successor: Block {})", succ.get())?;
+            }
+            let (start, end) = self.block_ranges[block];
+            writeln!(f, "  (instruction range: {} .. {})", start, end)?;
+            for inst in start..end {
+                writeln!(f, "  Inst {}: {:?}", inst, self.insts[inst as usize])?;
+            }
+        }
+
+        writeln!(f, "}}")?;
+        Ok(())
+    }
+}
+
+/// Pretty-printing with `RealRegUniverse` context.
+impl<I: VCodeInst> PrettyPrint for VCode<I> {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        use std::fmt::Write;
+
+        let mut s = String::new();
+        write!(&mut s, "VCode_ShowWithRRU {{{{\n").unwrap();
+        write!(&mut s, "  Entry block: {}\n", self.entry).unwrap();
+
+        let mut state = Default::default();
+        let mut safepoint_idx = 0;
+        for i in 0..self.num_blocks() {
+            let block = i as BlockIndex;
+
+            write!(&mut s, "Block {}:\n", block).unwrap();
+            if let Some(bb) = self.bindex_to_bb(block) {
+                write!(&mut s, "  (original IR block: {})\n", bb).unwrap();
+            }
+            for succ in self.succs(block) {
+                write!(&mut s, "  (successor: Block {})\n", succ.get()).unwrap();
+            }
+            let (start, end) = self.block_ranges[block as usize];
+            write!(&mut s, "  (instruction range: {} .. {})\n", start, end).unwrap();
+            for inst in start..end {
+                if safepoint_idx < self.safepoint_insns.len()
+                    && self.safepoint_insns[safepoint_idx] == inst
+                {
+                    write!(
+                        &mut s,
+                        "      (safepoint: slots {:?} with EmitState {:?})\n",
+                        self.safepoint_slots[safepoint_idx], state,
+                    )
+                    .unwrap();
+                    safepoint_idx += 1;
+                }
+                write!(
+                    &mut s,
+                    "  Inst {}:   {}\n",
+                    inst,
+                    self.insts[inst as usize].pretty_print(mb_rru, &mut state)
+                )
+                .unwrap();
+            }
+        }
+
+        write!(&mut s, "}}}}\n").unwrap();
+
+        s
+    }
+}
+
+/// This structure tracks the large constants used in VCode that will be emitted separately by the
+/// [MachBuffer].
+///
+/// First, during the lowering phase, constants are inserted using
+/// [VCodeConstants.insert]; an intermediate handle, [VCodeConstant], tracks what constants are
+/// used in this phase. Some deduplication is performed, when possible, as constant
+/// values are inserted.
+///
+/// Secondly, during the emission phase, the [MachBuffer] assigns [MachLabel]s for each of the
+/// constants so that instructions can refer to the value's memory location. The [MachBuffer]
+/// then writes the constant values to the buffer.
+#[derive(Default)]
+pub struct VCodeConstants {
+    constants: PrimaryMap<VCodeConstant, VCodeConstantData>,
+    pool_uses: HashMap<Constant, VCodeConstant>,
+    well_known_uses: HashMap<*const [u8], VCodeConstant>,
+}
+impl VCodeConstants {
+    /// Initialize the structure with the expected number of constants.
+    pub fn with_capacity(expected_num_constants: usize) -> Self {
+        Self {
+            constants: PrimaryMap::with_capacity(expected_num_constants),
+            pool_uses: HashMap::with_capacity(expected_num_constants),
+            well_known_uses: HashMap::new(),
+        }
+    }
+
+    /// Insert a constant; using this method indicates that a constant value will be used and thus
+    /// will be emitted to the `MachBuffer`. The current implementation can deduplicate constants
+    /// that are [VCodeConstantData::Pool] or [VCodeConstantData::WellKnown] but not
+    /// [VCodeConstantData::Generated].
+    pub fn insert(&mut self, data: VCodeConstantData) -> VCodeConstant {
+        match data {
+            VCodeConstantData::Generated(_) => self.constants.push(data),
+            VCodeConstantData::Pool(constant, _) => match self.pool_uses.get(&constant) {
+                None => {
+                    let vcode_constant = self.constants.push(data);
+                    self.pool_uses.insert(constant, vcode_constant);
+                    vcode_constant
+                }
+                Some(&vcode_constant) => vcode_constant,
+            },
+            VCodeConstantData::WellKnown(data_ref) => {
+                match self.well_known_uses.get(&(data_ref as *const [u8])) {
+                    None => {
+                        let vcode_constant = self.constants.push(data);
+                        self.well_known_uses
+                            .insert(data_ref as *const [u8], vcode_constant);
+                        vcode_constant
+                    }
+                    Some(&vcode_constant) => vcode_constant,
+                }
+            }
+        }
+    }
+
+    /// Retrieve a byte slice for the given [VCodeConstant], if available.
+    pub fn get(&self, constant: VCodeConstant) -> Option<&[u8]> {
+        self.constants.get(constant).map(|d| d.as_slice())
+    }
+
+    /// Return the number of constants inserted.
+    pub fn len(&self) -> usize {
+        self.constants.len()
+    }
+
+    /// Iterate over the [VCodeConstant] keys inserted in this structure.
+    pub fn keys(&self) -> Keys<VCodeConstant> {
+        self.constants.keys()
+    }
+
+    /// Iterate over the [VCodeConstant] keys and the data (as a byte slice) inserted in this
+    /// structure.
+    pub fn iter(&self) -> impl Iterator<Item = (VCodeConstant, &VCodeConstantData)> {
+        self.constants.iter()
+    }
+}
+
+/// A use of a constant by one or more VCode instructions; see [VCodeConstants].
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct VCodeConstant(u32);
+entity_impl!(VCodeConstant);
+
+/// Identify the different types of constant that can be inserted into [VCodeConstants]. Tracking
+/// these separately instead of as raw byte buffers allows us to avoid some duplication.
+pub enum VCodeConstantData {
+    /// A constant already present in the Cranelift IR
+    /// [ConstantPool](crate::ir::constant::ConstantPool).
+    Pool(Constant, ConstantData),
+    /// A reference to a well-known constant value that is statically encoded within the compiler.
+    WellKnown(&'static [u8]),
+    /// A constant value generated during lowering; the value may depend on the instruction context
+    /// which makes it difficult to de-duplicate--if possible, use other variants.
+    Generated(ConstantData),
+}
+impl VCodeConstantData {
+    /// Retrieve the constant data as a byte slice.
+    pub fn as_slice(&self) -> &[u8] {
+        match self {
+            VCodeConstantData::Pool(_, d) | VCodeConstantData::Generated(d) => d.as_slice(),
+            VCodeConstantData::WellKnown(d) => d,
+        }
+    }
+
+    /// Calculate the alignment of the constant data.
+    pub fn alignment(&self) -> u32 {
+        if self.as_slice().len() <= 8 {
+            8
+        } else {
+            16
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use std::mem::size_of;
+
+    #[test]
+    fn size_of_constant_structs() {
+        assert_eq!(size_of::<Constant>(), 4);
+        assert_eq!(size_of::<VCodeConstant>(), 4);
+        assert_eq!(size_of::<ConstantData>(), 24);
+        assert_eq!(size_of::<VCodeConstantData>(), 32);
+        assert_eq!(
+            size_of::<PrimaryMap<VCodeConstant, VCodeConstantData>>(),
+            24
+        );
+        // TODO The VCodeConstants structure's memory size could be further optimized.
+        // With certain versions of Rust, each `HashMap` in `VCodeConstants` occupied at
+        // least 48 bytes, making an empty `VCodeConstants` cost 120 bytes.
+    }
+}