59 files changed, 49674 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/abi.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/abi.rs
new file mode 100644
index 0000000000..dfb7db4dbf
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/abi.rs
@@ -0,0 +1,850 @@
+//! Implementation of a standard AArch64 ABI.
+
+use crate::ir;
+use crate::ir::types;
+use crate::ir::types::*;
+use crate::ir::MemFlags;
+use crate::isa;
+use crate::isa::aarch64::{inst::EmitState, inst::*};
+use crate::machinst::*;
+use crate::settings;
+use crate::{CodegenError, CodegenResult};
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use regalloc::{RealReg, Reg, RegClass, Set, Writable};
+use smallvec::SmallVec;
+
+// We use a generic implementation that factors out AArch64 and x64 ABI commonalities, because
+// these ABIs are very similar.
+
+/// Support for the AArch64 ABI from the callee side (within a function body).
+pub(crate) type AArch64ABICallee = ABICalleeImpl<AArch64MachineDeps>;
+
+/// Support for the AArch64 ABI from the caller side (at a callsite).
+pub(crate) type AArch64ABICaller = ABICallerImpl<AArch64MachineDeps>;
+
+// Spidermonkey specific ABI convention.
+
+/// This is SpiderMonkey's `WasmTableCallSigReg`.
+static BALDRDASH_SIG_REG: u8 = 10;
+
+/// This is SpiderMonkey's `WasmTlsReg`.
+static BALDRDASH_TLS_REG: u8 = 23;
+
+/// Offset in stack-arg area to callee-TLS slot in Baldrdash-2020 calling convention.
+static BALDRDASH_CALLEE_TLS_OFFSET: i64 = 0;
+/// Offset in stack-arg area to caller-TLS slot in Baldrdash-2020 calling convention.
+static BALDRDASH_CALLER_TLS_OFFSET: i64 = 8;
+
+// These two lists represent the registers the JIT may *not* use at any point in generated code.
+//
+// So these are callee-preserved from the JIT's point of view, and every register not in this list
+// has to be caller-preserved by definition.
+//
+// Keep these lists in sync with the NonAllocatableMask set in Spidermonkey's
+// Architecture-arm64.cpp.
+
+// Indexed by physical register number.
+#[rustfmt::skip]
+static BALDRDASH_JIT_CALLEE_SAVED_GPR: &[bool] = &[
+    /* 0 = */ false, false, false, false, false, false, false, false,
+    /* 8 = */ false, false, false, false, false, false, false, false,
+    /* 16 = */ true /* x16 / ip1 */, true /* x17 / ip2 */, true /* x18 / TLS */, false,
+    /* 20 = */ false, false, false, false,
+    /* 24 = */ false, false, false, false,
+    // There should be 28, the pseudo stack pointer in this list, however the wasm stubs trash it
+    // gladly right now.
+    /* 28 = */ false, false, true /* x30 = FP */, false /* x31 = SP */
+];
+
+#[rustfmt::skip]
+static BALDRDASH_JIT_CALLEE_SAVED_FPU: &[bool] = &[
+    /* 0 = */ false, false, false, false, false, false, false, false,
+    /* 8 = */ false, false, false, false, false, false, false, false,
+    /* 16 = */ false, false, false, false, false, false, false, false,
+    /* 24 = */ false, false, false, false, false, false, false, true /* v31 / d31 */
+];
+
+/// This is the limit for the size of argument and return-value areas on the
+/// stack. We place a reasonable limit here to avoid integer overflow issues
+/// with 32-bit arithmetic: for now, 128 MB.
+static STACK_ARG_RET_SIZE_LIMIT: u64 = 128 * 1024 * 1024;
+
+/// Try to fill a Baldrdash register, returning it if it was found.
+fn try_fill_baldrdash_reg(call_conv: isa::CallConv, param: &ir::AbiParam) -> Option<ABIArg> {
+    if call_conv.extends_baldrdash() {
+        match &param.purpose {
+            &ir::ArgumentPurpose::VMContext => {
+                // This is SpiderMonkey's `WasmTlsReg`.
+                Some(ABIArg::Reg(
+                    xreg(BALDRDASH_TLS_REG).to_real_reg(),
+                    ir::types::I64,
+                    param.extension,
+                    param.purpose,
+                ))
+            }
+            &ir::ArgumentPurpose::SignatureId => {
+                // This is SpiderMonkey's `WasmTableCallSigReg`.
+                Some(ABIArg::Reg(
+                    xreg(BALDRDASH_SIG_REG).to_real_reg(),
+                    ir::types::I64,
+                    param.extension,
+                    param.purpose,
+                ))
+            }
+            &ir::ArgumentPurpose::CalleeTLS => {
+                // This is SpiderMonkey's callee TLS slot in the extended frame of Wasm's ABI-2020.
+                assert!(call_conv == isa::CallConv::Baldrdash2020);
+                Some(ABIArg::Stack(
+                    BALDRDASH_CALLEE_TLS_OFFSET,
+                    ir::types::I64,
+                    ir::ArgumentExtension::None,
+                    param.purpose,
+                ))
+            }
+            &ir::ArgumentPurpose::CallerTLS => {
+                // This is SpiderMonkey's caller TLS slot in the extended frame of Wasm's ABI-2020.
+                assert!(call_conv == isa::CallConv::Baldrdash2020);
+                Some(ABIArg::Stack(
+                    BALDRDASH_CALLER_TLS_OFFSET,
+                    ir::types::I64,
+                    ir::ArgumentExtension::None,
+                    param.purpose,
+                ))
+            }
+            _ => None,
+        }
+    } else {
+        None
+    }
+}
+
+impl Into<AMode> for StackAMode {
+    fn into(self) -> AMode {
+        match self {
+            StackAMode::FPOffset(off, ty) => AMode::FPOffset(off, ty),
+            StackAMode::NominalSPOffset(off, ty) => AMode::NominalSPOffset(off, ty),
+            StackAMode::SPOffset(off, ty) => AMode::SPOffset(off, ty),
+        }
+    }
+}
+
+// Returns the size of stack space needed to store the
+// `int_reg` and `vec_reg`.
+fn saved_reg_stack_size(
+    int_reg: &[Writable<RealReg>],
+    vec_reg: &[Writable<RealReg>],
+) -> (usize, usize) {
+    // Round up to multiple of 2, to keep 16-byte stack alignment.
+    let int_save_bytes = (int_reg.len() + (int_reg.len() & 1)) * 8;
+    let vec_save_bytes = vec_reg.len() * 16;
+    (int_save_bytes, vec_save_bytes)
+}
+
+/// AArch64-specific ABI behavior. This struct just serves as an implementation
+/// point for the trait; it is never actually instantiated.
+pub(crate) struct AArch64MachineDeps;
+
+impl ABIMachineSpec for AArch64MachineDeps {
+    type I = Inst;
+
+    fn word_bits() -> u32 {
+        64
+    }
+
+    /// Return required stack alignment in bytes.
+    fn stack_align(_call_conv: isa::CallConv) -> u32 {
+        16
+    }
+
+    fn compute_arg_locs(
+        call_conv: isa::CallConv,
+        params: &[ir::AbiParam],
+        args_or_rets: ArgsOrRets,
+        add_ret_area_ptr: bool,
+    ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
+        let is_baldrdash = call_conv.extends_baldrdash();
+        let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020;
+
+        // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4.
+        let mut next_xreg = 0;
+        let mut next_vreg = 0;
+        let mut next_stack: u64 = 0;
+        let mut ret = vec![];
+
+        if args_or_rets == ArgsOrRets::Args && has_baldrdash_tls {
+            // Baldrdash ABI-2020 always has two stack-arg slots reserved, for the callee and
+            // caller TLS-register values, respectively.
+            next_stack = 16;
+        }
+
+        // Note on return values: on the regular non-baldrdash ABI, we may return values in 8
+        // registers for V128 and I64 registers independently of the number of register values
+        // returned in the other class. That is, we can return values in up to 8 integer and 8
+        // vector registers at once.
+        // In Baldrdash, we can only use one register for return value for all the register
+        // classes. That is, we can't return values in both one integer and one vector register;
+        // only one return value may be in a register.
+
+        let (max_per_class_reg_vals, mut remaining_reg_vals) = match (args_or_rets, is_baldrdash) {
+            (ArgsOrRets::Args, _) => (8, 16),     // x0-x7 and v0-v7
+            (ArgsOrRets::Rets, false) => (8, 16), // x0-x7 and v0-v7
+            (ArgsOrRets::Rets, true) => (1, 1),   // x0 or v0, but not both
+        };
+
+        for i in 0..params.len() {
+            // Process returns backward, according to the SpiderMonkey ABI (which we
+            // adopt internally if `is_baldrdash` is set).
+            let param = match (args_or_rets, is_baldrdash) {
+                (ArgsOrRets::Args, _) => &params[i],
+                (ArgsOrRets::Rets, false) => &params[i],
+                (ArgsOrRets::Rets, true) => &params[params.len() - 1 - i],
+            };
+
+            // Validate "purpose".
+            match &param.purpose {
+                &ir::ArgumentPurpose::VMContext
+                | &ir::ArgumentPurpose::Normal
+                | &ir::ArgumentPurpose::StackLimit
+                | &ir::ArgumentPurpose::SignatureId
+                | &ir::ArgumentPurpose::CallerTLS
+                | &ir::ArgumentPurpose::CalleeTLS => {}
+                _ => panic!(
+                    "Unsupported argument purpose {:?} in signature: {:?}",
+                    param.purpose, params
+                ),
+            }
+
+            assert!(
+                legal_type_for_machine(param.value_type),
+                "Invalid type for AArch64: {:?}",
+                param.value_type
+            );
+            let rc = Inst::rc_for_type(param.value_type).unwrap();
+
+            let next_reg = match rc {
+                RegClass::I64 => &mut next_xreg,
+                RegClass::V128 => &mut next_vreg,
+                _ => panic!("Invalid register class: {:?}", rc),
+            };
+
+            if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
+                assert!(rc == RegClass::I64);
+                ret.push(param);
+            } else if *next_reg < max_per_class_reg_vals && remaining_reg_vals > 0 {
+                let reg = match rc {
+                    RegClass::I64 => xreg(*next_reg),
+                    RegClass::V128 => vreg(*next_reg),
+                    _ => unreachable!(),
+                };
+                ret.push(ABIArg::Reg(
+                    reg.to_real_reg(),
+                    param.value_type,
+                    param.extension,
+                    param.purpose,
+                ));
+                *next_reg += 1;
+                remaining_reg_vals -= 1;
+            } else {
+                // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte
+                // stack alignment happens separately after all args.)
+                let size = (ty_bits(param.value_type) / 8) as u64;
+                let size = std::cmp::max(size, 8);
+                // Align.
+                debug_assert!(size.is_power_of_two());
+                next_stack = (next_stack + size - 1) & !(size - 1);
+                ret.push(ABIArg::Stack(
+                    next_stack as i64,
+                    param.value_type,
+                    param.extension,
+                    param.purpose,
+                ));
+                next_stack += size;
+            }
+        }
+
+        if args_or_rets == ArgsOrRets::Rets && is_baldrdash {
+            ret.reverse();
+        }
+
+        let extra_arg = if add_ret_area_ptr {
+            debug_assert!(args_or_rets == ArgsOrRets::Args);
+            if next_xreg < max_per_class_reg_vals && remaining_reg_vals > 0 {
+                ret.push(ABIArg::Reg(
+                    xreg(next_xreg).to_real_reg(),
+                    I64,
+                    ir::ArgumentExtension::None,
+                    ir::ArgumentPurpose::Normal,
+                ));
+            } else {
+                ret.push(ABIArg::Stack(
+                    next_stack as i64,
+                    I64,
+                    ir::ArgumentExtension::None,
+                    ir::ArgumentPurpose::Normal,
+                ));
+                next_stack += 8;
+            }
+            Some(ret.len() - 1)
+        } else {
+            None
+        };
+
+        next_stack = (next_stack + 15) & !15;
+
+        // To avoid overflow issues, limit the arg/return size to something
+        // reasonable -- here, 128 MB.
+        if next_stack > STACK_ARG_RET_SIZE_LIMIT {
+            return Err(CodegenError::ImplLimitExceeded);
+        }
+
+        Ok((ret, next_stack as i64, extra_arg))
+    }
+
+    fn fp_to_arg_offset(call_conv: isa::CallConv, flags: &settings::Flags) -> i64 {
+        if call_conv.extends_baldrdash() {
+            let num_words = flags.baldrdash_prologue_words() as i64;
+            debug_assert!(num_words > 0, "baldrdash must set baldrdash_prologue_words");
+            debug_assert_eq!(num_words % 2, 0, "stack must be 16-aligned");
+            num_words * 8
+        } else {
+            16 // frame pointer + return address.
+        }
+    }
+
+    fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Inst {
+        Inst::gen_load(into_reg, mem.into(), ty, MemFlags::trusted())
+    }
+
+    fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Inst {
+        Inst::gen_store(mem.into(), from_reg, ty, MemFlags::trusted())
+    }
+
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst {
+        Inst::gen_move(to_reg, from_reg, ty)
+    }
+
+    fn gen_extend(
+        to_reg: Writable<Reg>,
+        from_reg: Reg,
+        signed: bool,
+        from_bits: u8,
+        to_bits: u8,
+    ) -> Inst {
+        assert!(from_bits < to_bits);
+        Inst::Extend {
+            rd: to_reg,
+            rn: from_reg,
+            signed,
+            from_bits,
+            to_bits,
+        }
+    }
+
+    fn gen_ret() -> Inst {
+        Inst::Ret
+    }
+
+    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Inst; 4]> {
+        let imm = imm as u64;
+        let mut insts = SmallVec::new();
+        if let Some(imm12) = Imm12::maybe_from_u64(imm) {
+            insts.push(Inst::AluRRImm12 {
+                alu_op: ALUOp::Add64,
+                rd: into_reg,
+                rn: from_reg,
+                imm12,
+            });
+        } else {
+            let scratch2 = writable_tmp2_reg();
+            assert_ne!(scratch2.to_reg(), from_reg);
+            insts.extend(Inst::load_constant(scratch2, imm.into()));
+            insts.push(Inst::AluRRRExtend {
+                alu_op: ALUOp::Add64,
+                rd: into_reg,
+                rn: from_reg,
+                rm: scratch2.to_reg(),
+                extendop: ExtendOp::UXTX,
+            });
+        }
+        insts
+    }
+
+    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Inst; 2]> {
+        let mut insts = SmallVec::new();
+        insts.push(Inst::AluRRRExtend {
+            alu_op: ALUOp::SubS64,
+            rd: writable_zero_reg(),
+            rn: stack_reg(),
+            rm: limit_reg,
+            extendop: ExtendOp::UXTX,
+        });
+        insts.push(Inst::TrapIf {
+            trap_code: ir::TrapCode::StackOverflow,
+            // Here `Lo` == "less than" when interpreting the two
+            // operands as unsigned integers.
+            kind: CondBrKind::Cond(Cond::Lo),
+        });
+        insts
+    }
+
+    fn gen_epilogue_placeholder() -> Inst {
+        Inst::EpiloguePlaceholder
+    }
+
+    fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Inst {
+        let mem = mem.into();
+        Inst::LoadAddr { rd: into_reg, mem }
+    }
+
+    fn get_stacklimit_reg() -> Reg {
+        spilltmp_reg()
+    }
+
+    fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Inst {
+        let mem = AMode::RegOffset(base, offset as i64, ty);
+        Inst::gen_load(into_reg, mem, ty, MemFlags::trusted())
+    }
+
+    fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Inst {
+        let mem = AMode::RegOffset(base, offset as i64, ty);
+        Inst::gen_store(mem, from_reg, ty, MemFlags::trusted())
+    }
+
+    fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Inst; 2]> {
+        if amount == 0 {
+            return SmallVec::new();
+        }
+
+        let (amount, is_sub) = if amount > 0 {
+            (amount as u64, false)
+        } else {
+            (-amount as u64, true)
+        };
+
+        let alu_op = if is_sub { ALUOp::Sub64 } else { ALUOp::Add64 };
+
+        let mut ret = SmallVec::new();
+        if let Some(imm12) = Imm12::maybe_from_u64(amount) {
+            let adj_inst = Inst::AluRRImm12 {
+                alu_op,
+                rd: writable_stack_reg(),
+                rn: stack_reg(),
+                imm12,
+            };
+            ret.push(adj_inst);
+        } else {
+            let tmp = writable_spilltmp_reg();
+            let const_inst = Inst::load_constant(tmp, amount);
+            let adj_inst = Inst::AluRRRExtend {
+                alu_op,
+                rd: writable_stack_reg(),
+                rn: stack_reg(),
+                rm: tmp.to_reg(),
+                extendop: ExtendOp::UXTX,
+            };
+            ret.extend(const_inst);
+            ret.push(adj_inst);
+        }
+        ret
+    }
+
+    fn gen_nominal_sp_adj(offset: i32) -> Inst {
+        Inst::VirtualSPOffsetAdj {
+            offset: offset as i64,
+        }
+    }
+
+    fn gen_prologue_frame_setup() -> SmallVec<[Inst; 2]> {
+        let mut insts = SmallVec::new();
+        // stp fp (x29), lr (x30), [sp, #-16]!
+        insts.push(Inst::StoreP64 {
+            rt: fp_reg(),
+            rt2: link_reg(),
+            mem: PairAMode::PreIndexed(
+                writable_stack_reg(),
+                SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        });
+        // mov fp (x29), sp. This uses the ADDI rd, rs, 0 form of `MOV` because
+        // the usual encoding (`ORR`) does not work with SP.
+        insts.push(Inst::AluRRImm12 {
+            alu_op: ALUOp::Add64,
+            rd: writable_fp_reg(),
+            rn: stack_reg(),
+            imm12: Imm12 {
+                bits: 0,
+                shift12: false,
+            },
+        });
+        insts
+    }
+
+    fn gen_epilogue_frame_restore() -> SmallVec<[Inst; 2]> {
+        let mut insts = SmallVec::new();
+
+        // MOV (alias of ORR) interprets x31 as XZR, so use an ADD here.
+        // MOV to SP is an alias of ADD.
+        insts.push(Inst::AluRRImm12 {
+            alu_op: ALUOp::Add64,
+            rd: writable_stack_reg(),
+            rn: fp_reg(),
+            imm12: Imm12 {
+                bits: 0,
+                shift12: false,
+            },
+        });
+        insts.push(Inst::LoadP64 {
+            rt: writable_fp_reg(),
+            rt2: writable_link_reg(),
+            mem: PairAMode::PostIndexed(
+                writable_stack_reg(),
+                SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        });
+
+        insts
+    }
+
+    // Returns stack bytes used as well as instructions. Does not adjust
+    // nominal SP offset; abi_impl generic code will do that.
+    fn gen_clobber_save(
+        call_conv: isa::CallConv,
+        _: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
+    ) -> (u64, SmallVec<[Inst; 16]>) {
+        let mut insts = SmallVec::new();
+        let (clobbered_int, clobbered_vec) = get_regs_saved_in_prologue(call_conv, clobbers);
+
+        let (int_save_bytes, vec_save_bytes) = saved_reg_stack_size(&clobbered_int, &clobbered_vec);
+        let total_save_bytes = (vec_save_bytes + int_save_bytes) as i32;
+        insts.extend(Self::gen_sp_reg_adjust(
+            -(total_save_bytes + fixed_frame_storage_size as i32),
+        ));
+
+        for (i, reg_pair) in clobbered_int.chunks(2).enumerate() {
+            let (r1, r2) = if reg_pair.len() == 2 {
+                // .to_reg().to_reg(): Writable<RealReg> --> RealReg --> Reg
+                (reg_pair[0].to_reg().to_reg(), reg_pair[1].to_reg().to_reg())
+            } else {
+                (reg_pair[0].to_reg().to_reg(), zero_reg())
+            };
+
+            debug_assert!(r1.get_class() == RegClass::I64);
+            debug_assert!(r2.get_class() == RegClass::I64);
+
+            // stp r1, r2, [sp, #(i * #16)]
+            insts.push(Inst::StoreP64 {
+                rt: r1,
+                rt2: r2,
+                mem: PairAMode::SignedOffset(
+                    stack_reg(),
+                    SImm7Scaled::maybe_from_i64((i * 16) as i64, types::I64).unwrap(),
+                ),
+                flags: MemFlags::trusted(),
+            });
+        }
+
+        let vec_offset = int_save_bytes;
+        for (i, reg) in clobbered_vec.iter().enumerate() {
+            insts.push(Inst::FpuStore128 {
+                rd: reg.to_reg().to_reg(),
+                mem: AMode::Unscaled(
+                    stack_reg(),
+                    SImm9::maybe_from_i64((vec_offset + (i * 16)) as i64).unwrap(),
+                ),
+                flags: MemFlags::trusted(),
+            });
+        }
+
+        (total_save_bytes as u64, insts)
+    }
+
+    fn gen_clobber_restore(
+        call_conv: isa::CallConv,
+        flags: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        _fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
+    ) -> SmallVec<[Inst; 16]> {
+        let mut insts = SmallVec::new();
+        let (clobbered_int, clobbered_vec) = get_regs_saved_in_prologue(call_conv, clobbers);
+
+        let (int_save_bytes, vec_save_bytes) = saved_reg_stack_size(&clobbered_int, &clobbered_vec);
+        for (i, reg_pair) in clobbered_int.chunks(2).enumerate() {
+            let (r1, r2) = if reg_pair.len() == 2 {
+                (
+                    reg_pair[0].map(|r| r.to_reg()),
+                    reg_pair[1].map(|r| r.to_reg()),
+                )
+            } else {
+                (reg_pair[0].map(|r| r.to_reg()), writable_zero_reg())
+            };
+
+            debug_assert!(r1.to_reg().get_class() == RegClass::I64);
+            debug_assert!(r2.to_reg().get_class() == RegClass::I64);
+
+            // ldp r1, r2, [sp, #(i * 16)]
+            insts.push(Inst::LoadP64 {
+                rt: r1,
+                rt2: r2,
+                mem: PairAMode::SignedOffset(
+                    stack_reg(),
+                    SImm7Scaled::maybe_from_i64((i * 16) as i64, types::I64).unwrap(),
+                ),
+                flags: MemFlags::trusted(),
+            });
+        }
+
+        for (i, reg) in clobbered_vec.iter().enumerate() {
+            insts.push(Inst::FpuLoad128 {
+                rd: Writable::from_reg(reg.to_reg().to_reg()),
+                mem: AMode::Unscaled(
+                    stack_reg(),
+                    SImm9::maybe_from_i64(((i * 16) + int_save_bytes) as i64).unwrap(),
+                ),
+                flags: MemFlags::trusted(),
+            });
+        }
+
+        // For non-baldrdash calling conventions, the frame pointer
+        // will be moved into the stack pointer in the epilogue, so we
+        // can skip restoring the stack pointer value with this `add`.
+        if call_conv.extends_baldrdash() {
+            let total_save_bytes = (int_save_bytes + vec_save_bytes) as i32;
+            insts.extend(Self::gen_sp_reg_adjust(total_save_bytes));
+        }
+
+        // If this is Baldrdash-2020, restore the callee (i.e., our) TLS
+        // register. We may have allocated it for something else and clobbered
+        // it, but the ABI expects us to leave the TLS register unchanged.
+        if call_conv == isa::CallConv::Baldrdash2020 {
+            let off = BALDRDASH_CALLEE_TLS_OFFSET + Self::fp_to_arg_offset(call_conv, flags);
+            insts.push(Inst::gen_load(
+                writable_xreg(BALDRDASH_TLS_REG),
+                AMode::UnsignedOffset(fp_reg(), UImm12Scaled::maybe_from_i64(off, I64).unwrap()),
+                I64,
+                MemFlags::trusted(),
+            ));
+        }
+
+        insts
+    }
+
+    fn gen_call(
+        dest: &CallDest,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: ir::Opcode,
+        tmp: Writable<Reg>,
+        callee_conv: isa::CallConv,
+        caller_conv: isa::CallConv,
+    ) -> SmallVec<[(InstIsSafepoint, Inst); 2]> {
+        let mut insts = SmallVec::new();
+        match &dest {
+            &CallDest::ExtName(ref name, RelocDistance::Near) => insts.push((
+                InstIsSafepoint::Yes,
+                Inst::Call {
+                    info: Box::new(CallInfo {
+                        dest: name.clone(),
+                        uses,
+                        defs,
+                        opcode,
+                        caller_callconv: caller_conv,
+                        callee_callconv: callee_conv,
+                    }),
+                },
+            )),
+            &CallDest::ExtName(ref name, RelocDistance::Far) => {
+                insts.push((
+                    InstIsSafepoint::No,
+                    Inst::LoadExtName {
+                        rd: tmp,
+                        name: Box::new(name.clone()),
+                        offset: 0,
+                    },
+                ));
+                insts.push((
+                    InstIsSafepoint::Yes,
+                    Inst::CallInd {
+                        info: Box::new(CallIndInfo {
+                            rn: tmp.to_reg(),
+                            uses,
+                            defs,
+                            opcode,
+                            caller_callconv: caller_conv,
+                            callee_callconv: callee_conv,
+                        }),
+                    },
+                ));
+            }
+            &CallDest::Reg(reg) => insts.push((
+                InstIsSafepoint::Yes,
+                Inst::CallInd {
+                    info: Box::new(CallIndInfo {
+                        rn: *reg,
+                        uses,
+                        defs,
+                        opcode,
+                        caller_callconv: caller_conv,
+                        callee_callconv: callee_conv,
+                    }),
+                },
+            )),
+        }
+
+        insts
+    }
+
+    fn get_number_of_spillslots_for_value(rc: RegClass, ty: Type) -> u32 {
+        // We allocate in terms of 8-byte slots.
+        match (rc, ty) {
+            (RegClass::I64, _) => 1,
+            (RegClass::V128, F32) | (RegClass::V128, F64) => 1,
+            (RegClass::V128, _) => 2,
+            _ => panic!("Unexpected register class!"),
+        }
+    }
+
+    /// Get the current virtual-SP offset from an instruction-emission state.
+    fn get_virtual_sp_offset_from_state(s: &EmitState) -> i64 {
+        s.virtual_sp_offset
+    }
+
+    /// Get the nominal-SP-to-FP offset from an instruction-emission state.
+    fn get_nominal_sp_to_fp(s: &EmitState) -> i64 {
+        s.nominal_sp_to_fp
+    }
+
+    fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> Vec<Writable<Reg>> {
+        let mut caller_saved = Vec::new();
+        for i in 0..29 {
+            let x = writable_xreg(i);
+            if is_reg_clobbered_by_call(call_conv_of_callee, x.to_reg().to_real_reg()) {
+                caller_saved.push(x);
+            }
+        }
+        for i in 0..32 {
+            let v = writable_vreg(i);
+            if is_reg_clobbered_by_call(call_conv_of_callee, v.to_reg().to_real_reg()) {
+                caller_saved.push(v);
+            }
+        }
+        caller_saved
+    }
+}
+
+/// Is this type supposed to be seen on this machine? E.g. references of the
+/// wrong width are invalid.
+fn legal_type_for_machine(ty: Type) -> bool {
+    match ty {
+        R32 => false,
+        _ => true,
+    }
+}
+
+/// Is the given register saved in the prologue if clobbered, i.e., is it a
+/// callee-save?
+fn is_reg_saved_in_prologue(call_conv: isa::CallConv, r: RealReg) -> bool {
+    if call_conv.extends_baldrdash() {
+        match r.get_class() {
+            RegClass::I64 => {
+                let enc = r.get_hw_encoding();
+                return BALDRDASH_JIT_CALLEE_SAVED_GPR[enc];
+            }
+            RegClass::V128 => {
+                let enc = r.get_hw_encoding();
+                return BALDRDASH_JIT_CALLEE_SAVED_FPU[enc];
+            }
+            _ => unimplemented!("baldrdash callee saved on non-i64 reg classes"),
+        };
+    }
+
+    match r.get_class() {
+        RegClass::I64 => {
+            // x19 - x28 inclusive are callee-saves.
+            r.get_hw_encoding() >= 19 && r.get_hw_encoding() <= 28
+        }
+        RegClass::V128 => {
+            // v8 - v15 inclusive are callee-saves.
+            r.get_hw_encoding() >= 8 && r.get_hw_encoding() <= 15
+        }
+        _ => panic!("Unexpected RegClass"),
+    }
+}
+
+/// Return the set of all integer and vector registers that must be saved in the
+/// prologue and restored in the epilogue, given the set of all registers
+/// written by the function's body.
+fn get_regs_saved_in_prologue(
+    call_conv: isa::CallConv,
+    regs: &Set<Writable<RealReg>>,
+) -> (Vec<Writable<RealReg>>, Vec<Writable<RealReg>>) {
+    let mut int_saves = vec![];
+    let mut vec_saves = vec![];
+    for &reg in regs.iter() {
+        if is_reg_saved_in_prologue(call_conv, reg.to_reg()) {
+            match reg.to_reg().get_class() {
+                RegClass::I64 => int_saves.push(reg),
+                RegClass::V128 => vec_saves.push(reg),
+                _ => panic!("Unexpected RegClass"),
+            }
+        }
+    }
+    // Sort registers for deterministic code output. We can do an unstable sort because the
+    // registers will be unique (there are no dups).
+    int_saves.sort_unstable_by_key(|r| r.to_reg().get_index());
+    vec_saves.sort_unstable_by_key(|r| r.to_reg().get_index());
+    (int_saves, vec_saves)
+}
+
+fn is_reg_clobbered_by_call(call_conv_of_callee: isa::CallConv, r: RealReg) -> bool {
+    if call_conv_of_callee.extends_baldrdash() {
+        match r.get_class() {
+            RegClass::I64 => {
+                let enc = r.get_hw_encoding();
+                if !BALDRDASH_JIT_CALLEE_SAVED_GPR[enc] {
+                    return true;
+                }
+                // Otherwise, fall through to preserve native's ABI caller-saved.
+            }
+            RegClass::V128 => {
+                let enc = r.get_hw_encoding();
+                if !BALDRDASH_JIT_CALLEE_SAVED_FPU[enc] {
+                    return true;
+                }
+                // Otherwise, fall through to preserve native's ABI caller-saved.
+            }
+            _ => unimplemented!("baldrdash callee saved on non-i64 reg classes"),
+        };
+    }
+
+    match r.get_class() {
+        RegClass::I64 => {
+            // x0 - x17 inclusive are caller-saves.
+            r.get_hw_encoding() <= 17
+        }
+        RegClass::V128 => {
+            // v0 - v7 inclusive and v16 - v31 inclusive are caller-saves. The
+            // upper 64 bits of v8 - v15 inclusive are also caller-saves.
+            // However, because we cannot currently represent partial registers
+            // to regalloc.rs, we indicate here that every vector register is
+            // caller-save. Because this function is used at *callsites*,
+            // approximating in this direction (save more than necessary) is
+            // conservative and thus safe.
+            //
+            // Note that we set the 'not included in clobber set' flag in the
+            // regalloc.rs API when a call instruction's callee has the same ABI
+            // as the caller (the current function body); this is safe (anything
+            // clobbered by callee can be clobbered by caller as well) and
+            // avoids unnecessary saves of v8-v15 in the prologue even though we
+            // include them as defs here.
+            true
+        }
+        _ => panic!("Unexpected RegClass"),
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/args.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/args.rs
new file mode 100644
index 0000000000..7bd181c86b
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/args.rs
@@ -0,0 +1,728 @@
+//! AArch64 ISA definitions: instruction arguments.
+
+// Some variants are never constructed, but we still want them as options in the future.
+#![allow(dead_code)]
+
+use crate::ir::types::{F32X2, F32X4, F64X2, I16X4, I16X8, I32X2, I32X4, I64X2, I8X16, I8X8};
+use crate::ir::Type;
+use crate::isa::aarch64::inst::*;
+use crate::machinst::{ty_bits, MachLabel};
+
+use regalloc::{PrettyPrint, RealRegUniverse, Reg, Writable};
+
+use core::convert::Into;
+use std::string::String;
+
+//=============================================================================
+// Instruction sub-components: shift and extend descriptors
+
+/// A shift operator for a register or immediate.
+#[derive(Clone, Copy, Debug)]
+#[repr(u8)]
+pub enum ShiftOp {
+    LSL = 0b00,
+    LSR = 0b01,
+    ASR = 0b10,
+    ROR = 0b11,
+}
+
+impl ShiftOp {
+    /// Get the encoding of this shift op.
+    pub fn bits(self) -> u8 {
+        self as u8
+    }
+}
+
+/// A shift operator amount.
+#[derive(Clone, Copy, Debug)]
+pub struct ShiftOpShiftImm(u8);
+
+impl ShiftOpShiftImm {
+    /// Maximum shift for shifted-register operands.
+    pub const MAX_SHIFT: u64 = 63;
+
+    /// Create a new shiftop shift amount, if possible.
+    pub fn maybe_from_shift(shift: u64) -> Option<ShiftOpShiftImm> {
+        if shift <= Self::MAX_SHIFT {
+            Some(ShiftOpShiftImm(shift as u8))
+        } else {
+            None
+        }
+    }
+
+    /// Return the shift amount.
+    pub fn value(self) -> u8 {
+        self.0
+    }
+
+    /// Mask down to a given number of bits.
+    pub fn mask(self, bits: u8) -> ShiftOpShiftImm {
+        ShiftOpShiftImm(self.0 & (bits - 1))
+    }
+}
+
+/// A shift operator with an amount, guaranteed to be within range.
+#[derive(Clone, Debug)]
+pub struct ShiftOpAndAmt {
+    op: ShiftOp,
+    shift: ShiftOpShiftImm,
+}
+
+impl ShiftOpAndAmt {
+    pub fn new(op: ShiftOp, shift: ShiftOpShiftImm) -> ShiftOpAndAmt {
+        ShiftOpAndAmt { op, shift }
+    }
+
+    /// Get the shift op.
+    pub fn op(&self) -> ShiftOp {
+        self.op
+    }
+
+    /// Get the shift amount.
+    pub fn amt(&self) -> ShiftOpShiftImm {
+        self.shift
+    }
+}
+
+/// An extend operator for a register.
+#[derive(Clone, Copy, Debug)]
+#[repr(u8)]
+pub enum ExtendOp {
+    UXTB = 0b000,
+    UXTH = 0b001,
+    UXTW = 0b010,
+    UXTX = 0b011,
+    SXTB = 0b100,
+    SXTH = 0b101,
+    SXTW = 0b110,
+    SXTX = 0b111,
+}
+
+impl ExtendOp {
+    /// Encoding of this op.
+    pub fn bits(self) -> u8 {
+        self as u8
+    }
+}
+
+//=============================================================================
+// Instruction sub-components (memory addresses): definitions
+
+/// A reference to some memory address.
+#[derive(Clone, Debug)]
+pub enum MemLabel {
+    /// An address in the code, a constant pool or jumptable, with relative
+    /// offset from this instruction. This form must be used at emission time;
+    /// see `memlabel_finalize()` for how other forms are lowered to this one.
+    PCRel(i32),
+}
+
+/// An addressing mode specified for a load/store operation.
+#[derive(Clone, Debug)]
+pub enum AMode {
+    //
+    // Real ARM64 addressing modes:
+    //
+    /// "post-indexed" mode as per AArch64 docs: postincrement reg after address computation.
+    PostIndexed(Writable<Reg>, SImm9),
+    /// "pre-indexed" mode as per AArch64 docs: preincrement reg before address computation.
+    PreIndexed(Writable<Reg>, SImm9),
+
+    // N.B.: RegReg, RegScaled, and RegScaledExtended all correspond to
+    // what the ISA calls the "register offset" addressing mode. We split out
+    // several options here for more ergonomic codegen.
+    /// Register plus register offset.
+    RegReg(Reg, Reg),
+
+    /// Register plus register offset, scaled by type's size.
+    RegScaled(Reg, Reg, Type),
+
+    /// Register plus register offset, scaled by type's size, with index sign- or zero-extended
+    /// first.
+    RegScaledExtended(Reg, Reg, Type, ExtendOp),
+
+    /// Register plus register offset, with index sign- or zero-extended first.
+    RegExtended(Reg, Reg, ExtendOp),
+
+    /// Unscaled signed 9-bit immediate offset from reg.
+    Unscaled(Reg, SImm9),
+
+    /// Scaled (by size of a type) unsigned 12-bit immediate offset from reg.
+    UnsignedOffset(Reg, UImm12Scaled),
+
+    //
+    // virtual addressing modes that are lowered at emission time:
+    //
+    /// Reference to a "label": e.g., a symbol.
+    Label(MemLabel),
+
+    /// Arbitrary offset from a register. Converted to generation of large
+    /// offsets with multiple instructions as necessary during code emission.
+    RegOffset(Reg, i64, Type),
+
+    /// Offset from the stack pointer.
+    SPOffset(i64, Type),
+
+    /// Offset from the frame pointer.
+    FPOffset(i64, Type),
+
+    /// Offset from the "nominal stack pointer", which is where the real SP is
+    /// just after stack and spill slots are allocated in the function prologue.
+    /// At emission time, this is converted to `SPOffset` with a fixup added to
+    /// the offset constant. The fixup is a running value that is tracked as
+    /// emission iterates through instructions in linear order, and can be
+    /// adjusted up and down with [Inst::VirtualSPOffsetAdj].
+    ///
+    /// The standard ABI is in charge of handling this (by emitting the
+    /// adjustment meta-instructions). It maintains the invariant that "nominal
+    /// SP" is where the actual SP is after the function prologue and before
+    /// clobber pushes. See the diagram in the documentation for
+    /// [crate::isa::aarch64::abi](the ABI module) for more details.
+    NominalSPOffset(i64, Type),
+}
+
+impl AMode {
+    /// Memory reference using an address in a register.
+    pub fn reg(reg: Reg) -> AMode {
+        // Use UnsignedOffset rather than Unscaled to use ldr rather than ldur.
+        // This also does not use PostIndexed / PreIndexed as they update the register.
+        AMode::UnsignedOffset(reg, UImm12Scaled::zero(I64))
+    }
+
+    /// Memory reference using the sum of two registers as an address.
+    pub fn reg_plus_reg(reg1: Reg, reg2: Reg) -> AMode {
+        AMode::RegReg(reg1, reg2)
+    }
+
+    /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address.
+    pub fn reg_plus_reg_scaled(reg1: Reg, reg2: Reg, ty: Type) -> AMode {
+        AMode::RegScaled(reg1, reg2, ty)
+    }
+
+    /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address, with `reg2` sign- or
+    /// zero-extended as per `op`.
+    pub fn reg_plus_reg_scaled_extended(reg1: Reg, reg2: Reg, ty: Type, op: ExtendOp) -> AMode {
+        AMode::RegScaledExtended(reg1, reg2, ty, op)
+    }
+
+    /// Memory reference to a label: a global function or value, or data in the constant pool.
+    pub fn label(label: MemLabel) -> AMode {
+        AMode::Label(label)
+    }
+}
+
+/// A memory argument to a load/store-pair.
+#[derive(Clone, Debug)]
+pub enum PairAMode {
+    SignedOffset(Reg, SImm7Scaled),
+    PreIndexed(Writable<Reg>, SImm7Scaled),
+    PostIndexed(Writable<Reg>, SImm7Scaled),
+}
+
+//=============================================================================
+// Instruction sub-components (conditions, branches and branch targets):
+// definitions
+
+/// Condition for conditional branches.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[repr(u8)]
+pub enum Cond {
+    Eq = 0,
+    Ne = 1,
+    Hs = 2,
+    Lo = 3,
+    Mi = 4,
+    Pl = 5,
+    Vs = 6,
+    Vc = 7,
+    Hi = 8,
+    Ls = 9,
+    Ge = 10,
+    Lt = 11,
+    Gt = 12,
+    Le = 13,
+    Al = 14,
+    Nv = 15,
+}
+
+impl Cond {
+    /// Return the inverted condition.
+    pub fn invert(self) -> Cond {
+        match self {
+            Cond::Eq => Cond::Ne,
+            Cond::Ne => Cond::Eq,
+
+            Cond::Hs => Cond::Lo,
+            Cond::Lo => Cond::Hs,
+
+            Cond::Mi => Cond::Pl,
+            Cond::Pl => Cond::Mi,
+
+            Cond::Vs => Cond::Vc,
+            Cond::Vc => Cond::Vs,
+
+            Cond::Hi => Cond::Ls,
+            Cond::Ls => Cond::Hi,
+
+            Cond::Ge => Cond::Lt,
+            Cond::Lt => Cond::Ge,
+
+            Cond::Gt => Cond::Le,
+            Cond::Le => Cond::Gt,
+
+            Cond::Al => Cond::Nv,
+            Cond::Nv => Cond::Al,
+        }
+    }
+
+    /// Return the machine encoding of this condition.
+    pub fn bits(self) -> u32 {
+        self as u32
+    }
+}
+
+/// The kind of conditional branch: the common-case-optimized "reg-is-zero" /
+/// "reg-is-nonzero" variants, or the generic one that tests the machine
+/// condition codes.
+#[derive(Clone, Copy, Debug)]
+pub enum CondBrKind {
+    /// Condition: given register is zero.
+    Zero(Reg),
+    /// Condition: given register is nonzero.
+    NotZero(Reg),
+    /// Condition: the given condition-code test is true.
+    Cond(Cond),
+}
+
+impl CondBrKind {
+    /// Return the inverted branch condition.
+    pub fn invert(self) -> CondBrKind {
+        match self {
+            CondBrKind::Zero(reg) => CondBrKind::NotZero(reg),
+            CondBrKind::NotZero(reg) => CondBrKind::Zero(reg),
+            CondBrKind::Cond(c) => CondBrKind::Cond(c.invert()),
+        }
+    }
+}
+
+/// A branch target. Either unresolved (basic-block index) or resolved (offset
+/// from end of current instruction).
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum BranchTarget {
+    /// An unresolved reference to a Label, as passed into
+    /// `lower_branch_group()`.
+    Label(MachLabel),
+    /// A fixed PC offset.
+    ResolvedOffset(i32),
+}
+
+impl BranchTarget {
+    /// Return the target's label, if it is a label-based target.
+    pub fn as_label(self) -> Option<MachLabel> {
+        match self {
+            BranchTarget::Label(l) => Some(l),
+            _ => None,
+        }
+    }
+
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset19_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
+            _ => 0,
+        };
+        assert!(off <= 0x3ffff);
+        assert!(off >= -0x40000);
+        (off as u32) & 0x7ffff
+    }
+
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_offset26_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 2,
+            _ => 0,
+        };
+        assert!(off <= 0x1ffffff);
+        assert!(off >= -0x2000000);
+        (off as u32) & 0x3ffffff
+    }
+}
+
+impl PrettyPrint for ShiftOpAndAmt {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("{:?} {}", self.op(), self.amt().value())
+    }
+}
+
+impl PrettyPrint for ExtendOp {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("{:?}", self)
+    }
+}
+
+impl PrettyPrint for MemLabel {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &MemLabel::PCRel(off) => format!("pc+{}", off),
+        }
+    }
+}
+
+fn shift_for_type(ty: Type) -> usize {
+    match ty.bytes() {
+        1 => 0,
+        2 => 1,
+        4 => 2,
+        8 => 3,
+        16 => 4,
+        _ => panic!("unknown type: {}", ty),
+    }
+}
+
+impl PrettyPrint for AMode {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &AMode::Unscaled(reg, simm9) => {
+                if simm9.value != 0 {
+                    format!("[{}, {}]", reg.show_rru(mb_rru), simm9.show_rru(mb_rru))
+                } else {
+                    format!("[{}]", reg.show_rru(mb_rru))
+                }
+            }
+            &AMode::UnsignedOffset(reg, uimm12) => {
+                if uimm12.value != 0 {
+                    format!("[{}, {}]", reg.show_rru(mb_rru), uimm12.show_rru(mb_rru))
+                } else {
+                    format!("[{}]", reg.show_rru(mb_rru))
+                }
+            }
+            &AMode::RegReg(r1, r2) => {
+                format!("[{}, {}]", r1.show_rru(mb_rru), r2.show_rru(mb_rru),)
+            }
+            &AMode::RegScaled(r1, r2, ty) => {
+                let shift = shift_for_type(ty);
+                format!(
+                    "[{}, {}, LSL #{}]",
+                    r1.show_rru(mb_rru),
+                    r2.show_rru(mb_rru),
+                    shift,
+                )
+            }
+            &AMode::RegScaledExtended(r1, r2, ty, op) => {
+                let shift = shift_for_type(ty);
+                let size = match op {
+                    ExtendOp::SXTW | ExtendOp::UXTW => OperandSize::Size32,
+                    _ => OperandSize::Size64,
+                };
+                let op = op.show_rru(mb_rru);
+                format!(
+                    "[{}, {}, {} #{}]",
+                    r1.show_rru(mb_rru),
+                    show_ireg_sized(r2, mb_rru, size),
+                    op,
+                    shift
+                )
+            }
+            &AMode::RegExtended(r1, r2, op) => {
+                let size = match op {
+                    ExtendOp::SXTW | ExtendOp::UXTW => OperandSize::Size32,
+                    _ => OperandSize::Size64,
+                };
+                let op = op.show_rru(mb_rru);
+                format!(
+                    "[{}, {}, {}]",
+                    r1.show_rru(mb_rru),
+                    show_ireg_sized(r2, mb_rru, size),
+                    op,
+                )
+            }
+            &AMode::Label(ref label) => label.show_rru(mb_rru),
+            &AMode::PreIndexed(r, simm9) => format!(
+                "[{}, {}]!",
+                r.to_reg().show_rru(mb_rru),
+                simm9.show_rru(mb_rru)
+            ),
+            &AMode::PostIndexed(r, simm9) => format!(
+                "[{}], {}",
+                r.to_reg().show_rru(mb_rru),
+                simm9.show_rru(mb_rru)
+            ),
+            // Eliminated by `mem_finalize()`.
+            &AMode::SPOffset(..)
+            | &AMode::FPOffset(..)
+            | &AMode::NominalSPOffset(..)
+            | &AMode::RegOffset(..) => {
+                panic!("Unexpected pseudo mem-arg mode (stack-offset or generic reg-offset)!")
+            }
+        }
+    }
+}
+
+impl PrettyPrint for PairAMode {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &PairAMode::SignedOffset(reg, simm7) => {
+                if simm7.value != 0 {
+                    format!("[{}, {}]", reg.show_rru(mb_rru), simm7.show_rru(mb_rru))
+                } else {
+                    format!("[{}]", reg.show_rru(mb_rru))
+                }
+            }
+            &PairAMode::PreIndexed(reg, simm7) => format!(
+                "[{}, {}]!",
+                reg.to_reg().show_rru(mb_rru),
+                simm7.show_rru(mb_rru)
+            ),
+            &PairAMode::PostIndexed(reg, simm7) => format!(
+                "[{}], {}",
+                reg.to_reg().show_rru(mb_rru),
+                simm7.show_rru(mb_rru)
+            ),
+        }
+    }
+}
+
+impl PrettyPrint for Cond {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        let mut s = format!("{:?}", self);
+        s.make_ascii_lowercase();
+        s
+    }
+}
+
+impl PrettyPrint for BranchTarget {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &BranchTarget::Label(label) => format!("label{:?}", label.get()),
+            &BranchTarget::ResolvedOffset(off) => format!("{}", off),
+        }
+    }
+}
+
+/// Type used to communicate the operand size of a machine instruction, as AArch64 has 32- and
+/// 64-bit variants of many instructions (and integer registers).
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum OperandSize {
+    Size32,
+    Size64,
+}
+
+impl OperandSize {
+    /// 32-bit case?
+    pub fn is32(self) -> bool {
+        self == OperandSize::Size32
+    }
+    /// 64-bit case?
+    pub fn is64(self) -> bool {
+        self == OperandSize::Size64
+    }
+    /// Convert from an `is32` boolean flag to an `OperandSize`.
+    pub fn from_is32(is32: bool) -> OperandSize {
+        if is32 {
+            OperandSize::Size32
+        } else {
+            OperandSize::Size64
+        }
+    }
+    /// Convert from a needed width to the smallest size that fits.
+    pub fn from_bits<I: Into<usize>>(bits: I) -> OperandSize {
+        let bits: usize = bits.into();
+        assert!(bits <= 64);
+        if bits <= 32 {
+            OperandSize::Size32
+        } else {
+            OperandSize::Size64
+        }
+    }
+
+    /// Convert from an integer type into the smallest size that fits.
+    pub fn from_ty(ty: Type) -> OperandSize {
+        Self::from_bits(ty_bits(ty))
+    }
+
+    /// Convert to I32, I64, or I128.
+    pub fn to_ty(self) -> Type {
+        match self {
+            OperandSize::Size32 => I32,
+            OperandSize::Size64 => I64,
+        }
+    }
+
+    pub fn sf_bit(&self) -> u32 {
+        match self {
+            OperandSize::Size32 => 0,
+            OperandSize::Size64 => 1,
+        }
+    }
+}
+
+/// Type used to communicate the size of a scalar SIMD & FP operand.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum ScalarSize {
+    Size8,
+    Size16,
+    Size32,
+    Size64,
+    Size128,
+}
+
+impl ScalarSize {
+    /// Convert from a needed width to the smallest size that fits.
+    pub fn from_bits<I: Into<usize>>(bits: I) -> ScalarSize {
+        match bits.into().next_power_of_two() {
+            8 => ScalarSize::Size8,
+            16 => ScalarSize::Size16,
+            32 => ScalarSize::Size32,
+            64 => ScalarSize::Size64,
+            128 => ScalarSize::Size128,
+            w => panic!("Unexpected type width: {}", w),
+        }
+    }
+
+    /// Convert to an integer operand size.
+    pub fn operand_size(&self) -> OperandSize {
+        match self {
+            ScalarSize::Size32 => OperandSize::Size32,
+            ScalarSize::Size64 => OperandSize::Size64,
+            _ => panic!("Unexpected operand_size request for: {:?}", self),
+        }
+    }
+
+    /// Convert from a type into the smallest size that fits.
+    pub fn from_ty(ty: Type) -> ScalarSize {
+        Self::from_bits(ty_bits(ty))
+    }
+
+    /// Return the encoding bits that are used by some scalar FP instructions
+    /// for a particular operand size.
+    pub fn ftype(&self) -> u32 {
+        match self {
+            ScalarSize::Size16 => 0b11,
+            ScalarSize::Size32 => 0b00,
+            ScalarSize::Size64 => 0b01,
+            _ => panic!("Unexpected scalar FP operand size: {:?}", self),
+        }
+    }
+}
+
+/// Type used to communicate the size of a vector operand.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum VectorSize {
+    Size8x8,
+    Size8x16,
+    Size16x4,
+    Size16x8,
+    Size32x2,
+    Size32x4,
+    Size64x2,
+}
+
+impl VectorSize {
+    /// Get the vector operand size with the given scalar size as lane size.
+    pub fn from_lane_size(size: ScalarSize, is_128bit: bool) -> VectorSize {
+        match (size, is_128bit) {
+            (ScalarSize::Size8, false) => VectorSize::Size8x8,
+            (ScalarSize::Size8, true) => VectorSize::Size8x16,
+            (ScalarSize::Size16, false) => VectorSize::Size16x4,
+            (ScalarSize::Size16, true) => VectorSize::Size16x8,
+            (ScalarSize::Size32, false) => VectorSize::Size32x2,
+            (ScalarSize::Size32, true) => VectorSize::Size32x4,
+            (ScalarSize::Size64, true) => VectorSize::Size64x2,
+            _ => panic!("Unexpected scalar FP operand size: {:?}", size),
+        }
+    }
+
+    /// Convert from a type into a vector operand size.
+    pub fn from_ty(ty: Type) -> VectorSize {
+        match ty {
+            B8X16 => VectorSize::Size8x16,
+            B16X8 => VectorSize::Size16x8,
+            B32X4 => VectorSize::Size32x4,
+            B64X2 => VectorSize::Size64x2,
+            F32X2 => VectorSize::Size32x2,
+            F32X4 => VectorSize::Size32x4,
+            F64X2 => VectorSize::Size64x2,
+            I8X8 => VectorSize::Size8x8,
+            I8X16 => VectorSize::Size8x16,
+            I16X4 => VectorSize::Size16x4,
+            I16X8 => VectorSize::Size16x8,
+            I32X2 => VectorSize::Size32x2,
+            I32X4 => VectorSize::Size32x4,
+            I64X2 => VectorSize::Size64x2,
+            _ => unimplemented!("Unsupported type: {}", ty),
+        }
+    }
+
+    /// Get the integer operand size that corresponds to a lane of a vector with a certain size.
+    pub fn operand_size(&self) -> OperandSize {
+        match self {
+            VectorSize::Size64x2 => OperandSize::Size64,
+            _ => OperandSize::Size32,
+        }
+    }
+
+    /// Get the scalar operand size that corresponds to a lane of a vector with a certain size.
+    pub fn lane_size(&self) -> ScalarSize {
+        match self {
+            VectorSize::Size8x8 => ScalarSize::Size8,
+            VectorSize::Size8x16 => ScalarSize::Size8,
+            VectorSize::Size16x4 => ScalarSize::Size16,
+            VectorSize::Size16x8 => ScalarSize::Size16,
+            VectorSize::Size32x2 => ScalarSize::Size32,
+            VectorSize::Size32x4 => ScalarSize::Size32,
+            VectorSize::Size64x2 => ScalarSize::Size64,
+        }
+    }
+
+    pub fn is_128bits(&self) -> bool {
+        match self {
+            VectorSize::Size8x8 => false,
+            VectorSize::Size8x16 => true,
+            VectorSize::Size16x4 => false,
+            VectorSize::Size16x8 => true,
+            VectorSize::Size32x2 => false,
+            VectorSize::Size32x4 => true,
+            VectorSize::Size64x2 => true,
+        }
+    }
+
+    /// Produces a `VectorSize` with lanes twice as wide.  Note that if the resulting
+    /// size would exceed 128 bits, then the number of lanes is also halved, so as to
+    /// ensure that the result size is at most 128 bits.
+    pub fn widen(&self) -> VectorSize {
+        match self {
+            VectorSize::Size8x8 => VectorSize::Size16x8,
+            VectorSize::Size8x16 => VectorSize::Size16x8,
+            VectorSize::Size16x4 => VectorSize::Size32x4,
+            VectorSize::Size16x8 => VectorSize::Size32x4,
+            VectorSize::Size32x2 => VectorSize::Size64x2,
+            VectorSize::Size32x4 => VectorSize::Size64x2,
+            VectorSize::Size64x2 => unreachable!(),
+        }
+    }
+
+    /// Produces a `VectorSize` that has the same lane width, but half as many lanes.
+    pub fn halve(&self) -> VectorSize {
+        match self {
+            VectorSize::Size8x16 => VectorSize::Size8x8,
+            VectorSize::Size16x8 => VectorSize::Size16x4,
+            VectorSize::Size32x4 => VectorSize::Size32x2,
+            _ => *self,
+        }
+    }
+
+    /// Return the encoding bits that are used by some SIMD instructions
+    /// for a particular operand size.
+    pub fn enc_size(&self) -> (u32, u32) {
+        let q = self.is_128bits() as u32;
+        let size = match self.lane_size() {
+            ScalarSize::Size8 => 0b00,
+            ScalarSize::Size16 => 0b01,
+            ScalarSize::Size32 => 0b10,
+            ScalarSize::Size64 => 0b11,
+            _ => unreachable!(),
+        };
+
+        (q, size)
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit.rs
new file mode 100644
index 0000000000..5d0270dade
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit.rs
@@ -0,0 +1,2359 @@
+//! AArch64 ISA: binary code emission.
+
+use crate::binemit::{CodeOffset, Reloc, StackMap};
+use crate::ir::constant::ConstantData;
+use crate::ir::types::*;
+use crate::ir::{MemFlags, TrapCode};
+use crate::isa::aarch64::inst::*;
+use crate::machinst::ty_bits;
+
+use regalloc::{Reg, RegClass, Writable};
+
+use core::convert::TryFrom;
+use log::debug;
+
+/// Memory label/reference finalization: convert a MemLabel to a PC-relative
+/// offset, possibly emitting relocation(s) as necessary.
+pub fn memlabel_finalize(_insn_off: CodeOffset, label: &MemLabel) -> i32 {
+    match label {
+        &MemLabel::PCRel(rel) => rel,
+    }
+}
+
+/// Memory addressing mode finalization: convert "special" modes (e.g.,
+/// generic arbitrary stack offset) into real addressing modes, possibly by
+/// emitting some helper instructions that come immediately before the use
+/// of this amode.
+pub fn mem_finalize(
+    insn_off: CodeOffset,
+    mem: &AMode,
+    state: &EmitState,
+) -> (SmallVec<[Inst; 4]>, AMode) {
+    match mem {
+        &AMode::RegOffset(_, off, ty)
+        | &AMode::SPOffset(off, ty)
+        | &AMode::FPOffset(off, ty)
+        | &AMode::NominalSPOffset(off, ty) => {
+            let basereg = match mem {
+                &AMode::RegOffset(reg, _, _) => reg,
+                &AMode::SPOffset(..) | &AMode::NominalSPOffset(..) => stack_reg(),
+                &AMode::FPOffset(..) => fp_reg(),
+                _ => unreachable!(),
+            };
+            let adj = match mem {
+                &AMode::NominalSPOffset(..) => {
+                    debug!(
+                        "mem_finalize: nominal SP offset {} + adj {} -> {}",
+                        off,
+                        state.virtual_sp_offset,
+                        off + state.virtual_sp_offset
+                    );
+                    state.virtual_sp_offset
+                }
+                _ => 0,
+            };
+            let off = off + adj;
+
+            if let Some(simm9) = SImm9::maybe_from_i64(off) {
+                let mem = AMode::Unscaled(basereg, simm9);
+                (smallvec![], mem)
+            } else if let Some(uimm12s) = UImm12Scaled::maybe_from_i64(off, ty) {
+                let mem = AMode::UnsignedOffset(basereg, uimm12s);
+                (smallvec![], mem)
+            } else {
+                let tmp = writable_spilltmp_reg();
+                let mut const_insts = Inst::load_constant(tmp, off as u64);
+                // N.B.: we must use AluRRRExtend because AluRRR uses the "shifted register" form
+                // (AluRRRShift) instead, which interprets register 31 as the zero reg, not SP. SP
+                // is a valid base (for SPOffset) which we must handle here.
+                // Also, SP needs to be the first arg, not second.
+                let add_inst = Inst::AluRRRExtend {
+                    alu_op: ALUOp::Add64,
+                    rd: tmp,
+                    rn: basereg,
+                    rm: tmp.to_reg(),
+                    extendop: ExtendOp::UXTX,
+                };
+                const_insts.push(add_inst);
+                (const_insts, AMode::reg(tmp.to_reg()))
+            }
+        }
+
+        &AMode::Label(ref label) => {
+            let off = memlabel_finalize(insn_off, label);
+            (smallvec![], AMode::Label(MemLabel::PCRel(off)))
+        }
+
+        _ => (smallvec![], mem.clone()),
+    }
+}
+
+/// Helper: get a ConstantData from a u64.
+pub fn u64_constant(bits: u64) -> ConstantData {
+    let data = bits.to_le_bytes();
+    ConstantData::from(&data[..])
+}
+
+//=============================================================================
+// Instructions and subcomponents: emission
+
+fn machreg_to_gpr(m: Reg) -> u32 {
+    assert_eq!(m.get_class(), RegClass::I64);
+    u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
+}
+
+fn machreg_to_vec(m: Reg) -> u32 {
+    assert_eq!(m.get_class(), RegClass::V128);
+    u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
+}
+
+fn machreg_to_gpr_or_vec(m: Reg) -> u32 {
+    u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
+}
+
+fn enc_arith_rrr(bits_31_21: u32, bits_15_10: u32, rd: Writable<Reg>, rn: Reg, rm: Reg) -> u32 {
+    (bits_31_21 << 21)
+        | (bits_15_10 << 10)
+        | machreg_to_gpr(rd.to_reg())
+        | (machreg_to_gpr(rn) << 5)
+        | (machreg_to_gpr(rm) << 16)
+}
+
+fn enc_arith_rr_imm12(
+    bits_31_24: u32,
+    immshift: u32,
+    imm12: u32,
+    rn: Reg,
+    rd: Writable<Reg>,
+) -> u32 {
+    (bits_31_24 << 24)
+        | (immshift << 22)
+        | (imm12 << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_arith_rr_imml(bits_31_23: u32, imm_bits: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
+    (bits_31_23 << 23) | (imm_bits << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_arith_rrrr(top11: u32, rm: Reg, bit15: u32, ra: Reg, rn: Reg, rd: Writable<Reg>) -> u32 {
+    (top11 << 21)
+        | (machreg_to_gpr(rm) << 16)
+        | (bit15 << 15)
+        | (machreg_to_gpr(ra) << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_jump26(op_31_26: u32, off_26_0: u32) -> u32 {
+    assert!(off_26_0 < (1 << 26));
+    (op_31_26 << 26) | off_26_0
+}
+
+fn enc_cmpbr(op_31_24: u32, off_18_0: u32, reg: Reg) -> u32 {
+    assert!(off_18_0 < (1 << 19));
+    (op_31_24 << 24) | (off_18_0 << 5) | machreg_to_gpr(reg)
+}
+
+fn enc_cbr(op_31_24: u32, off_18_0: u32, op_4: u32, cond: u32) -> u32 {
+    assert!(off_18_0 < (1 << 19));
+    assert!(cond < (1 << 4));
+    (op_31_24 << 24) | (off_18_0 << 5) | (op_4 << 4) | cond
+}
+
+fn enc_conditional_br(taken: BranchTarget, kind: CondBrKind) -> u32 {
+    match kind {
+        CondBrKind::Zero(reg) => enc_cmpbr(0b1_011010_0, taken.as_offset19_or_zero(), reg),
+        CondBrKind::NotZero(reg) => enc_cmpbr(0b1_011010_1, taken.as_offset19_or_zero(), reg),
+        CondBrKind::Cond(c) => enc_cbr(0b01010100, taken.as_offset19_or_zero(), 0b0, c.bits()),
+    }
+}
+
+const MOVE_WIDE_FIXED: u32 = 0x12800000;
+
+#[repr(u32)]
+enum MoveWideOpcode {
+    MOVN = 0b00,
+    MOVZ = 0b10,
+    MOVK = 0b11,
+}
+
+fn enc_move_wide(
+    op: MoveWideOpcode,
+    rd: Writable<Reg>,
+    imm: MoveWideConst,
+    size: OperandSize,
+) -> u32 {
+    assert!(imm.shift <= 0b11);
+    MOVE_WIDE_FIXED
+        | size.sf_bit() << 31
+        | (op as u32) << 29
+        | u32::from(imm.shift) << 21
+        | u32::from(imm.bits) << 5
+        | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_ldst_pair(op_31_22: u32, simm7: SImm7Scaled, rn: Reg, rt: Reg, rt2: Reg) -> u32 {
+    (op_31_22 << 22)
+        | (simm7.bits() << 15)
+        | (machreg_to_gpr(rt2) << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rt)
+}
+
+fn enc_ldst_simm9(op_31_22: u32, simm9: SImm9, op_11_10: u32, rn: Reg, rd: Reg) -> u32 {
+    (op_31_22 << 22)
+        | (simm9.bits() << 12)
+        | (op_11_10 << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr_or_vec(rd)
+}
+
+fn enc_ldst_uimm12(op_31_22: u32, uimm12: UImm12Scaled, rn: Reg, rd: Reg) -> u32 {
+    (op_31_22 << 22)
+        | (0b1 << 24)
+        | (uimm12.bits() << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr_or_vec(rd)
+}
+
+fn enc_ldst_reg(
+    op_31_22: u32,
+    rn: Reg,
+    rm: Reg,
+    s_bit: bool,
+    extendop: Option<ExtendOp>,
+    rd: Reg,
+) -> u32 {
+    let s_bit = if s_bit { 1 } else { 0 };
+    let extend_bits = match extendop {
+        Some(ExtendOp::UXTW) => 0b010,
+        Some(ExtendOp::SXTW) => 0b110,
+        Some(ExtendOp::SXTX) => 0b111,
+        None => 0b011, // LSL
+        _ => panic!("bad extend mode for ld/st AMode"),
+    };
+    (op_31_22 << 22)
+        | (1 << 21)
+        | (machreg_to_gpr(rm) << 16)
+        | (extend_bits << 13)
+        | (s_bit << 12)
+        | (0b10 << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr_or_vec(rd)
+}
+
+fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 {
+    (op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
+}
+
+fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
+    debug_assert_eq!(q & 0b1, q);
+    debug_assert_eq!(size & 0b11, size);
+    0b0_0_0011010_10_00000_110_0_00_00000_00000
+        | q << 30
+        | size << 10
+        | machreg_to_gpr(rn) << 5
+        | machreg_to_vec(rt.to_reg())
+}
+
+fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
+    (top11 << 21)
+        | (machreg_to_vec(rm) << 16)
+        | (bit15_10 << 10)
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_bit_rr(size: u32, opcode2: u32, opcode1: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
+    (0b01011010110 << 21)
+        | size << 31
+        | opcode2 << 16
+        | opcode1 << 10
+        | machreg_to_gpr(rn) << 5
+        | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_br(rn: Reg) -> u32 {
+    0b1101011_0000_11111_000000_00000_00000 | (machreg_to_gpr(rn) << 5)
+}
+
+fn enc_adr(off: i32, rd: Writable<Reg>) -> u32 {
+    let off = u32::try_from(off).unwrap();
+    let immlo = off & 3;
+    let immhi = (off >> 2) & ((1 << 19) - 1);
+    (0b00010000 << 24) | (immlo << 29) | (immhi << 5) | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_csel(rd: Writable<Reg>, rn: Reg, rm: Reg, cond: Cond) -> u32 {
+    0b100_11010100_00000_0000_00_00000_00000
+        | (machreg_to_gpr(rm) << 16)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rd.to_reg())
+        | (cond.bits() << 12)
+}
+
+fn enc_fcsel(rd: Writable<Reg>, rn: Reg, rm: Reg, cond: Cond, size: ScalarSize) -> u32 {
+    0b000_11110_00_1_00000_0000_11_00000_00000
+        | (size.ftype() << 22)
+        | (machreg_to_vec(rm) << 16)
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+        | (cond.bits() << 12)
+}
+
+fn enc_cset(rd: Writable<Reg>, cond: Cond) -> u32 {
+    0b100_11010100_11111_0000_01_11111_00000
+        | machreg_to_gpr(rd.to_reg())
+        | (cond.invert().bits() << 12)
+}
+
+fn enc_ccmp_imm(size: OperandSize, rn: Reg, imm: UImm5, nzcv: NZCV, cond: Cond) -> u32 {
+    0b0_1_1_11010010_00000_0000_10_00000_0_0000
+        | size.sf_bit() << 31
+        | imm.bits() << 16
+        | cond.bits() << 12
+        | machreg_to_gpr(rn) << 5
+        | nzcv.bits()
+}
+
+fn enc_vecmov(is_16b: bool, rd: Writable<Reg>, rn: Reg) -> u32 {
+    0b00001110_101_00000_00011_1_00000_00000
+        | ((is_16b as u32) << 30)
+        | machreg_to_vec(rd.to_reg())
+        | (machreg_to_vec(rn) << 16)
+        | (machreg_to_vec(rn) << 5)
+}
+
+fn enc_fpurr(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_fpurrr(top22: u32, rd: Writable<Reg>, rn: Reg, rm: Reg) -> u32 {
+    (top22 << 10)
+        | (machreg_to_vec(rm) << 16)
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_fpurrrr(top17: u32, rd: Writable<Reg>, rn: Reg, rm: Reg, ra: Reg) -> u32 {
+    (top17 << 15)
+        | (machreg_to_vec(rm) << 16)
+        | (machreg_to_vec(ra) << 10)
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_fcmp(size: ScalarSize, rn: Reg, rm: Reg) -> u32 {
+    0b000_11110_00_1_00000_00_1000_00000_00000
+        | (size.ftype() << 22)
+        | (machreg_to_vec(rm) << 16)
+        | (machreg_to_vec(rn) << 5)
+}
+
+fn enc_fputoint(top16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    (top16 << 16) | (machreg_to_vec(rn) << 5) | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_inttofpu(top16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    (top16 << 16) | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_vec_rr_misc(qu: u32, size: u32, bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    debug_assert_eq!(qu & 0b11, qu);
+    debug_assert_eq!(size & 0b11, size);
+    debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16);
+    let bits = 0b0_00_01110_00_10000_00000_10_00000_00000;
+    bits | qu << 29
+        | size << 22
+        | bits_12_16 << 12
+        | machreg_to_vec(rn) << 5
+        | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    debug_assert_eq!(q & 0b1, q);
+    debug_assert_eq!(u & 0b1, u);
+    debug_assert_eq!(size & 0b11, size);
+    debug_assert_eq!(opcode & 0b11111, opcode);
+    0b0_0_0_01110_00_11000_0_0000_10_00000_00000
+        | q << 30
+        | u << 29
+        | size << 22
+        | opcode << 12
+        | machreg_to_vec(rn) << 5
+        | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_tbl(is_extension: bool, len: u32, rd: Writable<Reg>, rn: Reg, rm: Reg) -> u32 {
+    debug_assert_eq!(len & 0b11, len);
+    0b0_1_001110_000_00000_0_00_0_00_00000_00000
+        | (machreg_to_vec(rm) << 16)
+        | len << 13
+        | (is_extension as u32) << 12
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_dmb_ish() -> u32 {
+    0xD5033BBF
+}
+
+fn enc_ldxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
+    let sz = match ty {
+        I64 => 0b11,
+        I32 => 0b10,
+        I16 => 0b01,
+        I8 => 0b00,
+        _ => unreachable!(),
+    };
+    0b00001000_01011111_01111100_00000000
+        | (sz << 30)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rt.to_reg())
+}
+
+fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
+    let sz = match ty {
+        I64 => 0b11,
+        I32 => 0b10,
+        I16 => 0b01,
+        I8 => 0b00,
+        _ => unreachable!(),
+    };
+    0b00001000_00000000_01111100_00000000
+        | (sz << 30)
+        | (machreg_to_gpr(rs.to_reg()) << 16)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rt)
+}
+
+fn enc_asimd_mod_imm(rd: Writable<Reg>, q_op: u32, cmode: u32, imm: u8) -> u32 {
+    let abc = (imm >> 5) as u32;
+    let defgh = (imm & 0b11111) as u32;
+
+    debug_assert_eq!(cmode & 0b1111, cmode);
+    debug_assert_eq!(q_op & 0b11, q_op);
+
+    0b0_0_0_0111100000_000_0000_01_00000_00000
+        | (q_op << 29)
+        | (abc << 16)
+        | (cmode << 12)
+        | (defgh << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
+/// State carried between emissions of a sequence of instructions.
+#[derive(Default, Clone, Debug)]
+pub struct EmitState {
+    /// Addend to convert nominal-SP offsets to real-SP offsets at the current
+    /// program point.
+    pub(crate) virtual_sp_offset: i64,
+    /// Offset of FP from nominal-SP.
+    pub(crate) nominal_sp_to_fp: i64,
+    /// Safepoint stack map for upcoming instruction, as provided to `pre_safepoint()`.
+    stack_map: Option<StackMap>,
+    /// Current source-code location corresponding to instruction to be emitted.
+    cur_srcloc: SourceLoc,
+}
+
+impl MachInstEmitState<Inst> for EmitState {
+    fn new(abi: &dyn ABICallee<I = Inst>) -> Self {
+        EmitState {
+            virtual_sp_offset: 0,
+            nominal_sp_to_fp: abi.frame_size() as i64,
+            stack_map: None,
+            cur_srcloc: SourceLoc::default(),
+        }
+    }
+
+    fn pre_safepoint(&mut self, stack_map: StackMap) {
+        self.stack_map = Some(stack_map);
+    }
+
+    fn pre_sourceloc(&mut self, srcloc: SourceLoc) {
+        self.cur_srcloc = srcloc;
+    }
+}
+
+impl EmitState {
+    fn take_stack_map(&mut self) -> Option<StackMap> {
+        self.stack_map.take()
+    }
+
+    fn clear_post_insn(&mut self) {
+        self.stack_map = None;
+    }
+
+    fn cur_srcloc(&self) -> SourceLoc {
+        self.cur_srcloc
+    }
+}
+
+/// Constant state used during function compilation.
+pub struct EmitInfo(settings::Flags);
+
+impl EmitInfo {
+    pub(crate) fn new(flags: settings::Flags) -> Self {
+        Self(flags)
+    }
+}
+
+impl MachInstEmitInfo for EmitInfo {
+    fn flags(&self) -> &settings::Flags {
+        &self.0
+    }
+}
+
+impl MachInstEmit for Inst {
+    type State = EmitState;
+    type Info = EmitInfo;
+    type UnwindInfo = super::unwind::AArch64UnwindInfo;
+
+    fn emit(&self, sink: &mut MachBuffer<Inst>, emit_info: &Self::Info, state: &mut EmitState) {
+        // N.B.: we *must* not exceed the "worst-case size" used to compute
+        // where to insert islands, except when islands are explicitly triggered
+        // (with an `EmitIsland`). We check this in debug builds. This is `mut`
+        // to allow disabling the check for `JTSequence`, which is always
+        // emitted following an `EmitIsland`.
+        let mut start_off = sink.cur_offset();
+
+        match self {
+            &Inst::AluRRR { alu_op, rd, rn, rm } => {
+                let top11 = match alu_op {
+                    ALUOp::Add32 => 0b00001011_000,
+                    ALUOp::Add64 => 0b10001011_000,
+                    ALUOp::Sub32 => 0b01001011_000,
+                    ALUOp::Sub64 => 0b11001011_000,
+                    ALUOp::Orr32 => 0b00101010_000,
+                    ALUOp::Orr64 => 0b10101010_000,
+                    ALUOp::And32 => 0b00001010_000,
+                    ALUOp::And64 => 0b10001010_000,
+                    ALUOp::Eor32 => 0b01001010_000,
+                    ALUOp::Eor64 => 0b11001010_000,
+                    ALUOp::OrrNot32 => 0b00101010_001,
+                    ALUOp::OrrNot64 => 0b10101010_001,
+                    ALUOp::AndNot32 => 0b00001010_001,
+                    ALUOp::AndNot64 => 0b10001010_001,
+                    ALUOp::EorNot32 => 0b01001010_001,
+                    ALUOp::EorNot64 => 0b11001010_001,
+                    ALUOp::AddS32 => 0b00101011_000,
+                    ALUOp::AddS64 => 0b10101011_000,
+                    ALUOp::SubS32 => 0b01101011_000,
+                    ALUOp::SubS64 => 0b11101011_000,
+                    ALUOp::SDiv64 => 0b10011010_110,
+                    ALUOp::UDiv64 => 0b10011010_110,
+                    ALUOp::RotR32 | ALUOp::Lsr32 | ALUOp::Asr32 | ALUOp::Lsl32 => 0b00011010_110,
+                    ALUOp::RotR64 | ALUOp::Lsr64 | ALUOp::Asr64 | ALUOp::Lsl64 => 0b10011010_110,
+                    ALUOp::SMulH => 0b10011011_010,
+                    ALUOp::UMulH => 0b10011011_110,
+                };
+                let bit15_10 = match alu_op {
+                    ALUOp::SDiv64 => 0b000011,
+                    ALUOp::UDiv64 => 0b000010,
+                    ALUOp::RotR32 | ALUOp::RotR64 => 0b001011,
+                    ALUOp::Lsr32 | ALUOp::Lsr64 => 0b001001,
+                    ALUOp::Asr32 | ALUOp::Asr64 => 0b001010,
+                    ALUOp::Lsl32 | ALUOp::Lsl64 => 0b001000,
+                    ALUOp::SMulH | ALUOp::UMulH => 0b011111,
+                    _ => 0b000000,
+                };
+                debug_assert_ne!(writable_stack_reg(), rd);
+                // The stack pointer is the zero register in this context, so this might be an
+                // indication that something is wrong.
+                debug_assert_ne!(stack_reg(), rn);
+                debug_assert_ne!(stack_reg(), rm);
+                sink.put4(enc_arith_rrr(top11, bit15_10, rd, rn, rm));
+            }
+            &Inst::AluRRRR {
+                alu_op,
+                rd,
+                rm,
+                rn,
+                ra,
+            } => {
+                let (top11, bit15) = match alu_op {
+                    ALUOp3::MAdd32 => (0b0_00_11011_000, 0),
+                    ALUOp3::MSub32 => (0b0_00_11011_000, 1),
+                    ALUOp3::MAdd64 => (0b1_00_11011_000, 0),
+                    ALUOp3::MSub64 => (0b1_00_11011_000, 1),
+                };
+                sink.put4(enc_arith_rrrr(top11, rm, bit15, ra, rn, rd));
+            }
+            &Inst::AluRRImm12 {
+                alu_op,
+                rd,
+                rn,
+                ref imm12,
+            } => {
+                let top8 = match alu_op {
+                    ALUOp::Add32 => 0b000_10001,
+                    ALUOp::Add64 => 0b100_10001,
+                    ALUOp::Sub32 => 0b010_10001,
+                    ALUOp::Sub64 => 0b110_10001,
+                    ALUOp::AddS32 => 0b001_10001,
+                    ALUOp::AddS64 => 0b101_10001,
+                    ALUOp::SubS32 => 0b011_10001,
+                    ALUOp::SubS64 => 0b111_10001,
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                sink.put4(enc_arith_rr_imm12(
+                    top8,
+                    imm12.shift_bits(),
+                    imm12.imm_bits(),
+                    rn,
+                    rd,
+                ));
+            }
+            &Inst::AluRRImmLogic {
+                alu_op,
+                rd,
+                rn,
+                ref imml,
+            } => {
+                let (top9, inv) = match alu_op {
+                    ALUOp::Orr32 => (0b001_100100, false),
+                    ALUOp::Orr64 => (0b101_100100, false),
+                    ALUOp::And32 => (0b000_100100, false),
+                    ALUOp::And64 => (0b100_100100, false),
+                    ALUOp::Eor32 => (0b010_100100, false),
+                    ALUOp::Eor64 => (0b110_100100, false),
+                    ALUOp::OrrNot32 => (0b001_100100, true),
+                    ALUOp::OrrNot64 => (0b101_100100, true),
+                    ALUOp::AndNot32 => (0b000_100100, true),
+                    ALUOp::AndNot64 => (0b100_100100, true),
+                    ALUOp::EorNot32 => (0b010_100100, true),
+                    ALUOp::EorNot64 => (0b110_100100, true),
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                let imml = if inv { imml.invert() } else { imml.clone() };
+                sink.put4(enc_arith_rr_imml(top9, imml.enc_bits(), rn, rd));
+            }
+
+            &Inst::AluRRImmShift {
+                alu_op,
+                rd,
+                rn,
+                ref immshift,
+            } => {
+                let amt = immshift.value();
+                let (top10, immr, imms) = match alu_op {
+                    ALUOp::RotR32 => (0b0001001110, machreg_to_gpr(rn), u32::from(amt)),
+                    ALUOp::RotR64 => (0b1001001111, machreg_to_gpr(rn), u32::from(amt)),
+                    ALUOp::Lsr32 => (0b0101001100, u32::from(amt), 0b011111),
+                    ALUOp::Lsr64 => (0b1101001101, u32::from(amt), 0b111111),
+                    ALUOp::Asr32 => (0b0001001100, u32::from(amt), 0b011111),
+                    ALUOp::Asr64 => (0b1001001101, u32::from(amt), 0b111111),
+                    ALUOp::Lsl32 => (
+                        0b0101001100,
+                        u32::from((32 - amt) % 32),
+                        u32::from(31 - amt),
+                    ),
+                    ALUOp::Lsl64 => (
+                        0b1101001101,
+                        u32::from((64 - amt) % 64),
+                        u32::from(63 - amt),
+                    ),
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                sink.put4(
+                    (top10 << 22)
+                        | (immr << 16)
+                        | (imms << 10)
+                        | (machreg_to_gpr(rn) << 5)
+                        | machreg_to_gpr(rd.to_reg()),
+                );
+            }
+
+            &Inst::AluRRRShift {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ref shiftop,
+            } => {
+                let top11: u32 = match alu_op {
+                    ALUOp::Add32 => 0b000_01011000,
+                    ALUOp::Add64 => 0b100_01011000,
+                    ALUOp::AddS32 => 0b001_01011000,
+                    ALUOp::AddS64 => 0b101_01011000,
+                    ALUOp::Sub32 => 0b010_01011000,
+                    ALUOp::Sub64 => 0b110_01011000,
+                    ALUOp::SubS32 => 0b011_01011000,
+                    ALUOp::SubS64 => 0b111_01011000,
+                    ALUOp::Orr32 => 0b001_01010000,
+                    ALUOp::Orr64 => 0b101_01010000,
+                    ALUOp::And32 => 0b000_01010000,
+                    ALUOp::And64 => 0b100_01010000,
+                    ALUOp::Eor32 => 0b010_01010000,
+                    ALUOp::Eor64 => 0b110_01010000,
+                    ALUOp::OrrNot32 => 0b001_01010001,
+                    ALUOp::OrrNot64 => 0b101_01010001,
+                    ALUOp::EorNot32 => 0b010_01010001,
+                    ALUOp::EorNot64 => 0b110_01010001,
+                    ALUOp::AndNot32 => 0b000_01010001,
+                    ALUOp::AndNot64 => 0b100_01010001,
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                let top11 = top11 | (u32::from(shiftop.op().bits()) << 1);
+                let bits_15_10 = u32::from(shiftop.amt().value());
+                sink.put4(enc_arith_rrr(top11, bits_15_10, rd, rn, rm));
+            }
+
+            &Inst::AluRRRExtend {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                extendop,
+            } => {
+                let top11: u32 = match alu_op {
+                    ALUOp::Add32 => 0b00001011001,
+                    ALUOp::Add64 => 0b10001011001,
+                    ALUOp::Sub32 => 0b01001011001,
+                    ALUOp::Sub64 => 0b11001011001,
+                    ALUOp::AddS32 => 0b00101011001,
+                    ALUOp::AddS64 => 0b10101011001,
+                    ALUOp::SubS32 => 0b01101011001,
+                    ALUOp::SubS64 => 0b11101011001,
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                let bits_15_10 = u32::from(extendop.bits()) << 3;
+                sink.put4(enc_arith_rrr(top11, bits_15_10, rd, rn, rm));
+            }
+
+            &Inst::BitRR { op, rd, rn, .. } => {
+                let size = if op.operand_size().is32() { 0b0 } else { 0b1 };
+                let (op1, op2) = match op {
+                    BitOp::RBit32 | BitOp::RBit64 => (0b00000, 0b000000),
+                    BitOp::Clz32 | BitOp::Clz64 => (0b00000, 0b000100),
+                    BitOp::Cls32 | BitOp::Cls64 => (0b00000, 0b000101),
+                };
+                sink.put4(enc_bit_rr(size, op1, op2, rn, rd))
+            }
+
+            &Inst::ULoad8 { rd, ref mem, flags }
+            | &Inst::SLoad8 { rd, ref mem, flags }
+            | &Inst::ULoad16 { rd, ref mem, flags }
+            | &Inst::SLoad16 { rd, ref mem, flags }
+            | &Inst::ULoad32 { rd, ref mem, flags }
+            | &Inst::SLoad32 { rd, ref mem, flags }
+            | &Inst::ULoad64 {
+                rd, ref mem, flags, ..
+            }
+            | &Inst::FpuLoad32 { rd, ref mem, flags }
+            | &Inst::FpuLoad64 { rd, ref mem, flags }
+            | &Inst::FpuLoad128 { rd, ref mem, flags } => {
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);
+
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink, emit_info, state);
+                }
+
+                // ldst encoding helpers take Reg, not Writable<Reg>.
+                let rd = rd.to_reg();
+
+                // This is the base opcode (top 10 bits) for the "unscaled
+                // immediate" form (Unscaled). Other addressing modes will OR in
+                // other values for bits 24/25 (bits 1/2 of this constant).
+                let (op, bits) = match self {
+                    &Inst::ULoad8 { .. } => (0b0011100001, 8),
+                    &Inst::SLoad8 { .. } => (0b0011100010, 8),
+                    &Inst::ULoad16 { .. } => (0b0111100001, 16),
+                    &Inst::SLoad16 { .. } => (0b0111100010, 16),
+                    &Inst::ULoad32 { .. } => (0b1011100001, 32),
+                    &Inst::SLoad32 { .. } => (0b1011100010, 32),
+                    &Inst::ULoad64 { .. } => (0b1111100001, 64),
+                    &Inst::FpuLoad32 { .. } => (0b1011110001, 32),
+                    &Inst::FpuLoad64 { .. } => (0b1111110001, 64),
+                    &Inst::FpuLoad128 { .. } => (0b0011110011, 128),
+                    _ => unreachable!(),
+                };
+
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() && !flags.notrap() {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+
+                match &mem {
+                    &AMode::Unscaled(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd));
+                    }
+                    &AMode::UnsignedOffset(reg, uimm12scaled) => {
+                        if uimm12scaled.value() != 0 {
+                            assert_eq!(bits, ty_bits(uimm12scaled.scale_ty()));
+                        }
+                        sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd));
+                    }
+                    &AMode::RegReg(r1, r2) => {
+                        sink.put4(enc_ldst_reg(
+                            op, r1, r2, /* scaled = */ false, /* extendop = */ None, rd,
+                        ));
+                    }
+                    &AMode::RegScaled(r1, r2, ty) | &AMode::RegScaledExtended(r1, r2, ty, _) => {
+                        assert_eq!(bits, ty_bits(ty));
+                        let extendop = match &mem {
+                            &AMode::RegScaled(..) => None,
+                            &AMode::RegScaledExtended(_, _, _, op) => Some(op),
+                            _ => unreachable!(),
+                        };
+                        sink.put4(enc_ldst_reg(
+                            op, r1, r2, /* scaled = */ true, extendop, rd,
+                        ));
+                    }
+                    &AMode::RegExtended(r1, r2, extendop) => {
+                        sink.put4(enc_ldst_reg(
+                            op,
+                            r1,
+                            r2,
+                            /* scaled = */ false,
+                            Some(extendop),
+                            rd,
+                        ));
+                    }
+                    &AMode::Label(ref label) => {
+                        let offset = match label {
+                            // cast i32 to u32 (two's-complement)
+                            &MemLabel::PCRel(off) => off as u32,
+                        } / 4;
+                        assert!(offset < (1 << 19));
+                        match self {
+                            &Inst::ULoad32 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b00011000, offset, rd));
+                            }
+                            &Inst::SLoad32 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b10011000, offset, rd));
+                            }
+                            &Inst::FpuLoad32 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b00011100, offset, rd));
+                            }
+                            &Inst::ULoad64 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b01011000, offset, rd));
+                            }
+                            &Inst::FpuLoad64 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b01011100, offset, rd));
+                            }
+                            &Inst::FpuLoad128 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b10011100, offset, rd));
+                            }
+                            _ => panic!("Unspported size for LDR from constant pool!"),
+                        }
+                    }
+                    &AMode::PreIndexed(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b11, reg.to_reg(), rd));
+                    }
+                    &AMode::PostIndexed(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
+                    }
+                    // Eliminated by `mem_finalize()` above.
+                    &AMode::SPOffset(..) | &AMode::FPOffset(..) | &AMode::NominalSPOffset(..) => {
+                        panic!("Should not see stack-offset here!")
+                    }
+                    &AMode::RegOffset(..) => panic!("SHould not see generic reg-offset here!"),
+                }
+            }
+
+            &Inst::Store8 { rd, ref mem, flags }
+            | &Inst::Store16 { rd, ref mem, flags }
+            | &Inst::Store32 { rd, ref mem, flags }
+            | &Inst::Store64 { rd, ref mem, flags }
+            | &Inst::FpuStore32 { rd, ref mem, flags }
+            | &Inst::FpuStore64 { rd, ref mem, flags }
+            | &Inst::FpuStore128 { rd, ref mem, flags } => {
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);
+
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink, emit_info, state);
+                }
+
+                let (op, bits) = match self {
+                    &Inst::Store8 { .. } => (0b0011100000, 8),
+                    &Inst::Store16 { .. } => (0b0111100000, 16),
+                    &Inst::Store32 { .. } => (0b1011100000, 32),
+                    &Inst::Store64 { .. } => (0b1111100000, 64),
+                    &Inst::FpuStore32 { .. } => (0b1011110000, 32),
+                    &Inst::FpuStore64 { .. } => (0b1111110000, 64),
+                    &Inst::FpuStore128 { .. } => (0b0011110010, 128),
+                    _ => unreachable!(),
+                };
+
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() && !flags.notrap() {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+
+                match &mem {
+                    &AMode::Unscaled(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd));
+                    }
+                    &AMode::UnsignedOffset(reg, uimm12scaled) => {
+                        if uimm12scaled.value() != 0 {
+                            assert_eq!(bits, ty_bits(uimm12scaled.scale_ty()));
+                        }
+                        sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd));
+                    }
+                    &AMode::RegReg(r1, r2) => {
+                        sink.put4(enc_ldst_reg(
+                            op, r1, r2, /* scaled = */ false, /* extendop = */ None, rd,
+                        ));
+                    }
+                    &AMode::RegScaled(r1, r2, _ty) | &AMode::RegScaledExtended(r1, r2, _ty, _) => {
+                        let extendop = match &mem {
+                            &AMode::RegScaled(..) => None,
+                            &AMode::RegScaledExtended(_, _, _, op) => Some(op),
+                            _ => unreachable!(),
+                        };
+                        sink.put4(enc_ldst_reg(
+                            op, r1, r2, /* scaled = */ true, extendop, rd,
+                        ));
+                    }
+                    &AMode::RegExtended(r1, r2, extendop) => {
+                        sink.put4(enc_ldst_reg(
+                            op,
+                            r1,
+                            r2,
+                            /* scaled = */ false,
+                            Some(extendop),
+                            rd,
+                        ));
+                    }
+                    &AMode::Label(..) => {
+                        panic!("Store to a MemLabel not implemented!");
+                    }
+                    &AMode::PreIndexed(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b11, reg.to_reg(), rd));
+                    }
+                    &AMode::PostIndexed(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
+                    }
+                    // Eliminated by `mem_finalize()` above.
+                    &AMode::SPOffset(..) | &AMode::FPOffset(..) | &AMode::NominalSPOffset(..) => {
+                        panic!("Should not see stack-offset here!")
+                    }
+                    &AMode::RegOffset(..) => panic!("SHould not see generic reg-offset here!"),
+                }
+            }
+
+            &Inst::StoreP64 {
+                rt,
+                rt2,
+                ref mem,
+                flags,
+            } => {
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() && !flags.notrap() {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                match mem {
+                    &PairAMode::SignedOffset(reg, simm7) => {
+                        assert_eq!(simm7.scale_ty, I64);
+                        sink.put4(enc_ldst_pair(0b1010100100, simm7, reg, rt, rt2));
+                    }
+                    &PairAMode::PreIndexed(reg, simm7) => {
+                        assert_eq!(simm7.scale_ty, I64);
+                        sink.put4(enc_ldst_pair(0b1010100110, simm7, reg.to_reg(), rt, rt2));
+                    }
+                    &PairAMode::PostIndexed(reg, simm7) => {
+                        assert_eq!(simm7.scale_ty, I64);
+                        sink.put4(enc_ldst_pair(0b1010100010, simm7, reg.to_reg(), rt, rt2));
+                    }
+                }
+            }
+            &Inst::LoadP64 {
+                rt,
+                rt2,
+                ref mem,
+                flags,
+            } => {
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() && !flags.notrap() {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+
+                let rt = rt.to_reg();
+                let rt2 = rt2.to_reg();
+                match mem {
+                    &PairAMode::SignedOffset(reg, simm7) => {
+                        assert_eq!(simm7.scale_ty, I64);
+                        sink.put4(enc_ldst_pair(0b1010100101, simm7, reg, rt, rt2));
+                    }
+                    &PairAMode::PreIndexed(reg, simm7) => {
+                        assert_eq!(simm7.scale_ty, I64);
+                        sink.put4(enc_ldst_pair(0b1010100111, simm7, reg.to_reg(), rt, rt2));
+                    }
+                    &PairAMode::PostIndexed(reg, simm7) => {
+                        assert_eq!(simm7.scale_ty, I64);
+                        sink.put4(enc_ldst_pair(0b1010100011, simm7, reg.to_reg(), rt, rt2));
+                    }
+                }
+            }
+            &Inst::Mov64 { rd, rm } => {
+                assert!(rd.to_reg().get_class() == rm.get_class());
+                assert!(rm.get_class() == RegClass::I64);
+
+                // MOV to SP is interpreted as MOV to XZR instead. And our codegen
+                // should never MOV to XZR.
+                assert!(rd.to_reg() != stack_reg());
+
+                if rm == stack_reg() {
+                    // We can't use ORR here, so use an `add rd, sp, #0` instead.
+                    let imm12 = Imm12::maybe_from_u64(0).unwrap();
+                    sink.put4(enc_arith_rr_imm12(
+                        0b100_10001,
+                        imm12.shift_bits(),
+                        imm12.imm_bits(),
+                        rm,
+                        rd,
+                    ));
+                } else {
+                    // Encoded as ORR rd, rm, zero.
+                    sink.put4(enc_arith_rrr(0b10101010_000, 0b000_000, rd, zero_reg(), rm));
+                }
+            }
+            &Inst::Mov32 { rd, rm } => {
+                // MOV to SP is interpreted as MOV to XZR instead. And our codegen
+                // should never MOV to XZR.
+                assert!(machreg_to_gpr(rd.to_reg()) != 31);
+                // Encoded as ORR rd, rm, zero.
+                sink.put4(enc_arith_rrr(0b00101010_000, 0b000_000, rd, zero_reg(), rm));
+            }
+            &Inst::MovZ { rd, imm, size } => {
+                sink.put4(enc_move_wide(MoveWideOpcode::MOVZ, rd, imm, size))
+            }
+            &Inst::MovN { rd, imm, size } => {
+                sink.put4(enc_move_wide(MoveWideOpcode::MOVN, rd, imm, size))
+            }
+            &Inst::MovK { rd, imm, size } => {
+                sink.put4(enc_move_wide(MoveWideOpcode::MOVK, rd, imm, size))
+            }
+            &Inst::CSel { rd, rn, rm, cond } => {
+                sink.put4(enc_csel(rd, rn, rm, cond));
+            }
+            &Inst::CSet { rd, cond } => {
+                sink.put4(enc_cset(rd, cond));
+            }
+            &Inst::CCmpImm {
+                size,
+                rn,
+                imm,
+                nzcv,
+                cond,
+            } => {
+                sink.put4(enc_ccmp_imm(size, rn, imm, nzcv, cond));
+            }
+            &Inst::AtomicRMW { ty, op } => {
+                /* Emit this:
+                      dmb         ish
+                     again:
+                      ldxr{,b,h}  x/w27, [x25]
+                      op          x28, x27, x26 // op is add,sub,and,orr,eor
+                      stxr{,b,h}  w24, x/w28, [x25]
+                      cbnz        x24, again
+                      dmb         ish
+
+                   Operand conventions:
+                      IN:  x25 (addr), x26 (2nd arg for op)
+                      OUT: x27 (old value), x24 (trashed), x28 (trashed)
+
+                   It is unfortunate that, per the ARM documentation, x28 cannot be used for
+                   both the store-data and success-flag operands of stxr.  This causes the
+                   instruction's behaviour to be "CONSTRAINED UNPREDICTABLE", so we use x24
+                   instead for the success-flag.
+
+                   In the case where the operation is 'xchg', the second insn is instead
+                     mov          x28, x26
+                   so that we simply write in the destination, the "2nd arg for op".
+                */
+                let xzr = zero_reg();
+                let x24 = xreg(24);
+                let x25 = xreg(25);
+                let x26 = xreg(26);
+                let x27 = xreg(27);
+                let x28 = xreg(28);
+                let x24wr = writable_xreg(24);
+                let x27wr = writable_xreg(27);
+                let x28wr = writable_xreg(28);
+                let again_label = sink.get_label();
+
+                sink.put4(enc_dmb_ish()); // dmb ish
+
+                // again:
+                sink.bind_label(again_label);
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]
+
+                if op == inst_common::AtomicRmwOp::Xchg {
+                    // mov x28, x26
+                    sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x28wr, xzr, x26))
+                } else {
+                    // add/sub/and/orr/eor x28, x27, x26
+                    let bits_31_21 = match op {
+                        inst_common::AtomicRmwOp::Add => 0b100_01011_00_0,
+                        inst_common::AtomicRmwOp::Sub => 0b110_01011_00_0,
+                        inst_common::AtomicRmwOp::And => 0b100_01010_00_0,
+                        inst_common::AtomicRmwOp::Or => 0b101_01010_00_0,
+                        inst_common::AtomicRmwOp::Xor => 0b110_01010_00_0,
+                        inst_common::AtomicRmwOp::Xchg => unreachable!(),
+                    };
+                    sink.put4(enc_arith_rrr(bits_31_21, 0b000000, x28wr, x27, x26));
+                }
+
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25]
+
+                // cbnz w24, again
+                // Note, we're actually testing x24, and relying on the default zero-high-half
+                // rule in the assignment that `stxr` does.
+                let br_offset = sink.cur_offset();
+                sink.put4(enc_conditional_br(
+                    BranchTarget::Label(again_label),
+                    CondBrKind::NotZero(x24),
+                ));
+                sink.use_label_at_offset(br_offset, again_label, LabelUse::Branch19);
+
+                sink.put4(enc_dmb_ish()); // dmb ish
+            }
+            &Inst::AtomicCAS { ty } => {
+                /* Emit this:
+                     dmb         ish
+                    again:
+                     ldxr{,b,h}  x/w27, [x25]
+                     and         x24, x26, MASK (= 2^size_bits - 1)
+                     cmp         x27, x24
+                     b.ne        out
+                     stxr{,b,h}  w24, x/w28, [x25]
+                     cbnz        x24, again
+                    out:
+                     dmb         ish
+
+                  Operand conventions:
+                     IN:  x25 (addr), x26 (expected value), x28 (replacement value)
+                     OUT: x27 (old value), x24 (trashed)
+                */
+                let xzr = zero_reg();
+                let x24 = xreg(24);
+                let x25 = xreg(25);
+                let x26 = xreg(26);
+                let x27 = xreg(27);
+                let x28 = xreg(28);
+                let xzrwr = writable_zero_reg();
+                let x24wr = writable_xreg(24);
+                let x27wr = writable_xreg(27);
+                let again_label = sink.get_label();
+                let out_label = sink.get_label();
+
+                sink.put4(enc_dmb_ish()); // dmb ish
+
+                // again:
+                sink.bind_label(again_label);
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]
+
+                if ty == I64 {
+                    // mov x24, x26
+                    sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x24wr, xzr, x26))
+                } else {
+                    // and x24, x26, 0xFF/0xFFFF/0xFFFFFFFF
+                    let (mask, s) = match ty {
+                        I8 => (0xFF, 7),
+                        I16 => (0xFFFF, 15),
+                        I32 => (0xFFFFFFFF, 31),
+                        _ => unreachable!(),
+                    };
+                    sink.put4(enc_arith_rr_imml(
+                        0b100_100100,
+                        ImmLogic::from_n_r_s(mask, true, 0, s, OperandSize::Size64).enc_bits(),
+                        x26,
+                        x24wr,
+                    ))
+                }
+
+                // cmp x27, x24 (== subs xzr, x27, x24)
+                sink.put4(enc_arith_rrr(0b111_01011_00_0, 0b000000, xzrwr, x27, x24));
+
+                // b.ne out
+                let br_out_offset = sink.cur_offset();
+                sink.put4(enc_conditional_br(
+                    BranchTarget::Label(out_label),
+                    CondBrKind::Cond(Cond::Ne),
+                ));
+                sink.use_label_at_offset(br_out_offset, out_label, LabelUse::Branch19);
+
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25]
+
+                // cbnz w24, again.
+                // Note, we're actually testing x24, and relying on the default zero-high-half
+                // rule in the assignment that `stxr` does.
+                let br_again_offset = sink.cur_offset();
+                sink.put4(enc_conditional_br(
+                    BranchTarget::Label(again_label),
+                    CondBrKind::NotZero(x24),
+                ));
+                sink.use_label_at_offset(br_again_offset, again_label, LabelUse::Branch19);
+
+                // out:
+                sink.bind_label(out_label);
+                sink.put4(enc_dmb_ish()); // dmb ish
+            }
+            &Inst::AtomicLoad { ty, r_data, r_addr } => {
+                let op = match ty {
+                    I8 => 0b0011100001,
+                    I16 => 0b0111100001,
+                    I32 => 0b1011100001,
+                    I64 => 0b1111100001,
+                    _ => unreachable!(),
+                };
+                sink.put4(enc_dmb_ish()); // dmb ish
+
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/);
+                sink.put4(enc_ldst_uimm12(
+                    op,
+                    uimm12scaled_zero,
+                    r_addr,
+                    r_data.to_reg(),
+                ));
+            }
+            &Inst::AtomicStore { ty, r_data, r_addr } => {
+                let op = match ty {
+                    I8 => 0b0011100000,
+                    I16 => 0b0111100000,
+                    I32 => 0b1011100000,
+                    I64 => 0b1111100000,
+                    _ => unreachable!(),
+                };
+
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() {
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/);
+                sink.put4(enc_ldst_uimm12(op, uimm12scaled_zero, r_addr, r_data));
+                sink.put4(enc_dmb_ish()); // dmb ish
+            }
+            &Inst::Fence {} => {
+                sink.put4(enc_dmb_ish()); // dmb ish
+            }
+            &Inst::FpuMove64 { rd, rn } => {
+                sink.put4(enc_vecmov(/* 16b = */ false, rd, rn));
+            }
+            &Inst::FpuMove128 { rd, rn } => {
+                sink.put4(enc_vecmov(/* 16b = */ true, rd, rn));
+            }
+            &Inst::FpuMoveFromVec { rd, rn, idx, size } => {
+                let (imm5, shift, mask) = match size.lane_size() {
+                    ScalarSize::Size32 => (0b00100, 3, 0b011),
+                    ScalarSize::Size64 => (0b01000, 4, 0b001),
+                    _ => unimplemented!(),
+                };
+                debug_assert_eq!(idx & mask, idx);
+                let imm5 = imm5 | ((idx as u32) << shift);
+                sink.put4(
+                    0b010_11110000_00000_000001_00000_00000
+                        | (imm5 << 16)
+                        | (machreg_to_vec(rn) << 5)
+                        | machreg_to_vec(rd.to_reg()),
+                );
+            }
+            &Inst::FpuRR { fpu_op, rd, rn } => {
+                let top22 = match fpu_op {
+                    FPUOp1::Abs32 => 0b000_11110_00_1_000001_10000,
+                    FPUOp1::Abs64 => 0b000_11110_01_1_000001_10000,
+                    FPUOp1::Neg32 => 0b000_11110_00_1_000010_10000,
+                    FPUOp1::Neg64 => 0b000_11110_01_1_000010_10000,
+                    FPUOp1::Sqrt32 => 0b000_11110_00_1_000011_10000,
+                    FPUOp1::Sqrt64 => 0b000_11110_01_1_000011_10000,
+                    FPUOp1::Cvt32To64 => 0b000_11110_00_1_000101_10000,
+                    FPUOp1::Cvt64To32 => 0b000_11110_01_1_000100_10000,
+                };
+                sink.put4(enc_fpurr(top22, rd, rn));
+            }
+            &Inst::FpuRRR { fpu_op, rd, rn, rm } => {
+                let top22 = match fpu_op {
+                    FPUOp2::Add32 => 0b000_11110_00_1_00000_001010,
+                    FPUOp2::Add64 => 0b000_11110_01_1_00000_001010,
+                    FPUOp2::Sub32 => 0b000_11110_00_1_00000_001110,
+                    FPUOp2::Sub64 => 0b000_11110_01_1_00000_001110,
+                    FPUOp2::Mul32 => 0b000_11110_00_1_00000_000010,
+                    FPUOp2::Mul64 => 0b000_11110_01_1_00000_000010,
+                    FPUOp2::Div32 => 0b000_11110_00_1_00000_000110,
+                    FPUOp2::Div64 => 0b000_11110_01_1_00000_000110,
+                    FPUOp2::Max32 => 0b000_11110_00_1_00000_010010,
+                    FPUOp2::Max64 => 0b000_11110_01_1_00000_010010,
+                    FPUOp2::Min32 => 0b000_11110_00_1_00000_010110,
+                    FPUOp2::Min64 => 0b000_11110_01_1_00000_010110,
+                    FPUOp2::Sqadd64 => 0b010_11110_11_1_00000_000011,
+                    FPUOp2::Uqadd64 => 0b011_11110_11_1_00000_000011,
+                    FPUOp2::Sqsub64 => 0b010_11110_11_1_00000_001011,
+                    FPUOp2::Uqsub64 => 0b011_11110_11_1_00000_001011,
+                };
+                sink.put4(enc_fpurrr(top22, rd, rn, rm));
+            }
+            &Inst::FpuRRI { fpu_op, rd, rn } => match fpu_op {
+                FPUOpRI::UShr32(imm) => {
+                    debug_assert_eq!(32, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b0_0_1_011110_0000000_00_0_0_0_1_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+                FPUOpRI::UShr64(imm) => {
+                    debug_assert_eq!(64, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b01_1_111110_0000000_00_0_0_0_1_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+                FPUOpRI::Sli64(imm) => {
+                    debug_assert_eq!(64, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b01_1_111110_0000000_010101_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+                FPUOpRI::Sli32(imm) => {
+                    debug_assert_eq!(32, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b0_0_1_011110_0000000_010101_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+            },
+            &Inst::FpuRRRR {
+                fpu_op,
+                rd,
+                rn,
+                rm,
+                ra,
+            } => {
+                let top17 = match fpu_op {
+                    FPUOp3::MAdd32 => 0b000_11111_00_0_00000_0,
+                    FPUOp3::MAdd64 => 0b000_11111_01_0_00000_0,
+                };
+                sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
+            }
+            &Inst::VecMisc { op, rd, rn, size } => {
+                let (q, enc_size) = size.enc_size();
+                let (u, bits_12_16, size) = match op {
+                    VecMisc2::Not => (0b1, 0b00101, 0b00),
+                    VecMisc2::Neg => (0b1, 0b01011, enc_size),
+                    VecMisc2::Abs => (0b0, 0b01011, enc_size),
+                    VecMisc2::Fabs => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b01111, enc_size)
+                    }
+                    VecMisc2::Fneg => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b1, 0b01111, enc_size)
+                    }
+                    VecMisc2::Fsqrt => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b1, 0b11111, enc_size)
+                    }
+                    VecMisc2::Rev64 => {
+                        debug_assert_ne!(VectorSize::Size64x2, size);
+                        (0b0, 0b00000, enc_size)
+                    }
+                    VecMisc2::Shll => {
+                        debug_assert_ne!(VectorSize::Size64x2, size);
+                        debug_assert!(!size.is_128bits());
+                        (0b1, 0b10011, enc_size)
+                    }
+                    VecMisc2::Fcvtzs => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b11011, enc_size)
+                    }
+                    VecMisc2::Fcvtzu => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b1, 0b11011, enc_size)
+                    }
+                    VecMisc2::Scvtf => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b11101, enc_size & 0b1)
+                    }
+                    VecMisc2::Ucvtf => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b1, 0b11101, enc_size & 0b1)
+                    }
+                    VecMisc2::Frintn => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b11000, enc_size & 0b01)
+                    }
+                    VecMisc2::Frintz => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b11001, enc_size | 0b10)
+                    }
+                    VecMisc2::Frintm => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b11001, enc_size & 0b01)
+                    }
+                    VecMisc2::Frintp => {
+                        debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
+                        (0b0, 0b11000, enc_size | 0b10)
+                    }
+                };
+                sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
+            }
+            &Inst::VecLanes { op, rd, rn, size } => {
+                let (q, size) = match size {
+                    VectorSize::Size8x16 => (0b1, 0b00),
+                    VectorSize::Size16x8 => (0b1, 0b01),
+                    VectorSize::Size32x4 => (0b1, 0b10),
+                    _ => unreachable!(),
+                };
+                let (u, opcode) = match op {
+                    VecLanesOp::Uminv => (0b1, 0b11010),
+                    VecLanesOp::Addv => (0b0, 0b11011),
+                };
+                sink.put4(enc_vec_lanes(q, u, size, opcode, rd, rn));
+            }
+            &Inst::VecShiftImm {
+                op,
+                rd,
+                rn,
+                size,
+                imm,
+            } => {
+                let (is_shr, template) = match op {
+                    VecShiftImmOp::Ushr => (true, 0b_011_011110_0000_000_000001_00000_00000_u32),
+                    VecShiftImmOp::Sshr => (true, 0b_010_011110_0000_000_000001_00000_00000_u32),
+                    VecShiftImmOp::Shl => (false, 0b_010_011110_0000_000_010101_00000_00000_u32),
+                };
+                let imm = imm as u32;
+                // Deal with the somewhat strange encoding scheme for, and limits on,
+                // the shift amount.
+                let immh_immb = match (size, is_shr) {
+                    (VectorSize::Size64x2, true) if imm >= 1 && imm <= 64 => {
+                        0b_1000_000_u32 | (64 - imm)
+                    }
+                    (VectorSize::Size32x4, true) if imm >= 1 && imm <= 32 => {
+                        0b_0100_000_u32 | (32 - imm)
+                    }
+                    (VectorSize::Size16x8, true) if imm >= 1 && imm <= 16 => {
+                        0b_0010_000_u32 | (16 - imm)
+                    }
+                    (VectorSize::Size8x16, true) if imm >= 1 && imm <= 8 => {
+                        0b_0001_000_u32 | (8 - imm)
+                    }
+                    (VectorSize::Size64x2, false) if imm <= 63 => 0b_1000_000_u32 | imm,
+                    (VectorSize::Size32x4, false) if imm <= 31 => 0b_0100_000_u32 | imm,
+                    (VectorSize::Size16x8, false) if imm <= 15 => 0b_0010_000_u32 | imm,
+                    (VectorSize::Size8x16, false) if imm <= 7 => 0b_0001_000_u32 | imm,
+                    _ => panic!(
+                        "aarch64: Inst::VecShiftImm: emit: invalid op/size/imm {:?}, {:?}, {:?}",
+                        op, size, imm
+                    ),
+                };
+                let rn_enc = machreg_to_vec(rn);
+                let rd_enc = machreg_to_vec(rd.to_reg());
+                sink.put4(template | (immh_immb << 16) | (rn_enc << 5) | rd_enc);
+            }
+            &Inst::VecExtract { rd, rn, rm, imm4 } => {
+                if imm4 < 16 {
+                    let template = 0b_01_101110_000_00000_0_0000_0_00000_00000_u32;
+                    let rm_enc = machreg_to_vec(rm);
+                    let rn_enc = machreg_to_vec(rn);
+                    let rd_enc = machreg_to_vec(rd.to_reg());
+                    sink.put4(
+                        template | (rm_enc << 16) | ((imm4 as u32) << 11) | (rn_enc << 5) | rd_enc,
+                    );
+                } else {
+                    panic!(
+                        "aarch64: Inst::VecExtract: emit: invalid extract index {}",
+                        imm4
+                    );
+                }
+            }
+            &Inst::VecTbl {
+                rd,
+                rn,
+                rm,
+                is_extension,
+            } => {
+                sink.put4(enc_tbl(is_extension, 0b00, rd, rn, rm));
+            }
+            &Inst::VecTbl2 {
+                rd,
+                rn,
+                rn2,
+                rm,
+                is_extension,
+            } => {
+                assert_eq!(machreg_to_vec(rn2), (machreg_to_vec(rn) + 1) % 32);
+                sink.put4(enc_tbl(is_extension, 0b01, rd, rn, rm));
+            }
+            &Inst::FpuCmp32 { rn, rm } => {
+                sink.put4(enc_fcmp(ScalarSize::Size32, rn, rm));
+            }
+            &Inst::FpuCmp64 { rn, rm } => {
+                sink.put4(enc_fcmp(ScalarSize::Size64, rn, rm));
+            }
+            &Inst::FpuToInt { op, rd, rn } => {
+                let top16 = match op {
+                    // FCVTZS (32/32-bit)
+                    FpuToIntOp::F32ToI32 => 0b000_11110_00_1_11_000,
+                    // FCVTZU (32/32-bit)
+                    FpuToIntOp::F32ToU32 => 0b000_11110_00_1_11_001,
+                    // FCVTZS (32/64-bit)
+                    FpuToIntOp::F32ToI64 => 0b100_11110_00_1_11_000,
+                    // FCVTZU (32/64-bit)
+                    FpuToIntOp::F32ToU64 => 0b100_11110_00_1_11_001,
+                    // FCVTZS (64/32-bit)
+                    FpuToIntOp::F64ToI32 => 0b000_11110_01_1_11_000,
+                    // FCVTZU (64/32-bit)
+                    FpuToIntOp::F64ToU32 => 0b000_11110_01_1_11_001,
+                    // FCVTZS (64/64-bit)
+                    FpuToIntOp::F64ToI64 => 0b100_11110_01_1_11_000,
+                    // FCVTZU (64/64-bit)
+                    FpuToIntOp::F64ToU64 => 0b100_11110_01_1_11_001,
+                };
+                sink.put4(enc_fputoint(top16, rd, rn));
+            }
+            &Inst::IntToFpu { op, rd, rn } => {
+                let top16 = match op {
+                    // SCVTF (32/32-bit)
+                    IntToFpuOp::I32ToF32 => 0b000_11110_00_1_00_010,
+                    // UCVTF (32/32-bit)
+                    IntToFpuOp::U32ToF32 => 0b000_11110_00_1_00_011,
+                    // SCVTF (64/32-bit)
+                    IntToFpuOp::I64ToF32 => 0b100_11110_00_1_00_010,
+                    // UCVTF (64/32-bit)
+                    IntToFpuOp::U64ToF32 => 0b100_11110_00_1_00_011,
+                    // SCVTF (32/64-bit)
+                    IntToFpuOp::I32ToF64 => 0b000_11110_01_1_00_010,
+                    // UCVTF (32/64-bit)
+                    IntToFpuOp::U32ToF64 => 0b000_11110_01_1_00_011,
+                    // SCVTF (64/64-bit)
+                    IntToFpuOp::I64ToF64 => 0b100_11110_01_1_00_010,
+                    // UCVTF (64/64-bit)
+                    IntToFpuOp::U64ToF64 => 0b100_11110_01_1_00_011,
+                };
+                sink.put4(enc_inttofpu(top16, rd, rn));
+            }
+            &Inst::LoadFpuConst64 { rd, const_data } => {
+                let inst = Inst::FpuLoad64 {
+                    rd,
+                    mem: AMode::Label(MemLabel::PCRel(8)),
+                    flags: MemFlags::trusted(),
+                };
+                inst.emit(sink, emit_info, state);
+                let inst = Inst::Jump {
+                    dest: BranchTarget::ResolvedOffset(12),
+                };
+                inst.emit(sink, emit_info, state);
+                sink.put8(const_data);
+            }
+            &Inst::LoadFpuConst128 { rd, const_data } => {
+                let inst = Inst::FpuLoad128 {
+                    rd,
+                    mem: AMode::Label(MemLabel::PCRel(8)),
+                    flags: MemFlags::trusted(),
+                };
+                inst.emit(sink, emit_info, state);
+                let inst = Inst::Jump {
+                    dest: BranchTarget::ResolvedOffset(20),
+                };
+                inst.emit(sink, emit_info, state);
+
+                for i in const_data.to_le_bytes().iter() {
+                    sink.put1(*i);
+                }
+            }
+            &Inst::FpuCSel32 { rd, rn, rm, cond } => {
+                sink.put4(enc_fcsel(rd, rn, rm, cond, ScalarSize::Size32));
+            }
+            &Inst::FpuCSel64 { rd, rn, rm, cond } => {
+                sink.put4(enc_fcsel(rd, rn, rm, cond, ScalarSize::Size64));
+            }
+            &Inst::FpuRound { op, rd, rn } => {
+                let top22 = match op {
+                    FpuRoundMode::Minus32 => 0b000_11110_00_1_001_010_10000,
+                    FpuRoundMode::Minus64 => 0b000_11110_01_1_001_010_10000,
+                    FpuRoundMode::Plus32 => 0b000_11110_00_1_001_001_10000,
+                    FpuRoundMode::Plus64 => 0b000_11110_01_1_001_001_10000,
+                    FpuRoundMode::Zero32 => 0b000_11110_00_1_001_011_10000,
+                    FpuRoundMode::Zero64 => 0b000_11110_01_1_001_011_10000,
+                    FpuRoundMode::Nearest32 => 0b000_11110_00_1_001_000_10000,
+                    FpuRoundMode::Nearest64 => 0b000_11110_01_1_001_000_10000,
+                };
+                sink.put4(enc_fround(top22, rd, rn));
+            }
+            &Inst::MovToFpu { rd, rn, size } => {
+                let template = match size {
+                    ScalarSize::Size32 => 0b000_11110_00_1_00_111_000000_00000_00000,
+                    ScalarSize::Size64 => 0b100_11110_01_1_00_111_000000_00000_00000,
+                    _ => unreachable!(),
+                };
+                sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
+            }
+            &Inst::MovToVec { rd, rn, idx, size } => {
+                let (imm5, shift) = match size.lane_size() {
+                    ScalarSize::Size8 => (0b00001, 1),
+                    ScalarSize::Size16 => (0b00010, 2),
+                    ScalarSize::Size32 => (0b00100, 3),
+                    ScalarSize::Size64 => (0b01000, 4),
+                    _ => unreachable!(),
+                };
+                debug_assert_eq!(idx & (0b11111 >> shift), idx);
+                let imm5 = imm5 | ((idx as u32) << shift);
+                sink.put4(
+                    0b010_01110000_00000_0_0011_1_00000_00000
+                        | (imm5 << 16)
+                        | (machreg_to_gpr(rn) << 5)
+                        | machreg_to_vec(rd.to_reg()),
+                );
+            }
+            &Inst::MovFromVec { rd, rn, idx, size } => {
+                let (q, imm5, shift, mask) = match size {
+                    VectorSize::Size8x16 => (0b0, 0b00001, 1, 0b1111),
+                    VectorSize::Size16x8 => (0b0, 0b00010, 2, 0b0111),
+                    VectorSize::Size32x4 => (0b0, 0b00100, 3, 0b0011),
+                    VectorSize::Size64x2 => (0b1, 0b01000, 4, 0b0001),
+                    _ => unreachable!(),
+                };
+                debug_assert_eq!(idx & mask, idx);
+                let imm5 = imm5 | ((idx as u32) << shift);
+                sink.put4(
+                    0b000_01110000_00000_0_0111_1_00000_00000
+                        | (q << 30)
+                        | (imm5 << 16)
+                        | (machreg_to_vec(rn) << 5)
+                        | machreg_to_gpr(rd.to_reg()),
+                );
+            }
+            &Inst::MovFromVecSigned {
+                rd,
+                rn,
+                idx,
+                size,
+                scalar_size,
+            } => {
+                let (imm5, shift, half) = match size {
+                    VectorSize::Size8x8 => (0b00001, 1, true),
+                    VectorSize::Size8x16 => (0b00001, 1, false),
+                    VectorSize::Size16x4 => (0b00010, 2, true),
+                    VectorSize::Size16x8 => (0b00010, 2, false),
+                    VectorSize::Size32x2 => {
+                        debug_assert_ne!(scalar_size, OperandSize::Size32);
+                        (0b00100, 3, true)
+                    }
+                    VectorSize::Size32x4 => {
+                        debug_assert_ne!(scalar_size, OperandSize::Size32);
+                        (0b00100, 3, false)
+                    }
+                    _ => panic!("Unexpected vector operand size"),
+                };
+                debug_assert_eq!(idx & (0b11111 >> (half as u32 + shift)), idx);
+                let imm5 = imm5 | ((idx as u32) << shift);
+                sink.put4(
+                    0b000_01110000_00000_0_0101_1_00000_00000
+                        | (scalar_size.is64() as u32) << 30
+                        | (imm5 << 16)
+                        | (machreg_to_vec(rn) << 5)
+                        | machreg_to_gpr(rd.to_reg()),
+                );
+            }
+            &Inst::VecDup { rd, rn, size } => {
+                let imm5 = match size {
+                    VectorSize::Size8x16 => 0b00001,
+                    VectorSize::Size16x8 => 0b00010,
+                    VectorSize::Size32x4 => 0b00100,
+                    VectorSize::Size64x2 => 0b01000,
+                    _ => unimplemented!(),
+                };
+                sink.put4(
+                    0b010_01110000_00000_000011_00000_00000
+                        | (imm5 << 16)
+                        | (machreg_to_gpr(rn) << 5)
+                        | machreg_to_vec(rd.to_reg()),
+                );
+            }
+            &Inst::VecDupFromFpu { rd, rn, size } => {
+                let imm5 = match size {
+                    VectorSize::Size32x4 => 0b00100,
+                    VectorSize::Size64x2 => 0b01000,
+                    _ => unimplemented!(),
+                };
+                sink.put4(
+                    0b010_01110000_00000_000001_00000_00000
+                        | (imm5 << 16)
+                        | (machreg_to_vec(rn) << 5)
+                        | machreg_to_vec(rd.to_reg()),
+                );
+            }
+            &Inst::VecDupImm {
+                rd,
+                imm,
+                invert,
+                size,
+            } => {
+                let (imm, shift, shift_ones) = imm.value();
+                let (op, cmode) = match size.lane_size() {
+                    ScalarSize::Size8 => {
+                        assert!(!invert);
+                        assert_eq!(shift, 0);
+
+                        (0, 0b1110)
+                    }
+                    ScalarSize::Size16 => {
+                        let s = shift & 8;
+
+                        assert!(!shift_ones);
+                        assert_eq!(s, shift);
+
+                        (invert as u32, 0b1000 | (s >> 2))
+                    }
+                    ScalarSize::Size32 => {
+                        if shift_ones {
+                            assert!(shift == 8 || shift == 16);
+
+                            (invert as u32, 0b1100 | (shift >> 4))
+                        } else {
+                            let s = shift & 24;
+
+                            assert_eq!(s, shift);
+
+                            (invert as u32, 0b0000 | (s >> 2))
+                        }
+                    }
+                    ScalarSize::Size64 => {
+                        assert!(!invert);
+                        assert_eq!(shift, 0);
+
+                        (1, 0b1110)
+                    }
+                    _ => unreachable!(),
+                };
+                let q_op = op | ((size.is_128bits() as u32) << 1);
+
+                sink.put4(enc_asimd_mod_imm(rd, q_op, cmode, imm));
+            }
+            &Inst::VecExtend {
+                t,
+                rd,
+                rn,
+                high_half,
+            } => {
+                let (u, immh) = match t {
+                    VecExtendOp::Sxtl8 => (0b0, 0b001),
+                    VecExtendOp::Sxtl16 => (0b0, 0b010),
+                    VecExtendOp::Sxtl32 => (0b0, 0b100),
+                    VecExtendOp::Uxtl8 => (0b1, 0b001),
+                    VecExtendOp::Uxtl16 => (0b1, 0b010),
+                    VecExtendOp::Uxtl32 => (0b1, 0b100),
+                };
+                sink.put4(
+                    0b000_011110_0000_000_101001_00000_00000
+                        | ((high_half as u32) << 30)
+                        | (u << 29)
+                        | (immh << 19)
+                        | (machreg_to_vec(rn) << 5)
+                        | machreg_to_vec(rd.to_reg()),
+                );
+            }
+            &Inst::VecMiscNarrow {
+                op,
+                rd,
+                rn,
+                size,
+                high_half,
+            } => {
+                let size = match size.lane_size() {
+                    ScalarSize::Size8 => 0b00,
+                    ScalarSize::Size16 => 0b01,
+                    ScalarSize::Size32 => 0b10,
+                    _ => panic!("Unexpected vector operand lane size!"),
+                };
+                let (u, bits_12_16) = match op {
+                    VecMiscNarrowOp::Xtn => (0b0, 0b10010),
+                    VecMiscNarrowOp::Sqxtn => (0b0, 0b10100),
+                    VecMiscNarrowOp::Sqxtun => (0b1, 0b10010),
+                };
+                sink.put4(enc_vec_rr_misc(
+                    ((high_half as u32) << 1) | u,
+                    size,
+                    bits_12_16,
+                    rd,
+                    rn,
+                ));
+            }
+            &Inst::VecMovElement {
+                rd,
+                rn,
+                dest_idx,
+                src_idx,
+                size,
+            } => {
+                let (imm5, shift) = match size.lane_size() {
+                    ScalarSize::Size8 => (0b00001, 1),
+                    ScalarSize::Size16 => (0b00010, 2),
+                    ScalarSize::Size32 => (0b00100, 3),
+                    ScalarSize::Size64 => (0b01000, 4),
+                    _ => unreachable!(),
+                };
+                let mask = 0b11111 >> shift;
+                debug_assert_eq!(dest_idx & mask, dest_idx);
+                debug_assert_eq!(src_idx & mask, src_idx);
+                let imm4 = (src_idx as u32) << (shift - 1);
+                let imm5 = imm5 | ((dest_idx as u32) << shift);
+                sink.put4(
+                    0b011_01110000_00000_0_0000_1_00000_00000
+                        | (imm5 << 16)
+                        | (imm4 << 11)
+                        | (machreg_to_vec(rn) << 5)
+                        | machreg_to_vec(rd.to_reg()),
+                );
+            }
+            &Inst::VecRRR {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                size,
+            } => {
+                let (q, enc_size) = size.enc_size();
+                let is_float = match alu_op {
+                    VecALUOp::Fcmeq
+                    | VecALUOp::Fcmgt
+                    | VecALUOp::Fcmge
+                    | VecALUOp::Fadd
+                    | VecALUOp::Fsub
+                    | VecALUOp::Fdiv
+                    | VecALUOp::Fmax
+                    | VecALUOp::Fmin
+                    | VecALUOp::Fmul => true,
+                    _ => false,
+                };
+                let enc_float_size = match (is_float, size) {
+                    (true, VectorSize::Size32x2) => 0b0,
+                    (true, VectorSize::Size32x4) => 0b0,
+                    (true, VectorSize::Size64x2) => 0b1,
+                    (true, _) => unimplemented!(),
+                    _ => 0,
+                };
+
+                let (top11, bit15_10) = match alu_op {
+                    VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
+                    VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011),
+                    VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011),
+                    VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011),
+                    VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011),
+                    VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111),
+                    VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101),
+                    VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101),
+                    VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111),
+                    VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001),
+                    VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001),
+                    VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001),
+                    // The following logical instructions operate on bytes, so are not encoded differently
+                    // for the different vector types.
+                    VecALUOp::And => (0b000_01110_00_1, 0b000111),
+                    VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
+                    VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
+                    VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
+                    VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
+                    VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001),
+                    VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001),
+                    VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001),
+                    VecALUOp::Mul => {
+                        debug_assert_ne!(size, VectorSize::Size64x2);
+                        (0b000_01110_00_1 | enc_size << 1, 0b100111)
+                    }
+                    VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001),
+                    VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001),
+                    VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011),
+                    VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011),
+                    VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001),
+                    VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001),
+                    VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101),
+                    VecALUOp::Fadd => (0b000_01110_00_1, 0b110101),
+                    VecALUOp::Fsub => (0b000_01110_10_1, 0b110101),
+                    VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111),
+                    VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
+                    VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
+                    VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
+                    VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
+                    VecALUOp::Umlal => {
+                        debug_assert!(!size.is_128bits());
+                        (0b001_01110_00_1 | enc_size << 1, 0b100000)
+                    }
+                    VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
+                    VecALUOp::Smull => (0b000_01110_00_1 | enc_size << 1, 0b110000),
+                    VecALUOp::Smull2 => (0b010_01110_00_1 | enc_size << 1, 0b110000),
+                };
+                let top11 = match alu_op {
+                    VecALUOp::Smull | VecALUOp::Smull2 => top11,
+                    _ if is_float => top11 | (q << 9) | enc_float_size << 1,
+                    _ => top11 | (q << 9),
+                };
+                sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
+            }
+            &Inst::VecLoadReplicate { rd, rn, size } => {
+                let (q, size) = size.enc_size();
+
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+
+                sink.put4(enc_ldst_vec(q, size, rn, rd));
+            }
+            &Inst::VecCSel { rd, rn, rm, cond } => {
+                /* Emit this:
+                      b.cond  else
+                      mov     rd, rm
+                      b       out
+                     else:
+                      mov     rd, rn
+                     out:
+
+                   Note, we could do better in the cases where rd == rn or rd == rm.
+                */
+                let else_label = sink.get_label();
+                let out_label = sink.get_label();
+
+                // b.cond else
+                let br_else_offset = sink.cur_offset();
+                sink.put4(enc_conditional_br(
+                    BranchTarget::Label(else_label),
+                    CondBrKind::Cond(cond),
+                ));
+                sink.use_label_at_offset(br_else_offset, else_label, LabelUse::Branch19);
+
+                // mov rd, rm
+                sink.put4(enc_vecmov(/* 16b = */ true, rd, rm));
+
+                // b out
+                let b_out_offset = sink.cur_offset();
+                sink.use_label_at_offset(b_out_offset, out_label, LabelUse::Branch26);
+                sink.add_uncond_branch(b_out_offset, b_out_offset + 4, out_label);
+                sink.put4(enc_jump26(0b000101, 0 /* will be fixed up later */));
+
+                // else:
+                sink.bind_label(else_label);
+
+                // mov rd, rn
+                sink.put4(enc_vecmov(/* 16b = */ true, rd, rn));
+
+                // out:
+                sink.bind_label(out_label);
+            }
+            &Inst::MovToNZCV { rn } => {
+                sink.put4(0xd51b4200 | machreg_to_gpr(rn));
+            }
+            &Inst::MovFromNZCV { rd } => {
+                sink.put4(0xd53b4200 | machreg_to_gpr(rd.to_reg()));
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } if from_bits >= 8 => {
+                let top22 = match (signed, from_bits, to_bits) {
+                    (false, 8, 32) => 0b010_100110_0_000000_000111, // UXTB (32)
+                    (false, 16, 32) => 0b010_100110_0_000000_001111, // UXTH (32)
+                    (true, 8, 32) => 0b000_100110_0_000000_000111,  // SXTB (32)
+                    (true, 16, 32) => 0b000_100110_0_000000_001111, // SXTH (32)
+                    // The 64-bit unsigned variants are the same as the 32-bit ones,
+                    // because writes to Wn zero out the top 32 bits of Xn
+                    (false, 8, 64) => 0b010_100110_0_000000_000111, // UXTB (64)
+                    (false, 16, 64) => 0b010_100110_0_000000_001111, // UXTH (64)
+                    (true, 8, 64) => 0b100_100110_1_000000_000111,  // SXTB (64)
+                    (true, 16, 64) => 0b100_100110_1_000000_001111, // SXTH (64)
+                    // 32-to-64: the unsigned case is a 'mov' (special-cased below).
+                    (false, 32, 64) => 0,                           // MOV
+                    (true, 32, 64) => 0b100_100110_1_000000_011111, // SXTW (64)
+                    _ => panic!(
+                        "Unsupported extend combination: signed = {}, from_bits = {}, to_bits = {}",
+                        signed, from_bits, to_bits
+                    ),
+                };
+                if top22 != 0 {
+                    sink.put4(enc_extend(top22, rd, rn));
+                } else {
+                    Inst::mov32(rd, rn).emit(sink, emit_info, state);
+                }
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } if from_bits == 1 && signed => {
+                assert!(to_bits <= 64);
+                // Reduce sign-extend-from-1-bit to:
+                // - and rd, rn, #1
+                // - sub rd, zr, rd
+
+                // We don't have ImmLogic yet, so we just hardcode this. FIXME.
+                sink.put4(0x92400000 | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()));
+                let sub_inst = Inst::AluRRR {
+                    alu_op: ALUOp::Sub64,
+                    rd,
+                    rn: zero_reg(),
+                    rm: rd.to_reg(),
+                };
+                sub_inst.emit(sink, emit_info, state);
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } if from_bits == 1 && !signed => {
+                assert!(to_bits <= 64);
+                // Reduce zero-extend-from-1-bit to:
+                // - and rd, rn, #1
+
+                // We don't have ImmLogic yet, so we just hardcode this. FIXME.
+                sink.put4(0x92400000 | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()));
+            }
+            &Inst::Extend { .. } => {
+                panic!("Unsupported extend variant");
+            }
+            &Inst::Jump { ref dest } => {
+                let off = sink.cur_offset();
+                // Indicate that the jump uses a label, if so, so that a fixup can occur later.
+                if let Some(l) = dest.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch26);
+                    sink.add_uncond_branch(off, off + 4, l);
+                }
+                // Emit the jump itself.
+                sink.put4(enc_jump26(0b000101, dest.as_offset26_or_zero()));
+            }
+            &Inst::Ret => {
+                sink.put4(0xd65f03c0);
+            }
+            &Inst::EpiloguePlaceholder => {
+                // Noop; this is just a placeholder for epilogues.
+            }
+            &Inst::Call { ref info } => {
+                if let Some(s) = state.take_stack_map() {
+                    sink.add_stack_map(StackMapExtent::UpcomingBytes(4), s);
+                }
+                let loc = state.cur_srcloc();
+                sink.add_reloc(loc, Reloc::Arm64Call, &info.dest, 0);
+                sink.put4(enc_jump26(0b100101, 0));
+                if info.opcode.is_call() {
+                    sink.add_call_site(loc, info.opcode);
+                }
+            }
+            &Inst::CallInd { ref info } => {
+                if let Some(s) = state.take_stack_map() {
+                    sink.add_stack_map(StackMapExtent::UpcomingBytes(4), s);
+                }
+                sink.put4(0b1101011_0001_11111_000000_00000_00000 | (machreg_to_gpr(info.rn) << 5));
+                let loc = state.cur_srcloc();
+                if info.opcode.is_call() {
+                    sink.add_call_site(loc, info.opcode);
+                }
+            }
+            &Inst::CondBr {
+                taken,
+                not_taken,
+                kind,
+            } => {
+                // Conditional part first.
+                let cond_off = sink.cur_offset();
+                if let Some(l) = taken.as_label() {
+                    sink.use_label_at_offset(cond_off, l, LabelUse::Branch19);
+                    let inverted = enc_conditional_br(taken, kind.invert()).to_le_bytes();
+                    sink.add_cond_branch(cond_off, cond_off + 4, l, &inverted[..]);
+                }
+                sink.put4(enc_conditional_br(taken, kind));
+
+                // Unconditional part next.
+                let uncond_off = sink.cur_offset();
+                if let Some(l) = not_taken.as_label() {
+                    sink.use_label_at_offset(uncond_off, l, LabelUse::Branch26);
+                    sink.add_uncond_branch(uncond_off, uncond_off + 4, l);
+                }
+                sink.put4(enc_jump26(0b000101, not_taken.as_offset26_or_zero()));
+            }
+            &Inst::TrapIf { kind, trap_code } => {
+                // condbr KIND, LABEL
+                let off = sink.cur_offset();
+                let label = sink.get_label();
+                sink.put4(enc_conditional_br(
+                    BranchTarget::Label(label),
+                    kind.invert(),
+                ));
+                sink.use_label_at_offset(off, label, LabelUse::Branch19);
+                // udf
+                let trap = Inst::Udf { trap_code };
+                trap.emit(sink, emit_info, state);
+                // LABEL:
+                sink.bind_label(label);
+            }
+            &Inst::IndirectBr { rn, .. } => {
+                sink.put4(enc_br(rn));
+            }
+            &Inst::Nop0 => {}
+            &Inst::Nop4 => {
+                sink.put4(0xd503201f);
+            }
+            &Inst::Brk => {
+                sink.put4(0xd4200000);
+            }
+            &Inst::Udf { trap_code } => {
+                let srcloc = state.cur_srcloc();
+                sink.add_trap(srcloc, trap_code);
+                if let Some(s) = state.take_stack_map() {
+                    sink.add_stack_map(StackMapExtent::UpcomingBytes(4), s);
+                }
+                sink.put4(0xd4a00000);
+            }
+            &Inst::Adr { rd, off } => {
+                assert!(off > -(1 << 20));
+                assert!(off < (1 << 20));
+                sink.put4(enc_adr(off, rd));
+            }
+            &Inst::Word4 { data } => {
+                sink.put4(data);
+            }
+            &Inst::Word8 { data } => {
+                sink.put8(data);
+            }
+            &Inst::JTSequence {
+                ridx,
+                rtmp1,
+                rtmp2,
+                ref info,
+                ..
+            } => {
+                // This sequence is *one* instruction in the vcode, and is expanded only here at
+                // emission time, because we cannot allow the regalloc to insert spills/reloads in
+                // the middle; we depend on hardcoded PC-rel addressing below.
+
+                // Branch to default when condition code from prior comparison indicates.
+                let br = enc_conditional_br(info.default_target, CondBrKind::Cond(Cond::Hs));
+                // No need to inform the sink's branch folding logic about this branch, because it
+                // will not be merged with any other branch, flipped, or elided (it is not preceded
+                // or succeeded by any other branch). Just emit it with the label use.
+                let default_br_offset = sink.cur_offset();
+                if let BranchTarget::Label(l) = info.default_target {
+                    sink.use_label_at_offset(default_br_offset, l, LabelUse::Branch19);
+                }
+                sink.put4(br);
+
+                // Save index in a tmp (the live range of ridx only goes to start of this
+                // sequence; rtmp1 or rtmp2 may overwrite it).
+                let inst = Inst::gen_move(rtmp2, ridx, I64);
+                inst.emit(sink, emit_info, state);
+                // Load address of jump table
+                let inst = Inst::Adr { rd: rtmp1, off: 16 };
+                inst.emit(sink, emit_info, state);
+                // Load value out of jump table
+                let inst = Inst::SLoad32 {
+                    rd: rtmp2,
+                    mem: AMode::reg_plus_reg_scaled_extended(
+                        rtmp1.to_reg(),
+                        rtmp2.to_reg(),
+                        I32,
+                        ExtendOp::UXTW,
+                    ),
+                    flags: MemFlags::trusted(),
+                };
+                inst.emit(sink, emit_info, state);
+                // Add base of jump table to jump-table-sourced block offset
+                let inst = Inst::AluRRR {
+                    alu_op: ALUOp::Add64,
+                    rd: rtmp1,
+                    rn: rtmp1.to_reg(),
+                    rm: rtmp2.to_reg(),
+                };
+                inst.emit(sink, emit_info, state);
+                // Branch to computed address. (`targets` here is only used for successor queries
+                // and is not needed for emission.)
+                let inst = Inst::IndirectBr {
+                    rn: rtmp1.to_reg(),
+                    targets: vec![],
+                };
+                inst.emit(sink, emit_info, state);
+                // Emit jump table (table of 32-bit offsets).
+                let jt_off = sink.cur_offset();
+                for &target in info.targets.iter() {
+                    let word_off = sink.cur_offset();
+                    // off_into_table is an addend here embedded in the label to be later patched
+                    // at the end of codegen. The offset is initially relative to this jump table
+                    // entry; with the extra addend, it'll be relative to the jump table's start,
+                    // after patching.
+                    let off_into_table = word_off - jt_off;
+                    sink.use_label_at_offset(
+                        word_off,
+                        target.as_label().unwrap(),
+                        LabelUse::PCRel32,
+                    );
+                    sink.put4(off_into_table);
+                }
+
+                // Lowering produces an EmitIsland before using a JTSequence, so we can safely
+                // disable the worst-case-size check in this case.
+                start_off = sink.cur_offset();
+            }
+            &Inst::LoadExtName {
+                rd,
+                ref name,
+                offset,
+            } => {
+                let inst = Inst::ULoad64 {
+                    rd,
+                    mem: AMode::Label(MemLabel::PCRel(8)),
+                    flags: MemFlags::trusted(),
+                };
+                inst.emit(sink, emit_info, state);
+                let inst = Inst::Jump {
+                    dest: BranchTarget::ResolvedOffset(12),
+                };
+                inst.emit(sink, emit_info, state);
+                let srcloc = state.cur_srcloc();
+                sink.add_reloc(srcloc, Reloc::Abs8, name, offset);
+                if emit_info.flags().emit_all_ones_funcaddrs() {
+                    sink.put8(u64::max_value());
+                } else {
+                    sink.put8(0);
+                }
+            }
+            &Inst::LoadAddr { rd, ref mem } => {
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state);
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink, emit_info, state);
+                }
+
+                let (reg, index_reg, offset) = match mem {
+                    AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
+                    AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
+                    AMode::UnsignedOffset(r, uimm12scaled) => {
+                        (r, None, uimm12scaled.value() as i32)
+                    }
+                    _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
+                };
+                let abs_offset = if offset < 0 {
+                    -offset as u64
+                } else {
+                    offset as u64
+                };
+                let alu_op = if offset < 0 {
+                    ALUOp::Sub64
+                } else {
+                    ALUOp::Add64
+                };
+
+                if let Some((idx, extendop)) = index_reg {
+                    let add = Inst::AluRRRExtend {
+                        alu_op: ALUOp::Add64,
+                        rd,
+                        rn: reg,
+                        rm: idx,
+                        extendop,
+                    };
+
+                    add.emit(sink, emit_info, state);
+                } else if offset == 0 {
+                    if reg != rd.to_reg() {
+                        let mov = Inst::mov(rd, reg);
+
+                        mov.emit(sink, emit_info, state);
+                    }
+                } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
+                    let add = Inst::AluRRImm12 {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        imm12,
+                    };
+                    add.emit(sink, emit_info, state);
+                } else {
+                    // Use `tmp2` here: `reg` may be `spilltmp` if the `AMode` on this instruction
+                    // was initially an `SPOffset`. Assert that `tmp2` is truly free to use. Note
+                    // that no other instructions will be inserted here (we're emitting directly),
+                    // and a live range of `tmp2` should not span this instruction, so this use
+                    // should otherwise be correct.
+                    debug_assert!(rd.to_reg() != tmp2_reg());
+                    debug_assert!(reg != tmp2_reg());
+                    let tmp = writable_tmp2_reg();
+                    for insn in Inst::load_constant(tmp, abs_offset).into_iter() {
+                        insn.emit(sink, emit_info, state);
+                    }
+                    let add = Inst::AluRRR {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        rm: tmp.to_reg(),
+                    };
+                    add.emit(sink, emit_info, state);
+                }
+            }
+            &Inst::VirtualSPOffsetAdj { offset } => {
+                debug!(
+                    "virtual sp offset adjusted by {} -> {}",
+                    offset,
+                    state.virtual_sp_offset + offset,
+                );
+                state.virtual_sp_offset += offset;
+            }
+            &Inst::EmitIsland { needed_space } => {
+                if sink.island_needed(needed_space + 4) {
+                    let jump_around_label = sink.get_label();
+                    let jmp = Inst::Jump {
+                        dest: BranchTarget::Label(jump_around_label),
+                    };
+                    jmp.emit(sink, emit_info, state);
+                    sink.emit_island();
+                    sink.bind_label(jump_around_label);
+                }
+            }
+        }
+
+        let end_off = sink.cur_offset();
+        debug_assert!((end_off - start_off) <= Inst::worst_case_size());
+
+        state.clear_post_insn();
+    }
+
+    fn pretty_print(&self, mb_rru: Option<&RealRegUniverse>, state: &mut EmitState) -> String {
+        self.print_with_state(mb_rru, state)
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit_tests.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit_tests.rs
new file mode 100644
index 0000000000..eb31963b5d
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -0,0 +1,5143 @@
+use crate::ir::types::*;
+use crate::isa::aarch64::inst::*;
+use crate::isa::test_utils;
+use crate::isa::CallConv;
+use crate::settings;
+
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+
+#[test]
+fn test_aarch64_binemit() {
+    let mut insns = Vec::<(Inst, &str, &str)>::new();
+
+    // N.B.: the architecture is little-endian, so when transcribing the 32-bit
+    // hex instructions from e.g. objdump disassembly, one must swap the bytes
+    // seen below. (E.g., a `ret` is normally written as the u32 `D65F03C0`,
+    // but we write it here as C0035FD6.)
+
+    // Useful helper script to produce the encodings from the text:
+    //
+    //      #!/bin/sh
+    //      tmp=`mktemp /tmp/XXXXXXXX.o`
+    //      aarch64-linux-gnu-as /dev/stdin -o $tmp
+    //      aarch64-linux-gnu-objdump -d $tmp
+    //      rm -f $tmp
+    //
+    // Then:
+    //
+    //      $ echo "mov x1, x2" | aarch64inst.sh
+    insns.push((Inst::Ret, "C0035FD6", "ret"));
+    insns.push((Inst::Nop0, "", "nop-zero-len"));
+    insns.push((Inst::Nop4, "1F2003D5", "nop"));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Add32,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+        },
+        "4100030B",
+        "add w1, w2, w3",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Add64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A400068B",
+        "add x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Sub32,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+        },
+        "4100034B",
+        "sub w1, w2, w3",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Sub64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A40006CB",
+        "sub x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Orr32,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+        },
+        "4100032A",
+        "orr w1, w2, w3",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Orr64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A40006AA",
+        "orr x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::And32,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+        },
+        "4100030A",
+        "and w1, w2, w3",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::And64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A400068A",
+        "and x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::SubS32,
+            rd: writable_zero_reg(),
+            rn: xreg(2),
+            rm: xreg(3),
+        },
+        "5F00036B",
+        // TODO: Display as cmp
+        "subs wzr, w2, w3",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::SubS32,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+        },
+        "4100036B",
+        "subs w1, w2, w3",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::SubS64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A40006EB",
+        "subs x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::AddS32,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+        },
+        "4100032B",
+        "adds w1, w2, w3",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::AddS64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A40006AB",
+        "adds x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::AddS64,
+            rd: writable_zero_reg(),
+            rn: xreg(5),
+            imm12: Imm12::maybe_from_u64(1).unwrap(),
+        },
+        "BF0400B1",
+        // TODO: Display as cmn.
+        "adds xzr, x5, #1",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::SDiv64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A40CC69A",
+        "sdiv x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::UDiv64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A408C69A",
+        "udiv x4, x5, x6",
+    ));
+
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Eor32,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A400064A",
+        "eor w4, w5, w6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Eor64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A40006CA",
+        "eor x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::AndNot32,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A400260A",
+        "bic w4, w5, w6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::AndNot64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A400268A",
+        "bic x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::OrrNot32,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A400262A",
+        "orn w4, w5, w6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::OrrNot64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A40026AA",
+        "orn x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::EorNot32,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A400264A",
+        "eon w4, w5, w6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::EorNot64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A40026CA",
+        "eon x4, x5, x6",
+    ));
+
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::RotR32,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A42CC61A",
+        "ror w4, w5, w6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::RotR64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A42CC69A",
+        "ror x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Lsr32,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A424C61A",
+        "lsr w4, w5, w6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Lsr64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A424C69A",
+        "lsr x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Asr32,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A428C61A",
+        "asr w4, w5, w6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Asr64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A428C69A",
+        "asr x4, x5, x6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Lsl32,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A420C61A",
+        "lsl w4, w5, w6",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Lsl64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            rm: xreg(6),
+        },
+        "A420C69A",
+        "lsl x4, x5, x6",
+    ));
+
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::Add32,
+            rd: writable_xreg(7),
+            rn: xreg(8),
+            imm12: Imm12 {
+                bits: 0x123,
+                shift12: false,
+            },
+        },
+        "078D0411",
+        "add w7, w8, #291",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::Add32,
+            rd: writable_xreg(7),
+            rn: xreg(8),
+            imm12: Imm12 {
+                bits: 0x123,
+                shift12: true,
+            },
+        },
+        "078D4411",
+        "add w7, w8, #1191936",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::Add64,
+            rd: writable_xreg(7),
+            rn: xreg(8),
+            imm12: Imm12 {
+                bits: 0x123,
+                shift12: false,
+            },
+        },
+        "078D0491",
+        "add x7, x8, #291",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::Sub32,
+            rd: writable_xreg(7),
+            rn: xreg(8),
+            imm12: Imm12 {
+                bits: 0x123,
+                shift12: false,
+            },
+        },
+        "078D0451",
+        "sub w7, w8, #291",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::Sub64,
+            rd: writable_xreg(7),
+            rn: xreg(8),
+            imm12: Imm12 {
+                bits: 0x123,
+                shift12: false,
+            },
+        },
+        "078D04D1",
+        "sub x7, x8, #291",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::SubS32,
+            rd: writable_xreg(7),
+            rn: xreg(8),
+            imm12: Imm12 {
+                bits: 0x123,
+                shift12: false,
+            },
+        },
+        "078D0471",
+        "subs w7, w8, #291",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::SubS64,
+            rd: writable_xreg(7),
+            rn: xreg(8),
+            imm12: Imm12 {
+                bits: 0x123,
+                shift12: false,
+            },
+        },
+        "078D04F1",
+        "subs x7, x8, #291",
+    ));
+
+    insns.push((
+        Inst::AluRRRExtend {
+            alu_op: ALUOp::Add32,
+            rd: writable_xreg(7),
+            rn: xreg(8),
+            rm: xreg(9),
+            extendop: ExtendOp::SXTB,
+        },
+        "0781290B",
+        "add w7, w8, w9, SXTB",
+    ));
+
+    insns.push((
+        Inst::AluRRRExtend {
+            alu_op: ALUOp::Add64,
+            rd: writable_xreg(15),
+            rn: xreg(16),
+            rm: xreg(17),
+            extendop: ExtendOp::UXTB,
+        },
+        "0F02318B",
+        "add x15, x16, x17, UXTB",
+    ));
+
+    insns.push((
+        Inst::AluRRRExtend {
+            alu_op: ALUOp::Sub32,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+            extendop: ExtendOp::SXTH,
+        },
+        "41A0234B",
+        "sub w1, w2, w3, SXTH",
+    ));
+
+    insns.push((
+        Inst::AluRRRExtend {
+            alu_op: ALUOp::Sub64,
+            rd: writable_xreg(20),
+            rn: xreg(21),
+            rm: xreg(22),
+            extendop: ExtendOp::UXTW,
+        },
+        "B44236CB",
+        "sub x20, x21, x22, UXTW",
+    ));
+
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Add32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(20).unwrap(),
+            ),
+        },
+        "6A510C0B",
+        "add w10, w11, w12, LSL 20",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Add64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::ASR,
+                ShiftOpShiftImm::maybe_from_shift(42).unwrap(),
+            ),
+        },
+        "6AA98C8B",
+        "add x10, x11, x12, ASR 42",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Sub32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0C4B",
+        "sub w10, w11, w12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Sub64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0CCB",
+        "sub x10, x11, x12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Orr32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0C2A",
+        "orr w10, w11, w12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Orr64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0CAA",
+        "orr x10, x11, x12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::And32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0C0A",
+        "and w10, w11, w12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::And64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0C8A",
+        "and x10, x11, x12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Eor32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0C4A",
+        "eor w10, w11, w12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Eor64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0CCA",
+        "eor x10, x11, x12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::OrrNot32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D2C2A",
+        "orn w10, w11, w12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::OrrNot64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D2CAA",
+        "orn x10, x11, x12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::AndNot32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D2C0A",
+        "bic w10, w11, w12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::AndNot64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D2C8A",
+        "bic x10, x11, x12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::EorNot32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D2C4A",
+        "eon w10, w11, w12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::EorNot64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D2CCA",
+        "eon x10, x11, x12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::AddS32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0C2B",
+        "adds w10, w11, w12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::AddS64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0CAB",
+        "adds x10, x11, x12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::SubS32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0C6B",
+        "subs w10, w11, w12, LSL 23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::SubS64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            rm: xreg(12),
+            shiftop: ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            ),
+        },
+        "6A5D0CEB",
+        "subs x10, x11, x12, LSL 23",
+    ));
+
+    insns.push((
+        Inst::AluRRRExtend {
+            alu_op: ALUOp::SubS64,
+            rd: writable_zero_reg(),
+            rn: stack_reg(),
+            rm: xreg(12),
+            extendop: ExtendOp::UXTX,
+        },
+        "FF632CEB",
+        "subs xzr, sp, x12, UXTX",
+    ));
+
+    insns.push((
+        Inst::AluRRRR {
+            alu_op: ALUOp3::MAdd32,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+            ra: xreg(4),
+        },
+        "4110031B",
+        "madd w1, w2, w3, w4",
+    ));
+    insns.push((
+        Inst::AluRRRR {
+            alu_op: ALUOp3::MAdd64,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+            ra: xreg(4),
+        },
+        "4110039B",
+        "madd x1, x2, x3, x4",
+    ));
+    insns.push((
+        Inst::AluRRRR {
+            alu_op: ALUOp3::MSub32,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+            ra: xreg(4),
+        },
+        "4190031B",
+        "msub w1, w2, w3, w4",
+    ));
+    insns.push((
+        Inst::AluRRRR {
+            alu_op: ALUOp3::MSub64,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+            ra: xreg(4),
+        },
+        "4190039B",
+        "msub x1, x2, x3, x4",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::SMulH,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+        },
+        "417C439B",
+        "smulh x1, x2, x3",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::UMulH,
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            rm: xreg(3),
+        },
+        "417CC39B",
+        "umulh x1, x2, x3",
+    ));
+
+    insns.push((
+        Inst::AluRRImmShift {
+            alu_op: ALUOp::RotR32,
+            rd: writable_xreg(20),
+            rn: xreg(21),
+            immshift: ImmShift::maybe_from_u64(19).unwrap(),
+        },
+        "B44E9513",
+        "ror w20, w21, #19",
+    ));
+    insns.push((
+        Inst::AluRRImmShift {
+            alu_op: ALUOp::RotR64,
+            rd: writable_xreg(20),
+            rn: xreg(21),
+            immshift: ImmShift::maybe_from_u64(42).unwrap(),
+        },
+        "B4AAD593",
+        "ror x20, x21, #42",
+    ));
+    insns.push((
+        Inst::AluRRImmShift {
+            alu_op: ALUOp::Lsr32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            immshift: ImmShift::maybe_from_u64(13).unwrap(),
+        },
+        "6A7D0D53",
+        "lsr w10, w11, #13",
+    ));
+    insns.push((
+        Inst::AluRRImmShift {
+            alu_op: ALUOp::Lsr64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            immshift: ImmShift::maybe_from_u64(57).unwrap(),
+        },
+        "6AFD79D3",
+        "lsr x10, x11, #57",
+    ));
+    insns.push((
+        Inst::AluRRImmShift {
+            alu_op: ALUOp::Asr32,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            immshift: ImmShift::maybe_from_u64(7).unwrap(),
+        },
+        "A47C0713",
+        "asr w4, w5, #7",
+    ));
+    insns.push((
+        Inst::AluRRImmShift {
+            alu_op: ALUOp::Asr64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            immshift: ImmShift::maybe_from_u64(35).unwrap(),
+        },
+        "A4FC6393",
+        "asr x4, x5, #35",
+    ));
+    insns.push((
+        Inst::AluRRImmShift {
+            alu_op: ALUOp::Lsl32,
+            rd: writable_xreg(8),
+            rn: xreg(9),
+            immshift: ImmShift::maybe_from_u64(24).unwrap(),
+        },
+        "281D0853",
+        "lsl w8, w9, #24",
+    ));
+    insns.push((
+        Inst::AluRRImmShift {
+            alu_op: ALUOp::Lsl64,
+            rd: writable_xreg(8),
+            rn: xreg(9),
+            immshift: ImmShift::maybe_from_u64(63).unwrap(),
+        },
+        "280141D3",
+        "lsl x8, x9, #63",
+    ));
+    insns.push((
+        Inst::AluRRImmShift {
+            alu_op: ALUOp::Lsl32,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            immshift: ImmShift::maybe_from_u64(0).unwrap(),
+        },
+        "6A7D0053",
+        "lsl w10, w11, #0",
+    ));
+    insns.push((
+        Inst::AluRRImmShift {
+            alu_op: ALUOp::Lsl64,
+            rd: writable_xreg(10),
+            rn: xreg(11),
+            immshift: ImmShift::maybe_from_u64(0).unwrap(),
+        },
+        "6AFD40D3",
+        "lsl x10, x11, #0",
+    ));
+
+    insns.push((
+        Inst::AluRRImmLogic {
+            alu_op: ALUOp::And32,
+            rd: writable_xreg(21),
+            rn: xreg(27),
+            imml: ImmLogic::maybe_from_u64(0x80003fff, I32).unwrap(),
+        },
+        "753B0112",
+        "and w21, w27, #2147500031",
+    ));
+    insns.push((
+        Inst::AluRRImmLogic {
+            alu_op: ALUOp::And64,
+            rd: writable_xreg(7),
+            rn: xreg(6),
+            imml: ImmLogic::maybe_from_u64(0x3fff80003fff800, I64).unwrap(),
+        },
+        "C7381592",
+        "and x7, x6, #288221580125796352",
+    ));
+    insns.push((
+        Inst::AluRRImmLogic {
+            alu_op: ALUOp::Orr32,
+            rd: writable_xreg(1),
+            rn: xreg(5),
+            imml: ImmLogic::maybe_from_u64(0x100000, I32).unwrap(),
+        },
+        "A1000C32",
+        "orr w1, w5, #1048576",
+    ));
+    insns.push((
+        Inst::AluRRImmLogic {
+            alu_op: ALUOp::Orr64,
+            rd: writable_xreg(4),
+            rn: xreg(5),
+            imml: ImmLogic::maybe_from_u64(0x8181818181818181, I64).unwrap(),
+        },
+        "A4C401B2",
+        "orr x4, x5, #9331882296111890817",
+    ));
+    insns.push((
+        Inst::AluRRImmLogic {
+            alu_op: ALUOp::Eor32,
+            rd: writable_xreg(1),
+            rn: xreg(5),
+            imml: ImmLogic::maybe_from_u64(0x00007fff, I32).unwrap(),
+        },
+        "A1380052",
+        "eor w1, w5, #32767",
+    ));
+    insns.push((
+        Inst::AluRRImmLogic {
+            alu_op: ALUOp::Eor64,
+            rd: writable_xreg(10),
+            rn: xreg(8),
+            imml: ImmLogic::maybe_from_u64(0x8181818181818181, I64).unwrap(),
+        },
+        "0AC501D2",
+        "eor x10, x8, #9331882296111890817",
+    ));
+
+    insns.push((
+        Inst::BitRR {
+            op: BitOp::RBit32,
+            rd: writable_xreg(1),
+            rn: xreg(10),
+        },
+        "4101C05A",
+        "rbit w1, w10",
+    ));
+
+    insns.push((
+        Inst::BitRR {
+            op: BitOp::RBit64,
+            rd: writable_xreg(1),
+            rn: xreg(10),
+        },
+        "4101C0DA",
+        "rbit x1, x10",
+    ));
+
+    insns.push((
+        Inst::BitRR {
+            op: BitOp::Clz32,
+            rd: writable_xreg(15),
+            rn: xreg(3),
+        },
+        "6F10C05A",
+        "clz w15, w3",
+    ));
+
+    insns.push((
+        Inst::BitRR {
+            op: BitOp::Clz64,
+            rd: writable_xreg(15),
+            rn: xreg(3),
+        },
+        "6F10C0DA",
+        "clz x15, x3",
+    ));
+
+    insns.push((
+        Inst::BitRR {
+            op: BitOp::Cls32,
+            rd: writable_xreg(21),
+            rn: xreg(16),
+        },
+        "1516C05A",
+        "cls w21, w16",
+    ));
+
+    insns.push((
+        Inst::BitRR {
+            op: BitOp::Cls64,
+            rd: writable_xreg(21),
+            rn: xreg(16),
+        },
+        "1516C0DA",
+        "cls x21, x16",
+    ));
+
+    insns.push((
+        Inst::ULoad8 {
+            rd: writable_xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::zero()),
+            flags: MemFlags::trusted(),
+        },
+        "41004038",
+        "ldurb w1, [x2]",
+    ));
+    insns.push((
+        Inst::ULoad8 {
+            rd: writable_xreg(1),
+            mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::zero(I8)),
+            flags: MemFlags::trusted(),
+        },
+        "41004039",
+        "ldrb w1, [x2]",
+    ));
+    insns.push((
+        Inst::ULoad8 {
+            rd: writable_xreg(1),
+            mem: AMode::RegReg(xreg(2), xreg(5)),
+            flags: MemFlags::trusted(),
+        },
+        "41686538",
+        "ldrb w1, [x2, x5]",
+    ));
+    insns.push((
+        Inst::SLoad8 {
+            rd: writable_xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::zero()),
+            flags: MemFlags::trusted(),
+        },
+        "41008038",
+        "ldursb x1, [x2]",
+    ));
+    insns.push((
+        Inst::SLoad8 {
+            rd: writable_xreg(1),
+            mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(63, I8).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "41FC8039",
+        "ldrsb x1, [x2, #63]",
+    ));
+    insns.push((
+        Inst::SLoad8 {
+            rd: writable_xreg(1),
+            mem: AMode::RegReg(xreg(2), xreg(5)),
+            flags: MemFlags::trusted(),
+        },
+        "4168A538",
+        "ldrsb x1, [x2, x5]",
+    ));
+    insns.push((
+        Inst::ULoad16 {
+            rd: writable_xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::maybe_from_i64(5).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "41504078",
+        "ldurh w1, [x2, #5]",
+    ));
+    insns.push((
+        Inst::ULoad16 {
+            rd: writable_xreg(1),
+            mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(8, I16).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "41104079",
+        "ldrh w1, [x2, #8]",
+    ));
+    insns.push((
+        Inst::ULoad16 {
+            rd: writable_xreg(1),
+            mem: AMode::RegScaled(xreg(2), xreg(3), I16),
+            flags: MemFlags::trusted(),
+        },
+        "41786378",
+        "ldrh w1, [x2, x3, LSL #1]",
+    ));
+    insns.push((
+        Inst::SLoad16 {
+            rd: writable_xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::zero()),
+            flags: MemFlags::trusted(),
+        },
+        "41008078",
+        "ldursh x1, [x2]",
+    ));
+    insns.push((
+        Inst::SLoad16 {
+            rd: writable_xreg(28),
+            mem: AMode::UnsignedOffset(xreg(20), UImm12Scaled::maybe_from_i64(24, I16).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "9C328079",
+        "ldrsh x28, [x20, #24]",
+    ));
+    insns.push((
+        Inst::SLoad16 {
+            rd: writable_xreg(28),
+            mem: AMode::RegScaled(xreg(20), xreg(20), I16),
+            flags: MemFlags::trusted(),
+        },
+        "9C7AB478",
+        "ldrsh x28, [x20, x20, LSL #1]",
+    ));
+    insns.push((
+        Inst::ULoad32 {
+            rd: writable_xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::zero()),
+            flags: MemFlags::trusted(),
+        },
+        "410040B8",
+        "ldur w1, [x2]",
+    ));
+    insns.push((
+        Inst::ULoad32 {
+            rd: writable_xreg(12),
+            mem: AMode::UnsignedOffset(xreg(0), UImm12Scaled::maybe_from_i64(204, I32).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "0CCC40B9",
+        "ldr w12, [x0, #204]",
+    ));
+    insns.push((
+        Inst::ULoad32 {
+            rd: writable_xreg(1),
+            mem: AMode::RegScaled(xreg(2), xreg(12), I32),
+            flags: MemFlags::trusted(),
+        },
+        "41786CB8",
+        "ldr w1, [x2, x12, LSL #2]",
+    ));
+    insns.push((
+        Inst::SLoad32 {
+            rd: writable_xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::zero()),
+            flags: MemFlags::trusted(),
+        },
+        "410080B8",
+        "ldursw x1, [x2]",
+    ));
+    insns.push((
+        Inst::SLoad32 {
+            rd: writable_xreg(12),
+            mem: AMode::UnsignedOffset(xreg(1), UImm12Scaled::maybe_from_i64(16380, I32).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "2CFCBFB9",
+        "ldrsw x12, [x1, #16380]",
+    ));
+    insns.push((
+        Inst::SLoad32 {
+            rd: writable_xreg(1),
+            mem: AMode::RegScaled(xreg(5), xreg(1), I32),
+            flags: MemFlags::trusted(),
+        },
+        "A178A1B8",
+        "ldrsw x1, [x5, x1, LSL #2]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::zero()),
+            flags: MemFlags::trusted(),
+        },
+        "410040F8",
+        "ldur x1, [x2]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::maybe_from_i64(-256).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "410050F8",
+        "ldur x1, [x2, #-256]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::maybe_from_i64(255).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "41F04FF8",
+        "ldur x1, [x2, #255]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(32760, I64).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "41FC7FF9",
+        "ldr x1, [x2, #32760]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::RegReg(xreg(2), xreg(3)),
+            flags: MemFlags::trusted(),
+        },
+        "416863F8",
+        "ldr x1, [x2, x3]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::RegScaled(xreg(2), xreg(3), I64),
+            flags: MemFlags::trusted(),
+        },
+        "417863F8",
+        "ldr x1, [x2, x3, LSL #3]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::RegScaledExtended(xreg(2), xreg(3), I64, ExtendOp::SXTW),
+            flags: MemFlags::trusted(),
+        },
+        "41D863F8",
+        "ldr x1, [x2, w3, SXTW #3]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::RegExtended(xreg(2), xreg(3), ExtendOp::SXTW),
+            flags: MemFlags::trusted(),
+        },
+        "41C863F8",
+        "ldr x1, [x2, w3, SXTW]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::Label(MemLabel::PCRel(64)),
+            flags: MemFlags::trusted(),
+        },
+        "01020058",
+        "ldr x1, pc+64",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::PreIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "410C41F8",
+        "ldr x1, [x2, #16]!",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::PostIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "410441F8",
+        "ldr x1, [x2], #16",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::FPOffset(32768, I8),
+            flags: MemFlags::trusted(),
+        },
+        "100090D2B063308B010240F9",
+        "movz x16, #32768 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::FPOffset(-32768, I8),
+            flags: MemFlags::trusted(),
+        },
+        "F0FF8F92B063308B010240F9",
+        "movn x16, #32767 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::FPOffset(1048576, I8), // 2^20
+            flags: MemFlags::trusted(),
+        },
+        "1002A0D2B063308B010240F9",
+        "movz x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+    ));
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::FPOffset(1048576 + 1, I8), // 2^20 + 1
+            flags: MemFlags::trusted(),
+        },
+        "300080521002A072B063308B010240F9",
+        "movz w16, #1 ; movk w16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+    ));
+
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::RegOffset(xreg(7), 8, I64),
+            flags: MemFlags::trusted(),
+        },
+        "E18040F8",
+        "ldur x1, [x7, #8]",
+    ));
+
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::RegOffset(xreg(7), 1024, I64),
+            flags: MemFlags::trusted(),
+        },
+        "E10042F9",
+        "ldr x1, [x7, #1024]",
+    ));
+
+    insns.push((
+        Inst::ULoad64 {
+            rd: writable_xreg(1),
+            mem: AMode::RegOffset(xreg(7), 1048576, I64),
+            flags: MemFlags::trusted(),
+        },
+        "1002A0D2F060308B010240F9",
+        "movz x16, #16, LSL #16 ; add x16, x7, x16, UXTX ; ldr x1, [x16]",
+    ));
+
+    insns.push((
+        Inst::Store8 {
+            rd: xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::zero()),
+            flags: MemFlags::trusted(),
+        },
+        "41000038",
+        "sturb w1, [x2]",
+    ));
+    insns.push((
+        Inst::Store8 {
+            rd: xreg(1),
+            mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(4095, I8).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "41FC3F39",
+        "strb w1, [x2, #4095]",
+    ));
+    insns.push((
+        Inst::Store16 {
+            rd: xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::zero()),
+            flags: MemFlags::trusted(),
+        },
+        "41000078",
+        "sturh w1, [x2]",
+    ));
+    insns.push((
+        Inst::Store16 {
+            rd: xreg(1),
+            mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(8190, I16).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "41FC3F79",
+        "strh w1, [x2, #8190]",
+    ));
+    insns.push((
+        Inst::Store32 {
+            rd: xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::zero()),
+            flags: MemFlags::trusted(),
+        },
+        "410000B8",
+        "stur w1, [x2]",
+    ));
+    insns.push((
+        Inst::Store32 {
+            rd: xreg(1),
+            mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(16380, I32).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "41FC3FB9",
+        "str w1, [x2, #16380]",
+    ));
+    insns.push((
+        Inst::Store64 {
+            rd: xreg(1),
+            mem: AMode::Unscaled(xreg(2), SImm9::zero()),
+            flags: MemFlags::trusted(),
+        },
+        "410000F8",
+        "stur x1, [x2]",
+    ));
+    insns.push((
+        Inst::Store64 {
+            rd: xreg(1),
+            mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(32760, I64).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "41FC3FF9",
+        "str x1, [x2, #32760]",
+    ));
+    insns.push((
+        Inst::Store64 {
+            rd: xreg(1),
+            mem: AMode::RegReg(xreg(2), xreg(3)),
+            flags: MemFlags::trusted(),
+        },
+        "416823F8",
+        "str x1, [x2, x3]",
+    ));
+    insns.push((
+        Inst::Store64 {
+            rd: xreg(1),
+            mem: AMode::RegScaled(xreg(2), xreg(3), I64),
+            flags: MemFlags::trusted(),
+        },
+        "417823F8",
+        "str x1, [x2, x3, LSL #3]",
+    ));
+    insns.push((
+        Inst::Store64 {
+            rd: xreg(1),
+            mem: AMode::RegScaledExtended(xreg(2), xreg(3), I64, ExtendOp::UXTW),
+            flags: MemFlags::trusted(),
+        },
+        "415823F8",
+        "str x1, [x2, w3, UXTW #3]",
+    ));
+    insns.push((
+        Inst::Store64 {
+            rd: xreg(1),
+            mem: AMode::RegExtended(xreg(2), xreg(3), ExtendOp::UXTW),
+            flags: MemFlags::trusted(),
+        },
+        "414823F8",
+        "str x1, [x2, w3, UXTW]",
+    ));
+    insns.push((
+        Inst::Store64 {
+            rd: xreg(1),
+            mem: AMode::PreIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "410C01F8",
+        "str x1, [x2, #16]!",
+    ));
+    insns.push((
+        Inst::Store64 {
+            rd: xreg(1),
+            mem: AMode::PostIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "410401F8",
+        "str x1, [x2], #16",
+    ));
+
+    insns.push((
+        Inst::StoreP64 {
+            rt: xreg(8),
+            rt2: xreg(9),
+            mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::zero(I64)),
+            flags: MemFlags::trusted(),
+        },
+        "482500A9",
+        "stp x8, x9, [x10]",
+    ));
+    insns.push((
+        Inst::StoreP64 {
+            rt: xreg(8),
+            rt2: xreg(9),
+            mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::maybe_from_i64(504, I64).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "48A51FA9",
+        "stp x8, x9, [x10, #504]",
+    ));
+    insns.push((
+        Inst::StoreP64 {
+            rt: xreg(8),
+            rt2: xreg(9),
+            mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::maybe_from_i64(-64, I64).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "48253CA9",
+        "stp x8, x9, [x10, #-64]",
+    ));
+    insns.push((
+        Inst::StoreP64 {
+            rt: xreg(21),
+            rt2: xreg(28),
+            mem: PairAMode::SignedOffset(xreg(1), SImm7Scaled::maybe_from_i64(-512, I64).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "357020A9",
+        "stp x21, x28, [x1, #-512]",
+    ));
+    insns.push((
+        Inst::StoreP64 {
+            rt: xreg(8),
+            rt2: xreg(9),
+            mem: PairAMode::PreIndexed(
+                writable_xreg(10),
+                SImm7Scaled::maybe_from_i64(-64, I64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "4825BCA9",
+        "stp x8, x9, [x10, #-64]!",
+    ));
+    insns.push((
+        Inst::StoreP64 {
+            rt: xreg(15),
+            rt2: xreg(16),
+            mem: PairAMode::PostIndexed(
+                writable_xreg(20),
+                SImm7Scaled::maybe_from_i64(504, I64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "8FC29FA8",
+        "stp x15, x16, [x20], #504",
+    ));
+
+    insns.push((
+        Inst::LoadP64 {
+            rt: writable_xreg(8),
+            rt2: writable_xreg(9),
+            mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::zero(I64)),
+            flags: MemFlags::trusted(),
+        },
+        "482540A9",
+        "ldp x8, x9, [x10]",
+    ));
+    insns.push((
+        Inst::LoadP64 {
+            rt: writable_xreg(8),
+            rt2: writable_xreg(9),
+            mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::maybe_from_i64(504, I64).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "48A55FA9",
+        "ldp x8, x9, [x10, #504]",
+    ));
+    insns.push((
+        Inst::LoadP64 {
+            rt: writable_xreg(8),
+            rt2: writable_xreg(9),
+            mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::maybe_from_i64(-64, I64).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "48257CA9",
+        "ldp x8, x9, [x10, #-64]",
+    ));
+    insns.push((
+        Inst::LoadP64 {
+            rt: writable_xreg(8),
+            rt2: writable_xreg(9),
+            mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::maybe_from_i64(-512, I64).unwrap()),
+            flags: MemFlags::trusted(),
+        },
+        "482560A9",
+        "ldp x8, x9, [x10, #-512]",
+    ));
+    insns.push((
+        Inst::LoadP64 {
+            rt: writable_xreg(8),
+            rt2: writable_xreg(9),
+            mem: PairAMode::PreIndexed(
+                writable_xreg(10),
+                SImm7Scaled::maybe_from_i64(-64, I64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "4825FCA9",
+        "ldp x8, x9, [x10, #-64]!",
+    ));
+    insns.push((
+        Inst::LoadP64 {
+            rt: writable_xreg(8),
+            rt2: writable_xreg(25),
+            mem: PairAMode::PostIndexed(
+                writable_xreg(12),
+                SImm7Scaled::maybe_from_i64(504, I64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "88E5DFA8",
+        "ldp x8, x25, [x12], #504",
+    ));
+
+    insns.push((
+        Inst::Mov64 {
+            rd: writable_xreg(8),
+            rm: xreg(9),
+        },
+        "E80309AA",
+        "mov x8, x9",
+    ));
+    insns.push((
+        Inst::Mov32 {
+            rd: writable_xreg(8),
+            rm: xreg(9),
+        },
+        "E803092A",
+        "mov w8, w9",
+    ));
+
+    insns.push((
+        Inst::MovZ {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FF9FD2",
+        "movz x8, #65535",
+    ));
+    insns.push((
+        Inst::MovZ {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FFBFD2",
+        "movz x8, #65535, LSL #16",
+    ));
+    insns.push((
+        Inst::MovZ {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FFDFD2",
+        "movz x8, #65535, LSL #32",
+    ));
+    insns.push((
+        Inst::MovZ {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FFFFD2",
+        "movz x8, #65535, LSL #48",
+    ));
+    insns.push((
+        Inst::MovZ {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(),
+            size: OperandSize::Size32,
+        },
+        "E8FFBF52",
+        "movz w8, #65535, LSL #16",
+    ));
+
+    insns.push((
+        Inst::MovN {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FF9F92",
+        "movn x8, #65535",
+    ));
+    insns.push((
+        Inst::MovN {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FFBF92",
+        "movn x8, #65535, LSL #16",
+    ));
+    insns.push((
+        Inst::MovN {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FFDF92",
+        "movn x8, #65535, LSL #32",
+    ));
+    insns.push((
+        Inst::MovN {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FFFF92",
+        "movn x8, #65535, LSL #48",
+    ));
+    insns.push((
+        Inst::MovN {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(),
+            size: OperandSize::Size32,
+        },
+        "E8FF9F12",
+        "movn w8, #65535",
+    ));
+
+    insns.push((
+        Inst::MovK {
+            rd: writable_xreg(12),
+            imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_0000).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "0C0080F2",
+        "movk x12, #0",
+    ));
+    insns.push((
+        Inst::MovK {
+            rd: writable_xreg(19),
+            imm: MoveWideConst::maybe_with_shift(0x0000, 16).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "1300A0F2",
+        "movk x19, #0, LSL #16",
+    ));
+    insns.push((
+        Inst::MovK {
+            rd: writable_xreg(3),
+            imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E3FF9FF2",
+        "movk x3, #65535",
+    ));
+    insns.push((
+        Inst::MovK {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FFBFF2",
+        "movk x8, #65535, LSL #16",
+    ));
+    insns.push((
+        Inst::MovK {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FFDFF2",
+        "movk x8, #65535, LSL #32",
+    ));
+    insns.push((
+        Inst::MovK {
+            rd: writable_xreg(8),
+            imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(),
+            size: OperandSize::Size64,
+        },
+        "E8FFFFF2",
+        "movk x8, #65535, LSL #48",
+    ));
+
+    insns.push((
+        Inst::CSel {
+            rd: writable_xreg(10),
+            rn: xreg(12),
+            rm: xreg(14),
+            cond: Cond::Hs,
+        },
+        "8A218E9A",
+        "csel x10, x12, x14, hs",
+    ));
+    insns.push((
+        Inst::CSet {
+            rd: writable_xreg(15),
+            cond: Cond::Ge,
+        },
+        "EFB79F9A",
+        "cset x15, ge",
+    ));
+    insns.push((
+        Inst::CCmpImm {
+            size: OperandSize::Size64,
+            rn: xreg(22),
+            imm: UImm5::maybe_from_u8(5).unwrap(),
+            nzcv: NZCV::new(false, false, true, true),
+            cond: Cond::Eq,
+        },
+        "C30A45FA",
+        "ccmp x22, #5, #nzCV, eq",
+    ));
+    insns.push((
+        Inst::CCmpImm {
+            size: OperandSize::Size32,
+            rn: xreg(3),
+            imm: UImm5::maybe_from_u8(30).unwrap(),
+            nzcv: NZCV::new(true, true, true, true),
+            cond: Cond::Gt,
+        },
+        "6FC85E7A",
+        "ccmp w3, #30, #NZCV, gt",
+    ));
+    insns.push((
+        Inst::MovToFpu {
+            rd: writable_vreg(31),
+            rn: xreg(0),
+            size: ScalarSize::Size64,
+        },
+        "1F00679E",
+        "fmov d31, x0",
+    ));
+    insns.push((
+        Inst::MovToFpu {
+            rd: writable_vreg(1),
+            rn: xreg(28),
+            size: ScalarSize::Size32,
+        },
+        "8103271E",
+        "fmov s1, w28",
+    ));
+    insns.push((
+        Inst::MovToVec {
+            rd: writable_vreg(0),
+            rn: xreg(0),
+            idx: 7,
+            size: VectorSize::Size8x8,
+        },
+        "001C0F4E",
+        "mov v0.b[7], w0",
+    ));
+    insns.push((
+        Inst::MovToVec {
+            rd: writable_vreg(20),
+            rn: xreg(21),
+            idx: 0,
+            size: VectorSize::Size64x2,
+        },
+        "B41E084E",
+        "mov v20.d[0], x21",
+    ));
+    insns.push((
+        Inst::MovFromVec {
+            rd: writable_xreg(3),
+            rn: vreg(27),
+            idx: 14,
+            size: VectorSize::Size8x16,
+        },
+        "633F1D0E",
+        "umov w3, v27.b[14]",
+    ));
+    insns.push((
+        Inst::MovFromVec {
+            rd: writable_xreg(24),
+            rn: vreg(5),
+            idx: 3,
+            size: VectorSize::Size16x8,
+        },
+        "B83C0E0E",
+        "umov w24, v5.h[3]",
+    ));
+    insns.push((
+        Inst::MovFromVec {
+            rd: writable_xreg(12),
+            rn: vreg(17),
+            idx: 1,
+            size: VectorSize::Size32x4,
+        },
+        "2C3E0C0E",
+        "mov w12, v17.s[1]",
+    ));
+    insns.push((
+        Inst::MovFromVec {
+            rd: writable_xreg(21),
+            rn: vreg(20),
+            idx: 0,
+            size: VectorSize::Size64x2,
+        },
+        "953E084E",
+        "mov x21, v20.d[0]",
+    ));
+    insns.push((
+        Inst::MovFromVecSigned {
+            rd: writable_xreg(0),
+            rn: vreg(0),
+            idx: 15,
+            size: VectorSize::Size8x16,
+            scalar_size: OperandSize::Size32,
+        },
+        "002C1F0E",
+        "smov w0, v0.b[15]",
+    ));
+    insns.push((
+        Inst::MovFromVecSigned {
+            rd: writable_xreg(12),
+            rn: vreg(13),
+            idx: 7,
+            size: VectorSize::Size8x8,
+            scalar_size: OperandSize::Size64,
+        },
+        "AC2D0F4E",
+        "smov x12, v13.b[7]",
+    ));
+    insns.push((
+        Inst::MovFromVecSigned {
+            rd: writable_xreg(23),
+            rn: vreg(31),
+            idx: 7,
+            size: VectorSize::Size16x8,
+            scalar_size: OperandSize::Size32,
+        },
+        "F72F1E0E",
+        "smov w23, v31.h[7]",
+    ));
+    insns.push((
+        Inst::MovFromVecSigned {
+            rd: writable_xreg(24),
+            rn: vreg(5),
+            idx: 1,
+            size: VectorSize::Size32x2,
+            scalar_size: OperandSize::Size64,
+        },
+        "B82C0C4E",
+        "smov x24, v5.s[1]",
+    ));
+    insns.push((
+        Inst::MovToNZCV { rn: xreg(13) },
+        "0D421BD5",
+        "msr nzcv, x13",
+    ));
+    insns.push((
+        Inst::MovFromNZCV {
+            rd: writable_xreg(27),
+        },
+        "1B423BD5",
+        "mrs x27, nzcv",
+    ));
+    insns.push((
+        Inst::VecDup {
+            rd: writable_vreg(25),
+            rn: xreg(7),
+            size: VectorSize::Size8x16,
+        },
+        "F90C014E",
+        "dup v25.16b, w7",
+    ));
+    insns.push((
+        Inst::VecDup {
+            rd: writable_vreg(2),
+            rn: xreg(23),
+            size: VectorSize::Size16x8,
+        },
+        "E20E024E",
+        "dup v2.8h, w23",
+    ));
+    insns.push((
+        Inst::VecDup {
+            rd: writable_vreg(0),
+            rn: xreg(28),
+            size: VectorSize::Size32x4,
+        },
+        "800F044E",
+        "dup v0.4s, w28",
+    ));
+    insns.push((
+        Inst::VecDup {
+            rd: writable_vreg(31),
+            rn: xreg(5),
+            size: VectorSize::Size64x2,
+        },
+        "BF0C084E",
+        "dup v31.2d, x5",
+    ));
+    insns.push((
+        Inst::VecDupFromFpu {
+            rd: writable_vreg(14),
+            rn: vreg(19),
+            size: VectorSize::Size32x4,
+        },
+        "6E06044E",
+        "dup v14.4s, v19.s[0]",
+    ));
+    insns.push((
+        Inst::VecDupFromFpu {
+            rd: writable_vreg(18),
+            rn: vreg(10),
+            size: VectorSize::Size64x2,
+        },
+        "5205084E",
+        "dup v18.2d, v10.d[0]",
+    ));
+    insns.push((
+        Inst::VecDupImm {
+            rd: writable_vreg(31),
+            imm: ASIMDMovModImm::maybe_from_u64(255, ScalarSize::Size8).unwrap(),
+            invert: false,
+            size: VectorSize::Size8x16,
+        },
+        "FFE7074F",
+        "movi v31.16b, #255",
+    ));
+    insns.push((
+        Inst::VecDupImm {
+            rd: writable_vreg(0),
+            imm: ASIMDMovModImm::zero(),
+            invert: true,
+            size: VectorSize::Size16x4,
+        },
+        "0084002F",
+        "mvni v0.4h, #0",
+    ));
+    insns.push((
+        Inst::VecExtend {
+            t: VecExtendOp::Sxtl8,
+            rd: writable_vreg(4),
+            rn: vreg(27),
+            high_half: false,
+        },
+        "64A7080F",
+        "sxtl v4.8h, v27.8b",
+    ));
+    insns.push((
+        Inst::VecExtend {
+            t: VecExtendOp::Sxtl16,
+            rd: writable_vreg(17),
+            rn: vreg(19),
+            high_half: true,
+        },
+        "71A6104F",
+        "sxtl2 v17.4s, v19.8h",
+    ));
+    insns.push((
+        Inst::VecExtend {
+            t: VecExtendOp::Sxtl32,
+            rd: writable_vreg(30),
+            rn: vreg(6),
+            high_half: false,
+        },
+        "DEA4200F",
+        "sxtl v30.2d, v6.2s",
+    ));
+    insns.push((
+        Inst::VecExtend {
+            t: VecExtendOp::Uxtl8,
+            rd: writable_vreg(3),
+            rn: vreg(29),
+            high_half: true,
+        },
+        "A3A7086F",
+        "uxtl2 v3.8h, v29.16b",
+    ));
+    insns.push((
+        Inst::VecExtend {
+            t: VecExtendOp::Uxtl16,
+            rd: writable_vreg(15),
+            rn: vreg(12),
+            high_half: false,
+        },
+        "8FA5102F",
+        "uxtl v15.4s, v12.4h",
+    ));
+    insns.push((
+        Inst::VecExtend {
+            t: VecExtendOp::Uxtl32,
+            rd: writable_vreg(28),
+            rn: vreg(2),
+            high_half: true,
+        },
+        "5CA4206F",
+        "uxtl2 v28.2d, v2.4s",
+    ));
+
+    insns.push((
+        Inst::VecMovElement {
+            rd: writable_vreg(0),
+            rn: vreg(31),
+            dest_idx: 7,
+            src_idx: 7,
+            size: VectorSize::Size16x8,
+        },
+        "E0771E6E",
+        "mov v0.h[7], v31.h[7]",
+    ));
+
+    insns.push((
+        Inst::VecMovElement {
+            rd: writable_vreg(31),
+            rn: vreg(16),
+            dest_idx: 1,
+            src_idx: 0,
+            size: VectorSize::Size32x2,
+        },
+        "1F060C6E",
+        "mov v31.s[1], v16.s[0]",
+    ));
+
+    insns.push((
+        Inst::VecMiscNarrow {
+            op: VecMiscNarrowOp::Xtn,
+            rd: writable_vreg(22),
+            rn: vreg(8),
+            size: VectorSize::Size32x2,
+            high_half: false,
+        },
+        "1629A10E",
+        "xtn v22.2s, v8.2d",
+    ));
+
+    insns.push((
+        Inst::VecMiscNarrow {
+            op: VecMiscNarrowOp::Sqxtn,
+            rd: writable_vreg(31),
+            rn: vreg(0),
+            size: VectorSize::Size16x8,
+            high_half: true,
+        },
+        "1F48614E",
+        "sqxtn2 v31.8h, v0.4s",
+    ));
+
+    insns.push((
+        Inst::VecMiscNarrow {
+            op: VecMiscNarrowOp::Sqxtun,
+            rd: writable_vreg(16),
+            rn: vreg(23),
+            size: VectorSize::Size8x16,
+            high_half: false,
+        },
+        "F02A212E",
+        "sqxtun v16.8b, v23.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqadd,
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(8),
+            size: VectorSize::Size8x16,
+        },
+        "410C284E",
+        "sqadd v1.16b, v2.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqadd,
+            rd: writable_vreg(1),
+            rn: vreg(12),
+            rm: vreg(28),
+            size: VectorSize::Size16x8,
+        },
+        "810D7C4E",
+        "sqadd v1.8h, v12.8h, v28.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqadd,
+            rd: writable_vreg(12),
+            rn: vreg(2),
+            rm: vreg(6),
+            size: VectorSize::Size32x4,
+        },
+        "4C0CA64E",
+        "sqadd v12.4s, v2.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqadd,
+            rd: writable_vreg(20),
+            rn: vreg(7),
+            rm: vreg(13),
+            size: VectorSize::Size64x2,
+        },
+        "F40CED4E",
+        "sqadd v20.2d, v7.2d, v13.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqsub,
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(8),
+            size: VectorSize::Size8x16,
+        },
+        "412C284E",
+        "sqsub v1.16b, v2.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqsub,
+            rd: writable_vreg(1),
+            rn: vreg(12),
+            rm: vreg(28),
+            size: VectorSize::Size16x8,
+        },
+        "812D7C4E",
+        "sqsub v1.8h, v12.8h, v28.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqsub,
+            rd: writable_vreg(12),
+            rn: vreg(2),
+            rm: vreg(6),
+            size: VectorSize::Size32x4,
+        },
+        "4C2CA64E",
+        "sqsub v12.4s, v2.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sqsub,
+            rd: writable_vreg(20),
+            rn: vreg(7),
+            rm: vreg(13),
+            size: VectorSize::Size64x2,
+        },
+        "F42CED4E",
+        "sqsub v20.2d, v7.2d, v13.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqadd,
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(8),
+            size: VectorSize::Size8x16,
+        },
+        "410C286E",
+        "uqadd v1.16b, v2.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqadd,
+            rd: writable_vreg(1),
+            rn: vreg(12),
+            rm: vreg(28),
+            size: VectorSize::Size16x8,
+        },
+        "810D7C6E",
+        "uqadd v1.8h, v12.8h, v28.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqadd,
+            rd: writable_vreg(12),
+            rn: vreg(2),
+            rm: vreg(6),
+            size: VectorSize::Size32x4,
+        },
+        "4C0CA66E",
+        "uqadd v12.4s, v2.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqadd,
+            rd: writable_vreg(20),
+            rn: vreg(7),
+            rm: vreg(13),
+            size: VectorSize::Size64x2,
+        },
+        "F40CED6E",
+        "uqadd v20.2d, v7.2d, v13.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqsub,
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(8),
+            size: VectorSize::Size8x16,
+        },
+        "412C286E",
+        "uqsub v1.16b, v2.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqsub,
+            rd: writable_vreg(1),
+            rn: vreg(12),
+            rm: vreg(28),
+            size: VectorSize::Size16x8,
+        },
+        "812D7C6E",
+        "uqsub v1.8h, v12.8h, v28.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqsub,
+            rd: writable_vreg(12),
+            rn: vreg(2),
+            rm: vreg(6),
+            size: VectorSize::Size32x4,
+        },
+        "4C2CA66E",
+        "uqsub v12.4s, v2.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Uqsub,
+            rd: writable_vreg(20),
+            rn: vreg(7),
+            rm: vreg(13),
+            size: VectorSize::Size64x2,
+        },
+        "F42CED6E",
+        "uqsub v20.2d, v7.2d, v13.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmeq,
+            rd: writable_vreg(3),
+            rn: vreg(23),
+            rm: vreg(24),
+            size: VectorSize::Size8x16,
+        },
+        "E38E386E",
+        "cmeq v3.16b, v23.16b, v24.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmgt,
+            rd: writable_vreg(3),
+            rn: vreg(23),
+            rm: vreg(24),
+            size: VectorSize::Size8x16,
+        },
+        "E336384E",
+        "cmgt v3.16b, v23.16b, v24.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmge,
+            rd: writable_vreg(23),
+            rn: vreg(9),
+            rm: vreg(12),
+            size: VectorSize::Size8x16,
+        },
+        "373D2C4E",
+        "cmge v23.16b, v9.16b, v12.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmhi,
+            rd: writable_vreg(5),
+            rn: vreg(1),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "2534216E",
+        "cmhi v5.16b, v1.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmhs,
+            rd: writable_vreg(8),
+            rn: vreg(2),
+            rm: vreg(15),
+            size: VectorSize::Size8x16,
+        },
+        "483C2F6E",
+        "cmhs v8.16b, v2.16b, v15.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmeq,
+            rd: writable_vreg(3),
+            rn: vreg(23),
+            rm: vreg(24),
+            size: VectorSize::Size16x8,
+        },
+        "E38E786E",
+        "cmeq v3.8h, v23.8h, v24.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmgt,
+            rd: writable_vreg(3),
+            rn: vreg(23),
+            rm: vreg(24),
+            size: VectorSize::Size16x8,
+        },
+        "E336784E",
+        "cmgt v3.8h, v23.8h, v24.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmge,
+            rd: writable_vreg(23),
+            rn: vreg(9),
+            rm: vreg(12),
+            size: VectorSize::Size16x8,
+        },
+        "373D6C4E",
+        "cmge v23.8h, v9.8h, v12.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmhi,
+            rd: writable_vreg(5),
+            rn: vreg(1),
+            rm: vreg(1),
+            size: VectorSize::Size16x8,
+        },
+        "2534616E",
+        "cmhi v5.8h, v1.8h, v1.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmhs,
+            rd: writable_vreg(8),
+            rn: vreg(2),
+            rm: vreg(15),
+            size: VectorSize::Size16x8,
+        },
+        "483C6F6E",
+        "cmhs v8.8h, v2.8h, v15.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmeq,
+            rd: writable_vreg(3),
+            rn: vreg(23),
+            rm: vreg(24),
+            size: VectorSize::Size32x4,
+        },
+        "E38EB86E",
+        "cmeq v3.4s, v23.4s, v24.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmgt,
+            rd: writable_vreg(3),
+            rn: vreg(23),
+            rm: vreg(24),
+            size: VectorSize::Size32x4,
+        },
+        "E336B84E",
+        "cmgt v3.4s, v23.4s, v24.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmge,
+            rd: writable_vreg(23),
+            rn: vreg(9),
+            rm: vreg(12),
+            size: VectorSize::Size32x4,
+        },
+        "373DAC4E",
+        "cmge v23.4s, v9.4s, v12.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmhi,
+            rd: writable_vreg(5),
+            rn: vreg(1),
+            rm: vreg(1),
+            size: VectorSize::Size32x4,
+        },
+        "2534A16E",
+        "cmhi v5.4s, v1.4s, v1.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmhs,
+            rd: writable_vreg(8),
+            rn: vreg(2),
+            rm: vreg(15),
+            size: VectorSize::Size32x4,
+        },
+        "483CAF6E",
+        "cmhs v8.4s, v2.4s, v15.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fcmeq,
+            rd: writable_vreg(28),
+            rn: vreg(12),
+            rm: vreg(4),
+            size: VectorSize::Size32x2,
+        },
+        "9CE5240E",
+        "fcmeq v28.2s, v12.2s, v4.2s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fcmgt,
+            rd: writable_vreg(3),
+            rn: vreg(16),
+            rm: vreg(31),
+            size: VectorSize::Size64x2,
+        },
+        "03E6FF6E",
+        "fcmgt v3.2d, v16.2d, v31.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fcmge,
+            rd: writable_vreg(18),
+            rn: vreg(23),
+            rm: vreg(0),
+            size: VectorSize::Size64x2,
+        },
+        "F2E6606E",
+        "fcmge v18.2d, v23.2d, v0.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::And,
+            rd: writable_vreg(20),
+            rn: vreg(19),
+            rm: vreg(18),
+            size: VectorSize::Size32x4,
+        },
+        "741E324E",
+        "and v20.16b, v19.16b, v18.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Bic,
+            rd: writable_vreg(8),
+            rn: vreg(11),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "681D614E",
+        "bic v8.16b, v11.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Orr,
+            rd: writable_vreg(15),
+            rn: vreg(2),
+            rm: vreg(12),
+            size: VectorSize::Size16x8,
+        },
+        "4F1CAC4E",
+        "orr v15.16b, v2.16b, v12.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Eor,
+            rd: writable_vreg(18),
+            rn: vreg(3),
+            rm: vreg(22),
+            size: VectorSize::Size8x16,
+        },
+        "721C366E",
+        "eor v18.16b, v3.16b, v22.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Bsl,
+            rd: writable_vreg(8),
+            rn: vreg(9),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "281D616E",
+        "bsl v8.16b, v9.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umaxp,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "88A5216E",
+        "umaxp v8.16b, v12.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umaxp,
+            rd: writable_vreg(1),
+            rn: vreg(6),
+            rm: vreg(1),
+            size: VectorSize::Size16x8,
+        },
+        "C1A4616E",
+        "umaxp v1.8h, v6.8h, v1.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umaxp,
+            rd: writable_vreg(1),
+            rn: vreg(20),
+            rm: vreg(16),
+            size: VectorSize::Size32x4,
+        },
+        "81A6B06E",
+        "umaxp v1.4s, v20.4s, v16.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Add,
+            rd: writable_vreg(5),
+            rn: vreg(1),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "2584214E",
+        "add v5.16b, v1.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Add,
+            rd: writable_vreg(7),
+            rn: vreg(13),
+            rm: vreg(2),
+            size: VectorSize::Size16x8,
+        },
+        "A785624E",
+        "add v7.8h, v13.8h, v2.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Add,
+            rd: writable_vreg(18),
+            rn: vreg(9),
+            rm: vreg(6),
+            size: VectorSize::Size32x4,
+        },
+        "3285A64E",
+        "add v18.4s, v9.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Add,
+            rd: writable_vreg(1),
+            rn: vreg(3),
+            rm: vreg(2),
+            size: VectorSize::Size64x2,
+        },
+        "6184E24E",
+        "add v1.2d, v3.2d, v2.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sub,
+            rd: writable_vreg(5),
+            rn: vreg(1),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "2584216E",
+        "sub v5.16b, v1.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sub,
+            rd: writable_vreg(7),
+            rn: vreg(13),
+            rm: vreg(2),
+            size: VectorSize::Size16x8,
+        },
+        "A785626E",
+        "sub v7.8h, v13.8h, v2.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sub,
+            rd: writable_vreg(18),
+            rn: vreg(9),
+            rm: vreg(6),
+            size: VectorSize::Size32x4,
+        },
+        "3285A66E",
+        "sub v18.4s, v9.4s, v6.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sub,
+            rd: writable_vreg(18),
+            rn: vreg(0),
+            rm: vreg(8),
+            size: VectorSize::Size64x2,
+        },
+        "1284E86E",
+        "sub v18.2d, v0.2d, v8.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Mul,
+            rd: writable_vreg(25),
+            rn: vreg(9),
+            rm: vreg(8),
+            size: VectorSize::Size8x16,
+        },
+        "399D284E",
+        "mul v25.16b, v9.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Mul,
+            rd: writable_vreg(30),
+            rn: vreg(30),
+            rm: vreg(12),
+            size: VectorSize::Size16x8,
+        },
+        "DE9F6C4E",
+        "mul v30.8h, v30.8h, v12.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Mul,
+            rd: writable_vreg(18),
+            rn: vreg(18),
+            rm: vreg(18),
+            size: VectorSize::Size32x4,
+        },
+        "529EB24E",
+        "mul v18.4s, v18.4s, v18.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Ushl,
+            rd: writable_vreg(18),
+            rn: vreg(18),
+            rm: vreg(18),
+            size: VectorSize::Size8x16,
+        },
+        "5246326E",
+        "ushl v18.16b, v18.16b, v18.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Ushl,
+            rd: writable_vreg(18),
+            rn: vreg(18),
+            rm: vreg(18),
+            size: VectorSize::Size16x8,
+        },
+        "5246726E",
+        "ushl v18.8h, v18.8h, v18.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Ushl,
+            rd: writable_vreg(18),
+            rn: vreg(1),
+            rm: vreg(21),
+            size: VectorSize::Size32x4,
+        },
+        "3244B56E",
+        "ushl v18.4s, v1.4s, v21.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Ushl,
+            rd: writable_vreg(5),
+            rn: vreg(7),
+            rm: vreg(19),
+            size: VectorSize::Size64x2,
+        },
+        "E544F36E",
+        "ushl v5.2d, v7.2d, v19.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sshl,
+            rd: writable_vreg(18),
+            rn: vreg(18),
+            rm: vreg(18),
+            size: VectorSize::Size8x16,
+        },
+        "5246324E",
+        "sshl v18.16b, v18.16b, v18.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sshl,
+            rd: writable_vreg(30),
+            rn: vreg(1),
+            rm: vreg(29),
+            size: VectorSize::Size16x8,
+        },
+        "3E447D4E",
+        "sshl v30.8h, v1.8h, v29.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sshl,
+            rd: writable_vreg(8),
+            rn: vreg(22),
+            rm: vreg(21),
+            size: VectorSize::Size32x4,
+        },
+        "C846B54E",
+        "sshl v8.4s, v22.4s, v21.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Sshl,
+            rd: writable_vreg(8),
+            rn: vreg(22),
+            rm: vreg(2),
+            size: VectorSize::Size64x2,
+        },
+        "C846E24E",
+        "sshl v8.2d, v22.2d, v2.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umin,
+            rd: writable_vreg(1),
+            rn: vreg(12),
+            rm: vreg(3),
+            size: VectorSize::Size8x16,
+        },
+        "816D236E",
+        "umin v1.16b, v12.16b, v3.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umin,
+            rd: writable_vreg(30),
+            rn: vreg(20),
+            rm: vreg(10),
+            size: VectorSize::Size16x8,
+        },
+        "9E6E6A6E",
+        "umin v30.8h, v20.8h, v10.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umin,
+            rd: writable_vreg(8),
+            rn: vreg(22),
+            rm: vreg(21),
+            size: VectorSize::Size32x4,
+        },
+        "C86EB56E",
+        "umin v8.4s, v22.4s, v21.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smin,
+            rd: writable_vreg(1),
+            rn: vreg(12),
+            rm: vreg(3),
+            size: VectorSize::Size8x16,
+        },
+        "816D234E",
+        "smin v1.16b, v12.16b, v3.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smin,
+            rd: writable_vreg(30),
+            rn: vreg(20),
+            rm: vreg(10),
+            size: VectorSize::Size16x8,
+        },
+        "9E6E6A4E",
+        "smin v30.8h, v20.8h, v10.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smin,
+            rd: writable_vreg(8),
+            rn: vreg(22),
+            rm: vreg(21),
+            size: VectorSize::Size32x4,
+        },
+        "C86EB54E",
+        "smin v8.4s, v22.4s, v21.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umax,
+            rd: writable_vreg(6),
+            rn: vreg(9),
+            rm: vreg(8),
+            size: VectorSize::Size8x8,
+        },
+        "2665282E",
+        "umax v6.8b, v9.8b, v8.8b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umax,
+            rd: writable_vreg(11),
+            rn: vreg(13),
+            rm: vreg(2),
+            size: VectorSize::Size16x8,
+        },
+        "AB65626E",
+        "umax v11.8h, v13.8h, v2.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umax,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            rm: vreg(14),
+            size: VectorSize::Size32x4,
+        },
+        "8865AE6E",
+        "umax v8.4s, v12.4s, v14.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smax,
+            rd: writable_vreg(6),
+            rn: vreg(9),
+            rm: vreg(8),
+            size: VectorSize::Size8x16,
+        },
+        "2665284E",
+        "smax v6.16b, v9.16b, v8.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smax,
+            rd: writable_vreg(11),
+            rn: vreg(13),
+            rm: vreg(2),
+            size: VectorSize::Size16x8,
+        },
+        "AB65624E",
+        "smax v11.8h, v13.8h, v2.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smax,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            rm: vreg(14),
+            size: VectorSize::Size32x4,
+        },
+        "8865AE4E",
+        "smax v8.4s, v12.4s, v14.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Urhadd,
+            rd: writable_vreg(8),
+            rn: vreg(1),
+            rm: vreg(3),
+            size: VectorSize::Size8x16,
+        },
+        "2814236E",
+        "urhadd v8.16b, v1.16b, v3.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Urhadd,
+            rd: writable_vreg(2),
+            rn: vreg(13),
+            rm: vreg(6),
+            size: VectorSize::Size16x8,
+        },
+        "A215666E",
+        "urhadd v2.8h, v13.8h, v6.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Urhadd,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            rm: vreg(14),
+            size: VectorSize::Size32x4,
+        },
+        "8815AE6E",
+        "urhadd v8.4s, v12.4s, v14.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fadd,
+            rd: writable_vreg(31),
+            rn: vreg(0),
+            rm: vreg(16),
+            size: VectorSize::Size32x4,
+        },
+        "1FD4304E",
+        "fadd v31.4s, v0.4s, v16.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fsub,
+            rd: writable_vreg(8),
+            rn: vreg(7),
+            rm: vreg(15),
+            size: VectorSize::Size64x2,
+        },
+        "E8D4EF4E",
+        "fsub v8.2d, v7.2d, v15.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fdiv,
+            rd: writable_vreg(1),
+            rn: vreg(3),
+            rm: vreg(4),
+            size: VectorSize::Size32x4,
+        },
+        "61FC246E",
+        "fdiv v1.4s, v3.4s, v4.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fmax,
+            rd: writable_vreg(31),
+            rn: vreg(16),
+            rm: vreg(0),
+            size: VectorSize::Size64x2,
+        },
+        "1FF6604E",
+        "fmax v31.2d, v16.2d, v0.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fmin,
+            rd: writable_vreg(5),
+            rn: vreg(19),
+            rm: vreg(26),
+            size: VectorSize::Size32x4,
+        },
+        "65F6BA4E",
+        "fmin v5.4s, v19.4s, v26.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Fmul,
+            rd: writable_vreg(2),
+            rn: vreg(0),
+            rm: vreg(5),
+            size: VectorSize::Size64x2,
+        },
+        "02DC656E",
+        "fmul v2.2d, v0.2d, v5.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Addp,
+            rd: writable_vreg(16),
+            rn: vreg(12),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "90BD214E",
+        "addp v16.16b, v12.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Addp,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            rm: vreg(14),
+            size: VectorSize::Size32x4,
+        },
+        "88BDAE4E",
+        "addp v8.4s, v12.4s, v14.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Umlal,
+            rd: writable_vreg(9),
+            rn: vreg(20),
+            rm: vreg(17),
+            size: VectorSize::Size32x2,
+        },
+        "8982B12E",
+        "umlal v9.2d, v20.2s, v17.2s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Zip1,
+            rd: writable_vreg(16),
+            rn: vreg(12),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "9039014E",
+        "zip1 v16.16b, v12.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Zip1,
+            rd: writable_vreg(2),
+            rn: vreg(13),
+            rm: vreg(6),
+            size: VectorSize::Size16x8,
+        },
+        "A239464E",
+        "zip1 v2.8h, v13.8h, v6.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Zip1,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            rm: vreg(14),
+            size: VectorSize::Size32x4,
+        },
+        "88398E4E",
+        "zip1 v8.4s, v12.4s, v14.4s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Zip1,
+            rd: writable_vreg(9),
+            rn: vreg(20),
+            rm: vreg(17),
+            size: VectorSize::Size64x2,
+        },
+        "893AD14E",
+        "zip1 v9.2d, v20.2d, v17.2d",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smull,
+            rd: writable_vreg(16),
+            rn: vreg(12),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "90C1210E",
+        "smull v16.8h, v12.8b, v1.8b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smull,
+            rd: writable_vreg(2),
+            rn: vreg(13),
+            rm: vreg(6),
+            size: VectorSize::Size16x8,
+        },
+        "A2C1660E",
+        "smull v2.4s, v13.4h, v6.4h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smull,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            rm: vreg(14),
+            size: VectorSize::Size32x4,
+        },
+        "88C1AE0E",
+        "smull v8.2d, v12.2s, v14.2s",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smull2,
+            rd: writable_vreg(16),
+            rn: vreg(12),
+            rm: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "90C1214E",
+        "smull2 v16.8h, v12.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smull2,
+            rd: writable_vreg(2),
+            rn: vreg(13),
+            rm: vreg(6),
+            size: VectorSize::Size16x8,
+        },
+        "A2C1664E",
+        "smull2 v2.4s, v13.8h, v6.8h",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Smull2,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            rm: vreg(14),
+            size: VectorSize::Size32x4,
+        },
+        "88C1AE4E",
+        "smull2 v8.2d, v12.4s, v14.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Not,
+            rd: writable_vreg(20),
+            rn: vreg(17),
+            size: VectorSize::Size8x8,
+        },
+        "345A202E",
+        "mvn v20.8b, v17.8b",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Not,
+            rd: writable_vreg(2),
+            rn: vreg(1),
+            size: VectorSize::Size32x4,
+        },
+        "2258206E",
+        "mvn v2.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Neg,
+            rd: writable_vreg(3),
+            rn: vreg(7),
+            size: VectorSize::Size8x8,
+        },
+        "E3B8202E",
+        "neg v3.8b, v7.8b",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Neg,
+            rd: writable_vreg(8),
+            rn: vreg(12),
+            size: VectorSize::Size8x16,
+        },
+        "88B9206E",
+        "neg v8.16b, v12.16b",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Neg,
+            rd: writable_vreg(0),
+            rn: vreg(31),
+            size: VectorSize::Size16x8,
+        },
+        "E0BB606E",
+        "neg v0.8h, v31.8h",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Neg,
+            rd: writable_vreg(2),
+            rn: vreg(3),
+            size: VectorSize::Size32x4,
+        },
+        "62B8A06E",
+        "neg v2.4s, v3.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Neg,
+            rd: writable_vreg(10),
+            rn: vreg(8),
+            size: VectorSize::Size64x2,
+        },
+        "0AB9E06E",
+        "neg v10.2d, v8.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Abs,
+            rd: writable_vreg(3),
+            rn: vreg(1),
+            size: VectorSize::Size8x8,
+        },
+        "23B8200E",
+        "abs v3.8b, v1.8b",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Abs,
+            rd: writable_vreg(1),
+            rn: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "21B8204E",
+        "abs v1.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Abs,
+            rd: writable_vreg(29),
+            rn: vreg(28),
+            size: VectorSize::Size16x8,
+        },
+        "9DBB604E",
+        "abs v29.8h, v28.8h",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Abs,
+            rd: writable_vreg(7),
+            rn: vreg(8),
+            size: VectorSize::Size32x4,
+        },
+        "07B9A04E",
+        "abs v7.4s, v8.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Abs,
+            rd: writable_vreg(1),
+            rn: vreg(10),
+            size: VectorSize::Size64x2,
+        },
+        "41B9E04E",
+        "abs v1.2d, v10.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fabs,
+            rd: writable_vreg(15),
+            rn: vreg(16),
+            size: VectorSize::Size32x4,
+        },
+        "0FFAA04E",
+        "fabs v15.4s, v16.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fneg,
+            rd: writable_vreg(31),
+            rn: vreg(0),
+            size: VectorSize::Size32x4,
+        },
+        "1FF8A06E",
+        "fneg v31.4s, v0.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fsqrt,
+            rd: writable_vreg(7),
+            rn: vreg(18),
+            size: VectorSize::Size64x2,
+        },
+        "47FAE16E",
+        "fsqrt v7.2d, v18.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Rev64,
+            rd: writable_vreg(1),
+            rn: vreg(10),
+            size: VectorSize::Size32x4,
+        },
+        "4109A04E",
+        "rev64 v1.4s, v10.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Shll,
+            rd: writable_vreg(12),
+            rn: vreg(5),
+            size: VectorSize::Size8x8,
+        },
+        "AC38212E",
+        "shll v12.8h, v5.8b, #8",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Shll,
+            rd: writable_vreg(9),
+            rn: vreg(1),
+            size: VectorSize::Size16x4,
+        },
+        "2938612E",
+        "shll v9.4s, v1.4h, #16",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Shll,
+            rd: writable_vreg(1),
+            rn: vreg(10),
+            size: VectorSize::Size32x2,
+        },
+        "4139A12E",
+        "shll v1.2d, v10.2s, #32",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fcvtzs,
+            rd: writable_vreg(4),
+            rn: vreg(22),
+            size: VectorSize::Size32x4,
+        },
+        "C4BAA14E",
+        "fcvtzs v4.4s, v22.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fcvtzu,
+            rd: writable_vreg(29),
+            rn: vreg(15),
+            size: VectorSize::Size64x2,
+        },
+        "FDB9E16E",
+        "fcvtzu v29.2d, v15.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Scvtf,
+            rd: writable_vreg(20),
+            rn: vreg(8),
+            size: VectorSize::Size32x4,
+        },
+        "14D9214E",
+        "scvtf v20.4s, v8.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Ucvtf,
+            rd: writable_vreg(10),
+            rn: vreg(19),
+            size: VectorSize::Size64x2,
+        },
+        "6ADA616E",
+        "ucvtf v10.2d, v19.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintn,
+            rd: writable_vreg(11),
+            rn: vreg(18),
+            size: VectorSize::Size32x4,
+        },
+        "4B8A214E",
+        "frintn v11.4s, v18.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintn,
+            rd: writable_vreg(12),
+            rn: vreg(17),
+            size: VectorSize::Size64x2,
+        },
+        "2C8A614E",
+        "frintn v12.2d, v17.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintz,
+            rd: writable_vreg(11),
+            rn: vreg(18),
+            size: VectorSize::Size32x4,
+        },
+        "4B9AA14E",
+        "frintz v11.4s, v18.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintz,
+            rd: writable_vreg(12),
+            rn: vreg(17),
+            size: VectorSize::Size64x2,
+        },
+        "2C9AE14E",
+        "frintz v12.2d, v17.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintm,
+            rd: writable_vreg(11),
+            rn: vreg(18),
+            size: VectorSize::Size32x4,
+        },
+        "4B9A214E",
+        "frintm v11.4s, v18.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintm,
+            rd: writable_vreg(12),
+            rn: vreg(17),
+            size: VectorSize::Size64x2,
+        },
+        "2C9A614E",
+        "frintm v12.2d, v17.2d",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintp,
+            rd: writable_vreg(11),
+            rn: vreg(18),
+            size: VectorSize::Size32x4,
+        },
+        "4B8AA14E",
+        "frintp v11.4s, v18.4s",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Frintp,
+            rd: writable_vreg(12),
+            rn: vreg(17),
+            size: VectorSize::Size64x2,
+        },
+        "2C8AE14E",
+        "frintp v12.2d, v17.2d",
+    ));
+
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Uminv,
+            rd: writable_vreg(2),
+            rn: vreg(1),
+            size: VectorSize::Size8x16,
+        },
+        "22A8316E",
+        "uminv b2, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Uminv,
+            rd: writable_vreg(3),
+            rn: vreg(11),
+            size: VectorSize::Size16x8,
+        },
+        "63A9716E",
+        "uminv h3, v11.8h",
+    ));
+
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Uminv,
+            rd: writable_vreg(18),
+            rn: vreg(4),
+            size: VectorSize::Size32x4,
+        },
+        "92A8B16E",
+        "uminv s18, v4.4s",
+    ));
+
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Addv,
+            rd: writable_vreg(2),
+            rn: vreg(29),
+            size: VectorSize::Size8x16,
+        },
+        "A2BB314E",
+        "addv b2, v29.16b",
+    ));
+
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Addv,
+            rd: writable_vreg(3),
+            rn: vreg(21),
+            size: VectorSize::Size16x8,
+        },
+        "A3BA714E",
+        "addv h3, v21.8h",
+    ));
+
+    insns.push((
+        Inst::VecLanes {
+            op: VecLanesOp::Addv,
+            rd: writable_vreg(18),
+            rn: vreg(5),
+            size: VectorSize::Size32x4,
+        },
+        "B2B8B14E",
+        "addv s18, v5.4s",
+    ));
+
+    insns.push((
+        Inst::VecShiftImm {
+            op: VecShiftImmOp::Shl,
+            rd: writable_vreg(27),
+            rn: vreg(5),
+            imm: 7,
+            size: VectorSize::Size8x16,
+        },
+        "BB540F4F",
+        "shl v27.16b, v5.16b, #7",
+    ));
+
+    insns.push((
+        Inst::VecShiftImm {
+            op: VecShiftImmOp::Shl,
+            rd: writable_vreg(1),
+            rn: vreg(30),
+            imm: 0,
+            size: VectorSize::Size8x16,
+        },
+        "C157084F",
+        "shl v1.16b, v30.16b, #0",
+    ));
+
+    insns.push((
+        Inst::VecShiftImm {
+            op: VecShiftImmOp::Sshr,
+            rd: writable_vreg(26),
+            rn: vreg(6),
+            imm: 16,
+            size: VectorSize::Size16x8,
+        },
+        "DA04104F",
+        "sshr v26.8h, v6.8h, #16",
+    ));
+
+    insns.push((
+        Inst::VecShiftImm {
+            op: VecShiftImmOp::Sshr,
+            rd: writable_vreg(3),
+            rn: vreg(19),
+            imm: 1,
+            size: VectorSize::Size16x8,
+        },
+        "63061F4F",
+        "sshr v3.8h, v19.8h, #1",
+    ));
+
+    insns.push((
+        Inst::VecShiftImm {
+            op: VecShiftImmOp::Ushr,
+            rd: writable_vreg(25),
+            rn: vreg(6),
+            imm: 32,
+            size: VectorSize::Size32x4,
+        },
+        "D904206F",
+        "ushr v25.4s, v6.4s, #32",
+    ));
+
+    insns.push((
+        Inst::VecShiftImm {
+            op: VecShiftImmOp::Ushr,
+            rd: writable_vreg(5),
+            rn: vreg(21),
+            imm: 1,
+            size: VectorSize::Size32x4,
+        },
+        "A5063F6F",
+        "ushr v5.4s, v21.4s, #1",
+    ));
+
+    insns.push((
+        Inst::VecShiftImm {
+            op: VecShiftImmOp::Shl,
+            rd: writable_vreg(22),
+            rn: vreg(13),
+            imm: 63,
+            size: VectorSize::Size64x2,
+        },
+        "B6557F4F",
+        "shl v22.2d, v13.2d, #63",
+    ));
+
+    insns.push((
+        Inst::VecShiftImm {
+            op: VecShiftImmOp::Shl,
+            rd: writable_vreg(23),
+            rn: vreg(9),
+            imm: 0,
+            size: VectorSize::Size64x2,
+        },
+        "3755404F",
+        "shl v23.2d, v9.2d, #0",
+    ));
+
+    insns.push((
+        Inst::VecExtract {
+            rd: writable_vreg(1),
+            rn: vreg(30),
+            rm: vreg(17),
+            imm4: 0,
+        },
+        "C103116E",
+        "ext v1.16b, v30.16b, v17.16b, #0",
+    ));
+
+    insns.push((
+        Inst::VecExtract {
+            rd: writable_vreg(1),
+            rn: vreg(30),
+            rm: vreg(17),
+            imm4: 8,
+        },
+        "C143116E",
+        "ext v1.16b, v30.16b, v17.16b, #8",
+    ));
+
+    insns.push((
+        Inst::VecExtract {
+            rd: writable_vreg(1),
+            rn: vreg(30),
+            rm: vreg(17),
+            imm4: 15,
+        },
+        "C17B116E",
+        "ext v1.16b, v30.16b, v17.16b, #15",
+    ));
+
+    insns.push((
+        Inst::VecTbl {
+            rd: writable_vreg(0),
+            rn: vreg(31),
+            rm: vreg(16),
+            is_extension: false,
+        },
+        "E003104E",
+        "tbl v0.16b, { v31.16b }, v16.16b",
+    ));
+
+    insns.push((
+        Inst::VecTbl {
+            rd: writable_vreg(4),
+            rn: vreg(12),
+            rm: vreg(23),
+            is_extension: true,
+        },
+        "8411174E",
+        "tbx v4.16b, { v12.16b }, v23.16b",
+    ));
+
+    insns.push((
+        Inst::VecTbl2 {
+            rd: writable_vreg(16),
+            rn: vreg(31),
+            rn2: vreg(0),
+            rm: vreg(26),
+            is_extension: false,
+        },
+        "F0231A4E",
+        "tbl v16.16b, { v31.16b, v0.16b }, v26.16b",
+    ));
+
+    insns.push((
+        Inst::VecTbl2 {
+            rd: writable_vreg(3),
+            rn: vreg(11),
+            rn2: vreg(12),
+            rm: vreg(19),
+            is_extension: true,
+        },
+        "6331134E",
+        "tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
+    ));
+
+    insns.push((
+        Inst::VecLoadReplicate {
+            rd: writable_vreg(31),
+            rn: xreg(0),
+
+            size: VectorSize::Size64x2,
+        },
+        "1FCC404D",
+        "ld1r { v31.2d }, [x0]",
+    ));
+
+    insns.push((
+        Inst::VecLoadReplicate {
+            rd: writable_vreg(0),
+            rn: xreg(25),
+
+            size: VectorSize::Size8x8,
+        },
+        "20C3400D",
+        "ld1r { v0.8b }, [x25]",
+    ));
+
+    insns.push((
+        Inst::VecCSel {
+            rd: writable_vreg(5),
+            rn: vreg(10),
+            rm: vreg(19),
+            cond: Cond::Gt,
+        },
+        "6C000054651EB34E02000014451DAA4E",
+        "vcsel v5.16b, v10.16b, v19.16b, gt (if-then-else diamond)",
+    ));
+
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: false,
+            from_bits: 8,
+            to_bits: 32,
+        },
+        "411C0053",
+        "uxtb w1, w2",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: true,
+            from_bits: 8,
+            to_bits: 32,
+        },
+        "411C0013",
+        "sxtb w1, w2",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: false,
+            from_bits: 16,
+            to_bits: 32,
+        },
+        "413C0053",
+        "uxth w1, w2",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: true,
+            from_bits: 16,
+            to_bits: 32,
+        },
+        "413C0013",
+        "sxth w1, w2",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: false,
+            from_bits: 8,
+            to_bits: 64,
+        },
+        "411C0053",
+        "uxtb x1, w2",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: true,
+            from_bits: 8,
+            to_bits: 64,
+        },
+        "411C4093",
+        "sxtb x1, w2",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: false,
+            from_bits: 16,
+            to_bits: 64,
+        },
+        "413C0053",
+        "uxth x1, w2",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: true,
+            from_bits: 16,
+            to_bits: 64,
+        },
+        "413C4093",
+        "sxth x1, w2",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: false,
+            from_bits: 32,
+            to_bits: 64,
+        },
+        "E103022A",
+        "mov w1, w2",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: true,
+            from_bits: 32,
+            to_bits: 64,
+        },
+        "417C4093",
+        "sxtw x1, w2",
+    ));
+
+    insns.push((
+        Inst::Jump {
+            dest: BranchTarget::ResolvedOffset(64),
+        },
+        "10000014",
+        "b 64",
+    ));
+
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::NotZero(xreg(8)),
+        },
+        "480000B40000A0D4",
+        "cbz x8, 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Zero(xreg(8)),
+        },
+        "480000B50000A0D4",
+        "cbnz x8, 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Ne),
+        },
+        "400000540000A0D4",
+        "b.eq 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Eq),
+        },
+        "410000540000A0D4",
+        "b.ne 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Lo),
+        },
+        "420000540000A0D4",
+        "b.hs 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Hs),
+        },
+        "430000540000A0D4",
+        "b.lo 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Pl),
+        },
+        "440000540000A0D4",
+        "b.mi 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Mi),
+        },
+        "450000540000A0D4",
+        "b.pl 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Vc),
+        },
+        "460000540000A0D4",
+        "b.vs 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Vs),
+        },
+        "470000540000A0D4",
+        "b.vc 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Ls),
+        },
+        "480000540000A0D4",
+        "b.hi 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Hi),
+        },
+        "490000540000A0D4",
+        "b.ls 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Lt),
+        },
+        "4A0000540000A0D4",
+        "b.ge 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Ge),
+        },
+        "4B0000540000A0D4",
+        "b.lt 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Le),
+        },
+        "4C0000540000A0D4",
+        "b.gt 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Gt),
+        },
+        "4D0000540000A0D4",
+        "b.le 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Nv),
+        },
+        "4E0000540000A0D4",
+        "b.al 8 ; udf",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            trap_code: TrapCode::Interrupt,
+            kind: CondBrKind::Cond(Cond::Al),
+        },
+        "4F0000540000A0D4",
+        "b.nv 8 ; udf",
+    ));
+
+    insns.push((
+        Inst::CondBr {
+            taken: BranchTarget::ResolvedOffset(64),
+            not_taken: BranchTarget::ResolvedOffset(128),
+            kind: CondBrKind::Cond(Cond::Le),
+        },
+        "0D02005420000014",
+        "b.le 64 ; b 128",
+    ));
+
+    insns.push((
+        Inst::Call {
+            info: Box::new(CallInfo {
+                dest: ExternalName::testcase("test0"),
+                uses: Vec::new(),
+                defs: Vec::new(),
+                opcode: Opcode::Call,
+                caller_callconv: CallConv::SystemV,
+                callee_callconv: CallConv::SystemV,
+            }),
+        },
+        "00000094",
+        "bl 0",
+    ));
+
+    insns.push((
+        Inst::CallInd {
+            info: Box::new(CallIndInfo {
+                rn: xreg(10),
+                uses: Vec::new(),
+                defs: Vec::new(),
+                opcode: Opcode::CallIndirect,
+                caller_callconv: CallConv::SystemV,
+                callee_callconv: CallConv::SystemV,
+            }),
+        },
+        "40013FD6",
+        "blr x10",
+    ));
+
+    insns.push((
+        Inst::IndirectBr {
+            rn: xreg(3),
+            targets: vec![],
+        },
+        "60001FD6",
+        "br x3",
+    ));
+
+    insns.push((Inst::Brk, "000020D4", "brk #0"));
+
+    insns.push((
+        Inst::Adr {
+            rd: writable_xreg(15),
+            off: (1 << 20) - 4,
+        },
+        "EFFF7F10",
+        "adr x15, pc+1048572",
+    ));
+
+    insns.push((
+        Inst::FpuMove64 {
+            rd: writable_vreg(8),
+            rn: vreg(4),
+        },
+        "881CA40E",
+        "mov v8.8b, v4.8b",
+    ));
+
+    insns.push((
+        Inst::FpuMove128 {
+            rd: writable_vreg(17),
+            rn: vreg(26),
+        },
+        "511FBA4E",
+        "mov v17.16b, v26.16b",
+    ));
+
+    insns.push((
+        Inst::FpuMoveFromVec {
+            rd: writable_vreg(1),
+            rn: vreg(30),
+            idx: 2,
+            size: VectorSize::Size32x4,
+        },
+        "C107145E",
+        "mov s1, v30.s[2]",
+    ));
+
+    insns.push((
+        Inst::FpuMoveFromVec {
+            rd: writable_vreg(23),
+            rn: vreg(11),
+            idx: 0,
+            size: VectorSize::Size64x2,
+        },
+        "7705085E",
+        "mov d23, v11.d[0]",
+    ));
+
+    insns.push((
+        Inst::FpuRR {
+            fpu_op: FPUOp1::Abs32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+        },
+        "CFC3201E",
+        "fabs s15, s30",
+    ));
+
+    insns.push((
+        Inst::FpuRR {
+            fpu_op: FPUOp1::Abs64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+        },
+        "CFC3601E",
+        "fabs d15, d30",
+    ));
+
+    insns.push((
+        Inst::FpuRR {
+            fpu_op: FPUOp1::Neg32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+        },
+        "CF43211E",
+        "fneg s15, s30",
+    ));
+
+    insns.push((
+        Inst::FpuRR {
+            fpu_op: FPUOp1::Neg64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+        },
+        "CF43611E",
+        "fneg d15, d30",
+    ));
+
+    insns.push((
+        Inst::FpuRR {
+            fpu_op: FPUOp1::Sqrt32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+        },
+        "CFC3211E",
+        "fsqrt s15, s30",
+    ));
+
+    insns.push((
+        Inst::FpuRR {
+            fpu_op: FPUOp1::Sqrt64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+        },
+        "CFC3611E",
+        "fsqrt d15, d30",
+    ));
+
+    insns.push((
+        Inst::FpuRR {
+            fpu_op: FPUOp1::Cvt32To64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+        },
+        "CFC3221E",
+        "fcvt d15, s30",
+    ));
+
+    insns.push((
+        Inst::FpuRR {
+            fpu_op: FPUOp1::Cvt64To32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+        },
+        "CF43621E",
+        "fcvt s15, d30",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Add32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF2B3F1E",
+        "fadd s15, s30, s31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Add64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF2B7F1E",
+        "fadd d15, d30, d31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Sub32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF3B3F1E",
+        "fsub s15, s30, s31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Sub64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF3B7F1E",
+        "fsub d15, d30, d31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Mul32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF0B3F1E",
+        "fmul s15, s30, s31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Mul64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF0B7F1E",
+        "fmul d15, d30, d31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Div32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF1B3F1E",
+        "fdiv s15, s30, s31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Div64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF1B7F1E",
+        "fdiv d15, d30, d31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Max32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF4B3F1E",
+        "fmax s15, s30, s31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Max64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF4B7F1E",
+        "fmax d15, d30, d31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Min32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF5B3F1E",
+        "fmin s15, s30, s31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Min64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+        },
+        "CF5B7F1E",
+        "fmin d15, d30, d31",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Uqadd64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D50EF77E",
+        "uqadd d21, d22, d23",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Sqadd64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D50EF75E",
+        "sqadd d21, d22, d23",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Uqsub64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D52EF77E",
+        "uqsub d21, d22, d23",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Sqsub64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D52EF75E",
+        "sqsub d21, d22, d23",
+    ));
+
+    insns.push((
+        Inst::FpuRRRR {
+            fpu_op: FPUOp3::MAdd32,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+            ra: vreg(1),
+        },
+        "CF071F1F",
+        "fmadd s15, s30, s31, s1",
+    ));
+
+    insns.push((
+        Inst::FpuRRRR {
+            fpu_op: FPUOp3::MAdd64,
+            rd: writable_vreg(15),
+            rn: vreg(30),
+            rm: vreg(31),
+            ra: vreg(1),
+        },
+        "CF075F1F",
+        "fmadd d15, d30, d31, d1",
+    ));
+
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::UShr32(FPURightShiftImm::maybe_from_u8(32, 32).unwrap()),
+            rd: writable_vreg(2),
+            rn: vreg(5),
+        },
+        "A204202F",
+        "ushr v2.2s, v5.2s, #32",
+    ));
+
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::UShr64(FPURightShiftImm::maybe_from_u8(63, 64).unwrap()),
+            rd: writable_vreg(2),
+            rn: vreg(5),
+        },
+        "A204417F",
+        "ushr d2, d5, #63",
+    ));
+
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(31, 32).unwrap()),
+            rd: writable_vreg(4),
+            rn: vreg(10),
+        },
+        "44553F2F",
+        "sli v4.2s, v10.2s, #31",
+    ));
+
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(63, 64).unwrap()),
+            rd: writable_vreg(4),
+            rn: vreg(10),
+        },
+        "44557F7F",
+        "sli d4, d10, #63",
+    ));
+
+    insns.push((
+        Inst::FpuToInt {
+            op: FpuToIntOp::F32ToU32,
+            rd: writable_xreg(1),
+            rn: vreg(4),
+        },
+        "8100391E",
+        "fcvtzu w1, s4",
+    ));
+
+    insns.push((
+        Inst::FpuToInt {
+            op: FpuToIntOp::F32ToU64,
+            rd: writable_xreg(1),
+            rn: vreg(4),
+        },
+        "8100399E",
+        "fcvtzu x1, s4",
+    ));
+
+    insns.push((
+        Inst::FpuToInt {
+            op: FpuToIntOp::F32ToI32,
+            rd: writable_xreg(1),
+            rn: vreg(4),
+        },
+        "8100381E",
+        "fcvtzs w1, s4",
+    ));
+
+    insns.push((
+        Inst::FpuToInt {
+            op: FpuToIntOp::F32ToI64,
+            rd: writable_xreg(1),
+            rn: vreg(4),
+        },
+        "8100389E",
+        "fcvtzs x1, s4",
+    ));
+
+    insns.push((
+        Inst::FpuToInt {
+            op: FpuToIntOp::F64ToU32,
+            rd: writable_xreg(1),
+            rn: vreg(4),
+        },
+        "8100791E",
+        "fcvtzu w1, d4",
+    ));
+
+    insns.push((
+        Inst::FpuToInt {
+            op: FpuToIntOp::F64ToU64,
+            rd: writable_xreg(1),
+            rn: vreg(4),
+        },
+        "8100799E",
+        "fcvtzu x1, d4",
+    ));
+
+    insns.push((
+        Inst::FpuToInt {
+            op: FpuToIntOp::F64ToI32,
+            rd: writable_xreg(1),
+            rn: vreg(4),
+        },
+        "8100781E",
+        "fcvtzs w1, d4",
+    ));
+
+    insns.push((
+        Inst::FpuToInt {
+            op: FpuToIntOp::F64ToI64,
+            rd: writable_xreg(1),
+            rn: vreg(4),
+        },
+        "8100789E",
+        "fcvtzs x1, d4",
+    ));
+
+    insns.push((
+        Inst::IntToFpu {
+            op: IntToFpuOp::U32ToF32,
+            rd: writable_vreg(1),
+            rn: xreg(4),
+        },
+        "8100231E",
+        "ucvtf s1, w4",
+    ));
+
+    insns.push((
+        Inst::IntToFpu {
+            op: IntToFpuOp::I32ToF32,
+            rd: writable_vreg(1),
+            rn: xreg(4),
+        },
+        "8100221E",
+        "scvtf s1, w4",
+    ));
+
+    insns.push((
+        Inst::IntToFpu {
+            op: IntToFpuOp::U32ToF64,
+            rd: writable_vreg(1),
+            rn: xreg(4),
+        },
+        "8100631E",
+        "ucvtf d1, w4",
+    ));
+
+    insns.push((
+        Inst::IntToFpu {
+            op: IntToFpuOp::I32ToF64,
+            rd: writable_vreg(1),
+            rn: xreg(4),
+        },
+        "8100621E",
+        "scvtf d1, w4",
+    ));
+
+    insns.push((
+        Inst::IntToFpu {
+            op: IntToFpuOp::U64ToF32,
+            rd: writable_vreg(1),
+            rn: xreg(4),
+        },
+        "8100239E",
+        "ucvtf s1, x4",
+    ));
+
+    insns.push((
+        Inst::IntToFpu {
+            op: IntToFpuOp::I64ToF32,
+            rd: writable_vreg(1),
+            rn: xreg(4),
+        },
+        "8100229E",
+        "scvtf s1, x4",
+    ));
+
+    insns.push((
+        Inst::IntToFpu {
+            op: IntToFpuOp::U64ToF64,
+            rd: writable_vreg(1),
+            rn: xreg(4),
+        },
+        "8100639E",
+        "ucvtf d1, x4",
+    ));
+
+    insns.push((
+        Inst::IntToFpu {
+            op: IntToFpuOp::I64ToF64,
+            rd: writable_vreg(1),
+            rn: xreg(4),
+        },
+        "8100629E",
+        "scvtf d1, x4",
+    ));
+
+    insns.push((
+        Inst::FpuCmp32 {
+            rn: vreg(23),
+            rm: vreg(24),
+        },
+        "E022381E",
+        "fcmp s23, s24",
+    ));
+
+    insns.push((
+        Inst::FpuCmp64 {
+            rn: vreg(23),
+            rm: vreg(24),
+        },
+        "E022781E",
+        "fcmp d23, d24",
+    ));
+
+    insns.push((
+        Inst::FpuLoad32 {
+            rd: writable_vreg(16),
+            mem: AMode::RegScaled(xreg(8), xreg(9), F32),
+            flags: MemFlags::trusted(),
+        },
+        "107969BC",
+        "ldr s16, [x8, x9, LSL #2]",
+    ));
+
+    insns.push((
+        Inst::FpuLoad64 {
+            rd: writable_vreg(16),
+            mem: AMode::RegScaled(xreg(8), xreg(9), F64),
+            flags: MemFlags::trusted(),
+        },
+        "107969FC",
+        "ldr d16, [x8, x9, LSL #3]",
+    ));
+
+    insns.push((
+        Inst::FpuLoad128 {
+            rd: writable_vreg(16),
+            mem: AMode::RegScaled(xreg(8), xreg(9), I128),
+            flags: MemFlags::trusted(),
+        },
+        "1079E93C",
+        "ldr q16, [x8, x9, LSL #4]",
+    ));
+
+    insns.push((
+        Inst::FpuLoad32 {
+            rd: writable_vreg(16),
+            mem: AMode::Label(MemLabel::PCRel(8)),
+            flags: MemFlags::trusted(),
+        },
+        "5000001C",
+        "ldr s16, pc+8",
+    ));
+
+    insns.push((
+        Inst::FpuLoad64 {
+            rd: writable_vreg(16),
+            mem: AMode::Label(MemLabel::PCRel(8)),
+            flags: MemFlags::trusted(),
+        },
+        "5000005C",
+        "ldr d16, pc+8",
+    ));
+
+    insns.push((
+        Inst::FpuLoad128 {
+            rd: writable_vreg(16),
+            mem: AMode::Label(MemLabel::PCRel(8)),
+            flags: MemFlags::trusted(),
+        },
+        "5000009C",
+        "ldr q16, pc+8",
+    ));
+
+    insns.push((
+        Inst::FpuStore32 {
+            rd: vreg(16),
+            mem: AMode::RegScaled(xreg(8), xreg(9), F32),
+            flags: MemFlags::trusted(),
+        },
+        "107929BC",
+        "str s16, [x8, x9, LSL #2]",
+    ));
+
+    insns.push((
+        Inst::FpuStore64 {
+            rd: vreg(16),
+            mem: AMode::RegScaled(xreg(8), xreg(9), F64),
+            flags: MemFlags::trusted(),
+        },
+        "107929FC",
+        "str d16, [x8, x9, LSL #3]",
+    ));
+
+    insns.push((
+        Inst::FpuStore128 {
+            rd: vreg(16),
+            mem: AMode::RegScaled(xreg(8), xreg(9), I128),
+            flags: MemFlags::trusted(),
+        },
+        "1079A93C",
+        "str q16, [x8, x9, LSL #4]",
+    ));
+
+    insns.push((
+        Inst::LoadFpuConst64 {
+            rd: writable_vreg(16),
+            const_data: 1.0_f64.to_bits(),
+        },
+        "5000005C03000014000000000000F03F",
+        "ldr d16, pc+8 ; b 12 ; data.f64 1",
+    ));
+
+    insns.push((
+        Inst::LoadFpuConst128 {
+            rd: writable_vreg(5),
+            const_data: 0x0f0e0d0c0b0a09080706050403020100,
+        },
+        "4500009C05000014000102030405060708090A0B0C0D0E0F",
+        "ldr q5, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100",
+    ));
+
+    insns.push((
+        Inst::FpuCSel32 {
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(3),
+            cond: Cond::Hi,
+        },
+        "418C231E",
+        "fcsel s1, s2, s3, hi",
+    ));
+
+    insns.push((
+        Inst::FpuCSel64 {
+            rd: writable_vreg(1),
+            rn: vreg(2),
+            rm: vreg(3),
+            cond: Cond::Eq,
+        },
+        "410C631E",
+        "fcsel d1, d2, d3, eq",
+    ));
+
+    insns.push((
+        Inst::FpuRound {
+            rd: writable_vreg(23),
+            rn: vreg(24),
+            op: FpuRoundMode::Minus32,
+        },
+        "1743251E",
+        "frintm s23, s24",
+    ));
+    insns.push((
+        Inst::FpuRound {
+            rd: writable_vreg(23),
+            rn: vreg(24),
+            op: FpuRoundMode::Minus64,
+        },
+        "1743651E",
+        "frintm d23, d24",
+    ));
+    insns.push((
+        Inst::FpuRound {
+            rd: writable_vreg(23),
+            rn: vreg(24),
+            op: FpuRoundMode::Plus32,
+        },
+        "17C3241E",
+        "frintp s23, s24",
+    ));
+    insns.push((
+        Inst::FpuRound {
+            rd: writable_vreg(23),
+            rn: vreg(24),
+            op: FpuRoundMode::Plus64,
+        },
+        "17C3641E",
+        "frintp d23, d24",
+    ));
+    insns.push((
+        Inst::FpuRound {
+            rd: writable_vreg(23),
+            rn: vreg(24),
+            op: FpuRoundMode::Zero32,
+        },
+        "17C3251E",
+        "frintz s23, s24",
+    ));
+    insns.push((
+        Inst::FpuRound {
+            rd: writable_vreg(23),
+            rn: vreg(24),
+            op: FpuRoundMode::Zero64,
+        },
+        "17C3651E",
+        "frintz d23, d24",
+    ));
+    insns.push((
+        Inst::FpuRound {
+            rd: writable_vreg(23),
+            rn: vreg(24),
+            op: FpuRoundMode::Nearest32,
+        },
+        "1743241E",
+        "frintn s23, s24",
+    ));
+    insns.push((
+        Inst::FpuRound {
+            rd: writable_vreg(23),
+            rn: vreg(24),
+            op: FpuRoundMode::Nearest64,
+        },
+        "1743641E",
+        "frintn d23, d24",
+    ));
+
+    insns.push((
+        Inst::AtomicRMW {
+            ty: I16,
+            op: inst_common::AtomicRmwOp::Xor,
+        },
+        "BF3B03D53B7F5F487C031ACA3C7F1848B8FFFFB5BF3B03D5",
+        "atomically { 16_bits_at_[x25]) Xor= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
+    ));
+
+    insns.push((
+        Inst::AtomicRMW {
+            ty: I32,
+            op: inst_common::AtomicRmwOp::Xchg,
+        },
+        "BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5",
+        "atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
+    ));
+
+    insns.push((
+        Inst::AtomicCAS {
+            ty: I8,
+        },
+        "BF3B03D53B7F5F08581F40927F0318EB610000543C7F180878FFFFB5BF3B03D5",
+        "atomically { compare-and-swap(8_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
+    ));
+
+    insns.push((
+        Inst::AtomicCAS {
+            ty: I64,
+        },
+        "BF3B03D53B7F5FC8F8031AAA7F0318EB610000543C7F18C878FFFFB5BF3B03D5",
+        "atomically { compare-and-swap(64_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
+    ));
+
+    insns.push((
+        Inst::AtomicLoad {
+            ty: I8,
+            r_data: writable_xreg(7),
+            r_addr: xreg(28),
+        },
+        "BF3B03D587034039",
+        "atomically { x7 = zero_extend_8_bits_at[x28] }",
+    ));
+
+    insns.push((
+        Inst::AtomicLoad {
+            ty: I64,
+            r_data: writable_xreg(28),
+            r_addr: xreg(7),
+        },
+        "BF3B03D5FC0040F9",
+        "atomically { x28 = zero_extend_64_bits_at[x7] }",
+    ));
+
+    insns.push((
+        Inst::AtomicStore {
+            ty: I16,
+            r_data: xreg(17),
+            r_addr: xreg(8),
+        },
+        "11010079BF3B03D5",
+        "atomically { 16_bits_at[x8] = x17 }",
+    ));
+
+    insns.push((
+        Inst::AtomicStore {
+            ty: I32,
+            r_data: xreg(18),
+            r_addr: xreg(7),
+        },
+        "F20000B9BF3B03D5",
+        "atomically { 32_bits_at[x7] = x18 }",
+    ));
+
+    insns.push((Inst::Fence {}, "BF3B03D5", "dmb ish"));
+
+    let flags = settings::Flags::new(settings::builder());
+    let rru = create_reg_universe(&flags);
+    let emit_info = EmitInfo::new(flags);
+    for (insn, expected_encoding, expected_printing) in insns {
+        println!(
+            "AArch64: {:?}, {}, {}",
+            insn, expected_encoding, expected_printing
+        );
+
+        // Check the printed text is as expected.
+        let actual_printing = insn.show_rru(Some(&rru));
+        assert_eq!(expected_printing, actual_printing);
+
+        let mut sink = test_utils::TestCodeSink::new();
+        let mut buffer = MachBuffer::new();
+        insn.emit(&mut buffer, &emit_info, &mut Default::default());
+        let buffer = buffer.finish();
+        buffer.emit(&mut sink);
+        let actual_encoding = &sink.stringify();
+        assert_eq!(expected_encoding, actual_encoding);
+    }
+}
+
+#[test]
+fn test_cond_invert() {
+    for cond in vec![
+        Cond::Eq,
+        Cond::Ne,
+        Cond::Hs,
+        Cond::Lo,
+        Cond::Mi,
+        Cond::Pl,
+        Cond::Vs,
+        Cond::Vc,
+        Cond::Hi,
+        Cond::Ls,
+        Cond::Ge,
+        Cond::Lt,
+        Cond::Gt,
+        Cond::Le,
+        Cond::Al,
+        Cond::Nv,
+    ]
+    .into_iter()
+    {
+        assert_eq!(cond.invert().invert(), cond);
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/imms.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/imms.rs
new file mode 100644
index 0000000000..b6da0402bc
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/imms.rs
@@ -0,0 +1,1025 @@
+//! AArch64 ISA definitions: immediate constants.
+
+// Some variants are never constructed, but we still want them as options in the future.
+#[allow(dead_code)]
+use crate::ir::types::*;
+use crate::ir::Type;
+use crate::isa::aarch64::inst::{OperandSize, ScalarSize};
+
+use regalloc::{PrettyPrint, RealRegUniverse};
+
+use core::convert::TryFrom;
+use std::string::String;
+
+/// An immediate that represents the NZCV flags.
+#[derive(Clone, Copy, Debug)]
+pub struct NZCV {
+    /// The negative condition flag.
+    n: bool,
+    /// The zero condition flag.
+    z: bool,
+    /// The carry condition flag.
+    c: bool,
+    /// The overflow condition flag.
+    v: bool,
+}
+
+impl NZCV {
+    pub fn new(n: bool, z: bool, c: bool, v: bool) -> NZCV {
+        NZCV { n, z, c, v }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        (u32::from(self.n) << 3)
+            | (u32::from(self.z) << 2)
+            | (u32::from(self.c) << 1)
+            | u32::from(self.v)
+    }
+}
+
+/// An unsigned 5-bit immediate.
+#[derive(Clone, Copy, Debug)]
+pub struct UImm5 {
+    /// The value.
+    value: u8,
+}
+
+impl UImm5 {
+    pub fn maybe_from_u8(value: u8) -> Option<UImm5> {
+        if value < 32 {
+            Some(UImm5 { value })
+        } else {
+            None
+        }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        u32::from(self.value)
+    }
+}
+
+/// A signed, scaled 7-bit offset.
+#[derive(Clone, Copy, Debug)]
+pub struct SImm7Scaled {
+    /// The value.
+    pub value: i16,
+    /// multiplied by the size of this type
+    pub scale_ty: Type,
+}
+
+impl SImm7Scaled {
+    /// Create a SImm7Scaled from a raw offset and the known scale type, if
+    /// possible.
+    pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option<SImm7Scaled> {
+        assert!(scale_ty == I64 || scale_ty == I32);
+        let scale = scale_ty.bytes();
+        assert!(scale.is_power_of_two());
+        let scale = i64::from(scale);
+        let upper_limit = 63 * scale;
+        let lower_limit = -(64 * scale);
+        if value >= lower_limit && value <= upper_limit && (value & (scale - 1)) == 0 {
+            Some(SImm7Scaled {
+                value: i16::try_from(value).unwrap(),
+                scale_ty,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero(scale_ty: Type) -> SImm7Scaled {
+        SImm7Scaled { value: 0, scale_ty }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        let ty_bytes: i16 = self.scale_ty.bytes() as i16;
+        let scaled: i16 = self.value / ty_bytes;
+        assert!(scaled <= 63 && scaled >= -64);
+        let scaled: i8 = scaled as i8;
+        let encoded: u32 = scaled as u32;
+        encoded & 0x7f
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct FPULeftShiftImm {
+    pub amount: u8,
+    pub lane_size_in_bits: u8,
+}
+
+impl FPULeftShiftImm {
+    pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> {
+        debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64);
+        if amount < lane_size_in_bits {
+            Some(Self {
+                amount,
+                lane_size_in_bits,
+            })
+        } else {
+            None
+        }
+    }
+
+    pub fn enc(&self) -> u32 {
+        debug_assert!(self.lane_size_in_bits.is_power_of_two());
+        debug_assert!(self.lane_size_in_bits > self.amount);
+        // The encoding of the immediate follows the table below,
+        // where xs encode the shift amount.
+        //
+        // | lane_size_in_bits | encoding |
+        // +------------------------------+
+        // | 8                 | 0001xxx  |
+        // | 16                | 001xxxx  |
+        // | 32                | 01xxxxx  |
+        // | 64                | 1xxxxxx  |
+        //
+        // The highest one bit is represented by `lane_size_in_bits`. Since
+        // `lane_size_in_bits` is a power of 2 and `amount` is less
+        // than `lane_size_in_bits`, they can be ORed
+        // together to produced the encoded value.
+        u32::from(self.lane_size_in_bits | self.amount)
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct FPURightShiftImm {
+    pub amount: u8,
+    pub lane_size_in_bits: u8,
+}
+
+impl FPURightShiftImm {
+    pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> {
+        debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64);
+        if amount > 0 && amount <= lane_size_in_bits {
+            Some(Self {
+                amount,
+                lane_size_in_bits,
+            })
+        } else {
+            None
+        }
+    }
+
+    pub fn enc(&self) -> u32 {
+        debug_assert_ne!(0, self.amount);
+        // The encoding of the immediate follows the table below,
+        // where xs encodes the negated shift amount.
+        //
+        // | lane_size_in_bits | encoding |
+        // +------------------------------+
+        // | 8                 | 0001xxx  |
+        // | 16                | 001xxxx  |
+        // | 32                | 01xxxxx  |
+        // | 64                | 1xxxxxx  |
+        //
+        // The shift amount is negated such that a shift ammount
+        // of 1 (in 64-bit) is encoded as 0b111111 and a shift
+        // amount of 64 is encoded as 0b000000,
+        // in the bottom 6 bits.
+        u32::from((self.lane_size_in_bits * 2) - self.amount)
+    }
+}
+
+/// a 9-bit signed offset.
+#[derive(Clone, Copy, Debug)]
+pub struct SImm9 {
+    /// The value.
+    pub value: i16,
+}
+
+impl SImm9 {
+    /// Create a signed 9-bit offset from a full-range value, if possible.
+    pub fn maybe_from_i64(value: i64) -> Option<SImm9> {
+        if value >= -256 && value <= 255 {
+            Some(SImm9 {
+                value: value as i16,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero() -> SImm9 {
+        SImm9 { value: 0 }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        (self.value as u32) & 0x1ff
+    }
+
+    /// Signed value of immediate.
+    pub fn value(&self) -> i32 {
+        self.value as i32
+    }
+}
+
+/// An unsigned, scaled 12-bit offset.
+#[derive(Clone, Copy, Debug)]
+pub struct UImm12Scaled {
+    /// The value.
+    pub value: u16,
+    /// multiplied by the size of this type
+    pub scale_ty: Type,
+}
+
+impl UImm12Scaled {
+    /// Create a UImm12Scaled from a raw offset and the known scale type, if
+    /// possible.
+    pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option<UImm12Scaled> {
+        // Ensure the type is at least one byte.
+        let scale_ty = if scale_ty == B1 { B8 } else { scale_ty };
+
+        let scale = scale_ty.bytes();
+        assert!(scale.is_power_of_two());
+        let scale = scale as i64;
+        let limit = 4095 * scale;
+        if value >= 0 && value <= limit && (value & (scale - 1)) == 0 {
+            Some(UImm12Scaled {
+                value: value as u16,
+                scale_ty,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero(scale_ty: Type) -> UImm12Scaled {
+        UImm12Scaled { value: 0, scale_ty }
+    }
+
+    /// Encoded bits.
+    pub fn bits(&self) -> u32 {
+        (self.value as u32 / self.scale_ty.bytes()) & 0xfff
+    }
+
+    /// Value after scaling.
+    pub fn value(&self) -> u32 {
+        self.value as u32
+    }
+
+    /// The value type which is the scaling base.
+    pub fn scale_ty(&self) -> Type {
+        self.scale_ty
+    }
+}
+
+/// A shifted immediate value in 'imm12' format: supports 12 bits, shifted
+/// left by 0 or 12 places.
+#[derive(Clone, Debug)]
+pub struct Imm12 {
+    /// The immediate bits.
+    pub bits: u16,
+    /// Whether the immediate bits are shifted left by 12 or not.
+    pub shift12: bool,
+}
+
+impl Imm12 {
+    /// Compute a Imm12 from raw bits, if possible.
+    pub fn maybe_from_u64(val: u64) -> Option<Imm12> {
+        if val == 0 {
+            Some(Imm12 {
+                bits: 0,
+                shift12: false,
+            })
+        } else if val < 0xfff {
+            Some(Imm12 {
+                bits: val as u16,
+                shift12: false,
+            })
+        } else if val < 0xfff_000 && (val & 0xfff == 0) {
+            Some(Imm12 {
+                bits: (val >> 12) as u16,
+                shift12: true,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero() -> Self {
+        Imm12 {
+            bits: 0,
+            shift12: false,
+        }
+    }
+
+    /// Bits for 2-bit "shift" field in e.g. AddI.
+    pub fn shift_bits(&self) -> u32 {
+        if self.shift12 {
+            0b01
+        } else {
+            0b00
+        }
+    }
+
+    /// Bits for 12-bit "imm" field in e.g. AddI.
+    pub fn imm_bits(&self) -> u32 {
+        self.bits as u32
+    }
+}
+
+/// An immediate for logical instructions.
+#[derive(Clone, Debug, PartialEq)]
+pub struct ImmLogic {
+    /// The actual value.
+    value: u64,
+    /// `N` flag.
+    pub n: bool,
+    /// `S` field: element size and element bits.
+    pub r: u8,
+    /// `R` field: rotate amount.
+    pub s: u8,
+    /// Was this constructed for a 32-bit or 64-bit instruction?
+    pub size: OperandSize,
+}
+
+impl ImmLogic {
+    /// Compute an ImmLogic from raw bits, if possible.
+    pub fn maybe_from_u64(value: u64, ty: Type) -> Option<ImmLogic> {
+        // Note: This function is a port of VIXL's Assembler::IsImmLogical.
+
+        if ty != I64 && ty != I32 {
+            return None;
+        }
+        let operand_size = OperandSize::from_ty(ty);
+
+        let original_value = value;
+
+        let value = if ty == I32 {
+            // To handle 32-bit logical immediates, the very easiest thing is to repeat
+            // the input value twice to make a 64-bit word. The correct encoding of that
+            // as a logical immediate will also be the correct encoding of the 32-bit
+            // value.
+
+            // Avoid making the assumption that the most-significant 32 bits are zero by
+            // shifting the value left and duplicating it.
+            let value = value << 32;
+            value | value >> 32
+        } else {
+            value
+        };
+
+        // Logical immediates are encoded using parameters n, imm_s and imm_r using
+        // the following table:
+        //
+        //    N   imms    immr    size        S             R
+        //    1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+        //    0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+        //    0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+        //    0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+        //    0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+        //    0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+        // (s bits must not be all set)
+        //
+        // A pattern is constructed of size bits, where the least significant S+1 bits
+        // are set. The pattern is rotated right by R, and repeated across a 32 or
+        // 64-bit value, depending on destination register width.
+        //
+        // Put another way: the basic format of a logical immediate is a single
+        // contiguous stretch of 1 bits, repeated across the whole word at intervals
+        // given by a power of 2. To identify them quickly, we first locate the
+        // lowest stretch of 1 bits, then the next 1 bit above that; that combination
+        // is different for every logical immediate, so it gives us all the
+        // information we need to identify the only logical immediate that our input
+        // could be, and then we simply check if that's the value we actually have.
+        //
+        // (The rotation parameter does give the possibility of the stretch of 1 bits
+        // going 'round the end' of the word. To deal with that, we observe that in
+        // any situation where that happens the bitwise NOT of the value is also a
+        // valid logical immediate. So we simply invert the input whenever its low bit
+        // is set, and then we know that the rotated case can't arise.)
+        let (value, inverted) = if value & 1 == 1 {
+            (!value, true)
+        } else {
+            (value, false)
+        };
+
+        if value == 0 {
+            return None;
+        }
+
+        // The basic analysis idea: imagine our input word looks like this.
+        //
+        //    0011111000111110001111100011111000111110001111100011111000111110
+        //                                                          c  b    a
+        //                                                          |<--d-->|
+        //
+        // We find the lowest set bit (as an actual power-of-2 value, not its index)
+        // and call it a. Then we add a to our original number, which wipes out the
+        // bottommost stretch of set bits and replaces it with a 1 carried into the
+        // next zero bit. Then we look for the new lowest set bit, which is in
+        // position b, and subtract it, so now our number is just like the original
+        // but with the lowest stretch of set bits completely gone. Now we find the
+        // lowest set bit again, which is position c in the diagram above. Then we'll
+        // measure the distance d between bit positions a and c (using CLZ), and that
+        // tells us that the only valid logical immediate that could possibly be equal
+        // to this number is the one in which a stretch of bits running from a to just
+        // below b is replicated every d bits.
+        fn lowest_set_bit(value: u64) -> u64 {
+            let bit = value.trailing_zeros();
+            1u64.checked_shl(bit).unwrap_or(0)
+        }
+        let a = lowest_set_bit(value);
+        assert_ne!(0, a);
+        let value_plus_a = value.wrapping_add(a);
+        let b = lowest_set_bit(value_plus_a);
+        let value_plus_a_minus_b = value_plus_a - b;
+        let c = lowest_set_bit(value_plus_a_minus_b);
+
+        let (d, clz_a, out_n, mask) = if c != 0 {
+            // The general case, in which there is more than one stretch of set bits.
+            // Compute the repeat distance d, and set up a bitmask covering the basic
+            // unit of repetition (i.e. a word with the bottom d bits set). Also, in all
+            // of these cases the N bit of the output will be zero.
+            let clz_a = a.leading_zeros();
+            let clz_c = c.leading_zeros();
+            let d = clz_a - clz_c;
+            let mask = (1 << d) - 1;
+            (d, clz_a, 0, mask)
+        } else {
+            (64, a.leading_zeros(), 1, u64::max_value())
+        };
+
+        // If the repeat period d is not a power of two, it can't be encoded.
+        if !d.is_power_of_two() {
+            return None;
+        }
+
+        if ((b.wrapping_sub(a)) & !mask) != 0 {
+            // If the bit stretch (b - a) does not fit within the mask derived from the
+            // repeat period, then fail.
+            return None;
+        }
+
+        // The only possible option is b - a repeated every d bits. Now we're going to
+        // actually construct the valid logical immediate derived from that
+        // specification, and see if it equals our original input.
+        //
+        // To repeat a value every d bits, we multiply it by a number of the form
+        // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
+        // be derived using a table lookup on CLZ(d).
+        const MULTIPLIERS: [u64; 6] = [
+            0x0000000000000001,
+            0x0000000100000001,
+            0x0001000100010001,
+            0x0101010101010101,
+            0x1111111111111111,
+            0x5555555555555555,
+        ];
+        let multiplier = MULTIPLIERS[(u64::from(d).leading_zeros() - 57) as usize];
+        let candidate = b.wrapping_sub(a) * multiplier;
+
+        if value != candidate {
+            // The candidate pattern doesn't match our input value, so fail.
+            return None;
+        }
+
+        // We have a match! This is a valid logical immediate, so now we have to
+        // construct the bits and pieces of the instruction encoding that generates
+        // it.
+
+        // Count the set bits in our basic stretch. The special case of clz(0) == -1
+        // makes the answer come out right for stretches that reach the very top of
+        // the word (e.g. numbers like 0xffffc00000000000).
+        let clz_b = if b == 0 {
+            u32::max_value() // -1
+        } else {
+            b.leading_zeros()
+        };
+        let s = clz_a.wrapping_sub(clz_b);
+
+        // Decide how many bits to rotate right by, to put the low bit of that basic
+        // stretch in position a.
+        let (s, r) = if inverted {
+            // If we inverted the input right at the start of this function, here's
+            // where we compensate: the number of set bits becomes the number of clear
+            // bits, and the rotation count is based on position b rather than position
+            // a (since b is the location of the 'lowest' 1 bit after inversion).
+            // Need wrapping for when clz_b is max_value() (for when b == 0).
+            (d - s, clz_b.wrapping_add(1) & (d - 1))
+        } else {
+            (s, (clz_a + 1) & (d - 1))
+        };
+
+        // Now we're done, except for having to encode the S output in such a way that
+        // it gives both the number of set bits and the length of the repeated
+        // segment. The s field is encoded like this:
+        //
+        //     imms    size        S
+        //    ssssss    64    UInt(ssssss)
+        //    0sssss    32    UInt(sssss)
+        //    10ssss    16    UInt(ssss)
+        //    110sss     8    UInt(sss)
+        //    1110ss     4    UInt(ss)
+        //    11110s     2    UInt(s)
+        //
+        // So we 'or' (2 * -d) with our computed s to form imms.
+        let s = ((d * 2).wrapping_neg() | (s - 1)) & 0x3f;
+        debug_assert!(u8::try_from(r).is_ok());
+        debug_assert!(u8::try_from(s).is_ok());
+        Some(ImmLogic {
+            value: original_value,
+            n: out_n != 0,
+            r: r as u8,
+            s: s as u8,
+            size: operand_size,
+        })
+    }
+
+    /// Returns bits ready for encoding: (N:1, R:6, S:6)
+    pub fn enc_bits(&self) -> u32 {
+        ((self.n as u32) << 12) | ((self.r as u32) << 6) | (self.s as u32)
+    }
+
+    /// Returns the value that this immediate represents.
+    pub fn value(&self) -> u64 {
+        self.value
+    }
+
+    /// Return an immediate for the bitwise-inverted value.
+    pub fn invert(&self) -> ImmLogic {
+        // For every ImmLogical immediate, the inverse can also be encoded.
+        Self::maybe_from_u64(!self.value, self.size.to_ty()).unwrap()
+    }
+
+    /// This provides a safe(ish) way to avoid the costs of `maybe_from_u64` when we want to
+    /// encode a constant that we know at compiler-build time.  It constructs an `ImmLogic` from
+    /// the fields `n`, `r`, `s` and `size`, but in a debug build, checks that `value_to_check`
+    /// corresponds to those four fields.  The intention is that, in a non-debug build, this
+    /// reduces to something small enough that it will be a candidate for inlining.
+    pub fn from_n_r_s(value_to_check: u64, n: bool, r: u8, s: u8, size: OperandSize) -> Self {
+        // Construct it from the components we got given.
+        let imml = Self {
+            value: value_to_check,
+            n,
+            r,
+            s,
+            size,
+        };
+
+        // In debug mode, check that `n`/`r`/`s` are correct, given `value` and `size`.
+        debug_assert!(match ImmLogic::maybe_from_u64(
+            value_to_check,
+            if size == OperandSize::Size64 {
+                I64
+            } else {
+                I32
+            }
+        ) {
+            None => false, // fail: `value` is unrepresentable
+            Some(imml_check) => imml_check == imml,
+        });
+
+        imml
+    }
+}
+
+/// An immediate for shift instructions.
+#[derive(Clone, Debug)]
+pub struct ImmShift {
+    /// 6-bit shift amount.
+    pub imm: u8,
+}
+
+impl ImmShift {
+    /// Create an ImmShift from raw bits, if possible.
+    pub fn maybe_from_u64(val: u64) -> Option<ImmShift> {
+        if val < 64 {
+            Some(ImmShift { imm: val as u8 })
+        } else {
+            None
+        }
+    }
+
+    /// Get the immediate value.
+    pub fn value(&self) -> u8 {
+        self.imm
+    }
+}
+
+/// A 16-bit immediate for a MOVZ instruction, with a {0,16,32,48}-bit shift.
+#[derive(Clone, Copy, Debug)]
+pub struct MoveWideConst {
+    /// The value.
+    pub bits: u16,
+    /// Result is `bits` shifted 16*shift bits to the left.
+    pub shift: u8,
+}
+
+impl MoveWideConst {
+    /// Construct a MoveWideConst from an arbitrary 64-bit constant if possible.
+    pub fn maybe_from_u64(value: u64) -> Option<MoveWideConst> {
+        let mask0 = 0x0000_0000_0000_ffffu64;
+        let mask1 = 0x0000_0000_ffff_0000u64;
+        let mask2 = 0x0000_ffff_0000_0000u64;
+        let mask3 = 0xffff_0000_0000_0000u64;
+
+        if value == (value & mask0) {
+            return Some(MoveWideConst {
+                bits: (value & mask0) as u16,
+                shift: 0,
+            });
+        }
+        if value == (value & mask1) {
+            return Some(MoveWideConst {
+                bits: ((value >> 16) & mask0) as u16,
+                shift: 1,
+            });
+        }
+        if value == (value & mask2) {
+            return Some(MoveWideConst {
+                bits: ((value >> 32) & mask0) as u16,
+                shift: 2,
+            });
+        }
+        if value == (value & mask3) {
+            return Some(MoveWideConst {
+                bits: ((value >> 48) & mask0) as u16,
+                shift: 3,
+            });
+        }
+        None
+    }
+
+    pub fn maybe_with_shift(imm: u16, shift: u8) -> Option<MoveWideConst> {
+        let shift_enc = shift / 16;
+        if shift_enc > 3 {
+            None
+        } else {
+            Some(MoveWideConst {
+                bits: imm,
+                shift: shift_enc,
+            })
+        }
+    }
+
+    /// Returns the value that this constant represents.
+    pub fn value(&self) -> u64 {
+        (self.bits as u64) << (16 * self.shift)
+    }
+}
+
+/// Advanced SIMD modified immediate as used by MOVI/MVNI.
+#[derive(Clone, Copy, Debug)]
+pub struct ASIMDMovModImm {
+    imm: u8,
+    shift: u8,
+    shift_ones: bool,
+}
+
+impl ASIMDMovModImm {
+    pub fn maybe_from_u64(value: u64, size: ScalarSize) -> Option<ASIMDMovModImm> {
+        match size {
+            ScalarSize::Size8 => Some(ASIMDMovModImm {
+                imm: value as u8,
+                shift: 0,
+                shift_ones: false,
+            }),
+            _ => None,
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero() -> Self {
+        ASIMDMovModImm {
+            imm: 0,
+            shift: 0,
+            shift_ones: false,
+        }
+    }
+
+    pub fn value(&self) -> (u8, u32, bool) {
+        (self.imm, self.shift as u32, self.shift_ones)
+    }
+}
+
+impl PrettyPrint for NZCV {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        let fmt = |c: char, v| if v { c.to_ascii_uppercase() } else { c };
+        format!(
+            "#{}{}{}{}",
+            fmt('n', self.n),
+            fmt('z', self.z),
+            fmt('c', self.c),
+            fmt('v', self.v)
+        )
+    }
+}
+
+impl PrettyPrint for UImm5 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl PrettyPrint for Imm12 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        let shift = if self.shift12 { 12 } else { 0 };
+        let value = u32::from(self.bits) << shift;
+        format!("#{}", value)
+    }
+}
+
+impl PrettyPrint for SImm7Scaled {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl PrettyPrint for FPULeftShiftImm {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.amount)
+    }
+}
+
+impl PrettyPrint for FPURightShiftImm {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.amount)
+    }
+}
+
+impl PrettyPrint for SImm9 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl PrettyPrint for UImm12Scaled {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl PrettyPrint for ImmLogic {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value())
+    }
+}
+
+impl PrettyPrint for ImmShift {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.imm)
+    }
+}
+
+impl PrettyPrint for MoveWideConst {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        if self.shift == 0 {
+            format!("#{}", self.bits)
+        } else {
+            format!("#{}, LSL #{}", self.bits, self.shift * 16)
+        }
+    }
+}
+
+impl PrettyPrint for ASIMDMovModImm {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        if self.shift == 0 {
+            format!("#{}", self.imm)
+        } else {
+            let shift_type = if self.shift_ones { "MSL" } else { "LSL" };
+            format!("#{}, {} #{}", self.imm, shift_type, self.shift)
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn imm_logical_test() {
+        assert_eq!(None, ImmLogic::maybe_from_u64(0, I64));
+        assert_eq!(None, ImmLogic::maybe_from_u64(u64::max_value(), I64));
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 1,
+                n: true,
+                r: 0,
+                s: 0,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(1, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 2,
+                n: true,
+                r: 63,
+                s: 0,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(2, I64)
+        );
+
+        assert_eq!(None, ImmLogic::maybe_from_u64(5, I64));
+
+        assert_eq!(None, ImmLogic::maybe_from_u64(11, I64));
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 248,
+                n: true,
+                r: 61,
+                s: 4,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(248, I64)
+        );
+
+        assert_eq!(None, ImmLogic::maybe_from_u64(249, I64));
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 1920,
+                n: true,
+                r: 57,
+                s: 3,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(1920, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x7ffe,
+                n: true,
+                r: 63,
+                s: 13,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(0x7ffe, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x30000,
+                n: true,
+                r: 48,
+                s: 1,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(0x30000, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x100000,
+                n: true,
+                r: 44,
+                s: 0,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(0x100000, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: u64::max_value() - 1,
+                n: true,
+                r: 63,
+                s: 62,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(u64::max_value() - 1, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0xaaaaaaaaaaaaaaaa,
+                n: false,
+                r: 1,
+                s: 60,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(0xaaaaaaaaaaaaaaaa, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x8181818181818181,
+                n: false,
+                r: 1,
+                s: 49,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(0x8181818181818181, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0xffc3ffc3ffc3ffc3,
+                n: false,
+                r: 10,
+                s: 43,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(0xffc3ffc3ffc3ffc3, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x100000001,
+                n: false,
+                r: 0,
+                s: 0,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(0x100000001, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x1111111111111111,
+                n: false,
+                r: 0,
+                s: 56,
+                size: OperandSize::Size64,
+            }),
+            ImmLogic::maybe_from_u64(0x1111111111111111, I64)
+        );
+
+        for n in 0..2 {
+            let types = if n == 0 { vec![I64, I32] } else { vec![I64] };
+            for s in 0..64 {
+                for r in 0..64 {
+                    let imm = get_logical_imm(n, s, r);
+                    for &ty in &types {
+                        match ImmLogic::maybe_from_u64(imm, ty) {
+                            Some(ImmLogic { value, .. }) => {
+                                assert_eq!(imm, value);
+                                ImmLogic::maybe_from_u64(!value, ty).unwrap();
+                            }
+                            None => assert_eq!(0, imm),
+                        };
+                    }
+                }
+            }
+        }
+    }
+
+    // Repeat a value that has `width` bits, across a 64-bit value.
+    fn repeat(value: u64, width: u64) -> u64 {
+        let mut result = value & ((1 << width) - 1);
+        let mut i = width;
+        while i < 64 {
+            result |= result << i;
+            i *= 2;
+        }
+        result
+    }
+
+    // Get the logical immediate, from the encoding N/R/S bits.
+    fn get_logical_imm(n: u32, s: u32, r: u32) -> u64 {
+        // An integer is constructed from the n, imm_s and imm_r bits according to
+        // the following table:
+        //
+        //  N   imms    immr    size        S             R
+        //  1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+        //  0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+        //  0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+        //  0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+        //  0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+        //  0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+        // (s bits must not be all set)
+        //
+        // A pattern is constructed of size bits, where the least significant S+1
+        // bits are set. The pattern is rotated right by R, and repeated across a
+        // 64-bit value.
+
+        if n == 1 {
+            if s == 0x3f {
+                return 0;
+            }
+            let bits = (1u64 << (s + 1)) - 1;
+            bits.rotate_right(r)
+        } else {
+            if (s >> 1) == 0x1f {
+                return 0;
+            }
+            let mut width = 0x20;
+            while width >= 0x2 {
+                if (s & width) == 0 {
+                    let mask = width - 1;
+                    if (s & mask) == mask {
+                        return 0;
+                    }
+                    let bits = (1u64 << ((s & mask) + 1)) - 1;
+                    return repeat(bits.rotate_right(r & mask), width.into());
+                }
+                width >>= 1;
+            }
+            unreachable!();
+        }
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/mod.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/mod.rs
new file mode 100644
index 0000000000..278302018e
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/mod.rs
@@ -0,0 +1,4057 @@
+//! This module defines aarch64-specific machine instruction types.
+
+// Some variants are not constructed, but we still want them as options in the future.
+#![allow(dead_code)]
+
+use crate::binemit::CodeOffset;
+use crate::ir::types::{
+    B1, B16, B16X8, B32, B32X4, B64, B64X2, B8, B8X16, F32, F32X4, F64, F64X2, FFLAGS, I16, I16X8,
+    I32, I32X4, I64, I64X2, I8, I8X16, IFLAGS, R32, R64,
+};
+use crate::ir::{ExternalName, MemFlags, Opcode, SourceLoc, TrapCode, Type};
+use crate::isa::CallConv;
+use crate::machinst::*;
+use crate::{settings, CodegenError, CodegenResult};
+
+use regalloc::{PrettyPrint, RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable};
+use regalloc::{RegUsageCollector, RegUsageMapper};
+
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use core::convert::TryFrom;
+use smallvec::{smallvec, SmallVec};
+use std::string::{String, ToString};
+
+pub mod regs;
+pub use self::regs::*;
+pub mod imms;
+pub use self::imms::*;
+pub mod args;
+pub use self::args::*;
+pub mod emit;
+pub use self::emit::*;
+pub mod unwind;
+
+#[cfg(test)]
+mod emit_tests;
+
+//=============================================================================
+// Instructions (top level): definition
+
+/// An ALU operation. This can be paired with several instruction formats
+/// below (see `Inst`) in any combination.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum ALUOp {
+    Add32,
+    Add64,
+    Sub32,
+    Sub64,
+    Orr32,
+    Orr64,
+    OrrNot32,
+    OrrNot64,
+    And32,
+    And64,
+    AndNot32,
+    AndNot64,
+    /// XOR (AArch64 calls this "EOR")
+    Eor32,
+    /// XOR (AArch64 calls this "EOR")
+    Eor64,
+    /// XNOR (AArch64 calls this "EOR-NOT")
+    EorNot32,
+    /// XNOR (AArch64 calls this "EOR-NOT")
+    EorNot64,
+    /// Add, setting flags
+    AddS32,
+    /// Add, setting flags
+    AddS64,
+    /// Sub, setting flags
+    SubS32,
+    /// Sub, setting flags
+    SubS64,
+    /// Signed multiply, high-word result
+    SMulH,
+    /// Unsigned multiply, high-word result
+    UMulH,
+    SDiv64,
+    UDiv64,
+    RotR32,
+    RotR64,
+    Lsr32,
+    Lsr64,
+    Asr32,
+    Asr64,
+    Lsl32,
+    Lsl64,
+}
+
+/// An ALU operation with three arguments.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum ALUOp3 {
+    /// Multiply-add
+    MAdd32,
+    /// Multiply-add
+    MAdd64,
+    /// Multiply-sub
+    MSub32,
+    /// Multiply-sub
+    MSub64,
+}
+
+/// A floating-point unit (FPU) operation with one arg.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum FPUOp1 {
+    Abs32,
+    Abs64,
+    Neg32,
+    Neg64,
+    Sqrt32,
+    Sqrt64,
+    Cvt32To64,
+    Cvt64To32,
+}
+
+/// A floating-point unit (FPU) operation with two args.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum FPUOp2 {
+    Add32,
+    Add64,
+    Sub32,
+    Sub64,
+    Mul32,
+    Mul64,
+    Div32,
+    Div64,
+    Max32,
+    Max64,
+    Min32,
+    Min64,
+    /// Signed saturating add
+    Sqadd64,
+    /// Unsigned saturating add
+    Uqadd64,
+    /// Signed saturating subtract
+    Sqsub64,
+    /// Unsigned saturating subtract
+    Uqsub64,
+}
+
+/// A floating-point unit (FPU) operation with two args, a register and an immediate.
+#[derive(Copy, Clone, Debug)]
+pub enum FPUOpRI {
+    /// Unsigned right shift. Rd = Rn << #imm
+    UShr32(FPURightShiftImm),
+    /// Unsigned right shift. Rd = Rn << #imm
+    UShr64(FPURightShiftImm),
+    /// Shift left and insert. Rd |= Rn << #imm
+    Sli32(FPULeftShiftImm),
+    /// Shift left and insert. Rd |= Rn << #imm
+    Sli64(FPULeftShiftImm),
+}
+
+/// A floating-point unit (FPU) operation with three args.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum FPUOp3 {
+    MAdd32,
+    MAdd64,
+}
+
+/// A conversion from an FP to an integer value.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum FpuToIntOp {
+    F32ToU32,
+    F32ToI32,
+    F32ToU64,
+    F32ToI64,
+    F64ToU32,
+    F64ToI32,
+    F64ToU64,
+    F64ToI64,
+}
+
+/// A conversion from an integer to an FP value.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum IntToFpuOp {
+    U32ToF32,
+    I32ToF32,
+    U32ToF64,
+    I32ToF64,
+    U64ToF32,
+    I64ToF32,
+    U64ToF64,
+    I64ToF64,
+}
+
+/// Modes for FP rounding ops: round down (floor) or up (ceil), or toward zero (trunc), or to
+/// nearest, and for 32- or 64-bit FP values.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum FpuRoundMode {
+    Minus32,
+    Minus64,
+    Plus32,
+    Plus64,
+    Zero32,
+    Zero64,
+    Nearest32,
+    Nearest64,
+}
+
+/// Type of vector element extensions.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecExtendOp {
+    /// Signed extension of 8-bit elements
+    Sxtl8,
+    /// Signed extension of 16-bit elements
+    Sxtl16,
+    /// Signed extension of 32-bit elements
+    Sxtl32,
+    /// Unsigned extension of 8-bit elements
+    Uxtl8,
+    /// Unsigned extension of 16-bit elements
+    Uxtl16,
+    /// Unsigned extension of 32-bit elements
+    Uxtl32,
+}
+
+/// A vector ALU operation.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecALUOp {
+    /// Signed saturating add
+    Sqadd,
+    /// Unsigned saturating add
+    Uqadd,
+    /// Signed saturating subtract
+    Sqsub,
+    /// Unsigned saturating subtract
+    Uqsub,
+    /// Compare bitwise equal
+    Cmeq,
+    /// Compare signed greater than or equal
+    Cmge,
+    /// Compare signed greater than
+    Cmgt,
+    /// Compare unsigned higher
+    Cmhs,
+    /// Compare unsigned higher or same
+    Cmhi,
+    /// Floating-point compare equal
+    Fcmeq,
+    /// Floating-point compare greater than
+    Fcmgt,
+    /// Floating-point compare greater than or equal
+    Fcmge,
+    /// Bitwise and
+    And,
+    /// Bitwise bit clear
+    Bic,
+    /// Bitwise inclusive or
+    Orr,
+    /// Bitwise exclusive or
+    Eor,
+    /// Bitwise select
+    Bsl,
+    /// Unsigned maximum pairwise
+    Umaxp,
+    /// Add
+    Add,
+    /// Subtract
+    Sub,
+    /// Multiply
+    Mul,
+    /// Signed shift left
+    Sshl,
+    /// Unsigned shift left
+    Ushl,
+    /// Unsigned minimum
+    Umin,
+    /// Signed minimum
+    Smin,
+    /// Unsigned maximum
+    Umax,
+    /// Signed maximum
+    Smax,
+    /// Unsigned rounding halving add
+    Urhadd,
+    /// Floating-point add
+    Fadd,
+    /// Floating-point subtract
+    Fsub,
+    /// Floating-point divide
+    Fdiv,
+    /// Floating-point maximum
+    Fmax,
+    /// Floating-point minimum
+    Fmin,
+    /// Floating-point multiply
+    Fmul,
+    /// Add pairwise
+    Addp,
+    /// Unsigned multiply add long
+    Umlal,
+    /// Zip vectors (primary) [meaning, high halves]
+    Zip1,
+    /// Signed multiply long (low halves)
+    Smull,
+    /// Signed multiply long (high halves)
+    Smull2,
+}
+
+/// A Vector miscellaneous operation with two registers.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecMisc2 {
+    /// Bitwise NOT
+    Not,
+    /// Negate
+    Neg,
+    /// Absolute value
+    Abs,
+    /// Floating-point absolute value
+    Fabs,
+    /// Floating-point negate
+    Fneg,
+    /// Floating-point square root
+    Fsqrt,
+    /// Reverse elements in 64-bit doublewords
+    Rev64,
+    /// Shift left long (by element size)
+    Shll,
+    /// Floating-point convert to signed integer, rounding toward zero
+    Fcvtzs,
+    /// Floating-point convert to unsigned integer, rounding toward zero
+    Fcvtzu,
+    /// Signed integer convert to floating-point
+    Scvtf,
+    /// Unsigned integer convert to floating-point
+    Ucvtf,
+    /// Floating point round to integral, rounding towards nearest
+    Frintn,
+    /// Floating point round to integral, rounding towards zero
+    Frintz,
+    /// Floating point round to integral, rounding towards minus infinity
+    Frintm,
+    /// Floating point round to integral, rounding towards plus infinity
+    Frintp,
+}
+
+/// A Vector narrowing operation with two registers.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecMiscNarrowOp {
+    /// Extract Narrow
+    Xtn,
+    /// Signed saturating extract narrow
+    Sqxtn,
+    /// Signed saturating extract unsigned narrow
+    Sqxtun,
+}
+
+/// An operation across the lanes of vectors.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecLanesOp {
+    /// Integer addition across a vector
+    Addv,
+    /// Unsigned minimum across a vector
+    Uminv,
+}
+
+/// A shift-by-immediate operation on each lane of a vector.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecShiftImmOp {
+    // Unsigned shift left
+    Shl,
+    // Unsigned shift right
+    Ushr,
+    // Signed shift right
+    Sshr,
+}
+
+/// An operation on the bits of a register. This can be paired with several instruction formats
+/// below (see `Inst`) in any combination.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum BitOp {
+    /// Bit reverse
+    RBit32,
+    /// Bit reverse
+    RBit64,
+    Clz32,
+    Clz64,
+    Cls32,
+    Cls64,
+}
+
+impl BitOp {
+    /// What is the opcode's native width?
+    pub fn operand_size(&self) -> OperandSize {
+        match self {
+            BitOp::RBit32 | BitOp::Clz32 | BitOp::Cls32 => OperandSize::Size32,
+            _ => OperandSize::Size64,
+        }
+    }
+
+    /// Get the assembly mnemonic for this opcode.
+    pub fn op_str(&self) -> &'static str {
+        match self {
+            BitOp::RBit32 | BitOp::RBit64 => "rbit",
+            BitOp::Clz32 | BitOp::Clz64 => "clz",
+            BitOp::Cls32 | BitOp::Cls64 => "cls",
+        }
+    }
+}
+
+impl From<(Opcode, Type)> for BitOp {
+    /// Get the BitOp from the IR opcode.
+    fn from(op_ty: (Opcode, Type)) -> BitOp {
+        match op_ty {
+            (Opcode::Bitrev, I32) => BitOp::RBit32,
+            (Opcode::Bitrev, I64) => BitOp::RBit64,
+            (Opcode::Clz, I32) => BitOp::Clz32,
+            (Opcode::Clz, I64) => BitOp::Clz64,
+            (Opcode::Cls, I32) => BitOp::Cls32,
+            (Opcode::Cls, I64) => BitOp::Cls64,
+            _ => unreachable!("Called with non-bit op!: {:?}", op_ty),
+        }
+    }
+}
+
+/// Additional information for (direct) Call instructions, left out of line to lower the size of
+/// the Inst enum.
+#[derive(Clone, Debug)]
+pub struct CallInfo {
+    pub dest: ExternalName,
+    pub uses: Vec<Reg>,
+    pub defs: Vec<Writable<Reg>>,
+    pub opcode: Opcode,
+    pub caller_callconv: CallConv,
+    pub callee_callconv: CallConv,
+}
+
+/// Additional information for CallInd instructions, left out of line to lower the size of the Inst
+/// enum.
+#[derive(Clone, Debug)]
+pub struct CallIndInfo {
+    pub rn: Reg,
+    pub uses: Vec<Reg>,
+    pub defs: Vec<Writable<Reg>>,
+    pub opcode: Opcode,
+    pub caller_callconv: CallConv,
+    pub callee_callconv: CallConv,
+}
+
+/// Additional information for JTSequence instructions, left out of line to lower the size of the Inst
+/// enum.
+#[derive(Clone, Debug)]
+pub struct JTSequenceInfo {
+    pub targets: Vec<BranchTarget>,
+    pub default_target: BranchTarget,
+    pub targets_for_term: Vec<MachLabel>, // needed for MachTerminator.
+}
+
+/// Instruction formats.
+#[derive(Clone, Debug)]
+pub enum Inst {
+    /// A no-op of zero size.
+    Nop0,
+
+    /// A no-op that is one instruction large.
+    Nop4,
+
+    /// An ALU operation with two register sources and a register destination.
+    AluRRR {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+    },
+    /// An ALU operation with three register sources and a register destination.
+    AluRRRR {
+        alu_op: ALUOp3,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        ra: Reg,
+    },
+    /// An ALU operation with a register source and an immediate-12 source, and a register
+    /// destination.
+    AluRRImm12 {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        imm12: Imm12,
+    },
+    /// An ALU operation with a register source and an immediate-logic source, and a register destination.
+    AluRRImmLogic {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        imml: ImmLogic,
+    },
+    /// An ALU operation with a register source and an immediate-shiftamt source, and a register destination.
+    AluRRImmShift {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        immshift: ImmShift,
+    },
+    /// An ALU operation with two register sources, one of which can be shifted, and a register
+    /// destination.
+    AluRRRShift {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        shiftop: ShiftOpAndAmt,
+    },
+    /// An ALU operation with two register sources, one of which can be {zero,sign}-extended and
+    /// shifted, and a register destination.
+    AluRRRExtend {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        extendop: ExtendOp,
+    },
+
+    /// A bit op instruction with a single register source.
+    BitRR {
+        op: BitOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// An unsigned (zero-extending) 8-bit load.
+    ULoad8 {
+        rd: Writable<Reg>,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// A signed (sign-extending) 8-bit load.
+    SLoad8 {
+        rd: Writable<Reg>,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// An unsigned (zero-extending) 16-bit load.
+    ULoad16 {
+        rd: Writable<Reg>,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// A signed (sign-extending) 16-bit load.
+    SLoad16 {
+        rd: Writable<Reg>,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// An unsigned (zero-extending) 32-bit load.
+    ULoad32 {
+        rd: Writable<Reg>,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// A signed (sign-extending) 32-bit load.
+    SLoad32 {
+        rd: Writable<Reg>,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// A 64-bit load.
+    ULoad64 {
+        rd: Writable<Reg>,
+        mem: AMode,
+        flags: MemFlags,
+    },
+
+    /// An 8-bit store.
+    Store8 {
+        rd: Reg,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// A 16-bit store.
+    Store16 {
+        rd: Reg,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// A 32-bit store.
+    Store32 {
+        rd: Reg,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// A 64-bit store.
+    Store64 {
+        rd: Reg,
+        mem: AMode,
+        flags: MemFlags,
+    },
+
+    /// A store of a pair of registers.
+    StoreP64 {
+        rt: Reg,
+        rt2: Reg,
+        mem: PairAMode,
+        flags: MemFlags,
+    },
+    /// A load of a pair of registers.
+    LoadP64 {
+        rt: Writable<Reg>,
+        rt2: Writable<Reg>,
+        mem: PairAMode,
+        flags: MemFlags,
+    },
+
+    /// A MOV instruction. These are encoded as ORR's (AluRRR form) but we
+    /// keep them separate at the `Inst` level for better pretty-printing
+    /// and faster `is_move()` logic.
+    Mov64 {
+        rd: Writable<Reg>,
+        rm: Reg,
+    },
+
+    /// A 32-bit MOV. Zeroes the top 32 bits of the destination. This is
+    /// effectively an alias for an unsigned 32-to-64-bit extension.
+    Mov32 {
+        rd: Writable<Reg>,
+        rm: Reg,
+    },
+
+    /// A MOVZ with a 16-bit immediate.
+    MovZ {
+        rd: Writable<Reg>,
+        imm: MoveWideConst,
+        size: OperandSize,
+    },
+
+    /// A MOVN with a 16-bit immediate.
+    MovN {
+        rd: Writable<Reg>,
+        imm: MoveWideConst,
+        size: OperandSize,
+    },
+
+    /// A MOVK with a 16-bit immediate.
+    MovK {
+        rd: Writable<Reg>,
+        imm: MoveWideConst,
+        size: OperandSize,
+    },
+
+    /// A sign- or zero-extend operation.
+    Extend {
+        rd: Writable<Reg>,
+        rn: Reg,
+        signed: bool,
+        from_bits: u8,
+        to_bits: u8,
+    },
+
+    /// A conditional-select operation.
+    CSel {
+        rd: Writable<Reg>,
+        cond: Cond,
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// A conditional-set operation.
+    CSet {
+        rd: Writable<Reg>,
+        cond: Cond,
+    },
+
+    /// A conditional comparison with an immediate.
+    CCmpImm {
+        size: OperandSize,
+        rn: Reg,
+        imm: UImm5,
+        nzcv: NZCV,
+        cond: Cond,
+    },
+
+    /// A synthetic insn, which is a load-linked store-conditional loop, that has the overall
+    /// effect of atomically modifying a memory location in a particular way.  Because we have
+    /// no way to explain to the regalloc about earlyclobber registers, this instruction has
+    /// completely fixed operand registers, and we rely on the RA's coalescing to remove copies
+    /// in the surrounding code to the extent it can.  The sequence is both preceded and
+    /// followed by a fence which is at least as comprehensive as that of the `Fence`
+    /// instruction below.  This instruction is sequentially consistent.  The operand
+    /// conventions are:
+    ///
+    /// x25   (rd) address
+    /// x26   (rd) second operand for `op`
+    /// x27   (wr) old value
+    /// x24   (wr) scratch reg; value afterwards has no meaning
+    /// x28   (wr) scratch reg; value afterwards has no meaning
+    AtomicRMW {
+        ty: Type, // I8, I16, I32 or I64
+        op: inst_common::AtomicRmwOp,
+    },
+
+    /// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked
+    /// store-conditional loop.  (Although we could possibly implement it more directly using
+    /// CAS insns that are available in some revisions of AArch64 above 8.0).  The sequence is
+    /// both preceded and followed by a fence which is at least as comprehensive as that of the
+    /// `Fence` instruction below.  This instruction is sequentially consistent.  Note that the
+    /// operand conventions, although very similar to AtomicRMW, are different:
+    ///
+    /// x25   (rd) address
+    /// x26   (rd) expected value
+    /// x28   (rd) replacement value
+    /// x27   (wr) old value
+    /// x24   (wr) scratch reg; value afterwards has no meaning
+    AtomicCAS {
+        ty: Type, // I8, I16, I32 or I64
+    },
+
+    /// Read `ty` bits from address `r_addr`, zero extend the loaded value to 64 bits and put it
+    /// in `r_data`.  The load instruction is preceded by a fence at least as comprehensive as
+    /// that of the `Fence` instruction below.  This instruction is sequentially consistent.
+    AtomicLoad {
+        ty: Type, // I8, I16, I32 or I64
+        r_data: Writable<Reg>,
+        r_addr: Reg,
+    },
+
+    /// Write the lowest `ty` bits of `r_data` to address `r_addr`, with a memory fence
+    /// instruction following the store.  The fence is at least as comprehensive as that of the
+    /// `Fence` instruction below.  This instruction is sequentially consistent.
+    AtomicStore {
+        ty: Type, // I8, I16, I32 or I64
+        r_data: Reg,
+        r_addr: Reg,
+    },
+
+    /// A memory fence.  This must provide ordering to ensure that, at a minimum, neither loads
+    /// nor stores may move forwards or backwards across the fence.  Currently emitted as "dmb
+    /// ish".  This instruction is sequentially consistent.
+    Fence,
+
+    /// FPU move. Note that this is distinct from a vector-register
+    /// move; moving just 64 bits seems to be significantly faster.
+    FpuMove64 {
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// Vector register move.
+    FpuMove128 {
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// Move to scalar from a vector element.
+    FpuMoveFromVec {
+        rd: Writable<Reg>,
+        rn: Reg,
+        idx: u8,
+        size: VectorSize,
+    },
+
+    /// 1-op FPU instruction.
+    FpuRR {
+        fpu_op: FPUOp1,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// 2-op FPU instruction.
+    FpuRRR {
+        fpu_op: FPUOp2,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+    },
+
+    FpuRRI {
+        fpu_op: FPUOpRI,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// 3-op FPU instruction.
+    FpuRRRR {
+        fpu_op: FPUOp3,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        ra: Reg,
+    },
+
+    /// FPU comparison, single-precision (32 bit).
+    FpuCmp32 {
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// FPU comparison, double-precision (64 bit).
+    FpuCmp64 {
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// Floating-point load, single-precision (32 bit).
+    FpuLoad32 {
+        rd: Writable<Reg>,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// Floating-point store, single-precision (32 bit).
+    FpuStore32 {
+        rd: Reg,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// Floating-point load, double-precision (64 bit).
+    FpuLoad64 {
+        rd: Writable<Reg>,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// Floating-point store, double-precision (64 bit).
+    FpuStore64 {
+        rd: Reg,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// Floating-point/vector load, 128 bit.
+    FpuLoad128 {
+        rd: Writable<Reg>,
+        mem: AMode,
+        flags: MemFlags,
+    },
+    /// Floating-point/vector store, 128 bit.
+    FpuStore128 {
+        rd: Reg,
+        mem: AMode,
+        flags: MemFlags,
+    },
+
+    LoadFpuConst64 {
+        rd: Writable<Reg>,
+        const_data: u64,
+    },
+
+    LoadFpuConst128 {
+        rd: Writable<Reg>,
+        const_data: u128,
+    },
+
+    /// Conversion: FP -> integer.
+    FpuToInt {
+        op: FpuToIntOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// Conversion: integer -> FP.
+    IntToFpu {
+        op: IntToFpuOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// FP conditional select, 32 bit.
+    FpuCSel32 {
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        cond: Cond,
+    },
+    /// FP conditional select, 64 bit.
+    FpuCSel64 {
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        cond: Cond,
+    },
+
+    /// Round to integer.
+    FpuRound {
+        op: FpuRoundMode,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// Move from a GPR to a vector register.  The scalar value is parked in the lowest lane
+    /// of the destination, and all other lanes are zeroed out.  Currently only 32- and 64-bit
+    /// transactions are supported.
+    MovToFpu {
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: ScalarSize,
+    },
+
+    /// Move to a vector element from a GPR.
+    MovToVec {
+        rd: Writable<Reg>,
+        rn: Reg,
+        idx: u8,
+        size: VectorSize,
+    },
+
+    /// Unsigned move from a vector element to a GPR.
+    MovFromVec {
+        rd: Writable<Reg>,
+        rn: Reg,
+        idx: u8,
+        size: VectorSize,
+    },
+
+    /// Signed move from a vector element to a GPR.
+    MovFromVecSigned {
+        rd: Writable<Reg>,
+        rn: Reg,
+        idx: u8,
+        size: VectorSize,
+        scalar_size: OperandSize,
+    },
+
+    /// Duplicate general-purpose register to vector.
+    VecDup {
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+    },
+
+    /// Duplicate scalar to vector.
+    VecDupFromFpu {
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+    },
+
+    /// Duplicate immediate to vector.
+    VecDupImm {
+        rd: Writable<Reg>,
+        imm: ASIMDMovModImm,
+        invert: bool,
+        size: VectorSize,
+    },
+
+    /// Vector extend.
+    VecExtend {
+        t: VecExtendOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        high_half: bool,
+    },
+
+    /// Move vector element to another vector element.
+    VecMovElement {
+        rd: Writable<Reg>,
+        rn: Reg,
+        dest_idx: u8,
+        src_idx: u8,
+        size: VectorSize,
+    },
+
+    /// Vector narrowing operation.
+    VecMiscNarrow {
+        op: VecMiscNarrowOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+        high_half: bool,
+    },
+
+    /// A vector ALU op.
+    VecRRR {
+        alu_op: VecALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        size: VectorSize,
+    },
+
+    /// Vector two register miscellaneous instruction.
+    VecMisc {
+        op: VecMisc2,
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+    },
+
+    /// Vector instruction across lanes.
+    VecLanes {
+        op: VecLanesOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+    },
+
+    /// Vector shift by immediate: Shift Left (immediate), Unsigned Shift Right (immediate),
+    /// Signed Shift Right (immediate).  These are somewhat unusual in that, for right shifts,
+    /// the allowed range of `imm` values is 1 to lane-size-in-bits, inclusive.  A zero
+    /// right-shift cannot be encoded.  Left shifts are "normal", though, having valid `imm`
+    /// values from 0 to lane-size-in-bits - 1 inclusive.
+    VecShiftImm {
+        op: VecShiftImmOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+        imm: u8,
+    },
+
+    /// Vector extract - create a new vector, being the concatenation of the lowest `imm4` bytes
+    /// of `rm` followed by the uppermost `16 - imm4` bytes of `rn`.
+    VecExtract {
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        imm4: u8,
+    },
+
+    /// Table vector lookup - single register table. The table consists of 8-bit elements and is
+    /// stored in `rn`, while `rm` contains 8-bit element indices. `is_extension` specifies whether
+    /// to emit a TBX or a TBL instruction, i.e. whether to leave the elements in the destination
+    /// vector that correspond to out-of-range indices (greater than 15) unmodified or to set them
+    /// to 0.
+    VecTbl {
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        is_extension: bool,
+    },
+
+    /// Table vector lookup - two register table. The table consists of 8-bit elements and is
+    /// stored in `rn` and `rn2`, while `rm` contains 8-bit element indices. `is_extension`
+    /// specifies whether to emit a TBX or a TBL instruction, i.e. whether to leave the elements in
+    /// the destination vector that correspond to out-of-range indices (greater than 31) unmodified
+    /// or to set them to 0. The table registers `rn` and `rn2` must have consecutive numbers
+    /// modulo 32, that is v31 and v0 (in that order) are consecutive registers.
+    VecTbl2 {
+        rd: Writable<Reg>,
+        rn: Reg,
+        rn2: Reg,
+        rm: Reg,
+        is_extension: bool,
+    },
+
+    /// Load an element and replicate to all lanes of a vector.
+    VecLoadReplicate {
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+    },
+
+    /// Vector conditional select, 128 bit.  A synthetic instruction, which generates a 4-insn
+    /// control-flow diamond.
+    VecCSel {
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        cond: Cond,
+    },
+
+    /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
+    MovToNZCV {
+        rn: Reg,
+    },
+
+    /// Move from the NZCV flags (actually a `MRS Xn, NZCV` insn).
+    MovFromNZCV {
+        rd: Writable<Reg>,
+    },
+
+    /// A machine call instruction. N.B.: this allows only a +/- 128MB offset (it uses a relocation
+    /// of type `Reloc::Arm64Call`); if the destination distance is not `RelocDistance::Near`, the
+    /// code should use a `LoadExtName` / `CallInd` sequence instead, allowing an arbitrary 64-bit
+    /// target.
+    Call {
+        info: Box<CallInfo>,
+    },
+    /// A machine indirect-call instruction.
+    CallInd {
+        info: Box<CallIndInfo>,
+    },
+
+    // ---- branches (exactly one must appear at end of BB) ----
+    /// A machine return instruction.
+    Ret,
+
+    /// A placeholder instruction, generating no code, meaning that a function epilogue must be
+    /// inserted there.
+    EpiloguePlaceholder,
+
+    /// An unconditional branch.
+    Jump {
+        dest: BranchTarget,
+    },
+
+    /// A conditional branch. Contains two targets; at emission time, both are emitted, but
+    /// the MachBuffer knows to truncate the trailing branch if fallthrough. We optimize the
+    /// choice of taken/not_taken (inverting the branch polarity as needed) based on the
+    /// fallthrough at the time of lowering.
+    CondBr {
+        taken: BranchTarget,
+        not_taken: BranchTarget,
+        kind: CondBrKind,
+    },
+
+    /// A conditional trap: execute a `udf` if the condition is true. This is
+    /// one VCode instruction because it uses embedded control flow; it is
+    /// logically a single-in, single-out region, but needs to appear as one
+    /// unit to the register allocator.
+    ///
+    /// The `CondBrKind` gives the conditional-branch condition that will
+    /// *execute* the embedded `Inst`. (In the emitted code, we use the inverse
+    /// of this condition in a branch that skips the trap instruction.)
+    TrapIf {
+        kind: CondBrKind,
+        trap_code: TrapCode,
+    },
+
+    /// An indirect branch through a register, augmented with set of all
+    /// possible successors.
+    IndirectBr {
+        rn: Reg,
+        targets: Vec<MachLabel>,
+    },
+
+    /// A "break" instruction, used for e.g. traps and debug breakpoints.
+    Brk,
+
+    /// An instruction guaranteed to always be undefined and to trigger an illegal instruction at
+    /// runtime.
+    Udf {
+        trap_code: TrapCode,
+    },
+
+    /// Compute the address (using a PC-relative offset) of a memory location, using the `ADR`
+    /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is
+    /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may
+    /// need full `MemLabel` support.
+    Adr {
+        rd: Writable<Reg>,
+        /// Offset in range -2^20 .. 2^20.
+        off: i32,
+    },
+
+    /// Raw 32-bit word, used for inline constants and jump-table entries.
+    Word4 {
+        data: u32,
+    },
+
+    /// Raw 64-bit word, used for inline constants.
+    Word8 {
+        data: u64,
+    },
+
+    /// Jump-table sequence, as one compound instruction (see note in lower_inst.rs for rationale).
+    JTSequence {
+        info: Box<JTSequenceInfo>,
+        ridx: Reg,
+        rtmp1: Writable<Reg>,
+        rtmp2: Writable<Reg>,
+    },
+
+    /// Load an inline symbol reference.
+    LoadExtName {
+        rd: Writable<Reg>,
+        name: Box<ExternalName>,
+        offset: i64,
+    },
+
+    /// Load address referenced by `mem` into `rd`.
+    LoadAddr {
+        rd: Writable<Reg>,
+        mem: AMode,
+    },
+
+    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
+    /// controls how AMode::NominalSPOffset args are lowered.
+    VirtualSPOffsetAdj {
+        offset: i64,
+    },
+
+    /// Meta-insn, no-op in generated code: emit constant/branch veneer island
+    /// at this point (with a guard jump around it) if less than the needed
+    /// space is available before the next branch deadline. See the `MachBuffer`
+    /// implementation in `machinst/buffer.rs` for the overall algorithm. In
+    /// brief, we retain a set of "pending/unresolved label references" from
+    /// branches as we scan forward through instructions to emit machine code;
+    /// if we notice we're about to go out of range on an unresolved reference,
+    /// we stop, emit a bunch of "veneers" (branches in a form that has a longer
+    /// range, e.g. a 26-bit-offset unconditional jump), and point the original
+    /// label references to those. This is an "island" because it comes in the
+    /// middle of the code.
+    ///
+    /// This meta-instruction is a necessary part of the logic that determines
+    /// where to place islands. Ordinarily, we want to place them between basic
+    /// blocks, so we compute the worst-case size of each block, and emit the
+    /// island before starting a block if we would exceed a deadline before the
+    /// end of the block. However, some sequences (such as an inline jumptable)
+    /// are variable-length and not accounted for by this logic; so these
+    /// lowered sequences include an `EmitIsland` to trigger island generation
+    /// where necessary.
+    EmitIsland {
+        /// The needed space before the next deadline.
+        needed_space: CodeOffset,
+    },
+}
+
+fn count_zero_half_words(mut value: u64, num_half_words: u8) -> usize {
+    let mut count = 0;
+    for _ in 0..num_half_words {
+        if value & 0xffff == 0 {
+            count += 1;
+        }
+        value >>= 16;
+    }
+
+    count
+}
+
+#[test]
+fn inst_size_test() {
+    // This test will help with unintentionally growing the size
+    // of the Inst enum.
+    assert_eq!(32, std::mem::size_of::<Inst>());
+}
+
+impl Inst {
+    /// Create a move instruction.
+    pub fn mov(to_reg: Writable<Reg>, from_reg: Reg) -> Inst {
+        assert!(to_reg.to_reg().get_class() == from_reg.get_class());
+        if from_reg.get_class() == RegClass::I64 {
+            Inst::Mov64 {
+                rd: to_reg,
+                rm: from_reg,
+            }
+        } else if from_reg.get_class() == RegClass::V128 {
+            Inst::FpuMove128 {
+                rd: to_reg,
+                rn: from_reg,
+            }
+        } else {
+            Inst::FpuMove64 {
+                rd: to_reg,
+                rn: from_reg,
+            }
+        }
+    }
+
+    /// Create a 32-bit move instruction.
+    pub fn mov32(to_reg: Writable<Reg>, from_reg: Reg) -> Inst {
+        Inst::Mov32 {
+            rd: to_reg,
+            rm: from_reg,
+        }
+    }
+
+    /// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN,
+    /// logical immediate, or constant pool).
+    pub fn load_constant(rd: Writable<Reg>, value: u64) -> SmallVec<[Inst; 4]> {
+        if let Some(imm) = MoveWideConst::maybe_from_u64(value) {
+            // 16-bit immediate (shifted by 0, 16, 32 or 48 bits) in MOVZ
+            smallvec![Inst::MovZ {
+                rd,
+                imm,
+                size: OperandSize::Size64
+            }]
+        } else if let Some(imm) = MoveWideConst::maybe_from_u64(!value) {
+            // 16-bit immediate (shifted by 0, 16, 32 or 48 bits) in MOVN
+            smallvec![Inst::MovN {
+                rd,
+                imm,
+                size: OperandSize::Size64
+            }]
+        } else if let Some(imml) = ImmLogic::maybe_from_u64(value, I64) {
+            // Weird logical-instruction immediate in ORI using zero register
+            smallvec![Inst::AluRRImmLogic {
+                alu_op: ALUOp::Orr64,
+                rd,
+                rn: zero_reg(),
+                imml,
+            }]
+        } else {
+            let mut insts = smallvec![];
+
+            // If the top 32 bits are zero, use 32-bit `mov` operations.
+            let (num_half_words, size, negated) = if value >> 32 == 0 {
+                (2, OperandSize::Size32, (!value << 32) >> 32)
+            } else {
+                (4, OperandSize::Size64, !value)
+            };
+            // If the number of 0xffff half words is greater than the number of 0x0000 half words
+            // it is more efficient to use `movn` for the first instruction.
+            let first_is_inverted = count_zero_half_words(negated, num_half_words)
+                > count_zero_half_words(value, num_half_words);
+            // Either 0xffff or 0x0000 half words can be skipped, depending on the first
+            // instruction used.
+            let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
+            let mut first_mov_emitted = false;
+
+            for i in 0..num_half_words {
+                let imm16 = (value >> (16 * i)) & 0xffff;
+                if imm16 != ignored_halfword {
+                    if !first_mov_emitted {
+                        first_mov_emitted = true;
+                        if first_is_inverted {
+                            let imm =
+                                MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16)
+                                    .unwrap();
+                            insts.push(Inst::MovN { rd, imm, size });
+                        } else {
+                            let imm =
+                                MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
+                            insts.push(Inst::MovZ { rd, imm, size });
+                        }
+                    } else {
+                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
+                        insts.push(Inst::MovK { rd, imm, size });
+                    }
+                }
+            }
+
+            assert!(first_mov_emitted);
+
+            insts
+        }
+    }
+
+    /// Create instructions that load a 32-bit floating-point constant.
+    pub fn load_fp_constant32<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        rd: Writable<Reg>,
+        value: u32,
+        mut alloc_tmp: F,
+    ) -> SmallVec<[Inst; 4]> {
+        if value == 0 {
+            smallvec![Inst::VecDupImm {
+                rd,
+                imm: ASIMDMovModImm::zero(),
+                invert: false,
+                size: VectorSize::Size8x8
+            }]
+        } else {
+            // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent
+            // bits.
+            let tmp = alloc_tmp(RegClass::I64, I32);
+            let mut insts = Inst::load_constant(tmp, value as u64);
+
+            insts.push(Inst::MovToFpu {
+                rd,
+                rn: tmp.to_reg(),
+                size: ScalarSize::Size64,
+            });
+
+            insts
+        }
+    }
+
+    /// Create instructions that load a 64-bit floating-point constant.
+    pub fn load_fp_constant64<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        rd: Writable<Reg>,
+        const_data: u64,
+        mut alloc_tmp: F,
+    ) -> SmallVec<[Inst; 4]> {
+        if let Ok(const_data) = u32::try_from(const_data) {
+            Inst::load_fp_constant32(rd, const_data, alloc_tmp)
+        // TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent
+        // bits.  Also, treat it as half of a 128-bit vector and consider replicated
+        // patterns. Scalar MOVI might also be an option.
+        } else if const_data & (u32::MAX as u64) == 0 {
+            let tmp = alloc_tmp(RegClass::I64, I64);
+            let mut insts = Inst::load_constant(tmp, const_data);
+
+            insts.push(Inst::MovToFpu {
+                rd,
+                rn: tmp.to_reg(),
+                size: ScalarSize::Size64,
+            });
+
+            insts
+        } else {
+            smallvec![Inst::LoadFpuConst64 { rd, const_data }]
+        }
+    }
+
+    /// Create instructions that load a 128-bit vector constant.
+    pub fn load_fp_constant128<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        rd: Writable<Reg>,
+        const_data: u128,
+        alloc_tmp: F,
+    ) -> SmallVec<[Inst; 5]> {
+        if let Ok(const_data) = u64::try_from(const_data) {
+            SmallVec::from(&Inst::load_fp_constant64(rd, const_data, alloc_tmp)[..])
+        } else if let Some((pattern, size)) =
+            Inst::get_replicated_vector_pattern(const_data, ScalarSize::Size64)
+        {
+            Inst::load_replicated_vector_pattern(
+                rd,
+                pattern,
+                VectorSize::from_lane_size(size, true),
+                alloc_tmp,
+            )
+        } else {
+            smallvec![Inst::LoadFpuConst128 { rd, const_data }]
+        }
+    }
+
+    /// Determine whether a 128-bit constant represents a vector consisting of elements with
+    /// the same value.
+    pub fn get_replicated_vector_pattern(
+        value: u128,
+        size: ScalarSize,
+    ) -> Option<(u64, ScalarSize)> {
+        let (mask, shift, next_size) = match size {
+            ScalarSize::Size8 => (u8::MAX as u128, 8, ScalarSize::Size128),
+            ScalarSize::Size16 => (u16::MAX as u128, 16, ScalarSize::Size8),
+            ScalarSize::Size32 => (u32::MAX as u128, 32, ScalarSize::Size16),
+            ScalarSize::Size64 => (u64::MAX as u128, 64, ScalarSize::Size32),
+            _ => return None,
+        };
+        let mut r = None;
+        let v = value & mask;
+
+        if (value >> shift) & mask == v {
+            r = Inst::get_replicated_vector_pattern(v, next_size);
+
+            if r.is_none() {
+                r = Some((v as u64, size));
+            }
+        }
+
+        r
+    }
+
+    /// Create instructions that load a 128-bit vector constant consisting of elements with
+    /// the same value.
+    pub fn load_replicated_vector_pattern<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        rd: Writable<Reg>,
+        pattern: u64,
+        size: VectorSize,
+        mut alloc_tmp: F,
+    ) -> SmallVec<[Inst; 5]> {
+        let lane_size = size.lane_size();
+
+        if let Some(imm) = ASIMDMovModImm::maybe_from_u64(pattern, lane_size) {
+            smallvec![Inst::VecDupImm {
+                rd,
+                imm,
+                invert: false,
+                size
+            }]
+        } else if let Some(imm) = ASIMDMovModImm::maybe_from_u64(!pattern, lane_size) {
+            debug_assert_ne!(lane_size, ScalarSize::Size8);
+            debug_assert_ne!(lane_size, ScalarSize::Size64);
+
+            smallvec![Inst::VecDupImm {
+                rd,
+                imm,
+                invert: true,
+                size
+            }]
+        } else {
+            let tmp = alloc_tmp(RegClass::I64, I64);
+            let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern)[..]);
+
+            insts.push(Inst::VecDup {
+                rd,
+                rn: tmp.to_reg(),
+                size,
+            });
+
+            insts
+        }
+    }
+
+    /// Generic constructor for a load (zero-extending where appropriate).
+    pub fn gen_load(into_reg: Writable<Reg>, mem: AMode, ty: Type, flags: MemFlags) -> Inst {
+        match ty {
+            B1 | B8 | I8 => Inst::ULoad8 {
+                rd: into_reg,
+                mem,
+                flags,
+            },
+            B16 | I16 => Inst::ULoad16 {
+                rd: into_reg,
+                mem,
+                flags,
+            },
+            B32 | I32 | R32 => Inst::ULoad32 {
+                rd: into_reg,
+                mem,
+                flags,
+            },
+            B64 | I64 | R64 => Inst::ULoad64 {
+                rd: into_reg,
+                mem,
+                flags,
+            },
+            F32 => Inst::FpuLoad32 {
+                rd: into_reg,
+                mem,
+                flags,
+            },
+            F64 => Inst::FpuLoad64 {
+                rd: into_reg,
+                mem,
+                flags,
+            },
+            _ => {
+                if ty.is_vector() {
+                    let bits = ty_bits(ty);
+                    let rd = into_reg;
+
+                    if bits == 128 {
+                        Inst::FpuLoad128 { rd, mem, flags }
+                    } else {
+                        assert_eq!(bits, 64);
+                        Inst::FpuLoad64 { rd, mem, flags }
+                    }
+                } else {
+                    unimplemented!("gen_load({})", ty);
+                }
+            }
+        }
+    }
+
+    /// Generic constructor for a store.
+    pub fn gen_store(mem: AMode, from_reg: Reg, ty: Type, flags: MemFlags) -> Inst {
+        match ty {
+            B1 | B8 | I8 => Inst::Store8 {
+                rd: from_reg,
+                mem,
+                flags,
+            },
+            B16 | I16 => Inst::Store16 {
+                rd: from_reg,
+                mem,
+                flags,
+            },
+            B32 | I32 | R32 => Inst::Store32 {
+                rd: from_reg,
+                mem,
+                flags,
+            },
+            B64 | I64 | R64 => Inst::Store64 {
+                rd: from_reg,
+                mem,
+                flags,
+            },
+            F32 => Inst::FpuStore32 {
+                rd: from_reg,
+                mem,
+                flags,
+            },
+            F64 => Inst::FpuStore64 {
+                rd: from_reg,
+                mem,
+                flags,
+            },
+            _ => {
+                if ty.is_vector() {
+                    let bits = ty_bits(ty);
+                    let rd = from_reg;
+
+                    if bits == 128 {
+                        Inst::FpuStore128 { rd, mem, flags }
+                    } else {
+                        assert_eq!(bits, 64);
+                        Inst::FpuStore64 { rd, mem, flags }
+                    }
+                } else {
+                    unimplemented!("gen_store({})", ty);
+                }
+            }
+        }
+    }
+}
+
+//=============================================================================
+// Instructions: get_regs
+
+fn memarg_regs(memarg: &AMode, collector: &mut RegUsageCollector) {
+    match memarg {
+        &AMode::Unscaled(reg, ..) | &AMode::UnsignedOffset(reg, ..) => {
+            collector.add_use(reg);
+        }
+        &AMode::RegReg(r1, r2, ..)
+        | &AMode::RegScaled(r1, r2, ..)
+        | &AMode::RegScaledExtended(r1, r2, ..)
+        | &AMode::RegExtended(r1, r2, ..) => {
+            collector.add_use(r1);
+            collector.add_use(r2);
+        }
+        &AMode::Label(..) => {}
+        &AMode::PreIndexed(reg, ..) | &AMode::PostIndexed(reg, ..) => {
+            collector.add_mod(reg);
+        }
+        &AMode::FPOffset(..) => {
+            collector.add_use(fp_reg());
+        }
+        &AMode::SPOffset(..) | &AMode::NominalSPOffset(..) => {
+            collector.add_use(stack_reg());
+        }
+        &AMode::RegOffset(r, ..) => {
+            collector.add_use(r);
+        }
+    }
+}
+
+fn pairmemarg_regs(pairmemarg: &PairAMode, collector: &mut RegUsageCollector) {
+    match pairmemarg {
+        &PairAMode::SignedOffset(reg, ..) => {
+            collector.add_use(reg);
+        }
+        &PairAMode::PreIndexed(reg, ..) | &PairAMode::PostIndexed(reg, ..) => {
+            collector.add_mod(reg);
+        }
+    }
+}
+
+fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
+    match inst {
+        &Inst::AluRRR { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::AluRRRR { rd, rn, rm, ra, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+            collector.add_use(ra);
+        }
+        &Inst::AluRRImm12 { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::AluRRImmLogic { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::AluRRImmShift { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::AluRRRShift { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::AluRRRExtend { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::BitRR { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::ULoad8 { rd, ref mem, .. }
+        | &Inst::SLoad8 { rd, ref mem, .. }
+        | &Inst::ULoad16 { rd, ref mem, .. }
+        | &Inst::SLoad16 { rd, ref mem, .. }
+        | &Inst::ULoad32 { rd, ref mem, .. }
+        | &Inst::SLoad32 { rd, ref mem, .. }
+        | &Inst::ULoad64 { rd, ref mem, .. } => {
+            collector.add_def(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::Store8 { rd, ref mem, .. }
+        | &Inst::Store16 { rd, ref mem, .. }
+        | &Inst::Store32 { rd, ref mem, .. }
+        | &Inst::Store64 { rd, ref mem, .. } => {
+            collector.add_use(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::StoreP64 {
+            rt, rt2, ref mem, ..
+        } => {
+            collector.add_use(rt);
+            collector.add_use(rt2);
+            pairmemarg_regs(mem, collector);
+        }
+        &Inst::LoadP64 {
+            rt, rt2, ref mem, ..
+        } => {
+            collector.add_def(rt);
+            collector.add_def(rt2);
+            pairmemarg_regs(mem, collector);
+        }
+        &Inst::Mov64 { rd, rm } => {
+            collector.add_def(rd);
+            collector.add_use(rm);
+        }
+        &Inst::Mov32 { rd, rm } => {
+            collector.add_def(rd);
+            collector.add_use(rm);
+        }
+        &Inst::MovZ { rd, .. } | &Inst::MovN { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::MovK { rd, .. } => {
+            collector.add_mod(rd);
+        }
+        &Inst::CSel { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::CSet { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::CCmpImm { rn, .. } => {
+            collector.add_use(rn);
+        }
+        &Inst::AtomicRMW { .. } => {
+            collector.add_use(xreg(25));
+            collector.add_use(xreg(26));
+            collector.add_def(writable_xreg(24));
+            collector.add_def(writable_xreg(27));
+            collector.add_def(writable_xreg(28));
+        }
+        &Inst::AtomicCAS { .. } => {
+            collector.add_use(xreg(25));
+            collector.add_use(xreg(26));
+            collector.add_use(xreg(28));
+            collector.add_def(writable_xreg(24));
+            collector.add_def(writable_xreg(27));
+        }
+        &Inst::AtomicLoad { r_data, r_addr, .. } => {
+            collector.add_use(r_addr);
+            collector.add_def(r_data);
+        }
+        &Inst::AtomicStore { r_data, r_addr, .. } => {
+            collector.add_use(r_addr);
+            collector.add_use(r_data);
+        }
+        &Inst::Fence {} => {}
+        &Inst::FpuMove64 { rd, rn } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::FpuMove128 { rd, rn } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::FpuMoveFromVec { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::FpuRR { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::FpuRRR { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::FpuRRI { fpu_op, rd, rn, .. } => {
+            match fpu_op {
+                FPUOpRI::UShr32(..) | FPUOpRI::UShr64(..) => collector.add_def(rd),
+                FPUOpRI::Sli32(..) | FPUOpRI::Sli64(..) => collector.add_mod(rd),
+            }
+            collector.add_use(rn);
+        }
+        &Inst::FpuRRRR { rd, rn, rm, ra, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+            collector.add_use(ra);
+        }
+        &Inst::VecMisc { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+
+        &Inst::VecLanes { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::VecShiftImm { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::VecExtract { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::VecTbl {
+            rd,
+            rn,
+            rm,
+            is_extension,
+        } => {
+            collector.add_use(rn);
+            collector.add_use(rm);
+
+            if is_extension {
+                collector.add_mod(rd);
+            } else {
+                collector.add_def(rd);
+            }
+        }
+        &Inst::VecTbl2 {
+            rd,
+            rn,
+            rn2,
+            rm,
+            is_extension,
+        } => {
+            collector.add_use(rn);
+            collector.add_use(rn2);
+            collector.add_use(rm);
+
+            if is_extension {
+                collector.add_mod(rd);
+            } else {
+                collector.add_def(rd);
+            }
+        }
+        &Inst::VecLoadReplicate { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::VecCSel { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::FpuLoad32 { rd, ref mem, .. } => {
+            collector.add_def(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::FpuLoad64 { rd, ref mem, .. } => {
+            collector.add_def(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::FpuLoad128 { rd, ref mem, .. } => {
+            collector.add_def(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::FpuStore32 { rd, ref mem, .. } => {
+            collector.add_use(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::FpuStore64 { rd, ref mem, .. } => {
+            collector.add_use(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::FpuStore128 { rd, ref mem, .. } => {
+            collector.add_use(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::LoadFpuConst64 { rd, .. } | &Inst::LoadFpuConst128 { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::FpuToInt { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::IntToFpu { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::FpuCSel32 { rd, rn, rm, .. } | &Inst::FpuCSel64 { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::FpuRound { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::MovToFpu { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::MovToVec { rd, rn, .. } => {
+            collector.add_mod(rd);
+            collector.add_use(rn);
+        }
+        &Inst::MovFromVec { rd, rn, .. } | &Inst::MovFromVecSigned { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::VecDup { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::VecDupFromFpu { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::VecDupImm { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::VecExtend { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::VecMovElement { rd, rn, .. } => {
+            collector.add_mod(rd);
+            collector.add_use(rn);
+        }
+        &Inst::VecMiscNarrow {
+            rd, rn, high_half, ..
+        } => {
+            collector.add_use(rn);
+
+            if high_half {
+                collector.add_mod(rd);
+            } else {
+                collector.add_def(rd);
+            }
+        }
+        &Inst::VecRRR {
+            alu_op, rd, rn, rm, ..
+        } => {
+            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
+                collector.add_mod(rd);
+            } else {
+                collector.add_def(rd);
+            }
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::MovToNZCV { rn } => {
+            collector.add_use(rn);
+        }
+        &Inst::MovFromNZCV { rd } => {
+            collector.add_def(rd);
+        }
+        &Inst::Extend { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::Jump { .. } | &Inst::Ret | &Inst::EpiloguePlaceholder => {}
+        &Inst::Call { ref info, .. } => {
+            collector.add_uses(&*info.uses);
+            collector.add_defs(&*info.defs);
+        }
+        &Inst::CallInd { ref info, .. } => {
+            collector.add_uses(&*info.uses);
+            collector.add_defs(&*info.defs);
+            collector.add_use(info.rn);
+        }
+        &Inst::CondBr { ref kind, .. } => match kind {
+            CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => {
+                collector.add_use(*rt);
+            }
+            CondBrKind::Cond(_) => {}
+        },
+        &Inst::IndirectBr { rn, .. } => {
+            collector.add_use(rn);
+        }
+        &Inst::Nop0 | Inst::Nop4 => {}
+        &Inst::Brk => {}
+        &Inst::Udf { .. } => {}
+        &Inst::TrapIf { ref kind, .. } => match kind {
+            CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => {
+                collector.add_use(*rt);
+            }
+            CondBrKind::Cond(_) => {}
+        },
+        &Inst::Adr { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::Word4 { .. } | &Inst::Word8 { .. } => {}
+        &Inst::JTSequence {
+            ridx, rtmp1, rtmp2, ..
+        } => {
+            collector.add_use(ridx);
+            collector.add_def(rtmp1);
+            collector.add_def(rtmp2);
+        }
+        &Inst::LoadExtName { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::LoadAddr { rd, ref mem } => {
+            collector.add_def(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::VirtualSPOffsetAdj { .. } => {}
+        &Inst::EmitIsland { .. } => {}
+    }
+}
+
+//=============================================================================
+// Instructions: map_regs
+
+fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
+    fn map_use<RUM: RegUsageMapper>(m: &RUM, r: &mut Reg) {
+        if r.is_virtual() {
+            let new = m.get_use(r.to_virtual_reg()).unwrap().to_reg();
+            *r = new;
+        }
+    }
+
+    fn map_def<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
+        if r.to_reg().is_virtual() {
+            let new = m.get_def(r.to_reg().to_virtual_reg()).unwrap().to_reg();
+            *r = Writable::from_reg(new);
+        }
+    }
+
+    fn map_mod<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
+        if r.to_reg().is_virtual() {
+            let new = m.get_mod(r.to_reg().to_virtual_reg()).unwrap().to_reg();
+            *r = Writable::from_reg(new);
+        }
+    }
+
+    fn map_mem<RUM: RegUsageMapper>(m: &RUM, mem: &mut AMode) {
+        // N.B.: we take only the pre-map here, but this is OK because the
+        // only addressing modes that update registers (pre/post-increment on
+        // AArch64) both read and write registers, so they are "mods" rather
+        // than "defs", so must be the same in both the pre- and post-map.
+        match mem {
+            &mut AMode::Unscaled(ref mut reg, ..) => map_use(m, reg),
+            &mut AMode::UnsignedOffset(ref mut reg, ..) => map_use(m, reg),
+            &mut AMode::RegReg(ref mut r1, ref mut r2)
+            | &mut AMode::RegScaled(ref mut r1, ref mut r2, ..)
+            | &mut AMode::RegScaledExtended(ref mut r1, ref mut r2, ..)
+            | &mut AMode::RegExtended(ref mut r1, ref mut r2, ..) => {
+                map_use(m, r1);
+                map_use(m, r2);
+            }
+            &mut AMode::Label(..) => {}
+            &mut AMode::PreIndexed(ref mut r, ..) => map_mod(m, r),
+            &mut AMode::PostIndexed(ref mut r, ..) => map_mod(m, r),
+            &mut AMode::FPOffset(..)
+            | &mut AMode::SPOffset(..)
+            | &mut AMode::NominalSPOffset(..) => {}
+            &mut AMode::RegOffset(ref mut r, ..) => map_use(m, r),
+        };
+    }
+
+    fn map_pairmem<RUM: RegUsageMapper>(m: &RUM, mem: &mut PairAMode) {
+        match mem {
+            &mut PairAMode::SignedOffset(ref mut reg, ..) => map_use(m, reg),
+            &mut PairAMode::PreIndexed(ref mut reg, ..) => map_def(m, reg),
+            &mut PairAMode::PostIndexed(ref mut reg, ..) => map_def(m, reg),
+        }
+    }
+
+    fn map_br<RUM: RegUsageMapper>(m: &RUM, br: &mut CondBrKind) {
+        match br {
+            &mut CondBrKind::Zero(ref mut reg) => map_use(m, reg),
+            &mut CondBrKind::NotZero(ref mut reg) => map_use(m, reg),
+            &mut CondBrKind::Cond(..) => {}
+        };
+    }
+
+    match inst {
+        &mut Inst::AluRRR {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::AluRRRR {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ref mut ra,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+            map_use(mapper, ra);
+        }
+        &mut Inst::AluRRImm12 {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::AluRRImmLogic {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::AluRRImmShift {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::AluRRRShift {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::AluRRRExtend {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::BitRR {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::ULoad8 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::SLoad8 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::ULoad16 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::SLoad16 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::ULoad32 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::SLoad32 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+
+        &mut Inst::ULoad64 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::Store8 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::Store16 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::Store32 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::Store64 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rd);
+            map_mem(mapper, mem);
+        }
+
+        &mut Inst::StoreP64 {
+            ref mut rt,
+            ref mut rt2,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rt);
+            map_use(mapper, rt2);
+            map_pairmem(mapper, mem);
+        }
+        &mut Inst::LoadP64 {
+            ref mut rt,
+            ref mut rt2,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rt);
+            map_def(mapper, rt2);
+            map_pairmem(mapper, mem);
+        }
+        &mut Inst::Mov64 {
+            ref mut rd,
+            ref mut rm,
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rm);
+        }
+        &mut Inst::Mov32 {
+            ref mut rd,
+            ref mut rm,
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rm);
+        }
+        &mut Inst::MovZ { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::MovN { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::MovK { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::CSel {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::CSet { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::CCmpImm { ref mut rn, .. } => {
+            map_use(mapper, rn);
+        }
+        &mut Inst::AtomicRMW { .. } => {
+            // There are no vregs to map in this insn.
+        }
+        &mut Inst::AtomicCAS { .. } => {
+            // There are no vregs to map in this insn.
+        }
+        &mut Inst::AtomicLoad {
+            ref mut r_data,
+            ref mut r_addr,
+            ..
+        } => {
+            map_def(mapper, r_data);
+            map_use(mapper, r_addr);
+        }
+        &mut Inst::AtomicStore {
+            ref mut r_data,
+            ref mut r_addr,
+            ..
+        } => {
+            map_use(mapper, r_data);
+            map_use(mapper, r_addr);
+        }
+        &mut Inst::Fence {} => {}
+        &mut Inst::FpuMove64 {
+            ref mut rd,
+            ref mut rn,
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::FpuMove128 {
+            ref mut rd,
+            ref mut rn,
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::FpuMoveFromVec {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::FpuRR {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::FpuRRR {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::FpuRRI {
+            fpu_op,
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            match fpu_op {
+                FPUOpRI::UShr32(..) | FPUOpRI::UShr64(..) => map_def(mapper, rd),
+                FPUOpRI::Sli32(..) | FPUOpRI::Sli64(..) => map_mod(mapper, rd),
+            }
+            map_use(mapper, rn);
+        }
+        &mut Inst::FpuRRRR {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ref mut ra,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+            map_use(mapper, ra);
+        }
+        &mut Inst::VecMisc {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::VecLanes {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::VecShiftImm {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::VecExtract {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::VecTbl {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            is_extension,
+        } => {
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+
+            if is_extension {
+                map_mod(mapper, rd);
+            } else {
+                map_def(mapper, rd);
+            }
+        }
+        &mut Inst::VecTbl2 {
+            ref mut rd,
+            ref mut rn,
+            ref mut rn2,
+            ref mut rm,
+            is_extension,
+        } => {
+            map_use(mapper, rn);
+            map_use(mapper, rn2);
+            map_use(mapper, rm);
+
+            if is_extension {
+                map_mod(mapper, rd);
+            } else {
+                map_def(mapper, rd);
+            }
+        }
+        &mut Inst::VecLoadReplicate {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::VecCSel {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::FpuCmp32 {
+            ref mut rn,
+            ref mut rm,
+        } => {
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::FpuCmp64 {
+            ref mut rn,
+            ref mut rm,
+        } => {
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::FpuLoad32 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::FpuLoad64 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::FpuLoad128 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::FpuStore32 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::FpuStore64 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::FpuStore128 {
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::LoadFpuConst64 { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::LoadFpuConst128 { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::FpuToInt {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::IntToFpu {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::FpuCSel32 {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::FpuCSel64 {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::FpuRound {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::MovToFpu {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::MovToVec {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_mod(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::MovFromVec {
+            ref mut rd,
+            ref mut rn,
+            ..
+        }
+        | &mut Inst::MovFromVecSigned {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::VecDup {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::VecDupFromFpu {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::VecDupImm { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::VecExtend {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::VecMovElement {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_mod(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::VecMiscNarrow {
+            ref mut rd,
+            ref mut rn,
+            high_half,
+            ..
+        } => {
+            map_use(mapper, rn);
+
+            if high_half {
+                map_mod(mapper, rd);
+            } else {
+                map_def(mapper, rd);
+            }
+        }
+        &mut Inst::VecRRR {
+            alu_op,
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
+                map_mod(mapper, rd);
+            } else {
+                map_def(mapper, rd);
+            }
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::MovToNZCV { ref mut rn } => {
+            map_use(mapper, rn);
+        }
+        &mut Inst::MovFromNZCV { ref mut rd } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::Extend {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::Jump { .. } => {}
+        &mut Inst::Call { ref mut info } => {
+            for r in info.uses.iter_mut() {
+                map_use(mapper, r);
+            }
+            for r in info.defs.iter_mut() {
+                map_def(mapper, r);
+            }
+        }
+        &mut Inst::Ret | &mut Inst::EpiloguePlaceholder => {}
+        &mut Inst::CallInd { ref mut info, .. } => {
+            for r in info.uses.iter_mut() {
+                map_use(mapper, r);
+            }
+            for r in info.defs.iter_mut() {
+                map_def(mapper, r);
+            }
+            map_use(mapper, &mut info.rn);
+        }
+        &mut Inst::CondBr { ref mut kind, .. } => {
+            map_br(mapper, kind);
+        }
+        &mut Inst::IndirectBr { ref mut rn, .. } => {
+            map_use(mapper, rn);
+        }
+        &mut Inst::Nop0 | &mut Inst::Nop4 | &mut Inst::Brk | &mut Inst::Udf { .. } => {}
+        &mut Inst::TrapIf { ref mut kind, .. } => {
+            map_br(mapper, kind);
+        }
+        &mut Inst::Adr { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::Word4 { .. } | &mut Inst::Word8 { .. } => {}
+        &mut Inst::JTSequence {
+            ref mut ridx,
+            ref mut rtmp1,
+            ref mut rtmp2,
+            ..
+        } => {
+            map_use(mapper, ridx);
+            map_def(mapper, rtmp1);
+            map_def(mapper, rtmp2);
+        }
+        &mut Inst::LoadExtName { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::LoadAddr {
+            ref mut rd,
+            ref mut mem,
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::VirtualSPOffsetAdj { .. } => {}
+        &mut Inst::EmitIsland { .. } => {}
+    }
+}
+
+//=============================================================================
+// Instructions: misc functions and external interface
+
+impl MachInst for Inst {
+    type LabelUse = LabelUse;
+
+    fn get_regs(&self, collector: &mut RegUsageCollector) {
+        aarch64_get_regs(self, collector)
+    }
+
+    fn map_regs<RUM: RegUsageMapper>(&mut self, mapper: &RUM) {
+        aarch64_map_regs(self, mapper);
+    }
+
+    fn is_move(&self) -> Option<(Writable<Reg>, Reg)> {
+        match self {
+            &Inst::Mov64 { rd, rm } => Some((rd, rm)),
+            &Inst::FpuMove64 { rd, rn } => Some((rd, rn)),
+            &Inst::FpuMove128 { rd, rn } => Some((rd, rn)),
+            _ => None,
+        }
+    }
+
+    fn is_epilogue_placeholder(&self) -> bool {
+        if let Inst::EpiloguePlaceholder = self {
+            true
+        } else {
+            false
+        }
+    }
+
+    fn is_included_in_clobbers(&self) -> bool {
+        // We exclude call instructions from the clobber-set when they are calls
+        // from caller to callee with the same ABI. Such calls cannot possibly
+        // force any new registers to be saved in the prologue, because anything
+        // that the callee clobbers, the caller is also allowed to clobber. This
+        // both saves work and enables us to more precisely follow the
+        // half-caller-save, half-callee-save SysV ABI for some vector
+        // registers.
+        //
+        // See the note in [crate::isa::aarch64::abi::is_caller_save_reg] for
+        // more information on this ABI-implementation hack.
+        match self {
+            &Inst::Call { ref info } => info.caller_callconv != info.callee_callconv,
+            &Inst::CallInd { ref info } => info.caller_callconv != info.callee_callconv,
+            _ => true,
+        }
+    }
+
+    fn is_term<'a>(&'a self) -> MachTerminator<'a> {
+        match self {
+            &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret,
+            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_label().unwrap()),
+            &Inst::CondBr {
+                taken, not_taken, ..
+            } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()),
+            &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]),
+            &Inst::JTSequence { ref info, .. } => {
+                MachTerminator::Indirect(&info.targets_for_term[..])
+            }
+            _ => MachTerminator::None,
+        }
+    }
+
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst {
+        assert!(ty.bits() <= 128);
+        Inst::mov(to_reg, from_reg)
+    }
+
+    fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        to_reg: Writable<Reg>,
+        value: u64,
+        ty: Type,
+        alloc_tmp: F,
+    ) -> SmallVec<[Inst; 4]> {
+        if ty == F64 {
+            Inst::load_fp_constant64(to_reg, value, alloc_tmp)
+        } else if ty == F32 {
+            Inst::load_fp_constant32(to_reg, value as u32, alloc_tmp)
+        } else {
+            // Must be an integer type.
+            debug_assert!(
+                ty == B1
+                    || ty == I8
+                    || ty == B8
+                    || ty == I16
+                    || ty == B16
+                    || ty == I32
+                    || ty == B32
+                    || ty == I64
+                    || ty == B64
+                    || ty == R32
+                    || ty == R64
+            );
+            Inst::load_constant(to_reg, value)
+        }
+    }
+
+    fn gen_zero_len_nop() -> Inst {
+        Inst::Nop0
+    }
+
+    fn gen_nop(preferred_size: usize) -> Inst {
+        // We can't give a NOP (or any insn) < 4 bytes.
+        assert!(preferred_size >= 4);
+        Inst::Nop4
+    }
+
+    fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> {
+        None
+    }
+
+    fn rc_for_type(ty: Type) -> CodegenResult<RegClass> {
+        match ty {
+            I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 | R32 | R64 => Ok(RegClass::I64),
+            F32 | F64 => Ok(RegClass::V128),
+            IFLAGS | FFLAGS => Ok(RegClass::I64),
+            B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 | F32X4 | F64X2 => {
+                Ok(RegClass::V128)
+            }
+            _ => Err(CodegenError::Unsupported(format!(
+                "Unexpected SSA-value type: {}",
+                ty
+            ))),
+        }
+    }
+
+    fn gen_jump(target: MachLabel) -> Inst {
+        Inst::Jump {
+            dest: BranchTarget::Label(target),
+        }
+    }
+
+    fn reg_universe(flags: &settings::Flags) -> RealRegUniverse {
+        create_reg_universe(flags)
+    }
+
+    fn worst_case_size() -> CodeOffset {
+        // The maximum size, in bytes, of any `Inst`'s emitted code. We have at least one case of
+        // an 8-instruction sequence (saturating int-to-float conversions) with three embedded
+        // 64-bit f64 constants.
+        //
+        // Note that inline jump-tables handle island/pool insertion separately, so we do not need
+        // to account for them here (otherwise the worst case would be 2^31 * 4, clearly not
+        // feasible for other reasons).
+        44
+    }
+
+    fn ref_type_regclass(_: &settings::Flags) -> RegClass {
+        RegClass::I64
+    }
+}
+
+//=============================================================================
+// Pretty-printing of instructions.
+
+fn mem_finalize_for_show(
+    mem: &AMode,
+    mb_rru: Option<&RealRegUniverse>,
+    state: &EmitState,
+) -> (String, AMode) {
+    let (mem_insts, mem) = mem_finalize(0, mem, state);
+    let mut mem_str = mem_insts
+        .into_iter()
+        .map(|inst| inst.show_rru(mb_rru))
+        .collect::<Vec<_>>()
+        .join(" ; ");
+    if !mem_str.is_empty() {
+        mem_str += " ; ";
+    }
+
+    (mem_str, mem)
+}
+
+impl PrettyPrint for Inst {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        self.pretty_print(mb_rru, &mut EmitState::default())
+    }
+}
+
+impl Inst {
+    fn print_with_state(&self, mb_rru: Option<&RealRegUniverse>, state: &mut EmitState) -> String {
+        fn op_name_size(alu_op: ALUOp) -> (&'static str, OperandSize) {
+            match alu_op {
+                ALUOp::Add32 => ("add", OperandSize::Size32),
+                ALUOp::Add64 => ("add", OperandSize::Size64),
+                ALUOp::Sub32 => ("sub", OperandSize::Size32),
+                ALUOp::Sub64 => ("sub", OperandSize::Size64),
+                ALUOp::Orr32 => ("orr", OperandSize::Size32),
+                ALUOp::Orr64 => ("orr", OperandSize::Size64),
+                ALUOp::And32 => ("and", OperandSize::Size32),
+                ALUOp::And64 => ("and", OperandSize::Size64),
+                ALUOp::Eor32 => ("eor", OperandSize::Size32),
+                ALUOp::Eor64 => ("eor", OperandSize::Size64),
+                ALUOp::AddS32 => ("adds", OperandSize::Size32),
+                ALUOp::AddS64 => ("adds", OperandSize::Size64),
+                ALUOp::SubS32 => ("subs", OperandSize::Size32),
+                ALUOp::SubS64 => ("subs", OperandSize::Size64),
+                ALUOp::SMulH => ("smulh", OperandSize::Size64),
+                ALUOp::UMulH => ("umulh", OperandSize::Size64),
+                ALUOp::SDiv64 => ("sdiv", OperandSize::Size64),
+                ALUOp::UDiv64 => ("udiv", OperandSize::Size64),
+                ALUOp::AndNot32 => ("bic", OperandSize::Size32),
+                ALUOp::AndNot64 => ("bic", OperandSize::Size64),
+                ALUOp::OrrNot32 => ("orn", OperandSize::Size32),
+                ALUOp::OrrNot64 => ("orn", OperandSize::Size64),
+                ALUOp::EorNot32 => ("eon", OperandSize::Size32),
+                ALUOp::EorNot64 => ("eon", OperandSize::Size64),
+                ALUOp::RotR32 => ("ror", OperandSize::Size32),
+                ALUOp::RotR64 => ("ror", OperandSize::Size64),
+                ALUOp::Lsr32 => ("lsr", OperandSize::Size32),
+                ALUOp::Lsr64 => ("lsr", OperandSize::Size64),
+                ALUOp::Asr32 => ("asr", OperandSize::Size32),
+                ALUOp::Asr64 => ("asr", OperandSize::Size64),
+                ALUOp::Lsl32 => ("lsl", OperandSize::Size32),
+                ALUOp::Lsl64 => ("lsl", OperandSize::Size64),
+            }
+        }
+
+        match self {
+            &Inst::Nop0 => "nop-zero-len".to_string(),
+            &Inst::Nop4 => "nop".to_string(),
+            &Inst::AluRRR { alu_op, rd, rn, rm } => {
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let rm = show_ireg_sized(rm, mb_rru, size);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
+            &Inst::AluRRRR {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ra,
+            } => {
+                let (op, size) = match alu_op {
+                    ALUOp3::MAdd32 => ("madd", OperandSize::Size32),
+                    ALUOp3::MAdd64 => ("madd", OperandSize::Size64),
+                    ALUOp3::MSub32 => ("msub", OperandSize::Size32),
+                    ALUOp3::MSub64 => ("msub", OperandSize::Size64),
+                };
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let rm = show_ireg_sized(rm, mb_rru, size);
+                let ra = show_ireg_sized(ra, mb_rru, size);
+
+                format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra)
+            }
+            &Inst::AluRRImm12 {
+                alu_op,
+                rd,
+                rn,
+                ref imm12,
+            } => {
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+
+                if imm12.bits == 0 && alu_op == ALUOp::Add64 {
+                    // special-case MOV (used for moving into SP).
+                    format!("mov {}, {}", rd, rn)
+                } else {
+                    let imm12 = imm12.show_rru(mb_rru);
+                    format!("{} {}, {}, {}", op, rd, rn, imm12)
+                }
+            }
+            &Inst::AluRRImmLogic {
+                alu_op,
+                rd,
+                rn,
+                ref imml,
+            } => {
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let imml = imml.show_rru(mb_rru);
+                format!("{} {}, {}, {}", op, rd, rn, imml)
+            }
+            &Inst::AluRRImmShift {
+                alu_op,
+                rd,
+                rn,
+                ref immshift,
+            } => {
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let immshift = immshift.show_rru(mb_rru);
+                format!("{} {}, {}, {}", op, rd, rn, immshift)
+            }
+            &Inst::AluRRRShift {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ref shiftop,
+            } => {
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let rm = show_ireg_sized(rm, mb_rru, size);
+                let shiftop = shiftop.show_rru(mb_rru);
+                format!("{} {}, {}, {}, {}", op, rd, rn, rm, shiftop)
+            }
+            &Inst::AluRRRExtend {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ref extendop,
+            } => {
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let rm = show_ireg_sized(rm, mb_rru, size);
+                let extendop = extendop.show_rru(mb_rru);
+                format!("{} {}, {}, {}, {}", op, rd, rn, rm, extendop)
+            }
+            &Inst::BitRR { op, rd, rn } => {
+                let size = op.operand_size();
+                let op = op.op_str();
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::ULoad8 {
+                rd,
+                ref mem,
+                ..
+            }
+            | &Inst::SLoad8 {
+                rd,
+                ref mem,
+                ..
+            }
+            | &Inst::ULoad16 {
+                rd,
+                ref mem,
+                ..
+            }
+            | &Inst::SLoad16 {
+                rd,
+                ref mem,
+                ..
+            }
+            | &Inst::ULoad32 {
+                rd,
+                ref mem,
+                ..
+            }
+            | &Inst::SLoad32 {
+                rd,
+                ref mem,
+                ..
+            }
+            | &Inst::ULoad64 {
+                rd,
+                ref mem,
+                ..
+            } => {
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state);
+
+                let is_unscaled = match &mem {
+                    &AMode::Unscaled(..) => true,
+                    _ => false,
+                };
+                let (op, size) = match (self, is_unscaled) {
+                    (&Inst::ULoad8 { .. }, false) => ("ldrb", OperandSize::Size32),
+                    (&Inst::ULoad8 { .. }, true) => ("ldurb", OperandSize::Size32),
+                    (&Inst::SLoad8 { .. }, false) => ("ldrsb", OperandSize::Size64),
+                    (&Inst::SLoad8 { .. }, true) => ("ldursb", OperandSize::Size64),
+                    (&Inst::ULoad16 { .. }, false) => ("ldrh", OperandSize::Size32),
+                    (&Inst::ULoad16 { .. }, true) => ("ldurh", OperandSize::Size32),
+                    (&Inst::SLoad16 { .. }, false) => ("ldrsh", OperandSize::Size64),
+                    (&Inst::SLoad16 { .. }, true) => ("ldursh", OperandSize::Size64),
+                    (&Inst::ULoad32 { .. }, false) => ("ldr", OperandSize::Size32),
+                    (&Inst::ULoad32 { .. }, true) => ("ldur", OperandSize::Size32),
+                    (&Inst::SLoad32 { .. }, false) => ("ldrsw", OperandSize::Size64),
+                    (&Inst::SLoad32 { .. }, true) => ("ldursw", OperandSize::Size64),
+                    (&Inst::ULoad64 { .. }, false) => ("ldr", OperandSize::Size64),
+                    (&Inst::ULoad64 { .. }, true) => ("ldur", OperandSize::Size64),
+                    _ => unreachable!(),
+                };
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}{} {}, {}", mem_str, op, rd, mem)
+            }
+            &Inst::Store8 {
+                rd,
+                ref mem,
+                ..
+            }
+            | &Inst::Store16 {
+                rd,
+                ref mem,
+                ..
+            }
+            | &Inst::Store32 {
+                rd,
+                ref mem,
+                ..
+            }
+            | &Inst::Store64 {
+                rd,
+                ref mem,
+                ..
+            } => {
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state);
+
+                let is_unscaled = match &mem {
+                    &AMode::Unscaled(..) => true,
+                    _ => false,
+                };
+                let (op, size) = match (self, is_unscaled) {
+                    (&Inst::Store8 { .. }, false) => ("strb", OperandSize::Size32),
+                    (&Inst::Store8 { .. }, true) => ("sturb", OperandSize::Size32),
+                    (&Inst::Store16 { .. }, false) => ("strh", OperandSize::Size32),
+                    (&Inst::Store16 { .. }, true) => ("sturh", OperandSize::Size32),
+                    (&Inst::Store32 { .. }, false) => ("str", OperandSize::Size32),
+                    (&Inst::Store32 { .. }, true) => ("stur", OperandSize::Size32),
+                    (&Inst::Store64 { .. }, false) => ("str", OperandSize::Size64),
+                    (&Inst::Store64 { .. }, true) => ("stur", OperandSize::Size64),
+                    _ => unreachable!(),
+                };
+                let rd = show_ireg_sized(rd, mb_rru, size);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}{} {}, {}", mem_str, op, rd, mem)
+            }
+            &Inst::StoreP64 { rt, rt2, ref mem, .. } => {
+                let rt = rt.show_rru(mb_rru);
+                let rt2 = rt2.show_rru(mb_rru);
+                let mem = mem.show_rru(mb_rru);
+                format!("stp {}, {}, {}", rt, rt2, mem)
+            }
+            &Inst::LoadP64 { rt, rt2, ref mem, .. } => {
+                let rt = rt.to_reg().show_rru(mb_rru);
+                let rt2 = rt2.to_reg().show_rru(mb_rru);
+                let mem = mem.show_rru(mb_rru);
+                format!("ldp {}, {}, {}", rt, rt2, mem)
+            }
+            &Inst::Mov64 { rd, rm } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                format!("mov {}, {}", rd, rm)
+            }
+            &Inst::Mov32 { rd, rm } => {
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::Size32);
+                let rm = show_ireg_sized(rm, mb_rru, OperandSize::Size32);
+                format!("mov {}, {}", rd, rm)
+            }
+            &Inst::MovZ { rd, ref imm, size } => {
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let imm = imm.show_rru(mb_rru);
+                format!("movz {}, {}", rd, imm)
+            }
+            &Inst::MovN { rd, ref imm, size } => {
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let imm = imm.show_rru(mb_rru);
+                format!("movn {}, {}", rd, imm)
+            }
+            &Inst::MovK { rd, ref imm, size } => {
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let imm = imm.show_rru(mb_rru);
+                format!("movk {}, {}", rd, imm)
+            }
+            &Inst::CSel { rd, rn, rm, cond } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                let cond = cond.show_rru(mb_rru);
+                format!("csel {}, {}, {}, {}", rd, rn, rm, cond)
+            }
+            &Inst::CSet { rd, cond } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let cond = cond.show_rru(mb_rru);
+                format!("cset {}, {}", rd, cond)
+            }
+            &Inst::CCmpImm {
+                size,
+                rn,
+                imm,
+                nzcv,
+                cond,
+            } => {
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let imm = imm.show_rru(mb_rru);
+                let nzcv = nzcv.show_rru(mb_rru);
+                let cond = cond.show_rru(mb_rru);
+                format!("ccmp {}, {}, {}, {}", rn, imm, nzcv, cond)
+            }
+            &Inst::AtomicRMW { ty, op, .. } => {
+                format!(
+                    "atomically {{ {}_bits_at_[x25]) {:?}= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }}",
+                    ty.bits(), op)
+            }
+            &Inst::AtomicCAS { ty, .. } => {
+                format!(
+                    "atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}",
+                    ty.bits())
+            }
+            &Inst::AtomicLoad { ty, r_data, r_addr, .. } => {
+                format!(
+                    "atomically {{ {} = zero_extend_{}_bits_at[{}] }}",
+                    r_data.show_rru(mb_rru), ty.bits(), r_addr.show_rru(mb_rru))
+            }
+            &Inst::AtomicStore { ty, r_data, r_addr, .. } => {
+                format!(
+                    "atomically {{ {}_bits_at[{}] = {} }}", ty.bits(), r_addr.show_rru(mb_rru), r_data.show_rru(mb_rru))
+            }
+            &Inst::Fence {} => {
+                format!("dmb ish")
+            }
+            &Inst::FpuMove64 { rd, rn } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                format!("mov {}.8b, {}.8b", rd, rn)
+            }
+            &Inst::FpuMove128 { rd, rn } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                format!("mov {}.16b, {}.16b", rd, rn)
+            }
+            &Inst::FpuMoveFromVec { rd, rn, idx, size } => {
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size.lane_size());
+                let rn = show_vreg_element(rn, mb_rru, idx, size);
+                format!("mov {}, {}", rd, rn)
+            }
+            &Inst::FpuRR { fpu_op, rd, rn } => {
+                let (op, sizesrc, sizedest) = match fpu_op {
+                    FPUOp1::Abs32 => ("fabs", ScalarSize::Size32, ScalarSize::Size32),
+                    FPUOp1::Abs64 => ("fabs", ScalarSize::Size64, ScalarSize::Size64),
+                    FPUOp1::Neg32 => ("fneg", ScalarSize::Size32, ScalarSize::Size32),
+                    FPUOp1::Neg64 => ("fneg", ScalarSize::Size64, ScalarSize::Size64),
+                    FPUOp1::Sqrt32 => ("fsqrt", ScalarSize::Size32, ScalarSize::Size32),
+                    FPUOp1::Sqrt64 => ("fsqrt", ScalarSize::Size64, ScalarSize::Size64),
+                    FPUOp1::Cvt32To64 => ("fcvt", ScalarSize::Size32, ScalarSize::Size64),
+                    FPUOp1::Cvt64To32 => ("fcvt", ScalarSize::Size64, ScalarSize::Size32),
+                };
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, sizedest);
+                let rn = show_vreg_scalar(rn, mb_rru, sizesrc);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::FpuRRR { fpu_op, rd, rn, rm } => {
+                let (op, size) = match fpu_op {
+                    FPUOp2::Add32 => ("fadd", ScalarSize::Size32),
+                    FPUOp2::Add64 => ("fadd", ScalarSize::Size64),
+                    FPUOp2::Sub32 => ("fsub", ScalarSize::Size32),
+                    FPUOp2::Sub64 => ("fsub", ScalarSize::Size64),
+                    FPUOp2::Mul32 => ("fmul", ScalarSize::Size32),
+                    FPUOp2::Mul64 => ("fmul", ScalarSize::Size64),
+                    FPUOp2::Div32 => ("fdiv", ScalarSize::Size32),
+                    FPUOp2::Div64 => ("fdiv", ScalarSize::Size64),
+                    FPUOp2::Max32 => ("fmax", ScalarSize::Size32),
+                    FPUOp2::Max64 => ("fmax", ScalarSize::Size64),
+                    FPUOp2::Min32 => ("fmin", ScalarSize::Size32),
+                    FPUOp2::Min64 => ("fmin", ScalarSize::Size64),
+                    FPUOp2::Sqadd64 => ("sqadd", ScalarSize::Size64),
+                    FPUOp2::Uqadd64 => ("uqadd", ScalarSize::Size64),
+                    FPUOp2::Sqsub64 => ("sqsub", ScalarSize::Size64),
+                    FPUOp2::Uqsub64 => ("uqsub", ScalarSize::Size64),
+                };
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_scalar(rn, mb_rru, size);
+                let rm = show_vreg_scalar(rm, mb_rru, size);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
+            &Inst::FpuRRI { fpu_op, rd, rn } => {
+                let (op, imm, vector) = match fpu_op {
+                    FPUOpRI::UShr32(imm) => ("ushr", imm.show_rru(mb_rru), true),
+                    FPUOpRI::UShr64(imm) => ("ushr", imm.show_rru(mb_rru), false),
+                    FPUOpRI::Sli32(imm) => ("sli", imm.show_rru(mb_rru), true),
+                    FPUOpRI::Sli64(imm) => ("sli", imm.show_rru(mb_rru), false),
+                };
+
+                let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector {
+                    |reg, mb_rru| show_vreg_vector(reg, mb_rru, VectorSize::Size32x2)
+                } else {
+                    |reg, mb_rru| show_vreg_scalar(reg, mb_rru, ScalarSize::Size64)
+                };
+                let rd = show_vreg_fn(rd.to_reg(), mb_rru);
+                let rn = show_vreg_fn(rn, mb_rru);
+                format!("{} {}, {}, {}", op, rd, rn, imm)
+            }
+            &Inst::FpuRRRR {
+                fpu_op,
+                rd,
+                rn,
+                rm,
+                ra,
+            } => {
+                let (op, size) = match fpu_op {
+                    FPUOp3::MAdd32 => ("fmadd", ScalarSize::Size32),
+                    FPUOp3::MAdd64 => ("fmadd", ScalarSize::Size64),
+                };
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_scalar(rn, mb_rru, size);
+                let rm = show_vreg_scalar(rm, mb_rru, size);
+                let ra = show_vreg_scalar(ra, mb_rru, size);
+                format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra)
+            }
+            &Inst::FpuCmp32 { rn, rm } => {
+                let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size32);
+                let rm = show_vreg_scalar(rm, mb_rru, ScalarSize::Size32);
+                format!("fcmp {}, {}", rn, rm)
+            }
+            &Inst::FpuCmp64 { rn, rm } => {
+                let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size64);
+                let rm = show_vreg_scalar(rm, mb_rru, ScalarSize::Size64);
+                format!("fcmp {}, {}", rn, rm)
+            }
+            &Inst::FpuLoad32 { rd, ref mem, .. } => {
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size32);
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}ldr {}, {}", mem_str, rd, mem)
+            }
+            &Inst::FpuLoad64 { rd, ref mem, .. } => {
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}ldr {}, {}", mem_str, rd, mem)
+            }
+            &Inst::FpuLoad128 { rd, ref mem, .. } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rd = "q".to_string() + &rd[1..];
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}ldr {}, {}", mem_str, rd, mem)
+            }
+            &Inst::FpuStore32 { rd, ref mem, .. } => {
+                let rd = show_vreg_scalar(rd, mb_rru, ScalarSize::Size32);
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}str {}, {}", mem_str, rd, mem)
+            }
+            &Inst::FpuStore64 { rd, ref mem, .. } => {
+                let rd = show_vreg_scalar(rd, mb_rru, ScalarSize::Size64);
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}str {}, {}", mem_str, rd, mem)
+            }
+            &Inst::FpuStore128 { rd, ref mem, .. } => {
+                let rd = rd.show_rru(mb_rru);
+                let rd = "q".to_string() + &rd[1..];
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}str {}, {}", mem_str, rd, mem)
+            }
+            &Inst::LoadFpuConst64 { rd, const_data } => {
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
+                format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, f64::from_bits(const_data))
+            }
+            &Inst::LoadFpuConst128 { rd, const_data } => {
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size128);
+                format!("ldr {}, pc+8 ; b 20 ; data.f128 0x{:032x}", rd, const_data)
+            }
+            &Inst::FpuToInt { op, rd, rn } => {
+                let (op, sizesrc, sizedest) = match op {
+                    FpuToIntOp::F32ToI32 => ("fcvtzs", ScalarSize::Size32, OperandSize::Size32),
+                    FpuToIntOp::F32ToU32 => ("fcvtzu", ScalarSize::Size32, OperandSize::Size32),
+                    FpuToIntOp::F32ToI64 => ("fcvtzs", ScalarSize::Size32, OperandSize::Size64),
+                    FpuToIntOp::F32ToU64 => ("fcvtzu", ScalarSize::Size32, OperandSize::Size64),
+                    FpuToIntOp::F64ToI32 => ("fcvtzs", ScalarSize::Size64, OperandSize::Size32),
+                    FpuToIntOp::F64ToU32 => ("fcvtzu", ScalarSize::Size64, OperandSize::Size32),
+                    FpuToIntOp::F64ToI64 => ("fcvtzs", ScalarSize::Size64, OperandSize::Size64),
+                    FpuToIntOp::F64ToU64 => ("fcvtzu", ScalarSize::Size64, OperandSize::Size64),
+                };
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, sizedest);
+                let rn = show_vreg_scalar(rn, mb_rru, sizesrc);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::IntToFpu { op, rd, rn } => {
+                let (op, sizesrc, sizedest) = match op {
+                    IntToFpuOp::I32ToF32 => ("scvtf", OperandSize::Size32, ScalarSize::Size32),
+                    IntToFpuOp::U32ToF32 => ("ucvtf", OperandSize::Size32, ScalarSize::Size32),
+                    IntToFpuOp::I64ToF32 => ("scvtf", OperandSize::Size64, ScalarSize::Size32),
+                    IntToFpuOp::U64ToF32 => ("ucvtf", OperandSize::Size64, ScalarSize::Size32),
+                    IntToFpuOp::I32ToF64 => ("scvtf", OperandSize::Size32, ScalarSize::Size64),
+                    IntToFpuOp::U32ToF64 => ("ucvtf", OperandSize::Size32, ScalarSize::Size64),
+                    IntToFpuOp::I64ToF64 => ("scvtf", OperandSize::Size64, ScalarSize::Size64),
+                    IntToFpuOp::U64ToF64 => ("ucvtf", OperandSize::Size64, ScalarSize::Size64),
+                };
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, sizedest);
+                let rn = show_ireg_sized(rn, mb_rru, sizesrc);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::FpuCSel32 { rd, rn, rm, cond } => {
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size32);
+                let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size32);
+                let rm = show_vreg_scalar(rm, mb_rru, ScalarSize::Size32);
+                let cond = cond.show_rru(mb_rru);
+                format!("fcsel {}, {}, {}, {}", rd, rn, rm, cond)
+            }
+            &Inst::FpuCSel64 { rd, rn, rm, cond } => {
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
+                let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size64);
+                let rm = show_vreg_scalar(rm, mb_rru, ScalarSize::Size64);
+                let cond = cond.show_rru(mb_rru);
+                format!("fcsel {}, {}, {}, {}", rd, rn, rm, cond)
+            }
+            &Inst::FpuRound { op, rd, rn } => {
+                let (inst, size) = match op {
+                    FpuRoundMode::Minus32 => ("frintm", ScalarSize::Size32),
+                    FpuRoundMode::Minus64 => ("frintm", ScalarSize::Size64),
+                    FpuRoundMode::Plus32 => ("frintp", ScalarSize::Size32),
+                    FpuRoundMode::Plus64 => ("frintp", ScalarSize::Size64),
+                    FpuRoundMode::Zero32 => ("frintz", ScalarSize::Size32),
+                    FpuRoundMode::Zero64 => ("frintz", ScalarSize::Size64),
+                    FpuRoundMode::Nearest32 => ("frintn", ScalarSize::Size32),
+                    FpuRoundMode::Nearest64 => ("frintn", ScalarSize::Size64),
+                };
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_scalar(rn, mb_rru, size);
+                format!("{} {}, {}", inst, rd, rn)
+            }
+            &Inst::MovToFpu { rd, rn, size } => {
+                let operand_size = size.operand_size();
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, operand_size);
+                format!("fmov {}, {}", rd, rn)
+            }
+            &Inst::MovToVec { rd, rn, idx, size } => {
+                let rd = show_vreg_element(rd.to_reg(), mb_rru, idx, size);
+                let rn = show_ireg_sized(rn, mb_rru, size.operand_size());
+                format!("mov {}, {}", rd, rn)
+            }
+            &Inst::MovFromVec { rd, rn, idx, size } => {
+                let op = match size {
+                    VectorSize::Size8x16 => "umov",
+                    VectorSize::Size16x8 => "umov",
+                    VectorSize::Size32x4 => "mov",
+                    VectorSize::Size64x2 => "mov",
+                    _ => unimplemented!(),
+                };
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size.operand_size());
+                let rn = show_vreg_element(rn, mb_rru, idx, size);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::MovFromVecSigned {
+                rd,
+                rn,
+                idx,
+                size,
+                scalar_size,
+            } => {
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, scalar_size);
+                let rn = show_vreg_element(rn, mb_rru, idx, size);
+                format!("smov {}, {}", rd, rn)
+            }
+            &Inst::VecDup { rd, rn, size } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size.operand_size());
+                format!("dup {}, {}", rd, rn)
+            }
+            &Inst::VecDupFromFpu { rd, rn, size } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_element(rn, mb_rru, 0, size);
+                format!("dup {}, {}", rd, rn)
+            }
+            &Inst::VecDupImm { rd, imm, invert, size } => {
+                let imm = imm.show_rru(mb_rru);
+                let op = if invert {
+                    "mvni"
+                } else {
+                    "movi"
+                };
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+
+                format!("{} {}, {}", op, rd, imm)
+            }
+            &Inst::VecExtend { t, rd, rn, high_half } => {
+                let (op, dest, src) = match (t, high_half) {
+                    (VecExtendOp::Sxtl8, false) => ("sxtl", VectorSize::Size16x8, VectorSize::Size8x8),
+                    (VecExtendOp::Sxtl8, true) => ("sxtl2", VectorSize::Size16x8, VectorSize::Size8x16),
+                    (VecExtendOp::Sxtl16, false) => ("sxtl", VectorSize::Size32x4, VectorSize::Size16x4),
+                    (VecExtendOp::Sxtl16, true) => ("sxtl2", VectorSize::Size32x4, VectorSize::Size16x8),
+                    (VecExtendOp::Sxtl32, false) => ("sxtl", VectorSize::Size64x2, VectorSize::Size32x2),
+                    (VecExtendOp::Sxtl32, true) => ("sxtl2", VectorSize::Size64x2, VectorSize::Size32x4),
+                    (VecExtendOp::Uxtl8, false) => ("uxtl", VectorSize::Size16x8, VectorSize::Size8x8),
+                    (VecExtendOp::Uxtl8, true) => ("uxtl2", VectorSize::Size16x8, VectorSize::Size8x16),
+                    (VecExtendOp::Uxtl16, false) => ("uxtl", VectorSize::Size32x4, VectorSize::Size16x4),
+                    (VecExtendOp::Uxtl16, true) => ("uxtl2", VectorSize::Size32x4, VectorSize::Size16x8),
+                    (VecExtendOp::Uxtl32, false) => ("uxtl", VectorSize::Size64x2, VectorSize::Size32x2),
+                    (VecExtendOp::Uxtl32, true) => ("uxtl2", VectorSize::Size64x2, VectorSize::Size32x4),
+                };
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest);
+                let rn = show_vreg_vector(rn, mb_rru, src);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::VecMovElement {
+                rd,
+                rn,
+                dest_idx,
+                src_idx,
+                size,
+            } => {
+                let rd = show_vreg_element(rd.to_reg(), mb_rru, dest_idx, size);
+                let rn = show_vreg_element(rn, mb_rru, src_idx, size);
+                format!("mov {}, {}", rd, rn)
+            }
+            &Inst::VecMiscNarrow { op, rd, rn, size, high_half } => {
+                let dest_size = if high_half {
+                    assert!(size.is_128bits());
+                    size
+                } else {
+                    size.halve()
+                };
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest_size);
+                let rn = show_vreg_vector(rn, mb_rru, size.widen());
+                let op = match (op, high_half) {
+                    (VecMiscNarrowOp::Xtn, false) => "xtn",
+                    (VecMiscNarrowOp::Xtn, true) => "xtn2",
+                    (VecMiscNarrowOp::Sqxtn, false) => "sqxtn",
+                    (VecMiscNarrowOp::Sqxtn, true) => "sqxtn2",
+                    (VecMiscNarrowOp::Sqxtun, false) => "sqxtun",
+                    (VecMiscNarrowOp::Sqxtun, true) => "sqxtun2",
+                };
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::VecRRR {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                size,
+            } => {
+                let (op, size) = match alu_op {
+                    VecALUOp::Sqadd => ("sqadd", size),
+                    VecALUOp::Uqadd => ("uqadd", size),
+                    VecALUOp::Sqsub => ("sqsub", size),
+                    VecALUOp::Uqsub => ("uqsub", size),
+                    VecALUOp::Cmeq => ("cmeq", size),
+                    VecALUOp::Cmge => ("cmge", size),
+                    VecALUOp::Cmgt => ("cmgt", size),
+                    VecALUOp::Cmhs => ("cmhs", size),
+                    VecALUOp::Cmhi => ("cmhi", size),
+                    VecALUOp::Fcmeq => ("fcmeq", size),
+                    VecALUOp::Fcmgt => ("fcmgt", size),
+                    VecALUOp::Fcmge => ("fcmge", size),
+                    VecALUOp::And => ("and", VectorSize::Size8x16),
+                    VecALUOp::Bic => ("bic", VectorSize::Size8x16),
+                    VecALUOp::Orr => ("orr", VectorSize::Size8x16),
+                    VecALUOp::Eor => ("eor", VectorSize::Size8x16),
+                    VecALUOp::Bsl => ("bsl", VectorSize::Size8x16),
+                    VecALUOp::Umaxp => ("umaxp", size),
+                    VecALUOp::Add => ("add", size),
+                    VecALUOp::Sub => ("sub", size),
+                    VecALUOp::Mul => ("mul", size),
+                    VecALUOp::Sshl => ("sshl", size),
+                    VecALUOp::Ushl => ("ushl", size),
+                    VecALUOp::Umin => ("umin", size),
+                    VecALUOp::Smin => ("smin", size),
+                    VecALUOp::Umax => ("umax", size),
+                    VecALUOp::Smax => ("smax", size),
+                    VecALUOp::Urhadd => ("urhadd", size),
+                    VecALUOp::Fadd => ("fadd", size),
+                    VecALUOp::Fsub => ("fsub", size),
+                    VecALUOp::Fdiv => ("fdiv", size),
+                    VecALUOp::Fmax => ("fmax", size),
+                    VecALUOp::Fmin => ("fmin", size),
+                    VecALUOp::Fmul => ("fmul", size),
+                    VecALUOp::Addp => ("addp", size),
+                    VecALUOp::Umlal => ("umlal", size),
+                    VecALUOp::Zip1 => ("zip1", size),
+                    VecALUOp::Smull => ("smull", size),
+                    VecALUOp::Smull2 => ("smull2", size),
+                };
+                let rd_size = match alu_op {
+                    VecALUOp::Umlal | VecALUOp::Smull | VecALUOp::Smull2 => size.widen(),
+                    _ => size
+                };
+                let rn_size = match alu_op {
+                    VecALUOp::Smull => size.halve(),
+                    _ => size
+                };
+                let rm_size = rn_size;
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
+                let rn = show_vreg_vector(rn, mb_rru, rn_size);
+                let rm = show_vreg_vector(rm, mb_rru, rm_size);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
+            &Inst::VecMisc { op, rd, rn, size } => {
+                let is_shll = op == VecMisc2::Shll;
+                let suffix = match (is_shll, size) {
+                    (true, VectorSize::Size8x8) => ", #8",
+                    (true, VectorSize::Size16x4) => ", #16",
+                    (true, VectorSize::Size32x2) => ", #32",
+                    _ => "",
+                };
+
+                let (op, size) = match op {
+                    VecMisc2::Not => (
+                        "mvn",
+                        if size.is_128bits() {
+                            VectorSize::Size8x16
+                        } else {
+                            VectorSize::Size8x8
+                        },
+                    ),
+                    VecMisc2::Neg => ("neg", size),
+                    VecMisc2::Abs => ("abs", size),
+                    VecMisc2::Fabs => ("fabs", size),
+                    VecMisc2::Fneg => ("fneg", size),
+                    VecMisc2::Fsqrt => ("fsqrt", size),
+                    VecMisc2::Rev64 => ("rev64", size),
+                    VecMisc2::Shll => ("shll", size),
+                    VecMisc2::Fcvtzs => ("fcvtzs", size),
+                    VecMisc2::Fcvtzu => ("fcvtzu", size),
+                    VecMisc2::Scvtf => ("scvtf", size),
+                    VecMisc2::Ucvtf => ("ucvtf", size),
+                    VecMisc2::Frintn => ("frintn", size),
+                    VecMisc2::Frintz => ("frintz", size),
+                    VecMisc2::Frintm => ("frintm", size),
+                    VecMisc2::Frintp => ("frintp", size),
+                };
+
+                let rd_size = if is_shll { size.widen() } else { size };
+
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
+                let rn = show_vreg_vector(rn, mb_rru, size);
+                format!("{} {}, {}{}", op, rd, rn, suffix)
+            }
+            &Inst::VecLanes { op, rd, rn, size } => {
+                let op = match op {
+                    VecLanesOp::Uminv => "uminv",
+                    VecLanesOp::Addv => "addv",
+                };
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size.lane_size());
+                let rn = show_vreg_vector(rn, mb_rru, size);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::VecShiftImm { op, rd, rn, size, imm } => {
+                let op = match op {
+                    VecShiftImmOp::Shl => "shl",
+                    VecShiftImmOp::Ushr => "ushr",
+                    VecShiftImmOp::Sshr => "sshr",
+                };
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_vector(rn, mb_rru, size);
+                format!("{} {}, {}, #{}", op, rd, rn, imm)
+            }
+            &Inst::VecExtract { rd, rn, rm, imm4 } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, VectorSize::Size8x16);
+                let rn = show_vreg_vector(rn, mb_rru, VectorSize::Size8x16);
+                let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
+                format!("ext {}, {}, {}, #{}", rd, rn, rm, imm4)
+            }
+            &Inst::VecTbl {
+                rd,
+                rn,
+                rm,
+                is_extension,
+            } => {
+                let op = if is_extension { "tbx" } else { "tbl" };
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, VectorSize::Size8x16);
+                let rn = show_vreg_vector(rn, mb_rru, VectorSize::Size8x16);
+                let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
+                format!("{} {}, {{ {} }}, {}", op, rd, rn, rm)
+            }
+            &Inst::VecTbl2 {
+                rd,
+                rn,
+                rn2,
+                rm,
+                is_extension,
+            } => {
+                let op = if is_extension { "tbx" } else { "tbl" };
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, VectorSize::Size8x16);
+                let rn = show_vreg_vector(rn, mb_rru, VectorSize::Size8x16);
+                let rn2 = show_vreg_vector(rn2, mb_rru, VectorSize::Size8x16);
+                let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
+                format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
+            }
+            &Inst::VecLoadReplicate { rd, rn, size, .. } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = rn.show_rru(mb_rru);
+
+                format!("ld1r {{ {} }}, [{}]", rd, rn)
+            }
+            &Inst::VecCSel { rd, rn, rm, cond } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, VectorSize::Size8x16);
+                let rn = show_vreg_vector(rn, mb_rru, VectorSize::Size8x16);
+                let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
+                let cond = cond.show_rru(mb_rru);
+                format!("vcsel {}, {}, {}, {} (if-then-else diamond)", rd, rn, rm, cond)
+            }
+            &Inst::MovToNZCV { rn } => {
+                let rn = rn.show_rru(mb_rru);
+                format!("msr nzcv, {}", rn)
+            }
+            &Inst::MovFromNZCV { rd } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                format!("mrs {}, nzcv", rd)
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } if from_bits >= 8 => {
+                // Is the destination a 32-bit register? Corresponds to whether
+                // extend-to width is <= 32 bits, *unless* we have an unsigned
+                // 32-to-64-bit extension, which is implemented with a "mov" to a
+                // 32-bit (W-reg) dest, because this zeroes the top 32 bits.
+                let dest_size = if !signed && from_bits == 32 && to_bits == 64 {
+                    OperandSize::Size32
+                } else {
+                    OperandSize::from_bits(to_bits)
+                };
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size);
+                let rn = show_ireg_sized(rn, mb_rru, OperandSize::from_bits(from_bits));
+                let op = match (signed, from_bits, to_bits) {
+                    (false, 8, 32) => "uxtb",
+                    (true, 8, 32) => "sxtb",
+                    (false, 16, 32) => "uxth",
+                    (true, 16, 32) => "sxth",
+                    (false, 8, 64) => "uxtb",
+                    (true, 8, 64) => "sxtb",
+                    (false, 16, 64) => "uxth",
+                    (true, 16, 64) => "sxth",
+                    (false, 32, 64) => "mov", // special case (see above).
+                    (true, 32, 64) => "sxtw",
+                    _ => panic!("Unsupported Extend case: {:?}", self),
+                };
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } if from_bits == 1 && signed => {
+                let dest_size = OperandSize::from_bits(to_bits);
+                let zr = if dest_size.is32() { "wzr" } else { "xzr" };
+                let rd32 = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::Size32);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size);
+                let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size32);
+                format!("and {}, {}, #1 ; sub {}, {}, {}", rd32, rn, rd, zr, rd)
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                ..
+            } if from_bits == 1 && !signed => {
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::Size32);
+                let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size32);
+                format!("and {}, {}, #1", rd, rn)
+            }
+            &Inst::Extend { .. } => {
+                panic!("Unsupported Extend case");
+            }
+            &Inst::Call { .. } => format!("bl 0"),
+            &Inst::CallInd { ref info, .. } => {
+                let rn = info.rn.show_rru(mb_rru);
+                format!("blr {}", rn)
+            }
+            &Inst::Ret => "ret".to_string(),
+            &Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(),
+            &Inst::Jump { ref dest } => {
+                let dest = dest.show_rru(mb_rru);
+                format!("b {}", dest)
+            }
+            &Inst::CondBr {
+                ref taken,
+                ref not_taken,
+                ref kind,
+            } => {
+                let taken = taken.show_rru(mb_rru);
+                let not_taken = not_taken.show_rru(mb_rru);
+                match kind {
+                    &CondBrKind::Zero(reg) => {
+                        let reg = reg.show_rru(mb_rru);
+                        format!("cbz {}, {} ; b {}", reg, taken, not_taken)
+                    }
+                    &CondBrKind::NotZero(reg) => {
+                        let reg = reg.show_rru(mb_rru);
+                        format!("cbnz {}, {} ; b {}", reg, taken, not_taken)
+                    }
+                    &CondBrKind::Cond(c) => {
+                        let c = c.show_rru(mb_rru);
+                        format!("b.{} {} ; b {}", c, taken, not_taken)
+                    }
+                }
+            }
+            &Inst::IndirectBr { rn, .. } => {
+                let rn = rn.show_rru(mb_rru);
+                format!("br {}", rn)
+            }
+            &Inst::Brk => "brk #0".to_string(),
+            &Inst::Udf { .. } => "udf".to_string(),
+            &Inst::TrapIf { ref kind, .. } => match kind {
+                &CondBrKind::Zero(reg) => {
+                    let reg = reg.show_rru(mb_rru);
+                    format!("cbnz {}, 8 ; udf", reg)
+                }
+                &CondBrKind::NotZero(reg) => {
+                    let reg = reg.show_rru(mb_rru);
+                    format!("cbz {}, 8 ; udf", reg)
+                }
+                &CondBrKind::Cond(c) => {
+                    let c = c.invert().show_rru(mb_rru);
+                    format!("b.{} 8 ; udf", c)
+                }
+            },
+            &Inst::Adr { rd, off } => {
+                let rd = rd.show_rru(mb_rru);
+                format!("adr {}, pc+{}", rd, off)
+            }
+            &Inst::Word4 { data } => format!("data.i32 {}", data),
+            &Inst::Word8 { data } => format!("data.i64 {}", data),
+            &Inst::JTSequence {
+                ref info,
+                ridx,
+                rtmp1,
+                rtmp2,
+                ..
+            } => {
+                let ridx = ridx.show_rru(mb_rru);
+                let rtmp1 = rtmp1.show_rru(mb_rru);
+                let rtmp2 = rtmp2.show_rru(mb_rru);
+                let default_target = info.default_target.show_rru(mb_rru);
+                format!(
+                    concat!(
+                        "b.hs {} ; ",
+                        "adr {}, pc+16 ; ",
+                        "ldrsw {}, [{}, {}, LSL 2] ; ",
+                        "add {}, {}, {} ; ",
+                        "br {} ; ",
+                        "jt_entries {:?}"
+                    ),
+                    default_target,
+                    rtmp1,
+                    rtmp2,
+                    rtmp1,
+                    ridx,
+                    rtmp1,
+                    rtmp1,
+                    rtmp2,
+                    rtmp1,
+                    info.targets
+                )
+            }
+            &Inst::LoadExtName {
+                rd,
+                ref name,
+                offset,
+            } => {
+                let rd = rd.show_rru(mb_rru);
+                format!("ldr {}, 8 ; b 12 ; data {:?} + {}", rd, name, offset)
+            }
+            &Inst::LoadAddr { rd, ref mem } => {
+                // TODO: we really should find a better way to avoid duplication of
+                // this logic between `emit()` and `show_rru()` -- a separate 1-to-N
+                // expansion stage (i.e., legalization, but without the slow edit-in-place
+                // of the existing legalization framework).
+                let (mem_insts, mem) = mem_finalize(0, mem, state);
+                let mut ret = String::new();
+                for inst in mem_insts.into_iter() {
+                    ret.push_str(&inst.show_rru(mb_rru));
+                }
+                let (reg, offset) = match mem {
+                    AMode::Unscaled(r, simm9) => (r, simm9.value()),
+                    AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
+                    _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
+                };
+                let abs_offset = if offset < 0 {
+                    -offset as u64
+                } else {
+                    offset as u64
+                };
+                let alu_op = if offset < 0 {
+                    ALUOp::Sub64
+                } else {
+                    ALUOp::Add64
+                };
+
+                if offset == 0 {
+                    let mov = Inst::mov(rd, reg);
+                    ret.push_str(&mov.show_rru(mb_rru));
+                } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
+                    let add = Inst::AluRRImm12 {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        imm12,
+                    };
+                    ret.push_str(&add.show_rru(mb_rru));
+                } else {
+                    let tmp = writable_spilltmp_reg();
+                    for inst in Inst::load_constant(tmp, abs_offset).into_iter() {
+                        ret.push_str(&inst.show_rru(mb_rru));
+                    }
+                    let add = Inst::AluRRR {
+                        alu_op,
+                        rd,
+                        rn: reg,
+                        rm: tmp.to_reg(),
+                    };
+                    ret.push_str(&add.show_rru(mb_rru));
+                }
+                ret
+            }
+            &Inst::VirtualSPOffsetAdj { offset } => {
+                state.virtual_sp_offset += offset;
+                format!("virtual_sp_offset_adjust {}", offset)
+            }
+            &Inst::EmitIsland { needed_space } => format!("emit_island {}", needed_space),
+        }
+    }
+}
+
+//=============================================================================
+// Label fixups and jump veneers.
+
+/// Different forms of label references for different instruction formats.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LabelUse {
+    /// 19-bit branch offset (conditional branches). PC-rel, offset is imm << 2. Immediate is 19
+    /// signed bits, in bits 23:5. Used by cbz, cbnz, b.cond.
+    Branch19,
+    /// 26-bit branch offset (unconditional branches). PC-rel, offset is imm << 2. Immediate is 26
+    /// signed bits, in bits 25:0. Used by b, bl.
+    Branch26,
+    /// 19-bit offset for LDR (load literal). PC-rel, offset is imm << 2. Immediate is 19 signed bits,
+    /// in bits 23:5.
+    Ldr19,
+    /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is
+    /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29.
+    Adr21,
+    /// 32-bit PC relative constant offset (from address of constant itself),
+    /// signed. Used in jump tables.
+    PCRel32,
+}
+
+impl MachInstLabelUse for LabelUse {
+    /// Alignment for veneer code. Every AArch64 instruction must be 4-byte-aligned.
+    const ALIGN: CodeOffset = 4;
+
+    /// Maximum PC-relative range (positive), inclusive.
+    fn max_pos_range(self) -> CodeOffset {
+        match self {
+            // 19-bit immediate, left-shifted by 2, for 21 bits of total range. Signed, so +2^20
+            // from zero. Likewise for two other shifted cases below.
+            LabelUse::Branch19 => (1 << 20) - 1,
+            LabelUse::Branch26 => (1 << 27) - 1,
+            LabelUse::Ldr19 => (1 << 20) - 1,
+            // Adr does not shift its immediate, so the 21-bit immediate gives 21 bits of total
+            // range.
+            LabelUse::Adr21 => (1 << 20) - 1,
+            LabelUse::PCRel32 => 0x7fffffff,
+        }
+    }
+
+    /// Maximum PC-relative range (negative).
+    fn max_neg_range(self) -> CodeOffset {
+        // All forms are twos-complement signed offsets, so negative limit is one more than
+        // positive limit.
+        self.max_pos_range() + 1
+    }
+
+    /// Size of window into code needed to do the patch.
+    fn patch_size(self) -> CodeOffset {
+        // Patch is on one instruction only for all of these label reference types.
+        4
+    }
+
+    /// Perform the patch.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
+        let pc_rel = (label_offset as i64) - (use_offset as i64);
+        debug_assert!(pc_rel <= self.max_pos_range() as i64);
+        debug_assert!(pc_rel >= -(self.max_neg_range() as i64));
+        let pc_rel = pc_rel as u32;
+        let insn_word = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+        let mask = match self {
+            LabelUse::Branch19 => 0x00ffffe0, // bits 23..5 inclusive
+            LabelUse::Branch26 => 0x03ffffff, // bits 25..0 inclusive
+            LabelUse::Ldr19 => 0x00ffffe0,    // bits 23..5 inclusive
+            LabelUse::Adr21 => 0x60ffffe0,    // bits 30..29, 25..5 inclusive
+            LabelUse::PCRel32 => 0xffffffff,
+        };
+        let pc_rel_shifted = match self {
+            LabelUse::Adr21 | LabelUse::PCRel32 => pc_rel,
+            _ => {
+                debug_assert!(pc_rel & 3 == 0);
+                pc_rel >> 2
+            }
+        };
+        let pc_rel_inserted = match self {
+            LabelUse::Branch19 | LabelUse::Ldr19 => (pc_rel_shifted & 0x7ffff) << 5,
+            LabelUse::Branch26 => pc_rel_shifted & 0x3ffffff,
+            LabelUse::Adr21 => (pc_rel_shifted & 0x7ffff) << 5 | (pc_rel_shifted & 0x180000) << 10,
+            LabelUse::PCRel32 => pc_rel_shifted,
+        };
+        let is_add = match self {
+            LabelUse::PCRel32 => true,
+            _ => false,
+        };
+        let insn_word = if is_add {
+            insn_word.wrapping_add(pc_rel_inserted)
+        } else {
+            (insn_word & !mask) | pc_rel_inserted
+        };
+        buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word));
+    }
+
+    /// Is a veneer supported for this label reference type?
+    fn supports_veneer(self) -> bool {
+        match self {
+            LabelUse::Branch19 => true, // veneer is a Branch26
+            _ => false,
+        }
+    }
+
+    /// How large is the veneer, if supported?
+    fn veneer_size(self) -> CodeOffset {
+        4
+    }
+
+    /// Generate a veneer into the buffer, given that this veneer is at `veneer_offset`, and return
+    /// an offset and label-use for the veneer's use of the original label.
+    fn generate_veneer(
+        self,
+        buffer: &mut [u8],
+        veneer_offset: CodeOffset,
+    ) -> (CodeOffset, LabelUse) {
+        match self {
+            LabelUse::Branch19 => {
+                // veneer is a Branch26 (unconditional branch). Just encode directly here -- don't
+                // bother with constructing an Inst.
+                let insn_word = 0b000101 << 26;
+                buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word));
+                (veneer_offset, LabelUse::Branch26)
+            }
+            _ => panic!("Unsupported label-reference type for veneer generation!"),
+        }
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/regs.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/regs.rs
new file mode 100644
index 0000000000..0b4babe04a
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/regs.rs
@@ -0,0 +1,351 @@
+//! AArch64 ISA definitions: registers.
+
+use crate::isa::aarch64::inst::OperandSize;
+use crate::isa::aarch64::inst::ScalarSize;
+use crate::isa::aarch64::inst::VectorSize;
+use crate::settings;
+
+use regalloc::{
+    PrettyPrint, RealRegUniverse, Reg, RegClass, RegClassInfo, Writable, NUM_REG_CLASSES,
+};
+
+use std::string::{String, ToString};
+
+//=============================================================================
+// Registers, the Universe thereof, and printing
+
+/// The pinned register on this architecture.
+/// It must be the same as Spidermonkey's HeapReg, as found in this file.
+/// https://searchfox.org/mozilla-central/source/js/src/jit/arm64/Assembler-arm64.h#103
+pub const PINNED_REG: u8 = 21;
+
+#[rustfmt::skip]
+const XREG_INDICES: [u8; 31] = [
+    // X0 - X7
+    32, 33, 34, 35, 36, 37, 38, 39,
+    // X8 - X15
+    40, 41, 42, 43, 44, 45, 46, 47,
+    // X16, X17
+    58, 59,
+    // X18
+    60,
+    // X19, X20
+    48, 49,
+    // X21, put aside because it's the pinned register.
+    57,
+    // X22 - X28
+    50, 51, 52, 53, 54, 55, 56,
+    // X29 (FP)
+    61,
+    // X30 (LR)
+    62,
+];
+
+const ZERO_REG_INDEX: u8 = 63;
+
+const SP_REG_INDEX: u8 = 64;
+
+/// Get a reference to an X-register (integer register).
+pub fn xreg(num: u8) -> Reg {
+    assert!(num < 31);
+    Reg::new_real(
+        RegClass::I64,
+        /* enc = */ num,
+        /* index = */ XREG_INDICES[num as usize],
+    )
+}
+
+/// Get a writable reference to an X-register.
+pub fn writable_xreg(num: u8) -> Writable<Reg> {
+    Writable::from_reg(xreg(num))
+}
+
+/// Get a reference to a V-register (vector/FP register).
+pub fn vreg(num: u8) -> Reg {
+    assert!(num < 32);
+    Reg::new_real(RegClass::V128, /* enc = */ num, /* index = */ num)
+}
+
+/// Get a writable reference to a V-register.
+pub fn writable_vreg(num: u8) -> Writable<Reg> {
+    Writable::from_reg(vreg(num))
+}
+
+/// Get a reference to the zero-register.
+pub fn zero_reg() -> Reg {
+    // This should be the same as what xreg(31) returns, except that
+    // we use the special index into the register index space.
+    Reg::new_real(
+        RegClass::I64,
+        /* enc = */ 31,
+        /* index = */ ZERO_REG_INDEX,
+    )
+}
+
+/// Get a writable reference to the zero-register (this discards a result).
+pub fn writable_zero_reg() -> Writable<Reg> {
+    Writable::from_reg(zero_reg())
+}
+
+/// Get a reference to the stack-pointer register.
+pub fn stack_reg() -> Reg {
+    // XSP (stack) and XZR (zero) are logically different registers which have
+    // the same hardware encoding, and whose meaning, in real aarch64
+    // instructions, is context-dependent.  For convenience of
+    // universe-construction and for correct printing, we make them be two
+    // different real registers.
+    Reg::new_real(
+        RegClass::I64,
+        /* enc = */ 31,
+        /* index = */ SP_REG_INDEX,
+    )
+}
+
+/// Get a writable reference to the stack-pointer register.
+pub fn writable_stack_reg() -> Writable<Reg> {
+    Writable::from_reg(stack_reg())
+}
+
+/// Get a reference to the link register (x30).
+pub fn link_reg() -> Reg {
+    xreg(30)
+}
+
+/// Get a writable reference to the link register.
+pub fn writable_link_reg() -> Writable<Reg> {
+    Writable::from_reg(link_reg())
+}
+
+/// Get a reference to the frame pointer (x29).
+pub fn fp_reg() -> Reg {
+    xreg(29)
+}
+
+/// Get a writable reference to the frame pointer.
+pub fn writable_fp_reg() -> Writable<Reg> {
+    Writable::from_reg(fp_reg())
+}
+
+/// Get a reference to the first temporary, sometimes "spill temporary", register. This register is
+/// used to compute the address of a spill slot when a direct offset addressing mode from FP is not
+/// sufficient (+/- 2^11 words). We exclude this register from regalloc and reserve it for this
+/// purpose for simplicity; otherwise we need a multi-stage analysis where we first determine how
+/// many spill slots we have, then perhaps remove the reg from the pool and recompute regalloc.
+///
+/// We use x16 for this (aka IP0 in the AArch64 ABI) because it's a scratch register but is
+/// slightly special (used for linker veneers). We're free to use it as long as we don't expect it
+/// to live through call instructions.
+pub fn spilltmp_reg() -> Reg {
+    xreg(16)
+}
+
+/// Get a writable reference to the spilltmp reg.
+pub fn writable_spilltmp_reg() -> Writable<Reg> {
+    Writable::from_reg(spilltmp_reg())
+}
+
+/// Get a reference to the second temp register. We need this in some edge cases
+/// where we need both the spilltmp and another temporary.
+///
+/// We use x17 (aka IP1), the other "interprocedural"/linker-veneer scratch reg that is
+/// free to use otherwise.
+pub fn tmp2_reg() -> Reg {
+    xreg(17)
+}
+
+/// Get a writable reference to the tmp2 reg.
+pub fn writable_tmp2_reg() -> Writable<Reg> {
+    Writable::from_reg(tmp2_reg())
+}
+
+/// Create the register universe for AArch64.
+pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse {
+    let mut regs = vec![];
+    let mut allocable_by_class = [None; NUM_REG_CLASSES];
+
+    // Numbering Scheme: we put V-regs first, then X-regs. The X-regs exclude several registers:
+    // x18 (globally reserved for platform-specific purposes), x29 (frame pointer), x30 (link
+    // register), x31 (stack pointer or zero register, depending on context).
+
+    let v_reg_base = 0u8; // in contiguous real-register index space
+    let v_reg_count = 32;
+    for i in 0u8..v_reg_count {
+        let reg = Reg::new_real(
+            RegClass::V128,
+            /* enc = */ i,
+            /* index = */ v_reg_base + i,
+        )
+        .to_real_reg();
+        let name = format!("v{}", i);
+        regs.push((reg, name));
+    }
+    let v_reg_last = v_reg_base + v_reg_count - 1;
+
+    // Add the X registers. N.B.: the order here must match the order implied
+    // by XREG_INDICES, ZERO_REG_INDEX, and SP_REG_INDEX above.
+
+    let x_reg_base = 32u8; // in contiguous real-register index space
+    let mut x_reg_count = 0;
+
+    let uses_pinned_reg = flags.enable_pinned_reg();
+
+    for i in 0u8..32u8 {
+        // See above for excluded registers.
+        if i == 16 || i == 17 || i == 18 || i == 29 || i == 30 || i == 31 || i == PINNED_REG {
+            continue;
+        }
+        let reg = Reg::new_real(
+            RegClass::I64,
+            /* enc = */ i,
+            /* index = */ x_reg_base + x_reg_count,
+        )
+        .to_real_reg();
+        let name = format!("x{}", i);
+        regs.push((reg, name));
+        x_reg_count += 1;
+    }
+    let x_reg_last = x_reg_base + x_reg_count - 1;
+
+    allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo {
+        first: x_reg_base as usize,
+        last: x_reg_last as usize,
+        suggested_scratch: Some(XREG_INDICES[19] as usize),
+    });
+    allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo {
+        first: v_reg_base as usize,
+        last: v_reg_last as usize,
+        suggested_scratch: Some(/* V31: */ 31),
+    });
+
+    // Other regs, not available to the allocator.
+    let allocable = if uses_pinned_reg {
+        // The pinned register is not allocatable in this case, so record the length before adding
+        // it.
+        let len = regs.len();
+        regs.push((xreg(PINNED_REG).to_real_reg(), "x21/pinned_reg".to_string()));
+        len
+    } else {
+        regs.push((xreg(PINNED_REG).to_real_reg(), "x21".to_string()));
+        regs.len()
+    };
+
+    regs.push((xreg(16).to_real_reg(), "x16".to_string()));
+    regs.push((xreg(17).to_real_reg(), "x17".to_string()));
+    regs.push((xreg(18).to_real_reg(), "x18".to_string()));
+    regs.push((fp_reg().to_real_reg(), "fp".to_string()));
+    regs.push((link_reg().to_real_reg(), "lr".to_string()));
+    regs.push((zero_reg().to_real_reg(), "xzr".to_string()));
+    regs.push((stack_reg().to_real_reg(), "sp".to_string()));
+
+    // FIXME JRS 2020Feb06: unfortunately this pushes the number of real regs
+    // to 65, which is potentially inconvenient from a compiler performance
+    // standpoint.  We could possibly drop back to 64 by "losing" a vector
+    // register in future.
+
+    // Assert sanity: the indices in the register structs must match their
+    // actual indices in the array.
+    for (i, reg) in regs.iter().enumerate() {
+        assert_eq!(i, reg.0.get_index());
+    }
+
+    RealRegUniverse {
+        regs,
+        allocable,
+        allocable_by_class,
+    }
+}
+
+/// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show
+/// its name at the 32-bit size.
+pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: OperandSize) -> String {
+    let mut s = reg.show_rru(mb_rru);
+    if reg.get_class() != RegClass::I64 || !size.is32() {
+        // We can't do any better.
+        return s;
+    }
+
+    if reg.is_real() {
+        // Change (eg) "x42" into "w42" as appropriate
+        if reg.get_class() == RegClass::I64 && size.is32() && s.starts_with("x") {
+            s = "w".to_string() + &s[1..];
+        }
+    } else {
+        // Add a "w" suffix to RegClass::I64 vregs used in a 32-bit role
+        if reg.get_class() == RegClass::I64 && size.is32() {
+            s.push('w');
+        }
+    }
+    s
+}
+
+/// Show a vector register used in a scalar context.
+pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: ScalarSize) -> String {
+    let mut s = reg.show_rru(mb_rru);
+    if reg.get_class() != RegClass::V128 {
+        // We can't do any better.
+        return s;
+    }
+
+    if reg.is_real() {
+        // Change (eg) "v0" into "d0".
+        if s.starts_with("v") {
+            let replacement = match size {
+                ScalarSize::Size8 => "b",
+                ScalarSize::Size16 => "h",
+                ScalarSize::Size32 => "s",
+                ScalarSize::Size64 => "d",
+                ScalarSize::Size128 => "q",
+            };
+            s.replace_range(0..1, replacement);
+        }
+    } else {
+        // Add a "d" suffix to RegClass::V128 vregs.
+        if reg.get_class() == RegClass::V128 {
+            s.push('d');
+        }
+    }
+    s
+}
+
+/// Show a vector register.
+pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: VectorSize) -> String {
+    assert_eq!(RegClass::V128, reg.get_class());
+    let mut s = reg.show_rru(mb_rru);
+
+    let suffix = match size {
+        VectorSize::Size8x8 => ".8b",
+        VectorSize::Size8x16 => ".16b",
+        VectorSize::Size16x4 => ".4h",
+        VectorSize::Size16x8 => ".8h",
+        VectorSize::Size32x2 => ".2s",
+        VectorSize::Size32x4 => ".4s",
+        VectorSize::Size64x2 => ".2d",
+    };
+
+    s.push_str(suffix);
+    s
+}
+
+/// Show an indexed vector element.
+pub fn show_vreg_element(
+    reg: Reg,
+    mb_rru: Option<&RealRegUniverse>,
+    idx: u8,
+    size: VectorSize,
+) -> String {
+    assert_eq!(RegClass::V128, reg.get_class());
+    let mut s = reg.show_rru(mb_rru);
+
+    let suffix = match size {
+        VectorSize::Size8x8 => "b",
+        VectorSize::Size8x16 => "b",
+        VectorSize::Size16x4 => "h",
+        VectorSize::Size16x8 => "h",
+        VectorSize::Size32x2 => "s",
+        VectorSize::Size32x4 => "s",
+        VectorSize::Size64x2 => "d",
+    };
+
+    s.push_str(&format!(".{}[{}]", suffix, idx));
+    s
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind.rs
new file mode 100644
index 0000000000..698e094795
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind.rs
@@ -0,0 +1,201 @@
+use super::*;
+use crate::isa::aarch64::inst::{args::PairAMode, imms::Imm12, regs, ALUOp, Inst};
+use crate::isa::unwind::input::{UnwindCode, UnwindInfo};
+use crate::machinst::UnwindInfoContext;
+use crate::result::CodegenResult;
+use alloc::vec::Vec;
+use regalloc::Reg;
+
+#[cfg(feature = "unwind")]
+pub(crate) mod systemv;
+
+pub struct AArch64UnwindInfo;
+
+impl UnwindInfoGenerator<Inst> for AArch64UnwindInfo {
+    fn create_unwind_info(
+        context: UnwindInfoContext<Inst>,
+    ) -> CodegenResult<Option<UnwindInfo<Reg>>> {
+        let word_size = 8u8;
+        let pair_size = word_size * 2;
+        let mut codes = Vec::new();
+
+        for i in context.prologue.clone() {
+            let i = i as usize;
+            let inst = &context.insts[i];
+            let offset = context.insts_layout[i];
+
+            match inst {
+                Inst::StoreP64 {
+                    rt,
+                    rt2,
+                    mem: PairAMode::PreIndexed(rn, imm7),
+                    ..
+                } if *rt == regs::fp_reg()
+                    && *rt2 == regs::link_reg()
+                    && *rn == regs::writable_stack_reg()
+                    && imm7.value == -(pair_size as i16) =>
+                {
+                    // stp fp (x29), lr (x30), [sp, #-16]!
+                    codes.push((
+                        offset,
+                        UnwindCode::StackAlloc {
+                            size: pair_size as u32,
+                        },
+                    ));
+                    codes.push((
+                        offset,
+                        UnwindCode::SaveRegister {
+                            reg: *rt,
+                            stack_offset: 0,
+                        },
+                    ));
+                    codes.push((
+                        offset,
+                        UnwindCode::SaveRegister {
+                            reg: *rt2,
+                            stack_offset: word_size as u32,
+                        },
+                    ));
+                }
+                Inst::StoreP64 {
+                    rt,
+                    rt2,
+                    mem: PairAMode::PreIndexed(rn, imm7),
+                    ..
+                } if rn.to_reg() == regs::stack_reg() && imm7.value % (pair_size as i16) == 0 => {
+                    // stp r1, r2, [sp, #(i * #16)]
+                    let stack_offset = imm7.value as u32;
+                    codes.push((
+                        offset,
+                        UnwindCode::SaveRegister {
+                            reg: *rt,
+                            stack_offset,
+                        },
+                    ));
+                    if *rt2 != regs::zero_reg() {
+                        codes.push((
+                            offset,
+                            UnwindCode::SaveRegister {
+                                reg: *rt2,
+                                stack_offset: stack_offset + word_size as u32,
+                            },
+                        ));
+                    }
+                }
+                Inst::AluRRImm12 {
+                    alu_op: ALUOp::Add64,
+                    rd,
+                    rn,
+                    imm12:
+                        Imm12 {
+                            bits: 0,
+                            shift12: false,
+                        },
+                } if *rd == regs::writable_fp_reg() && *rn == regs::stack_reg() => {
+                    // mov fp (x29), sp.
+                    codes.push((offset, UnwindCode::SetFramePointer { reg: rd.to_reg() }));
+                }
+                Inst::VirtualSPOffsetAdj { offset: adj } if offset > 0 => {
+                    codes.push((offset, UnwindCode::StackAlloc { size: *adj as u32 }));
+                }
+                _ => {}
+            }
+        }
+
+        // TODO epilogues
+
+        let prologue_size = if context.prologue.is_empty() {
+            0
+        } else {
+            context.insts_layout[context.prologue.end as usize - 1]
+        };
+
+        Ok(Some(UnwindInfo {
+            prologue_size,
+            prologue_unwind_codes: codes,
+            epilogues_unwind_codes: vec![],
+            function_size: context.len,
+            word_size,
+            initial_sp_offset: 0,
+        }))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::{ExternalName, Function, InstBuilder, Signature, StackSlotData, StackSlotKind};
+    use crate::isa::{lookup, CallConv};
+    use crate::settings::{builder, Flags};
+    use crate::Context;
+    use std::str::FromStr;
+    use target_lexicon::triple;
+
+    #[test]
+    fn test_simple_func() {
+        let isa = lookup(triple!("aarch64"))
+            .expect("expect aarch64 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::SystemV,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let result = context.mach_compile_result.unwrap();
+        let unwind_info = result.unwind_info.unwrap();
+
+        assert_eq!(
+            unwind_info,
+            UnwindInfo {
+                prologue_size: 12,
+                prologue_unwind_codes: vec![
+                    (4, UnwindCode::StackAlloc { size: 16 }),
+                    (
+                        4,
+                        UnwindCode::SaveRegister {
+                            reg: regs::fp_reg(),
+                            stack_offset: 0
+                        }
+                    ),
+                    (
+                        4,
+                        UnwindCode::SaveRegister {
+                            reg: regs::link_reg(),
+                            stack_offset: 8
+                        }
+                    ),
+                    (
+                        8,
+                        UnwindCode::SetFramePointer {
+                            reg: regs::fp_reg()
+                        }
+                    )
+                ],
+                epilogues_unwind_codes: vec![],
+                function_size: 24,
+                word_size: 8,
+                initial_sp_offset: 0,
+            }
+        );
+    }
+
+    fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function {
+        let mut func =
+            Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv));
+
+        let block0 = func.dfg.make_block();
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().return_(&[]);
+
+        if let Some(stack_slot) = stack_slot {
+            func.stack_slots.push(stack_slot);
+        }
+
+        func
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind/systemv.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind/systemv.rs
new file mode 100644
index 0000000000..b988314b1b
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind/systemv.rs
@@ -0,0 +1,158 @@
+//! Unwind information for System V ABI (Aarch64).
+
+use crate::isa::aarch64::inst::regs;
+use crate::isa::unwind::input;
+use crate::isa::unwind::systemv::{RegisterMappingError, UnwindInfo};
+use crate::result::CodegenResult;
+use gimli::{write::CommonInformationEntry, Encoding, Format, Register};
+use regalloc::{Reg, RegClass};
+
+/// Creates a new aarch64 common information entry (CIE).
+pub fn create_cie() -> CommonInformationEntry {
+    use gimli::write::CallFrameInstruction;
+
+    let mut entry = CommonInformationEntry::new(
+        Encoding {
+            address_size: 8,
+            format: Format::Dwarf32,
+            version: 1,
+        },
+        4,  // Code alignment factor
+        -8, // Data alignment factor
+        Register(regs::link_reg().get_hw_encoding().into()),
+    );
+
+    // Every frame will start with the call frame address (CFA) at SP
+    let sp = Register(regs::stack_reg().get_hw_encoding().into());
+    entry.add_instruction(CallFrameInstruction::Cfa(sp, 0));
+
+    entry
+}
+
+/// Map Cranelift registers to their corresponding Gimli registers.
+pub fn map_reg(reg: Reg) -> Result<Register, RegisterMappingError> {
+    match reg.get_class() {
+        RegClass::I64 => Ok(Register(reg.get_hw_encoding().into())),
+        _ => Err(RegisterMappingError::UnsupportedRegisterBank("class?")),
+    }
+}
+
+pub(crate) fn create_unwind_info(
+    unwind: input::UnwindInfo<Reg>,
+) -> CodegenResult<Option<UnwindInfo>> {
+    struct RegisterMapper;
+    impl crate::isa::unwind::systemv::RegisterMapper<Reg> for RegisterMapper {
+        fn map(&self, reg: Reg) -> Result<u16, RegisterMappingError> {
+            Ok(map_reg(reg)?.0)
+        }
+        fn sp(&self) -> u16 {
+            regs::stack_reg().get_hw_encoding().into()
+        }
+    }
+    let map = RegisterMapper;
+    Ok(Some(UnwindInfo::build(unwind, &map)?))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::{
+        types, AbiParam, ExternalName, Function, InstBuilder, Signature, StackSlotData,
+        StackSlotKind,
+    };
+    use crate::isa::{lookup, CallConv};
+    use crate::settings::{builder, Flags};
+    use crate::Context;
+    use gimli::write::Address;
+    use std::str::FromStr;
+    use target_lexicon::triple;
+
+    #[test]
+    fn test_simple_func() {
+        let isa = lookup(triple!("aarch64"))
+            .expect("expect aarch64 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::SystemV,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let fde = match context
+            .create_unwind_info(isa.as_ref())
+            .expect("can create unwind info")
+        {
+            Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => {
+                info.to_fde(Address::Constant(1234))
+            }
+            _ => panic!("expected unwind information"),
+        };
+
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(1234), length: 24, lsda: None, instructions: [(4, CfaOffset(16)), (4, Offset(Register(29), -16)), (4, Offset(Register(30), -8)), (8, CfaRegister(Register(29)))] }");
+    }
+
+    fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function {
+        let mut func =
+            Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv));
+
+        let block0 = func.dfg.make_block();
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().return_(&[]);
+
+        if let Some(stack_slot) = stack_slot {
+            func.stack_slots.push(stack_slot);
+        }
+
+        func
+    }
+
+    #[test]
+    fn test_multi_return_func() {
+        let isa = lookup(triple!("aarch64"))
+            .expect("expect aarch64 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let fde = match context
+            .create_unwind_info(isa.as_ref())
+            .expect("can create unwind info")
+        {
+            Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => {
+                info.to_fde(Address::Constant(4321))
+            }
+            _ => panic!("expected unwind information"),
+        };
+
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 40, lsda: None, instructions: [(4, CfaOffset(16)), (4, Offset(Register(29), -16)), (4, Offset(Register(30), -8)), (8, CfaRegister(Register(29)))] }");
+    }
+
+    fn create_multi_return_function(call_conv: CallConv) -> Function {
+        let mut sig = Signature::new(call_conv);
+        sig.params.push(AbiParam::new(types::I32));
+        let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig);
+
+        let block0 = func.dfg.make_block();
+        let v0 = func.dfg.append_block_param(block0, types::I32);
+        let block1 = func.dfg.make_block();
+        let block2 = func.dfg.make_block();
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().brnz(v0, block2, &[]);
+        pos.ins().jump(block1, &[]);
+
+        pos.insert_block(block1);
+        pos.ins().return_(&[]);
+
+        pos.insert_block(block2);
+        pos.ins().return_(&[]);
+
+        func
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/lower.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/lower.rs
new file mode 100644
index 0000000000..17555c1bd2
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/lower.rs
@@ -0,0 +1,1196 @@
+//! Lowering rules for AArch64.
+//!
+//! TODO: opportunities for better code generation:
+//!
+//! - Smarter use of addressing modes. Recognize a+SCALE*b patterns. Recognize
+//!   pre/post-index opportunities.
+//!
+//! - Floating-point immediates (FIMM instruction).
+
+use crate::ir::condcodes::{FloatCC, IntCC};
+use crate::ir::types::*;
+use crate::ir::Inst as IRInst;
+use crate::ir::{Opcode, Type};
+use crate::machinst::lower::*;
+use crate::machinst::*;
+use crate::CodegenResult;
+
+use crate::isa::aarch64::inst::*;
+use crate::isa::aarch64::AArch64Backend;
+
+use super::lower_inst;
+
+use crate::data_value::DataValue;
+use log::{debug, trace};
+use regalloc::{Reg, RegClass, Writable};
+use smallvec::SmallVec;
+
+//============================================================================
+// Result enum types.
+//
+// Lowering of a given value results in one of these enums, depending on the
+// modes in which we can accept the value.
+
+/// A lowering result: register, register-shift.  An SSA value can always be
+/// lowered into one of these options; the register form is the fallback.
+#[derive(Clone, Debug)]
+enum ResultRS {
+    Reg(Reg),
+    RegShift(Reg, ShiftOpAndAmt),
+}
+
+/// A lowering result: register, register-shift, register-extend.  An SSA value can always be
+/// lowered into one of these options; the register form is the fallback.
+#[derive(Clone, Debug)]
+enum ResultRSE {
+    Reg(Reg),
+    RegShift(Reg, ShiftOpAndAmt),
+    RegExtend(Reg, ExtendOp),
+}
+
+impl ResultRSE {
+    fn from_rs(rs: ResultRS) -> ResultRSE {
+        match rs {
+            ResultRS::Reg(r) => ResultRSE::Reg(r),
+            ResultRS::RegShift(r, s) => ResultRSE::RegShift(r, s),
+        }
+    }
+}
+
+/// A lowering result: register, register-shift, register-extend, or 12-bit immediate form.
+/// An SSA value can always be lowered into one of these options; the register form is the
+/// fallback.
+#[derive(Clone, Debug)]
+pub(crate) enum ResultRSEImm12 {
+    Reg(Reg),
+    RegShift(Reg, ShiftOpAndAmt),
+    RegExtend(Reg, ExtendOp),
+    Imm12(Imm12),
+}
+
+impl ResultRSEImm12 {
+    fn from_rse(rse: ResultRSE) -> ResultRSEImm12 {
+        match rse {
+            ResultRSE::Reg(r) => ResultRSEImm12::Reg(r),
+            ResultRSE::RegShift(r, s) => ResultRSEImm12::RegShift(r, s),
+            ResultRSE::RegExtend(r, e) => ResultRSEImm12::RegExtend(r, e),
+        }
+    }
+}
+
+/// A lowering result: register, register-shift, or logical immediate form.
+/// An SSA value can always be lowered into one of these options; the register form is the
+/// fallback.
+#[derive(Clone, Debug)]
+pub(crate) enum ResultRSImmLogic {
+    Reg(Reg),
+    RegShift(Reg, ShiftOpAndAmt),
+    ImmLogic(ImmLogic),
+}
+
+impl ResultRSImmLogic {
+    fn from_rs(rse: ResultRS) -> ResultRSImmLogic {
+        match rse {
+            ResultRS::Reg(r) => ResultRSImmLogic::Reg(r),
+            ResultRS::RegShift(r, s) => ResultRSImmLogic::RegShift(r, s),
+        }
+    }
+}
+
+/// A lowering result: register or immediate shift amount (arg to a shift op).
+/// An SSA value can always be lowered into one of these options; the register form is the
+/// fallback.
+#[derive(Clone, Debug)]
+pub(crate) enum ResultRegImmShift {
+    Reg(Reg),
+    ImmShift(ImmShift),
+}
+
+//============================================================================
+// Lowering: convert instruction inputs to forms that we can use.
+
+/// Lower an instruction input to a 64-bit constant, if possible.
+pub(crate) fn input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64> {
+    let input = ctx.get_input(input.insn, input.input);
+    input.constant
+}
+
+/// Lower an instruction input to a constant register-shift amount, if possible.
+pub(crate) fn input_to_shiftimm<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+) -> Option<ShiftOpShiftImm> {
+    input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift)
+}
+
+pub(crate) fn const_param_to_u128<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    inst: IRInst,
+) -> Option<u128> {
+    match ctx.get_immediate(inst) {
+        Some(DataValue::V128(bytes)) => Some(u128::from_le_bytes(bytes)),
+        _ => None,
+    }
+}
+
+/// How to handle narrow values loaded into registers; see note on `narrow_mode`
+/// parameter to `put_input_in_*` below.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum NarrowValueMode {
+    None,
+    /// Zero-extend to 32 bits if original is < 32 bits.
+    ZeroExtend32,
+    /// Sign-extend to 32 bits if original is < 32 bits.
+    SignExtend32,
+    /// Zero-extend to 64 bits if original is < 64 bits.
+    ZeroExtend64,
+    /// Sign-extend to 64 bits if original is < 64 bits.
+    SignExtend64,
+}
+
+impl NarrowValueMode {
+    fn is_32bit(&self) -> bool {
+        match self {
+            NarrowValueMode::None => false,
+            NarrowValueMode::ZeroExtend32 | NarrowValueMode::SignExtend32 => true,
+            NarrowValueMode::ZeroExtend64 | NarrowValueMode::SignExtend64 => false,
+        }
+    }
+}
+
+/// Lower an instruction input to a reg.
+///
+/// The given register will be extended appropriately, according to
+/// `narrow_mode` and the input's type. If extended, the value is
+/// always extended to 64 bits, for simplicity.
+pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> Reg {
+    debug!("put_input_in_reg: input {:?}", input);
+    let ty = ctx.input_ty(input.insn, input.input);
+    let from_bits = ty_bits(ty) as u8;
+    let inputs = ctx.get_input(input.insn, input.input);
+    let in_reg = if let Some(c) = inputs.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        let masked = if from_bits < 64 {
+            c & ((1u64 << from_bits) - 1)
+        } else {
+            c
+        };
+        let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        for inst in Inst::gen_constant(to_reg, masked, ty, |reg_class, ty| {
+            ctx.alloc_tmp(reg_class, ty)
+        })
+        .into_iter()
+        {
+            ctx.emit(inst);
+        }
+        to_reg.to_reg()
+    } else {
+        ctx.use_input_reg(inputs);
+        inputs.reg
+    };
+
+    match (narrow_mode, from_bits) {
+        (NarrowValueMode::None, _) => in_reg,
+        (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+            ctx.emit(Inst::Extend {
+                rd: tmp,
+                rn: in_reg,
+                signed: false,
+                from_bits,
+                to_bits: 32,
+            });
+            tmp.to_reg()
+        }
+        (NarrowValueMode::SignExtend32, n) if n < 32 => {
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+            ctx.emit(Inst::Extend {
+                rd: tmp,
+                rn: in_reg,
+                signed: true,
+                from_bits,
+                to_bits: 32,
+            });
+            tmp.to_reg()
+        }
+        (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,
+
+        (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
+            if inputs.constant.is_some() {
+                // Constants are zero-extended to full 64-bit width on load already.
+                in_reg
+            } else {
+                let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+                ctx.emit(Inst::Extend {
+                    rd: tmp,
+                    rn: in_reg,
+                    signed: false,
+                    from_bits,
+                    to_bits: 64,
+                });
+                tmp.to_reg()
+            }
+        }
+        (NarrowValueMode::SignExtend64, n) if n < 64 => {
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+            ctx.emit(Inst::Extend {
+                rd: tmp,
+                rn: in_reg,
+                signed: true,
+                from_bits,
+                to_bits: 64,
+            });
+            tmp.to_reg()
+        }
+        (_, 64) => in_reg,
+        (_, 128) => in_reg,
+
+        _ => panic!(
+            "Unsupported input width: input ty {} bits {} mode {:?}",
+            ty, from_bits, narrow_mode
+        ),
+    }
+}
+
+/// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
+///
+/// The `narrow_mode` flag indicates whether the consumer of this value needs
+/// the high bits clear. For many operations, such as an add/sub/mul or any
+/// bitwise logical operation, the low-bit results depend only on the low-bit
+/// inputs, so e.g. we can do an 8 bit add on 32 bit registers where the 8-bit
+/// value is stored in the low 8 bits of the register and the high 24 bits are
+/// undefined. If the op truly needs the high N bits clear (such as for a
+/// divide or a right-shift or a compare-to-zero), `narrow_mode` should be
+/// set to `ZeroExtend` or `SignExtend` as appropriate, and the resulting
+/// register will be provided the extended value.
+fn put_input_in_rs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> ResultRS {
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
+        let op = ctx.data(insn).opcode();
+
+        if op == Opcode::Ishl {
+            let shiftee = InsnInput { insn, input: 0 };
+            let shift_amt = InsnInput { insn, input: 1 };
+
+            // Can we get the shift amount as an immediate?
+            if let Some(shiftimm) = input_to_shiftimm(ctx, shift_amt) {
+                let shiftee_bits = ty_bits(ctx.input_ty(insn, 0));
+                if shiftee_bits <= std::u8::MAX as usize {
+                    let shiftimm = shiftimm.mask(shiftee_bits as u8);
+                    let reg = put_input_in_reg(ctx, shiftee, narrow_mode);
+                    return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
+                }
+            }
+        }
+    }
+
+    ResultRS::Reg(put_input_in_reg(ctx, input, narrow_mode))
+}
+
+/// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
+/// This does not actually codegen the source instruction; it just uses the
+/// vreg into which the source instruction will generate its value.
+///
+/// See note on `put_input_in_rs` for a description of `narrow_mode`.
+fn put_input_in_rse<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> ResultRSE {
+    let inputs = ctx.get_input(input.insn, input.input);
+    if let Some((insn, 0)) = inputs.inst {
+        let op = ctx.data(insn).opcode();
+        let out_ty = ctx.output_ty(insn, 0);
+        let out_bits = ty_bits(out_ty);
+
+        // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator?
+        if op == Opcode::Uextend || op == Opcode::Sextend {
+            let sign_extend = op == Opcode::Sextend;
+            let inner_ty = ctx.input_ty(insn, 0);
+            let inner_bits = ty_bits(inner_ty);
+            assert!(inner_bits < out_bits);
+            if match (sign_extend, narrow_mode) {
+                // A single zero-extend or sign-extend is equal to itself.
+                (_, NarrowValueMode::None) => true,
+                // Two zero-extends or sign-extends in a row is equal to a single zero-extend or sign-extend.
+                (false, NarrowValueMode::ZeroExtend32) | (false, NarrowValueMode::ZeroExtend64) => {
+                    true
+                }
+                (true, NarrowValueMode::SignExtend32) | (true, NarrowValueMode::SignExtend64) => {
+                    true
+                }
+                // A zero-extend and a sign-extend in a row is not equal to a single zero-extend or sign-extend
+                (false, NarrowValueMode::SignExtend32) | (false, NarrowValueMode::SignExtend64) => {
+                    false
+                }
+                (true, NarrowValueMode::ZeroExtend32) | (true, NarrowValueMode::ZeroExtend64) => {
+                    false
+                }
+            } {
+                let extendop = match (sign_extend, inner_bits) {
+                    (true, 8) => ExtendOp::SXTB,
+                    (false, 8) => ExtendOp::UXTB,
+                    (true, 16) => ExtendOp::SXTH,
+                    (false, 16) => ExtendOp::UXTH,
+                    (true, 32) => ExtendOp::SXTW,
+                    (false, 32) => ExtendOp::UXTW,
+                    _ => unreachable!(),
+                };
+                let reg =
+                    put_input_in_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None);
+                return ResultRSE::RegExtend(reg, extendop);
+            }
+        }
+
+        // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend,
+        // then get the result into a register and return an Extend-mode operand on
+        // that register.
+        if narrow_mode != NarrowValueMode::None
+            && ((narrow_mode.is_32bit() && out_bits < 32)
+                || (!narrow_mode.is_32bit() && out_bits < 64))
+        {
+            let reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
+            let extendop = match (narrow_mode, out_bits) {
+                (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => {
+                    ExtendOp::SXTB
+                }
+                (NarrowValueMode::ZeroExtend32, 1) | (NarrowValueMode::ZeroExtend64, 1) => {
+                    ExtendOp::UXTB
+                }
+                (NarrowValueMode::SignExtend32, 8) | (NarrowValueMode::SignExtend64, 8) => {
+                    ExtendOp::SXTB
+                }
+                (NarrowValueMode::ZeroExtend32, 8) | (NarrowValueMode::ZeroExtend64, 8) => {
+                    ExtendOp::UXTB
+                }
+                (NarrowValueMode::SignExtend32, 16) | (NarrowValueMode::SignExtend64, 16) => {
+                    ExtendOp::SXTH
+                }
+                (NarrowValueMode::ZeroExtend32, 16) | (NarrowValueMode::ZeroExtend64, 16) => {
+                    ExtendOp::UXTH
+                }
+                (NarrowValueMode::SignExtend64, 32) => ExtendOp::SXTW,
+                (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW,
+                _ => unreachable!(),
+            };
+            return ResultRSE::RegExtend(reg, extendop);
+        }
+    }
+
+    ResultRSE::from_rs(put_input_in_rs(ctx, input, narrow_mode))
+}
+
+pub(crate) fn put_input_in_rse_imm12<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> ResultRSEImm12 {
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        if let Some(i) = Imm12::maybe_from_u64(imm_value) {
+            return ResultRSEImm12::Imm12(i);
+        }
+    }
+
+    ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode))
+}
+
+/// Like `put_input_in_rse_imm12` above, except is allowed to negate the
+/// argument (assuming a two's-complement representation with the given bit
+/// width) if this allows use of 12-bit immediate. Used to flip `add`s with
+/// negative immediates to `sub`s (and vice-versa).
+pub(crate) fn put_input_in_rse_imm12_maybe_negated<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    twos_complement_bits: usize,
+    narrow_mode: NarrowValueMode,
+) -> (ResultRSEImm12, bool) {
+    assert!(twos_complement_bits <= 64);
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        if let Some(i) = Imm12::maybe_from_u64(imm_value) {
+            return (ResultRSEImm12::Imm12(i), false);
+        }
+        let sign_extended =
+            ((imm_value as i64) << (64 - twos_complement_bits)) >> (64 - twos_complement_bits);
+        let inverted = sign_extended.wrapping_neg();
+        if let Some(i) = Imm12::maybe_from_u64(inverted as u64) {
+            return (ResultRSEImm12::Imm12(i), true);
+        }
+    }
+
+    (
+        ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode)),
+        false,
+    )
+}
+
+pub(crate) fn put_input_in_rs_immlogic<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> ResultRSImmLogic {
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        let ty = ctx.input_ty(input.insn, input.input);
+        let ty = if ty_bits(ty) < 32 { I32 } else { ty };
+        if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
+            return ResultRSImmLogic::ImmLogic(i);
+        }
+    }
+
+    ResultRSImmLogic::from_rs(put_input_in_rs(ctx, input, narrow_mode))
+}
+
+pub(crate) fn put_input_in_reg_immshift<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    shift_width_bits: usize,
+) -> ResultRegImmShift {
+    if let Some(imm_value) = input_to_const(ctx, input) {
+        let imm_value = imm_value & ((shift_width_bits - 1) as u64);
+        if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
+            return ResultRegImmShift::ImmShift(immshift);
+        }
+    }
+
+    ResultRegImmShift::Reg(put_input_in_reg(ctx, input, NarrowValueMode::None))
+}
+
+//============================================================================
+// ALU instruction constructors.
+
+pub(crate) fn alu_inst_imm12(op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRSEImm12) -> Inst {
+    match rm {
+        ResultRSEImm12::Imm12(imm12) => Inst::AluRRImm12 {
+            alu_op: op,
+            rd,
+            rn,
+            imm12,
+        },
+        ResultRSEImm12::Reg(rm) => Inst::AluRRR {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+        },
+        ResultRSEImm12::RegShift(rm, shiftop) => Inst::AluRRRShift {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+            shiftop,
+        },
+        ResultRSEImm12::RegExtend(rm, extendop) => Inst::AluRRRExtend {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+            extendop,
+        },
+    }
+}
+
+pub(crate) fn alu_inst_immlogic(
+    op: ALUOp,
+    rd: Writable<Reg>,
+    rn: Reg,
+    rm: ResultRSImmLogic,
+) -> Inst {
+    match rm {
+        ResultRSImmLogic::ImmLogic(imml) => Inst::AluRRImmLogic {
+            alu_op: op,
+            rd,
+            rn,
+            imml,
+        },
+        ResultRSImmLogic::Reg(rm) => Inst::AluRRR {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+        },
+        ResultRSImmLogic::RegShift(rm, shiftop) => Inst::AluRRRShift {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+            shiftop,
+        },
+    }
+}
+
+pub(crate) fn alu_inst_immshift(
+    op: ALUOp,
+    rd: Writable<Reg>,
+    rn: Reg,
+    rm: ResultRegImmShift,
+) -> Inst {
+    match rm {
+        ResultRegImmShift::ImmShift(immshift) => Inst::AluRRImmShift {
+            alu_op: op,
+            rd,
+            rn,
+            immshift,
+        },
+        ResultRegImmShift::Reg(rm) => Inst::AluRRR {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+        },
+    }
+}
+
+//============================================================================
+// Lowering: addressing mode support. Takes instruction directly, rather
+// than an `InsnInput`, to do more introspection.
+
+/// 32-bit addends that make up an address: an input, and an extension mode on that
+/// input.
+type AddressAddend32List = SmallVec<[(Reg, ExtendOp); 4]>;
+/// 64-bit addends that make up an address: just an input.
+type AddressAddend64List = SmallVec<[Reg; 4]>;
+
+/// Collect all addends that feed into an address computation, with extend-modes
+/// on each.  Note that a load/store may have multiple address components (and
+/// the CLIF semantics are that these components are added to form the final
+/// address), but sometimes the CLIF that we receive still has arguments that
+/// refer to `iadd` instructions. We also want to handle uextend/sextend below
+/// the add(s).
+///
+/// We match any 64-bit add (and descend into its inputs), and we match any
+/// 32-to-64-bit sign or zero extension. The returned addend-list will use
+/// NarrowValueMode values to indicate how to extend each input:
+///
+/// - NarrowValueMode::None: the associated input is 64 bits wide; no extend.
+/// - NarrowValueMode::SignExtend64: the associated input is 32 bits wide;
+///                                  do a sign-extension.
+/// - NarrowValueMode::ZeroExtend64: the associated input is 32 bits wide;
+///                                  do a zero-extension.
+///
+/// We do not descend further into the inputs of extensions (unless it is a constant),
+/// because supporting (e.g.) a 32-bit add that is later extended would require
+/// additional masking of high-order bits, which is too complex. So, in essence, we
+/// descend any number of adds from the roots, collecting all 64-bit address addends;
+/// then possibly support extensions at these leaves.
+fn collect_address_addends<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    roots: &[InsnInput],
+) -> (AddressAddend64List, AddressAddend32List, i64) {
+    let mut result32: AddressAddend32List = SmallVec::new();
+    let mut result64: AddressAddend64List = SmallVec::new();
+    let mut offset: i64 = 0;
+
+    let mut workqueue: SmallVec<[InsnInput; 4]> = roots.iter().cloned().collect();
+
+    while let Some(input) = workqueue.pop() {
+        debug_assert!(ty_bits(ctx.input_ty(input.insn, input.input)) == 64);
+        if let Some((op, insn)) = maybe_input_insn_multi(
+            ctx,
+            input,
+            &[
+                Opcode::Uextend,
+                Opcode::Sextend,
+                Opcode::Iadd,
+                Opcode::Iconst,
+            ],
+        ) {
+            match op {
+                Opcode::Uextend | Opcode::Sextend if ty_bits(ctx.input_ty(insn, 0)) == 32 => {
+                    let extendop = if op == Opcode::Uextend {
+                        ExtendOp::UXTW
+                    } else {
+                        ExtendOp::SXTW
+                    };
+                    let extendee_input = InsnInput { insn, input: 0 };
+                    // If the input is a zero-extension of a constant, add the value to the known
+                    // offset.
+                    // Only do this for zero-extension, as generating a sign-extended
+                    // constant may be more instructions than using the 'SXTW' addressing mode.
+                    if let (Some(insn), ExtendOp::UXTW) = (
+                        maybe_input_insn(ctx, extendee_input, Opcode::Iconst),
+                        extendop,
+                    ) {
+                        let value = (ctx.get_constant(insn).unwrap() & 0xFFFF_FFFF_u64) as i64;
+                        offset += value;
+                    } else {
+                        let reg = put_input_in_reg(ctx, extendee_input, NarrowValueMode::None);
+                        result32.push((reg, extendop));
+                    }
+                }
+                Opcode::Uextend | Opcode::Sextend => {
+                    let reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
+                    result64.push(reg);
+                }
+                Opcode::Iadd => {
+                    for input in 0..ctx.num_inputs(insn) {
+                        let addend = InsnInput { insn, input };
+                        workqueue.push(addend);
+                    }
+                }
+                Opcode::Iconst => {
+                    let value: i64 = ctx.get_constant(insn).unwrap() as i64;
+                    offset += value;
+                }
+                _ => panic!("Unexpected opcode from maybe_input_insn_multi"),
+            }
+        } else {
+            let reg = put_input_in_reg(ctx, input, NarrowValueMode::ZeroExtend64);
+            result64.push(reg);
+        }
+    }
+
+    (result64, result32, offset)
+}
+
+/// Lower the address of a load or store.
+pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    elem_ty: Type,
+    roots: &[InsnInput],
+    offset: i32,
+) -> AMode {
+    // TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or
+    // mul instructions (Load/StoreComplex don't include scale factors).
+
+    // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
+    // extends and addition ops. We update these as we consume address
+    // components, so they represent the remaining addends not yet handled.
+    let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
+    let mut offset = args_offset + (offset as i64);
+
+    trace!(
+        "lower_address: addends64 {:?}, addends32 {:?}, offset {}",
+        addends64,
+        addends32,
+        offset
+    );
+
+    // First, decide what the `AMode` will be. Take one extendee and one 64-bit
+    // reg, or two 64-bit regs, or a 64-bit reg and a 32-bit reg with extension,
+    // or some other combination as appropriate.
+    let memarg = if addends64.len() > 0 {
+        if addends32.len() > 0 {
+            let (reg32, extendop) = addends32.pop().unwrap();
+            let reg64 = addends64.pop().unwrap();
+            AMode::RegExtended(reg64, reg32, extendop)
+        } else if offset > 0 && offset < 0x1000 {
+            let reg64 = addends64.pop().unwrap();
+            let off = offset;
+            offset = 0;
+            AMode::RegOffset(reg64, off, elem_ty)
+        } else if addends64.len() >= 2 {
+            let reg1 = addends64.pop().unwrap();
+            let reg2 = addends64.pop().unwrap();
+            AMode::RegReg(reg1, reg2)
+        } else {
+            let reg1 = addends64.pop().unwrap();
+            AMode::reg(reg1)
+        }
+    } else
+    /* addends64.len() == 0 */
+    {
+        if addends32.len() > 0 {
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+            let (reg1, extendop) = addends32.pop().unwrap();
+            let signed = match extendop {
+                ExtendOp::SXTW => true,
+                ExtendOp::UXTW => false,
+                _ => unreachable!(),
+            };
+            ctx.emit(Inst::Extend {
+                rd: tmp,
+                rn: reg1,
+                signed,
+                from_bits: 32,
+                to_bits: 64,
+            });
+            if let Some((reg2, extendop)) = addends32.pop() {
+                AMode::RegExtended(tmp.to_reg(), reg2, extendop)
+            } else {
+                AMode::reg(tmp.to_reg())
+            }
+        } else
+        /* addends32.len() == 0 */
+        {
+            let off_reg = ctx.alloc_tmp(RegClass::I64, I64);
+            lower_constant_u64(ctx, off_reg, offset as u64);
+            offset = 0;
+            AMode::reg(off_reg.to_reg())
+        }
+    };
+
+    // At this point, if we have any remaining components, we need to allocate a
+    // temp, replace one of the registers in the AMode with the temp, and emit
+    // instructions to add together the remaining components. Return immediately
+    // if this is *not* the case.
+    if offset == 0 && addends32.len() == 0 && addends64.len() == 0 {
+        return memarg;
+    }
+
+    // Allocate the temp and shoehorn it into the AMode.
+    let addr = ctx.alloc_tmp(RegClass::I64, I64);
+    let (reg, memarg) = match memarg {
+        AMode::RegExtended(r1, r2, extendop) => {
+            (r1, AMode::RegExtended(addr.to_reg(), r2, extendop))
+        }
+        AMode::RegOffset(r, off, ty) => (r, AMode::RegOffset(addr.to_reg(), off, ty)),
+        AMode::RegReg(r1, r2) => (r2, AMode::RegReg(addr.to_reg(), r1)),
+        AMode::UnsignedOffset(r, imm) => (r, AMode::UnsignedOffset(addr.to_reg(), imm)),
+        _ => unreachable!(),
+    };
+
+    // If there is any offset, load that first into `addr`, and add the `reg`
+    // that we kicked out of the `AMode`; otherwise, start with that reg.
+    if offset != 0 {
+        // If we can fit offset or -offset in an imm12, use an add-imm
+        // to combine the reg and offset. Otherwise, load value first then add.
+        if let Some(imm12) = Imm12::maybe_from_u64(offset as u64) {
+            ctx.emit(Inst::AluRRImm12 {
+                alu_op: ALUOp::Add64,
+                rd: addr,
+                rn: reg,
+                imm12,
+            });
+        } else if let Some(imm12) = Imm12::maybe_from_u64(offset.wrapping_neg() as u64) {
+            ctx.emit(Inst::AluRRImm12 {
+                alu_op: ALUOp::Sub64,
+                rd: addr,
+                rn: reg,
+                imm12,
+            });
+        } else {
+            lower_constant_u64(ctx, addr, offset as u64);
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::Add64,
+                rd: addr,
+                rn: addr.to_reg(),
+                rm: reg,
+            });
+        }
+    } else {
+        ctx.emit(Inst::gen_move(addr, reg, I64));
+    }
+
+    // Now handle reg64 and reg32-extended components.
+    for reg in addends64 {
+        // If the register is the stack reg, we must move it to another reg
+        // before adding it.
+        let reg = if reg == stack_reg() {
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+            ctx.emit(Inst::gen_move(tmp, stack_reg(), I64));
+            tmp.to_reg()
+        } else {
+            reg
+        };
+        ctx.emit(Inst::AluRRR {
+            alu_op: ALUOp::Add64,
+            rd: addr,
+            rn: addr.to_reg(),
+            rm: reg,
+        });
+    }
+    for (reg, extendop) in addends32 {
+        assert!(reg != stack_reg());
+        ctx.emit(Inst::AluRRRExtend {
+            alu_op: ALUOp::Add64,
+            rd: addr,
+            rn: addr.to_reg(),
+            rm: reg,
+            extendop,
+        });
+    }
+
+    memarg
+}
+
+pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    value: u64,
+) {
+    for inst in Inst::load_constant(rd, value) {
+        ctx.emit(inst);
+    }
+}
+
+pub(crate) fn lower_constant_f32<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    value: f32,
+) {
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_fp_constant32(rd, value.to_bits(), alloc_tmp) {
+        ctx.emit(inst);
+    }
+}
+
+pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    value: f64,
+) {
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_fp_constant64(rd, value.to_bits(), alloc_tmp) {
+        ctx.emit(inst);
+    }
+}
+
+pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    value: u128,
+) {
+    if value == 0 {
+        // Fast-track a common case.  The general case, viz, calling `Inst::load_fp_constant128`,
+        // is potentially expensive.
+        ctx.emit(Inst::VecDupImm {
+            rd,
+            imm: ASIMDMovModImm::zero(),
+            invert: false,
+            size: VectorSize::Size8x16,
+        });
+    } else {
+        let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+        for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
+            ctx.emit(inst);
+        }
+    }
+}
+
+pub(crate) fn lower_splat_const<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    value: u64,
+    size: VectorSize,
+) {
+    let (value, narrow_size) = match size.lane_size() {
+        ScalarSize::Size8 => (value as u8 as u64, ScalarSize::Size128),
+        ScalarSize::Size16 => (value as u16 as u64, ScalarSize::Size8),
+        ScalarSize::Size32 => (value as u32 as u64, ScalarSize::Size16),
+        ScalarSize::Size64 => (value, ScalarSize::Size32),
+        _ => unreachable!(),
+    };
+    let (value, size) = match Inst::get_replicated_vector_pattern(value as u128, narrow_size) {
+        Some((value, lane_size)) => (
+            value,
+            VectorSize::from_lane_size(lane_size, size.is_128bits()),
+        ),
+        None => (value, size),
+    };
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_replicated_vector_pattern(rd, value, size, alloc_tmp) {
+        ctx.emit(inst);
+    }
+}
+
+pub(crate) fn lower_condcode(cc: IntCC) -> Cond {
+    match cc {
+        IntCC::Equal => Cond::Eq,
+        IntCC::NotEqual => Cond::Ne,
+        IntCC::SignedGreaterThanOrEqual => Cond::Ge,
+        IntCC::SignedGreaterThan => Cond::Gt,
+        IntCC::SignedLessThanOrEqual => Cond::Le,
+        IntCC::SignedLessThan => Cond::Lt,
+        IntCC::UnsignedGreaterThanOrEqual => Cond::Hs,
+        IntCC::UnsignedGreaterThan => Cond::Hi,
+        IntCC::UnsignedLessThanOrEqual => Cond::Ls,
+        IntCC::UnsignedLessThan => Cond::Lo,
+        IntCC::Overflow => Cond::Vs,
+        IntCC::NotOverflow => Cond::Vc,
+    }
+}
+
+pub(crate) fn lower_fp_condcode(cc: FloatCC) -> Cond {
+    // Refer to `codegen/shared/src/condcodes.rs` and to the `FCMP` AArch64 docs.
+    // The FCMP instruction sets:
+    //               NZCV
+    // - PCSR.NZCV = 0011 on UN (unordered),
+    //               0110 on EQ,
+    //               1000 on LT,
+    //               0010 on GT.
+    match cc {
+        // EQ | LT | GT. Vc => V clear.
+        FloatCC::Ordered => Cond::Vc,
+        // UN. Vs => V set.
+        FloatCC::Unordered => Cond::Vs,
+        // EQ. Eq => Z set.
+        FloatCC::Equal => Cond::Eq,
+        // UN | LT | GT. Ne => Z clear.
+        FloatCC::NotEqual => Cond::Ne,
+        // LT | GT.
+        FloatCC::OrderedNotEqual => unimplemented!(),
+        //  UN | EQ
+        FloatCC::UnorderedOrEqual => unimplemented!(),
+        // LT. Mi => N set.
+        FloatCC::LessThan => Cond::Mi,
+        // LT | EQ. Ls => C clear or Z set.
+        FloatCC::LessThanOrEqual => Cond::Ls,
+        // GT. Gt => Z clear, N = V.
+        FloatCC::GreaterThan => Cond::Gt,
+        // GT | EQ. Ge => N = V.
+        FloatCC::GreaterThanOrEqual => Cond::Ge,
+        // UN | LT
+        FloatCC::UnorderedOrLessThan => unimplemented!(),
+        // UN | LT | EQ
+        FloatCC::UnorderedOrLessThanOrEqual => unimplemented!(),
+        // UN | GT
+        FloatCC::UnorderedOrGreaterThan => unimplemented!(),
+        // UN | GT | EQ
+        FloatCC::UnorderedOrGreaterThanOrEqual => unimplemented!(),
+    }
+}
+
+pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    mut rn: Reg,
+    mut rm: Reg,
+    ty: Type,
+    cond: Cond,
+) -> CodegenResult<()> {
+    let is_float = match ty {
+        F32X4 | F64X2 => true,
+        _ => false,
+    };
+    let size = VectorSize::from_ty(ty);
+    // 'Less than' operations are implemented by swapping
+    // the order of operands and using the 'greater than'
+    // instructions.
+    // 'Not equal' is implemented with 'equal' and inverting
+    // the result.
+    let (alu_op, swap) = match (is_float, cond) {
+        (false, Cond::Eq) => (VecALUOp::Cmeq, false),
+        (false, Cond::Ne) => (VecALUOp::Cmeq, false),
+        (false, Cond::Ge) => (VecALUOp::Cmge, false),
+        (false, Cond::Gt) => (VecALUOp::Cmgt, false),
+        (false, Cond::Le) => (VecALUOp::Cmge, true),
+        (false, Cond::Lt) => (VecALUOp::Cmgt, true),
+        (false, Cond::Hs) => (VecALUOp::Cmhs, false),
+        (false, Cond::Hi) => (VecALUOp::Cmhi, false),
+        (false, Cond::Ls) => (VecALUOp::Cmhs, true),
+        (false, Cond::Lo) => (VecALUOp::Cmhi, true),
+        (true, Cond::Eq) => (VecALUOp::Fcmeq, false),
+        (true, Cond::Ne) => (VecALUOp::Fcmeq, false),
+        (true, Cond::Mi) => (VecALUOp::Fcmgt, true),
+        (true, Cond::Ls) => (VecALUOp::Fcmge, true),
+        (true, Cond::Ge) => (VecALUOp::Fcmge, false),
+        (true, Cond::Gt) => (VecALUOp::Fcmgt, false),
+        _ => unreachable!(),
+    };
+
+    if swap {
+        std::mem::swap(&mut rn, &mut rm);
+    }
+
+    ctx.emit(Inst::VecRRR {
+        alu_op,
+        rd,
+        rn,
+        rm,
+        size,
+    });
+
+    if cond == Cond::Ne {
+        ctx.emit(Inst::VecMisc {
+            op: VecMisc2::Not,
+            rd,
+            rn: rd.to_reg(),
+            size,
+        });
+    }
+
+    Ok(())
+}
+
+/// Determines whether this condcode interprets inputs as signed or unsigned.  See the
+/// documentation for the `icmp` instruction in cranelift-codegen/meta/src/shared/instructions.rs
+/// for further insights into this.
+pub(crate) fn condcode_is_signed(cc: IntCC) -> bool {
+    match cc {
+        IntCC::Equal
+        | IntCC::UnsignedGreaterThanOrEqual
+        | IntCC::UnsignedGreaterThan
+        | IntCC::UnsignedLessThanOrEqual
+        | IntCC::UnsignedLessThan
+        | IntCC::NotEqual => false,
+        IntCC::SignedGreaterThanOrEqual
+        | IntCC::SignedGreaterThan
+        | IntCC::SignedLessThanOrEqual
+        | IntCC::SignedLessThan
+        | IntCC::Overflow
+        | IntCC::NotOverflow => true,
+    }
+}
+
+//=============================================================================
+// Helpers for instruction lowering.
+
+pub(crate) fn choose_32_64<T: Copy>(ty: Type, op32: T, op64: T) -> T {
+    let bits = ty_bits(ty);
+    if bits <= 32 {
+        op32
+    } else if bits == 64 {
+        op64
+    } else {
+        panic!("choose_32_64 on > 64 bits!")
+    }
+}
+
+/// Checks for an instance of `op` feeding the given input.
+pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
+    c: &mut C,
+    input: InsnInput,
+    op: Opcode,
+) -> Option<IRInst> {
+    let inputs = c.get_input(input.insn, input.input);
+    debug!(
+        "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}",
+        input, inputs, op
+    );
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        debug!(" -> input inst {:?}", data);
+        if data.opcode() == op {
+            return Some(src_inst);
+        }
+    }
+    None
+}
+
+/// Checks for an instance of any one of `ops` feeding the given input.
+pub(crate) fn maybe_input_insn_multi<C: LowerCtx<I = Inst>>(
+    c: &mut C,
+    input: InsnInput,
+    ops: &[Opcode],
+) -> Option<(Opcode, IRInst)> {
+    for &op in ops {
+        if let Some(inst) = maybe_input_insn(c, input, op) {
+            return Some((op, inst));
+        }
+    }
+    None
+}
+
+/// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g.,
+/// Bint or a bitcast).
+///
+/// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it
+/// a bit more generic.
+pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
+    c: &mut C,
+    input: InsnInput,
+    op: Opcode,
+    conv: Opcode,
+) -> Option<IRInst> {
+    let inputs = c.get_input(input.insn, input.input);
+    if let Some((src_inst, _)) = inputs.inst {
+        let data = c.data(src_inst);
+        if data.opcode() == op {
+            return Some(src_inst);
+        }
+        if data.opcode() == conv {
+            let inputs = c.get_input(src_inst, 0);
+            if let Some((src_inst, _)) = inputs.inst {
+                let data = c.data(src_inst);
+                if data.opcode() == op {
+                    return Some(src_inst);
+                }
+            }
+        }
+    }
+    None
+}
+
+pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+    is_signed: bool,
+) {
+    debug!("lower_icmp_or_ifcmp_to_flags: insn {}", insn);
+    let ty = ctx.input_ty(insn, 0);
+    let bits = ty_bits(ty);
+    let narrow_mode = match (bits <= 32, is_signed) {
+        (true, true) => NarrowValueMode::SignExtend32,
+        (true, false) => NarrowValueMode::ZeroExtend32,
+        (false, true) => NarrowValueMode::SignExtend64,
+        (false, false) => NarrowValueMode::ZeroExtend64,
+    };
+    let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
+    let ty = ctx.input_ty(insn, 0);
+    let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+    let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
+    debug!("lower_icmp_or_ifcmp_to_flags: rn = {:?} rm = {:?}", rn, rm);
+    let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
+    let rd = writable_zero_reg();
+    ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+}
+
+pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
+    let ty = ctx.input_ty(insn, 0);
+    let bits = ty_bits(ty);
+    let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
+    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+    let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+    match bits {
+        32 => {
+            ctx.emit(Inst::FpuCmp32 { rn, rm });
+        }
+        64 => {
+            ctx.emit(Inst::FpuCmp64 { rn, rm });
+        }
+        _ => panic!("Unknown float size"),
+    }
+}
+
+/// Convert a 0 / 1 result, such as from a conditional-set instruction, into a 0
+/// / -1 (all-ones) result as expected for bool operations.
+pub(crate) fn normalize_bool_result<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+    rd: Writable<Reg>,
+) {
+    // A boolean is 0 / -1; if output width is > 1, negate.
+    if ty_bits(ctx.output_ty(insn, 0)) > 1 {
+        ctx.emit(Inst::AluRRR {
+            alu_op: ALUOp::Sub64,
+            rd,
+            rn: zero_reg(),
+            rm: rd.to_reg(),
+        });
+    }
+}
+
+//=============================================================================
+// Lowering-backend trait implementation.
+
+impl LowerBackend for AArch64Backend {
+    type MInst = Inst;
+
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
+        lower_inst::lower_insn_to_regs(ctx, ir_inst)
+    }
+
+    fn lower_branch_group<C: LowerCtx<I = Inst>>(
+        &self,
+        ctx: &mut C,
+        branches: &[IRInst],
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
+        lower_inst::lower_branch(ctx, branches, targets, fallthrough)
+    }
+
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        Some(xreg(PINNED_REG))
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/lower_inst.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/lower_inst.rs
new file mode 100644
index 0000000000..faa89d3b98
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/lower_inst.rs
@@ -0,0 +1,3409 @@
+//! Lower a single Cranelift instruction into vcode.
+
+use crate::binemit::CodeOffset;
+use crate::ir::condcodes::FloatCC;
+use crate::ir::types::*;
+use crate::ir::Inst as IRInst;
+use crate::ir::{InstructionData, Opcode, TrapCode};
+use crate::machinst::lower::*;
+use crate::machinst::*;
+use crate::{CodegenError, CodegenResult};
+
+use crate::isa::aarch64::abi::*;
+use crate::isa::aarch64::inst::*;
+
+use regalloc::{RegClass, Writable};
+
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use core::convert::TryFrom;
+use smallvec::SmallVec;
+
+use super::lower::*;
+
+/// This is target-word-size dependent.  And it excludes booleans and reftypes.
+fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
+    match ty {
+        I8 | I16 | I32 | I64 => true,
+        _ => false,
+    }
+}
+
+/// Actually codegen an instruction's results into registers.
+pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+) -> CodegenResult<()> {
+    let op = ctx.data(insn).opcode();
+    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
+        .map(|i| InsnInput { insn, input: i })
+        .collect();
+    let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
+        .map(|i| InsnOutput { insn, output: i })
+        .collect();
+    let ty = if outputs.len() > 0 {
+        Some(ctx.output_ty(insn, 0))
+    } else {
+        None
+    };
+
+    match op {
+        Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
+            let value = ctx.get_constant(insn).unwrap();
+            // Sign extend constant if necessary
+            let value = match ty.unwrap() {
+                I8 => (((value as i64) << 56) >> 56) as u64,
+                I16 => (((value as i64) << 48) >> 48) as u64,
+                I32 => (((value as i64) << 32) >> 32) as u64,
+                I64 | R64 => value,
+                ty if ty.is_bool() => value,
+                ty => unreachable!("Unknown type for const: {}", ty),
+            };
+            let rd = get_output_reg(ctx, outputs[0]);
+            lower_constant_u64(ctx, rd, value);
+        }
+        Opcode::F32const => {
+            let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
+            let rd = get_output_reg(ctx, outputs[0]);
+            lower_constant_f32(ctx, rd, value);
+        }
+        Opcode::F64const => {
+            let value = f64::from_bits(ctx.get_constant(insn).unwrap());
+            let rd = get_output_reg(ctx, outputs[0]);
+            lower_constant_f64(ctx, rd, value);
+        }
+        Opcode::Iadd => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            if !ty.is_vector() {
+                let mul_insn =
+                    if let Some(mul_insn) = maybe_input_insn(ctx, inputs[1], Opcode::Imul) {
+                        Some((mul_insn, 0))
+                    } else if let Some(mul_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Imul) {
+                        Some((mul_insn, 1))
+                    } else {
+                        None
+                    };
+                // If possible combine mul + add into madd.
+                if let Some((insn, addend_idx)) = mul_insn {
+                    let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
+                    let rn_input = InsnInput { insn, input: 0 };
+                    let rm_input = InsnInput { insn, input: 1 };
+
+                    let rn = put_input_in_reg(ctx, rn_input, NarrowValueMode::None);
+                    let rm = put_input_in_reg(ctx, rm_input, NarrowValueMode::None);
+                    let ra = put_input_in_reg(ctx, inputs[addend_idx], NarrowValueMode::None);
+
+                    ctx.emit(Inst::AluRRRR {
+                        alu_op,
+                        rd,
+                        rn,
+                        rm,
+                        ra,
+                    });
+                } else {
+                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                    let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
+                        ctx,
+                        inputs[1],
+                        ty_bits(ty),
+                        NarrowValueMode::None,
+                    );
+                    let alu_op = if !negated {
+                        choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
+                    } else {
+                        choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
+                    };
+                    ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+                }
+            } else {
+                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                ctx.emit(Inst::VecRRR {
+                    rd,
+                    rn,
+                    rm,
+                    alu_op: VecALUOp::Add,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
+        }
+        Opcode::Isub => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            if !ty.is_vector() {
+                let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
+                    ctx,
+                    inputs[1],
+                    ty_bits(ty),
+                    NarrowValueMode::None,
+                );
+                let alu_op = if !negated {
+                    choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
+                } else {
+                    choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
+                };
+                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+            } else {
+                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                ctx.emit(Inst::VecRRR {
+                    rd,
+                    rn,
+                    rm,
+                    alu_op: VecALUOp::Sub,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
+        }
+        Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
+            // We use the scalar SIMD & FP saturating additions and subtractions
+            // (SQADD / UQADD / SQSUB / UQSUB), which require scalar FP registers.
+            let is_signed = op == Opcode::SaddSat || op == Opcode::SsubSat;
+            let ty = ty.unwrap();
+            let rd = get_output_reg(ctx, outputs[0]);
+            if !ty.is_vector() {
+                let narrow_mode = if is_signed {
+                    NarrowValueMode::SignExtend64
+                } else {
+                    NarrowValueMode::ZeroExtend64
+                };
+                let fpu_op = match op {
+                    Opcode::UaddSat => FPUOp2::Uqadd64,
+                    Opcode::SaddSat => FPUOp2::Sqadd64,
+                    Opcode::UsubSat => FPUOp2::Uqsub64,
+                    Opcode::SsubSat => FPUOp2::Sqsub64,
+                    _ => unreachable!(),
+                };
+                let va = ctx.alloc_tmp(RegClass::V128, I128);
+                let vb = ctx.alloc_tmp(RegClass::V128, I128);
+                let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
+                let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
+                ctx.emit(Inst::MovToFpu {
+                    rd: va,
+                    rn: ra,
+                    size: ScalarSize::Size64,
+                });
+                ctx.emit(Inst::MovToFpu {
+                    rd: vb,
+                    rn: rb,
+                    size: ScalarSize::Size64,
+                });
+                ctx.emit(Inst::FpuRRR {
+                    fpu_op,
+                    rd: va,
+                    rn: va.to_reg(),
+                    rm: vb.to_reg(),
+                });
+                ctx.emit(Inst::MovFromVec {
+                    rd,
+                    rn: va.to_reg(),
+                    idx: 0,
+                    size: VectorSize::Size64x2,
+                });
+            } else {
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+
+                let alu_op = match op {
+                    Opcode::UaddSat => VecALUOp::Uqadd,
+                    Opcode::SaddSat => VecALUOp::Sqadd,
+                    Opcode::UsubSat => VecALUOp::Uqsub,
+                    Opcode::SsubSat => VecALUOp::Sqsub,
+                    _ => unreachable!(),
+                };
+
+                ctx.emit(Inst::VecRRR {
+                    rd,
+                    rn,
+                    rm,
+                    alu_op,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
+        }
+
+        Opcode::Ineg => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            if !ty.is_vector() {
+                let rn = zero_reg();
+                let rm = put_input_in_rse_imm12(ctx, inputs[0], NarrowValueMode::None);
+                let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
+                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+            } else {
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                ctx.emit(Inst::VecMisc {
+                    op: VecMisc2::Neg,
+                    rd,
+                    rn,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
+        }
+
+        Opcode::Imul => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            if !ty.is_vector() {
+                let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
+                ctx.emit(Inst::AluRRRR {
+                    alu_op,
+                    rd,
+                    rn,
+                    rm,
+                    ra: zero_reg(),
+                });
+            } else {
+                if ty == I64X2 {
+                    let tmp1 = ctx.alloc_tmp(RegClass::V128, I64X2);
+                    let tmp2 = ctx.alloc_tmp(RegClass::V128, I64X2);
+
+                    // This I64X2 multiplication is performed with several 32-bit
+                    // operations.
+
+                    // 64-bit numbers x and y, can be represented as:
+                    //   x = a + 2^32(b)
+                    //   y = c + 2^32(d)
+
+                    // A 64-bit multiplication is:
+                    //   x * y = ac + 2^32(ad + bc) + 2^64(bd)
+                    // note: `2^64(bd)` can be ignored, the value is too large to fit in
+                    // 64 bits.
+
+                    // This sequence implements a I64X2 multiply, where the registers
+                    // `rn` and `rm` are split up into 32-bit components:
+                    //   rn = |d|c|b|a|
+                    //   rm = |h|g|f|e|
+                    //
+                    //   rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
+                    //
+                    //  The sequence is:
+                    //  rev64 rd.4s, rm.4s
+                    //  mul rd.4s, rd.4s, rn.4s
+                    //  xtn tmp1.2s, rn.2d
+                    //  addp rd.4s, rd.4s, rd.4s
+                    //  xtn tmp2.2s, rm.2d
+                    //  shll rd.2d, rd.2s, #32
+                    //  umlal rd.2d, tmp2.2s, tmp1.2s
+
+                    // Reverse the 32-bit elements in the 64-bit words.
+                    //   rd = |g|h|e|f|
+                    ctx.emit(Inst::VecMisc {
+                        op: VecMisc2::Rev64,
+                        rd,
+                        rn: rm,
+                        size: VectorSize::Size32x4,
+                    });
+
+                    // Calculate the high half components.
+                    //   rd = |dg|ch|be|af|
+                    //
+                    // Note that this 32-bit multiply of the high half
+                    // discards the bits that would overflow, same as
+                    // if 64-bit operations were used. Also the Shll
+                    // below would shift out the overflow bits anyway.
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Mul,
+                        rd,
+                        rn: rd.to_reg(),
+                        rm: rn,
+                        size: VectorSize::Size32x4,
+                    });
+
+                    // Extract the low half components of rn.
+                    //   tmp1 = |c|a|
+                    ctx.emit(Inst::VecMiscNarrow {
+                        op: VecMiscNarrowOp::Xtn,
+                        rd: tmp1,
+                        rn,
+                        size: VectorSize::Size32x2,
+                        high_half: false,
+                    });
+
+                    // Sum the respective high half components.
+                    //   rd = |dg+ch|be+af||dg+ch|be+af|
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Addp,
+                        rd: rd,
+                        rn: rd.to_reg(),
+                        rm: rd.to_reg(),
+                        size: VectorSize::Size32x4,
+                    });
+
+                    // Extract the low half components of rm.
+                    //   tmp2 = |g|e|
+                    ctx.emit(Inst::VecMiscNarrow {
+                        op: VecMiscNarrowOp::Xtn,
+                        rd: tmp2,
+                        rn: rm,
+                        size: VectorSize::Size32x2,
+                        high_half: false,
+                    });
+
+                    // Shift the high half components, into the high half.
+                    //   rd = |dg+ch << 32|be+af << 32|
+                    ctx.emit(Inst::VecMisc {
+                        op: VecMisc2::Shll,
+                        rd,
+                        rn: rd.to_reg(),
+                        size: VectorSize::Size32x2,
+                    });
+
+                    // Multiply the low components together, and accumulate with the high
+                    // half.
+                    //   rd = |rd[1] + cg|rd[0] + ae|
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Umlal,
+                        rd,
+                        rn: tmp2.to_reg(),
+                        rm: tmp1.to_reg(),
+                        size: VectorSize::Size32x2,
+                    });
+                } else {
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Mul,
+                        rd,
+                        rn,
+                        rm,
+                        size: VectorSize::from_ty(ty),
+                    });
+                }
+            }
+        }
+
+        Opcode::Umulhi | Opcode::Smulhi => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let is_signed = op == Opcode::Smulhi;
+            let input_ty = ctx.input_ty(insn, 0);
+            assert!(ctx.input_ty(insn, 1) == input_ty);
+            assert!(ctx.output_ty(insn, 0) == input_ty);
+
+            match input_ty {
+                I64 => {
+                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                    let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                    let alu_op = if is_signed {
+                        ALUOp::SMulH
+                    } else {
+                        ALUOp::UMulH
+                    };
+                    ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
+                }
+                I32 | I16 | I8 => {
+                    let narrow_mode = if is_signed {
+                        NarrowValueMode::SignExtend64
+                    } else {
+                        NarrowValueMode::ZeroExtend64
+                    };
+                    let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+                    let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
+                    let ra = zero_reg();
+                    ctx.emit(Inst::AluRRRR {
+                        alu_op: ALUOp3::MAdd64,
+                        rd,
+                        rn,
+                        rm,
+                        ra,
+                    });
+                    let shift_op = if is_signed {
+                        ALUOp::Asr64
+                    } else {
+                        ALUOp::Lsr64
+                    };
+                    let shift_amt = match input_ty {
+                        I32 => 32,
+                        I16 => 16,
+                        I8 => 8,
+                        _ => unreachable!(),
+                    };
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: shift_op,
+                        rd,
+                        rn: rd.to_reg(),
+                        immshift: ImmShift::maybe_from_u64(shift_amt).unwrap(),
+                    });
+                }
+                _ => {
+                    panic!("Unsupported argument type for umulhi/smulhi: {}", input_ty);
+                }
+            }
+        }
+
+        Opcode::Udiv | Opcode::Sdiv | Opcode::Urem | Opcode::Srem => {
+            let is_signed = match op {
+                Opcode::Udiv | Opcode::Urem => false,
+                Opcode::Sdiv | Opcode::Srem => true,
+                _ => unreachable!(),
+            };
+            let is_rem = match op {
+                Opcode::Udiv | Opcode::Sdiv => false,
+                Opcode::Urem | Opcode::Srem => true,
+                _ => unreachable!(),
+            };
+            let narrow_mode = if is_signed {
+                NarrowValueMode::SignExtend64
+            } else {
+                NarrowValueMode::ZeroExtend64
+            };
+            // TODO: Add SDiv32 to implement 32-bit directly, rather
+            // than extending the input.
+            let div_op = if is_signed {
+                ALUOp::SDiv64
+            } else {
+                ALUOp::UDiv64
+            };
+
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+            let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
+            // The div instruction does not trap on divide by zero or signed overflow
+            // so checks are inserted below.
+            //
+            //   div rd, rn, rm
+            ctx.emit(Inst::AluRRR {
+                alu_op: div_op,
+                rd,
+                rn,
+                rm,
+            });
+
+            if is_rem {
+                // Remainder (rn % rm) is implemented as:
+                //
+                //   tmp = rn / rm
+                //   rd = rn - (tmp*rm)
+                //
+                // use 'rd' for tmp and you have:
+                //
+                //   div rd, rn, rm       ; rd = rn / rm
+                //   cbnz rm, #8          ; branch over trap
+                //   udf                  ; divide by zero
+                //   msub rd, rd, rm, rn  ; rd = rn - rd * rm
+
+                // Check for divide by 0.
+                let trap_code = TrapCode::IntegerDivisionByZero;
+                ctx.emit(Inst::TrapIf {
+                    trap_code,
+                    kind: CondBrKind::Zero(rm),
+                });
+
+                ctx.emit(Inst::AluRRRR {
+                    alu_op: ALUOp3::MSub64,
+                    rd: rd,
+                    rn: rd.to_reg(),
+                    rm: rm,
+                    ra: rn,
+                });
+            } else {
+                if div_op == ALUOp::SDiv64 {
+                    //   cbnz rm, #8
+                    //   udf ; divide by zero
+                    //   cmn rm, 1
+                    //   ccmp rn, 1, #nzcv, eq
+                    //   b.vc #8
+                    //   udf ; signed overflow
+
+                    // Check for divide by 0.
+                    let trap_code = TrapCode::IntegerDivisionByZero;
+                    ctx.emit(Inst::TrapIf {
+                        trap_code,
+                        kind: CondBrKind::Zero(rm),
+                    });
+
+                    // Check for signed overflow. The only case is min_value / -1.
+                    let ty = ty.unwrap();
+                    // The following checks must be done in 32-bit or 64-bit, depending
+                    // on the input type. Even though the initial div instruction is
+                    // always done in 64-bit currently.
+                    let size = OperandSize::from_ty(ty);
+                    // Check RHS is -1.
+                    ctx.emit(Inst::AluRRImm12 {
+                        alu_op: choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64),
+                        rd: writable_zero_reg(),
+                        rn: rm,
+                        imm12: Imm12::maybe_from_u64(1).unwrap(),
+                    });
+                    // Check LHS is min_value, by subtracting 1 and branching if
+                    // there is overflow.
+                    ctx.emit(Inst::CCmpImm {
+                        size,
+                        rn,
+                        imm: UImm5::maybe_from_u8(1).unwrap(),
+                        nzcv: NZCV::new(false, false, false, false),
+                        cond: Cond::Eq,
+                    });
+                    let trap_code = TrapCode::IntegerOverflow;
+                    ctx.emit(Inst::TrapIf {
+                        trap_code,
+                        kind: CondBrKind::Cond(Cond::Vs),
+                    });
+                } else {
+                    //   cbnz rm, #8
+                    //   udf ; divide by zero
+
+                    // Check for divide by 0.
+                    let trap_code = TrapCode::IntegerDivisionByZero;
+                    ctx.emit(Inst::TrapIf {
+                        trap_code,
+                        kind: CondBrKind::Zero(rm),
+                    });
+                }
+            }
+        }
+
+        Opcode::Uextend | Opcode::Sextend => {
+            let output_ty = ty.unwrap();
+            let input_ty = ctx.input_ty(insn, 0);
+            let from_bits = ty_bits(input_ty) as u8;
+            let to_bits = ty_bits(output_ty) as u8;
+            let to_bits = std::cmp::max(32, to_bits);
+            assert!(from_bits <= to_bits);
+            if from_bits < to_bits {
+                let signed = op == Opcode::Sextend;
+                let rd = get_output_reg(ctx, outputs[0]);
+
+                if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) {
+                    let idx =
+                        if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(extract_insn) {
+                            *imm
+                        } else {
+                            unreachable!();
+                        };
+                    let input = InsnInput {
+                        insn: extract_insn,
+                        input: 0,
+                    };
+                    let rn = put_input_in_reg(ctx, input, NarrowValueMode::None);
+                    let size = VectorSize::from_ty(ctx.input_ty(extract_insn, 0));
+
+                    if signed {
+                        let scalar_size = OperandSize::from_ty(output_ty);
+
+                        ctx.emit(Inst::MovFromVecSigned {
+                            rd,
+                            rn,
+                            idx,
+                            size,
+                            scalar_size,
+                        });
+                    } else {
+                        ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
+                    }
+                } else {
+                    // If we reach this point, we weren't able to incorporate the extend as
+                    // a register-mode on another instruction, so we have a 'None'
+                    // narrow-value/extend mode here, and we emit the explicit instruction.
+                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                    ctx.emit(Inst::Extend {
+                        rd,
+                        rn,
+                        signed,
+                        from_bits,
+                        to_bits,
+                    });
+                }
+            }
+        }
+
+        Opcode::Bnot => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            if !ty.is_vector() {
+                let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
+                let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
+                // NOT rd, rm ==> ORR_NOT rd, zero, rm
+                ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm));
+            } else {
+                let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                ctx.emit(Inst::VecMisc {
+                    op: VecMisc2::Not,
+                    rd,
+                    rn: rm,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
+        }
+
+        Opcode::Band
+        | Opcode::Bor
+        | Opcode::Bxor
+        | Opcode::BandNot
+        | Opcode::BorNot
+        | Opcode::BxorNot => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            if !ty.is_vector() {
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rm = put_input_in_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
+                let alu_op = match op {
+                    Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64),
+                    Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
+                    Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64),
+                    Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64),
+                    Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64),
+                    Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64),
+                    _ => unreachable!(),
+                };
+                ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm));
+            } else {
+                let alu_op = match op {
+                    Opcode::Band => VecALUOp::And,
+                    Opcode::BandNot => VecALUOp::Bic,
+                    Opcode::Bor => VecALUOp::Orr,
+                    Opcode::Bxor => VecALUOp::Eor,
+                    _ => unreachable!(),
+                };
+
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                let rd = get_output_reg(ctx, outputs[0]);
+
+                ctx.emit(Inst::VecRRR {
+                    alu_op,
+                    rd,
+                    rn,
+                    rm,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
+        }
+
+        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
+            let ty = ty.unwrap();
+            let rd = get_output_reg(ctx, outputs[0]);
+            if !ty.is_vector() {
+                let size = OperandSize::from_bits(ty_bits(ty));
+                let narrow_mode = match (op, size) {
+                    (Opcode::Ishl, _) => NarrowValueMode::None,
+                    (Opcode::Ushr, OperandSize::Size64) => NarrowValueMode::ZeroExtend64,
+                    (Opcode::Ushr, OperandSize::Size32) => NarrowValueMode::ZeroExtend32,
+                    (Opcode::Sshr, OperandSize::Size64) => NarrowValueMode::SignExtend64,
+                    (Opcode::Sshr, OperandSize::Size32) => NarrowValueMode::SignExtend32,
+                    _ => unreachable!(),
+                };
+                let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+                let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
+                let alu_op = match op {
+                    Opcode::Ishl => choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64),
+                    Opcode::Ushr => choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
+                    Opcode::Sshr => choose_32_64(ty, ALUOp::Asr32, ALUOp::Asr64),
+                    _ => unreachable!(),
+                };
+                ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
+            } else {
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let size = VectorSize::from_ty(ty);
+                let (alu_op, is_right_shift) = match op {
+                    Opcode::Ishl => (VecALUOp::Sshl, false),
+                    Opcode::Ushr => (VecALUOp::Ushl, true),
+                    Opcode::Sshr => (VecALUOp::Sshl, true),
+                    _ => unreachable!(),
+                };
+
+                let rm = if is_right_shift {
+                    // Right shifts are implemented with a negative left shift.
+                    let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+                    let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
+                    let rn = zero_reg();
+                    ctx.emit(alu_inst_imm12(ALUOp::Sub32, tmp, rn, rm));
+                    tmp.to_reg()
+                } else {
+                    put_input_in_reg(ctx, inputs[1], NarrowValueMode::None)
+                };
+
+                ctx.emit(Inst::VecDup { rd, rn: rm, size });
+
+                ctx.emit(Inst::VecRRR {
+                    alu_op,
+                    rd,
+                    rn,
+                    rm: rd.to_reg(),
+                    size,
+                });
+            }
+        }
+
+        Opcode::Rotr | Opcode::Rotl => {
+            // aarch64 doesn't have a left-rotate instruction, but a left rotation of K places is
+            // effectively a right rotation of N - K places, if N is the integer's bit size. We
+            // implement left rotations with this trick.
+            //
+            // For a 32-bit or 64-bit rotate-right, we can use the ROR instruction directly.
+            //
+            // For a < 32-bit rotate-right, we synthesize this as:
+            //
+            //    rotr rd, rn, rm
+            //
+            //       =>
+            //
+            //    zero-extend rn, <32-or-64>
+            //    and tmp_masked_rm, rm, <bitwidth - 1>
+            //    sub tmp1, tmp_masked_rm, <bitwidth>
+            //    sub tmp1, zero, tmp1  ; neg
+            //    lsr tmp2, rn, tmp_masked_rm
+            //    lsl rd, rn, tmp1
+            //    orr rd, rd, tmp2
+            //
+            // For a constant amount, we can instead do:
+            //
+            //    zero-extend rn, <32-or-64>
+            //    lsr tmp2, rn, #<shiftimm>
+            //    lsl rd, rn, <bitwidth - shiftimm>
+            //    orr rd, rd, tmp2
+
+            let is_rotl = op == Opcode::Rotl;
+
+            let ty = ty.unwrap();
+            let ty_bits_size = ty_bits(ty) as u8;
+
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(
+                ctx,
+                inputs[0],
+                if ty_bits_size <= 32 {
+                    NarrowValueMode::ZeroExtend32
+                } else {
+                    NarrowValueMode::ZeroExtend64
+                },
+            );
+            let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
+
+            if ty_bits_size == 32 || ty_bits_size == 64 {
+                let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64);
+                match rm {
+                    ResultRegImmShift::ImmShift(mut immshift) => {
+                        if is_rotl {
+                            immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
+                        }
+                        immshift.imm &= ty_bits_size - 1;
+                        ctx.emit(Inst::AluRRImmShift {
+                            alu_op,
+                            rd,
+                            rn,
+                            immshift,
+                        });
+                    }
+
+                    ResultRegImmShift::Reg(rm) => {
+                        let rm = if is_rotl {
+                            // Really ty_bits_size - rn, but the upper bits of the result are
+                            // ignored (because of the implicit masking done by the instruction),
+                            // so this is equivalent to negating the input.
+                            let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
+                            ctx.emit(Inst::AluRRR {
+                                alu_op,
+                                rd: tmp,
+                                rn: zero_reg(),
+                                rm,
+                            });
+                            tmp.to_reg()
+                        } else {
+                            rm
+                        };
+                        ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
+                    }
+                }
+            } else {
+                debug_assert!(ty_bits_size < 32);
+
+                match rm {
+                    ResultRegImmShift::Reg(reg) => {
+                        let reg = if is_rotl {
+                            // Really ty_bits_size - rn, but the upper bits of the result are
+                            // ignored (because of the implicit masking done by the instruction),
+                            // so this is equivalent to negating the input.
+                            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
+                            ctx.emit(Inst::AluRRR {
+                                alu_op: ALUOp::Sub32,
+                                rd: tmp,
+                                rn: zero_reg(),
+                                rm: reg,
+                            });
+                            tmp.to_reg()
+                        } else {
+                            reg
+                        };
+
+                        // Explicitly mask the rotation count.
+                        let tmp_masked_rm = ctx.alloc_tmp(RegClass::I64, I32);
+                        ctx.emit(Inst::AluRRImmLogic {
+                            alu_op: ALUOp::And32,
+                            rd: tmp_masked_rm,
+                            rn: reg,
+                            imml: ImmLogic::maybe_from_u64((ty_bits_size - 1) as u64, I32).unwrap(),
+                        });
+                        let tmp_masked_rm = tmp_masked_rm.to_reg();
+
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                        let tmp2 = ctx.alloc_tmp(RegClass::I64, I32);
+                        ctx.emit(Inst::AluRRImm12 {
+                            alu_op: ALUOp::Sub32,
+                            rd: tmp1,
+                            rn: tmp_masked_rm,
+                            imm12: Imm12::maybe_from_u64(ty_bits_size as u64).unwrap(),
+                        });
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::Sub32,
+                            rd: tmp1,
+                            rn: zero_reg(),
+                            rm: tmp1.to_reg(),
+                        });
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::Lsr32,
+                            rd: tmp2,
+                            rn,
+                            rm: tmp_masked_rm,
+                        });
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::Lsl32,
+                            rd,
+                            rn,
+                            rm: tmp1.to_reg(),
+                        });
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::Orr32,
+                            rd,
+                            rn: rd.to_reg(),
+                            rm: tmp2.to_reg(),
+                        });
+                    }
+
+                    ResultRegImmShift::ImmShift(mut immshift) => {
+                        if is_rotl {
+                            immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
+                        }
+                        immshift.imm &= ty_bits_size - 1;
+
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                        ctx.emit(Inst::AluRRImmShift {
+                            alu_op: ALUOp::Lsr32,
+                            rd: tmp1,
+                            rn,
+                            immshift: immshift.clone(),
+                        });
+
+                        let amount = immshift.value() & (ty_bits_size - 1);
+                        let opp_shift =
+                            ImmShift::maybe_from_u64(ty_bits_size as u64 - amount as u64).unwrap();
+                        ctx.emit(Inst::AluRRImmShift {
+                            alu_op: ALUOp::Lsl32,
+                            rd,
+                            rn,
+                            immshift: opp_shift,
+                        });
+
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::Orr32,
+                            rd,
+                            rn: rd.to_reg(),
+                            rm: tmp1.to_reg(),
+                        });
+                    }
+                }
+            }
+        }
+
+        Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let needs_zext = match op {
+                Opcode::Bitrev | Opcode::Ctz => false,
+                Opcode::Clz | Opcode::Cls => true,
+                _ => unreachable!(),
+            };
+            let ty = ty.unwrap();
+            let narrow_mode = if needs_zext && ty_bits(ty) == 64 {
+                NarrowValueMode::ZeroExtend64
+            } else if needs_zext {
+                NarrowValueMode::ZeroExtend32
+            } else {
+                NarrowValueMode::None
+            };
+            let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+            let op_ty = match ty {
+                I8 | I16 | I32 => I32,
+                I64 => I64,
+                _ => panic!("Unsupported type for Bitrev/Clz/Cls"),
+            };
+            let bitop = match op {
+                Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)),
+                Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)),
+                _ => unreachable!(),
+            };
+            ctx.emit(Inst::BitRR { rd, rn, op: bitop });
+
+            // Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem
+            // to a clz, and bitrev as the main operation.
+            if op == Opcode::Bitrev || op == Opcode::Ctz {
+                // Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place
+                // the reversed result in the highest n bits, so we need to shift them down into
+                // place.
+                let right_shift = match ty {
+                    I8 => Some(24),
+                    I16 => Some(16),
+                    I32 => None,
+                    I64 => None,
+                    _ => panic!("Unsupported type for Bitrev"),
+                };
+                if let Some(s) = right_shift {
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: ALUOp::Lsr32,
+                        rd,
+                        rn: rd.to_reg(),
+                        immshift: ImmShift::maybe_from_u64(s).unwrap(),
+                    });
+                }
+            }
+
+            if op == Opcode::Ctz {
+                ctx.emit(Inst::BitRR {
+                    op: BitOp::from((Opcode::Clz, op_ty)),
+                    rd,
+                    rn: rd.to_reg(),
+                });
+            }
+        }
+
+        Opcode::Popcnt => {
+            // Lower popcount using the following algorithm:
+            //
+            //   x -= (x >> 1) & 0x5555555555555555
+            //   x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333)
+            //   x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f
+            //   x += x << 8
+            //   x += x << 16
+            //   x += x << 32
+            //   x >> 56
+            let ty = ty.unwrap();
+            let rd = get_output_reg(ctx, outputs[0]);
+            // FIXME(#1537): zero-extend 8/16/32-bit operands only to 32 bits,
+            // and fix the sequence below to work properly for this.
+            let narrow_mode = NarrowValueMode::ZeroExtend64;
+            let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+
+            // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
+            // the rest of the code is identical to the 64-bit version.
+            // lsr [wx]d, [wx]n, #1
+            ctx.emit(Inst::AluRRImmShift {
+                alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
+                rd: rd,
+                rn: rn,
+                immshift: ImmShift::maybe_from_u64(1).unwrap(),
+            });
+
+            // and xd, xd, #0x5555555555555555
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op: ALUOp::And64,
+                rd: rd,
+                rn: rd.to_reg(),
+                imml: ImmLogic::maybe_from_u64(0x5555555555555555, I64).unwrap(),
+            });
+
+            // sub xd, xn, xd
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::Sub64,
+                rd: rd,
+                rn: rn,
+                rm: rd.to_reg(),
+            });
+
+            // and xt, xd, #0x3333333333333333
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op: ALUOp::And64,
+                rd: tmp,
+                rn: rd.to_reg(),
+                imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(),
+            });
+
+            // lsr xd, xd, #2
+            ctx.emit(Inst::AluRRImmShift {
+                alu_op: ALUOp::Lsr64,
+                rd: rd,
+                rn: rd.to_reg(),
+                immshift: ImmShift::maybe_from_u64(2).unwrap(),
+            });
+
+            // and xd, xd, #0x3333333333333333
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op: ALUOp::And64,
+                rd: rd,
+                rn: rd.to_reg(),
+                imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(),
+            });
+
+            // add xt, xd, xt
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::Add64,
+                rd: tmp,
+                rn: rd.to_reg(),
+                rm: tmp.to_reg(),
+            });
+
+            // add xt, xt, xt LSR #4
+            ctx.emit(Inst::AluRRRShift {
+                alu_op: ALUOp::Add64,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                rm: tmp.to_reg(),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSR,
+                    ShiftOpShiftImm::maybe_from_shift(4).unwrap(),
+                ),
+            });
+
+            // and xt, xt, #0x0f0f0f0f0f0f0f0f
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op: ALUOp::And64,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                imml: ImmLogic::maybe_from_u64(0x0f0f0f0f0f0f0f0f, I64).unwrap(),
+            });
+
+            // add xt, xt, xt, LSL #8
+            ctx.emit(Inst::AluRRRShift {
+                alu_op: ALUOp::Add64,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                rm: tmp.to_reg(),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(8).unwrap(),
+                ),
+            });
+
+            // add xt, xt, xt, LSL #16
+            ctx.emit(Inst::AluRRRShift {
+                alu_op: ALUOp::Add64,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                rm: tmp.to_reg(),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(16).unwrap(),
+                ),
+            });
+
+            // add xt, xt, xt, LSL #32
+            ctx.emit(Inst::AluRRRShift {
+                alu_op: ALUOp::Add64,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                rm: tmp.to_reg(),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(32).unwrap(),
+                ),
+            });
+
+            // lsr xd, xt, #56
+            ctx.emit(Inst::AluRRImmShift {
+                alu_op: ALUOp::Lsr64,
+                rd: rd,
+                rn: tmp.to_reg(),
+                immshift: ImmShift::maybe_from_u64(56).unwrap(),
+            });
+        }
+
+        Opcode::Load
+        | Opcode::Uload8
+        | Opcode::Sload8
+        | Opcode::Uload16
+        | Opcode::Sload16
+        | Opcode::Uload32
+        | Opcode::Sload32
+        | Opcode::LoadComplex
+        | Opcode::Uload8Complex
+        | Opcode::Sload8Complex
+        | Opcode::Uload16Complex
+        | Opcode::Sload16Complex
+        | Opcode::Uload32Complex
+        | Opcode::Sload32Complex
+        | Opcode::Sload8x8
+        | Opcode::Uload8x8
+        | Opcode::Sload16x4
+        | Opcode::Uload16x4
+        | Opcode::Sload32x2
+        | Opcode::Uload32x2 => {
+            let off = ctx.data(insn).load_store_offset().unwrap();
+            let elem_ty = match op {
+                Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
+                    I8
+                }
+                Opcode::Sload16
+                | Opcode::Uload16
+                | Opcode::Sload16Complex
+                | Opcode::Uload16Complex => I16,
+                Opcode::Sload32
+                | Opcode::Uload32
+                | Opcode::Sload32Complex
+                | Opcode::Uload32Complex => I32,
+                Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
+                Opcode::Sload8x8 | Opcode::Uload8x8 => I8X8,
+                Opcode::Sload16x4 | Opcode::Uload16x4 => I16X4,
+                Opcode::Sload32x2 | Opcode::Uload32x2 => I32X2,
+                _ => unreachable!(),
+            };
+            let sign_extend = match op {
+                Opcode::Sload8
+                | Opcode::Sload8Complex
+                | Opcode::Sload16
+                | Opcode::Sload16Complex
+                | Opcode::Sload32
+                | Opcode::Sload32Complex => true,
+                _ => false,
+            };
+            let is_float = ty_has_float_or_vec_representation(elem_ty);
+
+            let mem = lower_address(ctx, elem_ty, &inputs[..], off);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let flags = ctx
+                .memflags(insn)
+                .expect("Load instruction should have memflags");
+
+            ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
+                (1, _, _) => Inst::ULoad8 { rd, mem, flags },
+                (8, false, _) => Inst::ULoad8 { rd, mem, flags },
+                (8, true, _) => Inst::SLoad8 { rd, mem, flags },
+                (16, false, _) => Inst::ULoad16 { rd, mem, flags },
+                (16, true, _) => Inst::SLoad16 { rd, mem, flags },
+                (32, false, false) => Inst::ULoad32 { rd, mem, flags },
+                (32, true, false) => Inst::SLoad32 { rd, mem, flags },
+                (32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
+                (64, _, false) => Inst::ULoad64 { rd, mem, flags },
+                // Note that we treat some of the vector loads as scalar floating-point loads,
+                // which is correct in a little endian environment.
+                (64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
+                (128, _, _) => Inst::FpuLoad128 { rd, mem, flags },
+                _ => panic!("Unsupported size in load"),
+            });
+
+            let vec_extend = match op {
+                Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
+                Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
+                Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
+                Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
+                Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
+                Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
+                _ => None,
+            };
+
+            if let Some(t) = vec_extend {
+                ctx.emit(Inst::VecExtend {
+                    t,
+                    rd,
+                    rn: rd.to_reg(),
+                    high_half: false,
+                });
+            }
+        }
+
+        Opcode::LoadSplat => {
+            let off = ctx.data(insn).load_store_offset().unwrap();
+            let ty = ty.unwrap();
+            let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let size = VectorSize::from_ty(ty);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+
+            ctx.emit(Inst::LoadAddr { rd: tmp, mem });
+            ctx.emit(Inst::VecLoadReplicate {
+                rd,
+                rn: tmp.to_reg(),
+                size,
+            });
+        }
+
+        Opcode::Store
+        | Opcode::Istore8
+        | Opcode::Istore16
+        | Opcode::Istore32
+        | Opcode::StoreComplex
+        | Opcode::Istore8Complex
+        | Opcode::Istore16Complex
+        | Opcode::Istore32Complex => {
+            let off = ctx.data(insn).load_store_offset().unwrap();
+            let elem_ty = match op {
+                Opcode::Istore8 | Opcode::Istore8Complex => I8,
+                Opcode::Istore16 | Opcode::Istore16Complex => I16,
+                Opcode::Istore32 | Opcode::Istore32Complex => I32,
+                Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
+                _ => unreachable!(),
+            };
+            let is_float = ty_has_float_or_vec_representation(elem_ty);
+            let flags = ctx
+                .memflags(insn)
+                .expect("Store instruction should have memflags");
+
+            let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
+            let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+
+            ctx.emit(match (ty_bits(elem_ty), is_float) {
+                (1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
+                (16, _) => Inst::Store16 { rd, mem, flags },
+                (32, false) => Inst::Store32 { rd, mem, flags },
+                (32, true) => Inst::FpuStore32 { rd, mem, flags },
+                (64, false) => Inst::Store64 { rd, mem, flags },
+                (64, true) => Inst::FpuStore64 { rd, mem, flags },
+                (128, _) => Inst::FpuStore128 { rd, mem, flags },
+                _ => panic!("Unsupported size in store"),
+            });
+        }
+
+        Opcode::StackAddr => {
+            let (stack_slot, offset) = match *ctx.data(insn) {
+                InstructionData::StackLoad {
+                    opcode: Opcode::StackAddr,
+                    stack_slot,
+                    offset,
+                } => (stack_slot, offset),
+                _ => unreachable!(),
+            };
+            let rd = get_output_reg(ctx, outputs[0]);
+            let offset: i32 = offset.into();
+            let inst = ctx
+                .abi()
+                .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
+            ctx.emit(inst);
+        }
+
+        Opcode::AtomicRmw => {
+            let r_dst = get_output_reg(ctx, outputs[0]);
+            let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+            // Make sure that both args are in virtual regs, since in effect
+            // we have to do a parallel copy to get them safely to the AtomicRMW input
+            // regs, and that's not guaranteed safe if either is in a real reg.
+            r_addr = ctx.ensure_in_vreg(r_addr, I64);
+            r_arg2 = ctx.ensure_in_vreg(r_arg2, I64);
+            // Move the args to the preordained AtomicRMW input regs
+            ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
+            ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64));
+            // Now the AtomicRMW insn itself
+            let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
+            ctx.emit(Inst::AtomicRMW { ty: ty_access, op });
+            // And finally, copy the preordained AtomicRMW output reg to its destination.
+            ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
+            // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
+        }
+
+        Opcode::AtomicCas => {
+            // This is very similar to, but not identical to, the AtomicRmw case.  Note
+            // that the AtomicCAS sequence does its own masking, so we don't need to worry
+            // about zero-extending narrow (I8/I16/I32) values here.
+            let r_dst = get_output_reg(ctx, outputs[0]);
+            let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+            // Make sure that all three args are in virtual regs.  See corresponding comment
+            // for `Opcode::AtomicRmw` above.
+            r_addr = ctx.ensure_in_vreg(r_addr, I64);
+            r_expected = ctx.ensure_in_vreg(r_expected, I64);
+            r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
+            // Move the args to the preordained AtomicCAS input regs
+            ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(xreg(26)),
+                r_expected,
+                I64,
+            ));
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(xreg(28)),
+                r_replacement,
+                I64,
+            ));
+            // Now the AtomicCAS itself, implemented in the normal way, with an LL-SC loop
+            ctx.emit(Inst::AtomicCAS { ty: ty_access });
+            // And finally, copy the preordained AtomicCAS output reg to its destination.
+            ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
+            // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
+        }
+
+        Opcode::AtomicLoad => {
+            let r_data = get_output_reg(ctx, outputs[0]);
+            let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+            ctx.emit(Inst::AtomicLoad {
+                ty: ty_access,
+                r_data,
+                r_addr,
+            });
+        }
+
+        Opcode::AtomicStore => {
+            let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty_access = ctx.input_ty(insn, 0);
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+            ctx.emit(Inst::AtomicStore {
+                ty: ty_access,
+                r_data,
+                r_addr,
+            });
+        }
+
+        Opcode::Fence => {
+            ctx.emit(Inst::Fence {});
+        }
+
+        Opcode::StackLoad | Opcode::StackStore => {
+            panic!("Direct stack memory access not supported; should not be used by Wasm");
+        }
+
+        Opcode::HeapAddr => {
+            panic!("heap_addr should have been removed by legalization!");
+        }
+
+        Opcode::TableAddr => {
+            panic!("table_addr should have been removed by legalization!");
+        }
+
+        Opcode::ConstAddr => unimplemented!(),
+
+        Opcode::Nop => {
+            // Nothing.
+        }
+
+        Opcode::Select => {
+            let flag_input = inputs[0];
+            let cond = if let Some(icmp_insn) =
+                maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
+            {
+                let condcode = ctx.data(icmp_insn).cond_code().unwrap();
+                let cond = lower_condcode(condcode);
+                let is_signed = condcode_is_signed(condcode);
+                lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
+                cond
+            } else if let Some(fcmp_insn) =
+                maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
+            {
+                let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
+                let cond = lower_fp_condcode(condcode);
+                lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
+                cond
+            } else {
+                let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
+                    (ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
+                } else {
+                    (ALUOp::SubS32, NarrowValueMode::ZeroExtend32)
+                };
+
+                let rcond = put_input_in_reg(ctx, inputs[0], narrow_mode);
+                // cmp rcond, #0
+                ctx.emit(Inst::AluRRR {
+                    alu_op: cmp_op,
+                    rd: writable_zero_reg(),
+                    rn: rcond,
+                    rm: zero_reg(),
+                });
+                Cond::Ne
+            };
+
+            // csel.cond rd, rn, rm
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
+            let ty = ctx.output_ty(insn, 0);
+            let bits = ty_bits(ty);
+            let is_float = ty_has_float_or_vec_representation(ty);
+            if is_float && bits == 32 {
+                ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
+            } else if is_float && bits == 64 {
+                ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
+            } else if is_float && bits == 128 {
+                ctx.emit(Inst::VecCSel { cond, rd, rn, rm });
+            } else {
+                ctx.emit(Inst::CSel { cond, rd, rn, rm });
+            }
+        }
+
+        Opcode::Selectif | Opcode::SelectifSpectreGuard => {
+            let condcode = ctx.data(insn).cond_code().unwrap();
+            let cond = lower_condcode(condcode);
+            let is_signed = condcode_is_signed(condcode);
+            // Verification ensures that the input is always a
+            // single-def ifcmp.
+            let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
+            lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
+
+            // csel.COND rd, rn, rm
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
+            let ty = ctx.output_ty(insn, 0);
+            let bits = ty_bits(ty);
+            let is_float = ty_has_float_or_vec_representation(ty);
+            if is_float && bits == 32 {
+                ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
+            } else if is_float && bits == 64 {
+                ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
+            } else {
+                ctx.emit(Inst::CSel { cond, rd, rn, rm });
+            }
+        }
+
+        Opcode::Bitselect | Opcode::Vselect => {
+            let ty = ty.unwrap();
+            if !ty.is_vector() {
+                debug_assert_ne!(Opcode::Vselect, op);
+                let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+                let rd = get_output_reg(ctx, outputs[0]);
+                let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
+                // AND rTmp, rn, rcond
+                ctx.emit(Inst::AluRRR {
+                    alu_op: ALUOp::And64,
+                    rd: tmp,
+                    rn,
+                    rm: rcond,
+                });
+                // BIC rd, rm, rcond
+                ctx.emit(Inst::AluRRR {
+                    alu_op: ALUOp::AndNot64,
+                    rd,
+                    rn: rm,
+                    rm: rcond,
+                });
+                // ORR rd, rd, rTmp
+                ctx.emit(Inst::AluRRR {
+                    alu_op: ALUOp::Orr64,
+                    rd,
+                    rn: rd.to_reg(),
+                    rm: tmp.to_reg(),
+                });
+            } else {
+                let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
+                let rd = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(rd, rcond, ty));
+
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Bsl,
+                    rd,
+                    rn,
+                    rm,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
+        }
+
+        Opcode::Trueif => {
+            let condcode = ctx.data(insn).cond_code().unwrap();
+            let cond = lower_condcode(condcode);
+            let is_signed = condcode_is_signed(condcode);
+            // Verification ensures that the input is always a
+            // single-def ifcmp.
+            let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
+            lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
+            let rd = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::CSet { rd, cond });
+            normalize_bool_result(ctx, insn, rd);
+        }
+
+        Opcode::Trueff => {
+            let condcode = ctx.data(insn).fp_cond_code().unwrap();
+            let cond = lower_fp_condcode(condcode);
+            let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
+            lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
+            let rd = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::CSet { rd, cond });
+            normalize_bool_result(ctx, insn, rd);
+        }
+
+        Opcode::IsNull | Opcode::IsInvalid => {
+            // Null references are represented by the constant value 0; invalid references are
+            // represented by the constant value -1. See `define_reftypes()` in
+            // `meta/src/isa/x86/encodings.rs` to confirm.
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ctx.input_ty(insn, 0);
+            let (alu_op, const_value) = match op {
+                Opcode::IsNull => {
+                    // cmp rn, #0
+                    (choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64), 0)
+                }
+                Opcode::IsInvalid => {
+                    // cmn rn, #1
+                    (choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64), 1)
+                }
+                _ => unreachable!(),
+            };
+            let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
+            ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
+            ctx.emit(Inst::CSet { rd, cond: Cond::Eq });
+            normalize_bool_result(ctx, insn, rd);
+        }
+
+        Opcode::Copy => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ctx.input_ty(insn, 0);
+            ctx.emit(Inst::gen_move(rd, rn, ty));
+        }
+
+        Opcode::Breduce | Opcode::Ireduce => {
+            // Smaller integers/booleans are stored with high-order bits
+            // undefined, so we can simply do a copy.
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+            ctx.emit(Inst::gen_move(rd, rn, ty));
+        }
+
+        Opcode::Bextend | Opcode::Bmask => {
+            // Bextend and Bmask both simply sign-extend. This works for:
+            // - Bextend, because booleans are stored as 0 / -1, so we
+            //   sign-extend the -1 to a -1 in the wider width.
+            // - Bmask, because the resulting integer mask value must be
+            //   all-ones (-1) if the argument is true.
+            //
+            // For a sign-extension from a 1-bit value (Case 1 below), we need
+            // to do things a bit specially, because the ISA does not have a
+            // 1-to-N-bit sign extension instruction.  For 8-bit or wider
+            // sources (Case 2 below), we do a sign extension normally.
+
+            let from_ty = ctx.input_ty(insn, 0);
+            let to_ty = ctx.output_ty(insn, 0);
+            let from_bits = ty_bits(from_ty);
+            let to_bits = ty_bits(to_ty);
+
+            assert!(
+                from_bits <= 64 && to_bits <= 64,
+                "Vector Bextend not supported yet"
+            );
+            assert!(from_bits <= to_bits);
+
+            if from_bits == to_bits {
+                // Nothing.
+            } else if from_bits == 1 {
+                assert!(to_bits >= 8);
+                // Case 1: 1-bit to N-bit extension: AND the LSB of source into
+                // dest, generating a value of 0 or 1, then negate to get
+                // 0x000... or 0xfff...
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rd = get_output_reg(ctx, outputs[0]);
+                // AND Rdest, Rsource, #1
+                ctx.emit(Inst::AluRRImmLogic {
+                    alu_op: ALUOp::And64,
+                    rd,
+                    rn,
+                    imml: ImmLogic::maybe_from_u64(1, I64).unwrap(),
+                });
+                // SUB Rdest, XZR, Rdest  (i.e., NEG Rdest)
+                ctx.emit(Inst::AluRRR {
+                    alu_op: ALUOp::Sub64,
+                    rd,
+                    rn: zero_reg(),
+                    rm: rd.to_reg(),
+                });
+            } else {
+                // Case 2: 8-or-more-bit to N-bit extension: just sign-extend. A
+                // `true` (all ones, or `-1`) will be extended to -1 with the
+                // larger width.
+                assert!(from_bits >= 8);
+                let narrow_mode = if to_bits == 64 {
+                    NarrowValueMode::SignExtend64
+                } else {
+                    assert!(to_bits <= 32);
+                    NarrowValueMode::SignExtend32
+                };
+                let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+                let rd = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(rd, rn, to_ty));
+            }
+        }
+
+        Opcode::Bint => {
+            // Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
+            // out the LSB to give a 0 / 1-valued integer result.
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let output_bits = ty_bits(ctx.output_ty(insn, 0));
+
+            let (imm_ty, alu_op) = if output_bits > 32 {
+                (I64, ALUOp::And64)
+            } else {
+                (I32, ALUOp::And32)
+            };
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op,
+                rd,
+                rn,
+                imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(),
+            });
+        }
+
+        Opcode::Bitcast => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let ity = ctx.input_ty(insn, 0);
+            let oty = ctx.output_ty(insn, 0);
+            let ity_vec_reg = ty_has_float_or_vec_representation(ity);
+            let oty_vec_reg = ty_has_float_or_vec_representation(oty);
+            match (ity_vec_reg, oty_vec_reg) {
+                (true, true) => {
+                    let narrow_mode = if ty_bits(ity) <= 32 && ty_bits(oty) <= 32 {
+                        NarrowValueMode::ZeroExtend32
+                    } else {
+                        NarrowValueMode::ZeroExtend64
+                    };
+                    let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
+                    ctx.emit(Inst::gen_move(rd, rm, oty));
+                }
+                (false, false) => {
+                    let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                    ctx.emit(Inst::gen_move(rd, rm, oty));
+                }
+                (false, true) => {
+                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
+                    ctx.emit(Inst::MovToFpu {
+                        rd,
+                        rn,
+                        size: ScalarSize::Size64,
+                    });
+                }
+                (true, false) => {
+                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                    ctx.emit(Inst::MovFromVec {
+                        rd,
+                        rn,
+                        idx: 0,
+                        size: VectorSize::Size64x2,
+                    });
+                }
+            }
+        }
+
+        Opcode::FallthroughReturn | Opcode::Return => {
+            for (i, input) in inputs.iter().enumerate() {
+                // N.B.: according to the AArch64 ABI, the top bits of a register
+                // (above the bits for the value's type) are undefined, so we
+                // need not extend the return values.
+                let reg = put_input_in_reg(ctx, *input, NarrowValueMode::None);
+                let retval_reg = ctx.retval(i);
+                let ty = ctx.input_ty(insn, i);
+                ctx.emit(Inst::gen_move(retval_reg, reg, ty));
+            }
+            // N.B.: the Ret itself is generated by the ABI.
+        }
+
+        Opcode::Ifcmp | Opcode::Ffcmp => {
+            // An Ifcmp/Ffcmp must always be seen as a use of a brif/brff or trueif/trueff
+            // instruction. This will always be the case as long as the IR uses an Ifcmp/Ffcmp from
+            // the same block, or a dominating block. In other words, it cannot pass through a BB
+            // param (phi). The flags pass of the verifier will ensure this.
+            panic!("Should never reach ifcmp as isel root!");
+        }
+
+        Opcode::Icmp => {
+            let condcode = ctx.data(insn).cond_code().unwrap();
+            let cond = lower_condcode(condcode);
+            let is_signed = condcode_is_signed(condcode);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+            let bits = ty_bits(ty);
+            let narrow_mode = match (bits <= 32, is_signed) {
+                (true, true) => NarrowValueMode::SignExtend32,
+                (true, false) => NarrowValueMode::ZeroExtend32,
+                (false, true) => NarrowValueMode::SignExtend64,
+                (false, false) => NarrowValueMode::ZeroExtend64,
+            };
+            let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+
+            if !ty.is_vector() {
+                let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
+                let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
+                ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
+                ctx.emit(Inst::CSet { cond, rd });
+                normalize_bool_result(ctx, insn, rd);
+            } else {
+                let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
+                lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
+            }
+        }
+
+        Opcode::Fcmp => {
+            let condcode = ctx.data(insn).fp_cond_code().unwrap();
+            let cond = lower_fp_condcode(condcode);
+            let ty = ctx.input_ty(insn, 0);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+
+            if !ty.is_vector() {
+                match ty_bits(ty) {
+                    32 => {
+                        ctx.emit(Inst::FpuCmp32 { rn, rm });
+                    }
+                    64 => {
+                        ctx.emit(Inst::FpuCmp64 { rn, rm });
+                    }
+                    _ => panic!("Bad float size"),
+                }
+                ctx.emit(Inst::CSet { cond, rd });
+                normalize_bool_result(ctx, insn, rd);
+            } else {
+                lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
+            }
+        }
+
+        Opcode::JumpTableEntry | Opcode::JumpTableBase => {
+            panic!("Should not appear: we handle BrTable directly");
+        }
+
+        Opcode::Debugtrap => {
+            ctx.emit(Inst::Brk);
+        }
+
+        Opcode::Trap | Opcode::ResumableTrap => {
+            let trap_code = ctx.data(insn).trap_code().unwrap();
+            ctx.emit_safepoint(Inst::Udf { trap_code });
+        }
+
+        Opcode::Trapif | Opcode::Trapff => {
+            let trap_code = ctx.data(insn).trap_code().unwrap();
+
+            let cond = if maybe_input_insn(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
+                let condcode = ctx.data(insn).cond_code().unwrap();
+                let cond = lower_condcode(condcode);
+                // The flags must not have been clobbered by any other
+                // instruction between the iadd_ifcout and this instruction, as
+                // verified by the CLIF validator; so we can simply use the
+                // flags here.
+                cond
+            } else if op == Opcode::Trapif {
+                let condcode = ctx.data(insn).cond_code().unwrap();
+                let cond = lower_condcode(condcode);
+                let is_signed = condcode_is_signed(condcode);
+
+                // Verification ensures that the input is always a single-def ifcmp.
+                let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
+                lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
+                cond
+            } else {
+                let condcode = ctx.data(insn).fp_cond_code().unwrap();
+                let cond = lower_fp_condcode(condcode);
+
+                // Verification ensures that the input is always a
+                // single-def ffcmp.
+                let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
+                lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
+                cond
+            };
+
+            ctx.emit_safepoint(Inst::TrapIf {
+                trap_code,
+                kind: CondBrKind::Cond(cond),
+            });
+        }
+
+        Opcode::Safepoint => {
+            panic!("safepoint instructions not used by new backend's safepoints!");
+        }
+
+        Opcode::Trapz | Opcode::Trapnz | Opcode::ResumableTrapnz => {
+            panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!");
+        }
+
+        Opcode::FuncAddr => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let (extname, _) = ctx.call_target(insn).unwrap();
+            let extname = extname.clone();
+            ctx.emit(Inst::LoadExtName {
+                rd,
+                name: Box::new(extname),
+                offset: 0,
+            });
+        }
+
+        Opcode::GlobalValue => {
+            panic!("global_value should have been removed by legalization!");
+        }
+
+        Opcode::SymbolValue => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
+            let extname = extname.clone();
+            ctx.emit(Inst::LoadExtName {
+                rd,
+                name: Box::new(extname),
+                offset,
+            });
+        }
+
+        Opcode::Call | Opcode::CallIndirect => {
+            let caller_conv = ctx.abi().call_conv();
+            let (mut abi, inputs) = match op {
+                Opcode::Call => {
+                    let (extname, dist) = ctx.call_target(insn).unwrap();
+                    let extname = extname.clone();
+                    let sig = ctx.call_sig(insn).unwrap();
+                    assert!(inputs.len() == sig.params.len());
+                    assert!(outputs.len() == sig.returns.len());
+                    (
+                        AArch64ABICaller::from_func(sig, &extname, dist, caller_conv)?,
+                        &inputs[..],
+                    )
+                }
+                Opcode::CallIndirect => {
+                    let ptr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
+                    let sig = ctx.call_sig(insn).unwrap();
+                    assert!(inputs.len() - 1 == sig.params.len());
+                    assert!(outputs.len() == sig.returns.len());
+                    (
+                        AArch64ABICaller::from_ptr(sig, ptr, op, caller_conv)?,
+                        &inputs[1..],
+                    )
+                }
+                _ => unreachable!(),
+            };
+
+            abi.emit_stack_pre_adjust(ctx);
+            assert!(inputs.len() == abi.num_args());
+            for (i, input) in inputs.iter().enumerate() {
+                let arg_reg = put_input_in_reg(ctx, *input, NarrowValueMode::None);
+                abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
+            }
+            abi.emit_call(ctx);
+            for (i, output) in outputs.iter().enumerate() {
+                let retval_reg = get_output_reg(ctx, *output);
+                abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
+            }
+            abi.emit_stack_post_adjust(ctx);
+        }
+
+        Opcode::GetPinnedReg => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::mov(rd, xreg(PINNED_REG)));
+        }
+
+        Opcode::SetPinnedReg => {
+            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm));
+        }
+
+        Opcode::Spill
+        | Opcode::Fill
+        | Opcode::FillNop
+        | Opcode::Regmove
+        | Opcode::CopySpecial
+        | Opcode::CopyToSsa
+        | Opcode::CopyNop
+        | Opcode::AdjustSpDown
+        | Opcode::AdjustSpUpImm
+        | Opcode::AdjustSpDownImm
+        | Opcode::IfcmpSp
+        | Opcode::Regspill
+        | Opcode::Regfill => {
+            panic!("Unused opcode should not be encountered.");
+        }
+
+        Opcode::Jump
+        | Opcode::Fallthrough
+        | Opcode::Brz
+        | Opcode::Brnz
+        | Opcode::BrIcmp
+        | Opcode::Brif
+        | Opcode::Brff
+        | Opcode::IndirectJumpTableBr
+        | Opcode::BrTable => {
+            panic!("Branch opcode reached non-branch lowering logic!");
+        }
+
+        Opcode::Vconst => {
+            let value = const_param_to_u128(ctx, insn).expect("Invalid immediate bytes");
+            let rd = get_output_reg(ctx, outputs[0]);
+            lower_constant_f128(ctx, rd, value);
+        }
+
+        Opcode::RawBitcast => {
+            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+            ctx.emit(Inst::gen_move(rd, rm, ty));
+        }
+
+        Opcode::Extractlane => {
+            if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
+                let idx = *imm;
+                let rd = get_output_reg(ctx, outputs[0]);
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
+                let ty = ty.unwrap();
+
+                if ty_has_int_representation(ty) {
+                    ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
+                // Plain moves are faster on some processors.
+                } else if idx == 0 {
+                    ctx.emit(Inst::gen_move(rd, rn, ty));
+                } else {
+                    ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
+                }
+            } else {
+                unreachable!();
+            }
+        }
+
+        Opcode::Insertlane => {
+            let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
+                *imm
+            } else {
+                unreachable!();
+            };
+            let input_ty = ctx.input_ty(insn, 1);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let size = VectorSize::from_ty(ty);
+
+            ctx.emit(Inst::gen_move(rd, rm, ty));
+
+            if ty_has_int_representation(input_ty) {
+                ctx.emit(Inst::MovToVec { rd, rn, idx, size });
+            } else {
+                ctx.emit(Inst::VecMovElement {
+                    rd,
+                    rn,
+                    dest_idx: idx,
+                    src_idx: 0,
+                    size,
+                });
+            }
+        }
+
+        Opcode::Splat => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let size = VectorSize::from_ty(ty.unwrap());
+
+            if let Some((_, insn)) = maybe_input_insn_multi(
+                ctx,
+                inputs[0],
+                &[
+                    Opcode::Bconst,
+                    Opcode::F32const,
+                    Opcode::F64const,
+                    Opcode::Iconst,
+                ],
+            ) {
+                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
+            } else if let Some(insn) =
+                maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
+            {
+                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
+            } else if let Some(insn) =
+                maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
+            {
+                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
+            } else {
+                let input_ty = ctx.input_ty(insn, 0);
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let inst = if ty_has_int_representation(input_ty) {
+                    Inst::VecDup { rd, rn, size }
+                } else {
+                    Inst::VecDupFromFpu { rd, rn, size }
+                };
+
+                ctx.emit(inst);
+            }
+        }
+
+        Opcode::ScalarToVector => {
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let input_ty = ctx.input_ty(insn, 0);
+            if (input_ty == I32 && ty.unwrap() == I32X4)
+                || (input_ty == I64 && ty.unwrap() == I64X2)
+            {
+                ctx.emit(Inst::MovToFpu {
+                    rd,
+                    rn,
+                    size: ScalarSize::from_ty(input_ty),
+                });
+            } else {
+                return Err(CodegenError::Unsupported(format!(
+                    "ScalarToVector: unsupported types {:?} -> {:?}",
+                    input_ty, ty
+                )));
+            }
+        }
+
+        Opcode::VanyTrue | Opcode::VallTrue => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let tmp = ctx.alloc_tmp(RegClass::V128, ty.unwrap());
+
+            // This operation is implemented by using umaxp or uminv to
+            // create a scalar value, which is then compared against zero.
+            //
+            // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
+            // mov xm, vn.d[0]
+            // cmp xm, #0
+            // cset xm, ne
+
+            let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
+
+            if op == Opcode::VanyTrue {
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Umaxp,
+                    rd: tmp,
+                    rn: rm,
+                    rm: rm,
+                    size,
+                });
+            } else {
+                ctx.emit(Inst::VecLanes {
+                    op: VecLanesOp::Uminv,
+                    rd: tmp,
+                    rn: rm,
+                    size,
+                });
+            };
+
+            ctx.emit(Inst::MovFromVec {
+                rd,
+                rn: tmp.to_reg(),
+                idx: 0,
+                size: VectorSize::Size64x2,
+            });
+
+            ctx.emit(Inst::AluRRImm12 {
+                alu_op: ALUOp::SubS64,
+                rd: writable_zero_reg(),
+                rn: rd.to_reg(),
+                imm12: Imm12::zero(),
+            });
+
+            ctx.emit(Inst::CSet { rd, cond: Cond::Ne });
+            normalize_bool_result(ctx, insn, rd);
+        }
+
+        Opcode::VhighBits => {
+            let dst_r = get_output_reg(ctx, outputs[0]);
+            let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ctx.input_ty(insn, 0);
+            // All three sequences use one integer temporary and two vector temporaries.  The
+            // shift is done early so as to give the register allocator the possibility of using
+            // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
+            // `src_v`.  See https://github.com/WebAssembly/simd/pull/201 for the background and
+            // derivation of these sequences.  Alternative sequences are discussed in
+            // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
+            // used here.
+            // Also .. FIXME: when https://github.com/bytecodealliance/wasmtime/pull/2310 is
+            // merged, use `lower_splat_constant` instead to generate the constants.
+            let tmp_r0 = ctx.alloc_tmp(RegClass::I64, I64);
+            let tmp_v0 = ctx.alloc_tmp(RegClass::V128, I8X16);
+            let tmp_v1 = ctx.alloc_tmp(RegClass::V128, I8X16);
+            match ty {
+                I8X16 => {
+                    // sshr  tmp_v1.16b, src_v.16b, #7
+                    // mov   tmp_r0, #0x0201
+                    // movk  tmp_r0, #0x0804, lsl 16
+                    // movk  tmp_r0, #0x2010, lsl 32
+                    // movk  tmp_r0, #0x8040, lsl 48
+                    // dup   tmp_v0.2d, tmp_r0
+                    // and   tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
+                    // ext   tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
+                    // zip1  tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
+                    // addv  tmp_v0h, tmp_v0.8h
+                    // mov   dst_r, tmp_v0.h[0]
+                    ctx.emit(Inst::VecShiftImm {
+                        op: VecShiftImmOp::Sshr,
+                        rd: tmp_v1,
+                        rn: src_v,
+                        size: VectorSize::Size8x16,
+                        imm: 7,
+                    });
+                    lower_constant_u64(ctx, tmp_r0, 0x8040201008040201u64);
+                    ctx.emit(Inst::VecDup {
+                        rd: tmp_v0,
+                        rn: tmp_r0.to_reg(),
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::And,
+                        rd: tmp_v1,
+                        rn: tmp_v1.to_reg(),
+                        rm: tmp_v0.to_reg(),
+                        size: VectorSize::Size8x16,
+                    });
+                    ctx.emit(Inst::VecExtract {
+                        rd: tmp_v0,
+                        rn: tmp_v1.to_reg(),
+                        rm: tmp_v1.to_reg(),
+                        imm4: 8,
+                    });
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Zip1,
+                        rd: tmp_v0,
+                        rn: tmp_v1.to_reg(),
+                        rm: tmp_v0.to_reg(),
+                        size: VectorSize::Size8x16,
+                    });
+                    ctx.emit(Inst::VecLanes {
+                        op: VecLanesOp::Addv,
+                        rd: tmp_v0,
+                        rn: tmp_v0.to_reg(),
+                        size: VectorSize::Size16x8,
+                    });
+                    ctx.emit(Inst::MovFromVec {
+                        rd: dst_r,
+                        rn: tmp_v0.to_reg(),
+                        idx: 0,
+                        size: VectorSize::Size16x8,
+                    });
+                }
+                I16X8 => {
+                    // sshr  tmp_v1.8h, src_v.8h, #15
+                    // mov   tmp_r0, #0x1
+                    // movk  tmp_r0, #0x2, lsl 16
+                    // movk  tmp_r0, #0x4, lsl 32
+                    // movk  tmp_r0, #0x8, lsl 48
+                    // dup   tmp_v0.2d, tmp_r0
+                    // shl   tmp_r0, tmp_r0, #4
+                    // mov   tmp_v0.d[1], tmp_r0
+                    // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
+                    // addv  tmp_v0h, tmp_v0.8h
+                    // mov   dst_r, tmp_v0.h[0]
+                    ctx.emit(Inst::VecShiftImm {
+                        op: VecShiftImmOp::Sshr,
+                        rd: tmp_v1,
+                        rn: src_v,
+                        size: VectorSize::Size16x8,
+                        imm: 15,
+                    });
+                    lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
+                    ctx.emit(Inst::VecDup {
+                        rd: tmp_v0,
+                        rn: tmp_r0.to_reg(),
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: ALUOp::Lsl64,
+                        rd: tmp_r0,
+                        rn: tmp_r0.to_reg(),
+                        immshift: ImmShift { imm: 4 },
+                    });
+                    ctx.emit(Inst::MovToVec {
+                        rd: tmp_v0,
+                        rn: tmp_r0.to_reg(),
+                        idx: 1,
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::And,
+                        rd: tmp_v0,
+                        rn: tmp_v1.to_reg(),
+                        rm: tmp_v0.to_reg(),
+                        size: VectorSize::Size8x16,
+                    });
+                    ctx.emit(Inst::VecLanes {
+                        op: VecLanesOp::Addv,
+                        rd: tmp_v0,
+                        rn: tmp_v0.to_reg(),
+                        size: VectorSize::Size16x8,
+                    });
+                    ctx.emit(Inst::MovFromVec {
+                        rd: dst_r,
+                        rn: tmp_v0.to_reg(),
+                        idx: 0,
+                        size: VectorSize::Size16x8,
+                    });
+                }
+                I32X4 => {
+                    // sshr  tmp_v1.4s, src_v.4s, #31
+                    // mov   tmp_r0, #0x1
+                    // movk  tmp_r0, #0x2, lsl 32
+                    // dup   tmp_v0.2d, tmp_r0
+                    // shl   tmp_r0, tmp_r0, #2
+                    // mov   tmp_v0.d[1], tmp_r0
+                    // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
+                    // addv  tmp_v0s, tmp_v0.4s
+                    // mov   dst_r, tmp_v0.s[0]
+                    ctx.emit(Inst::VecShiftImm {
+                        op: VecShiftImmOp::Sshr,
+                        rd: tmp_v1,
+                        rn: src_v,
+                        size: VectorSize::Size32x4,
+                        imm: 31,
+                    });
+                    lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
+                    ctx.emit(Inst::VecDup {
+                        rd: tmp_v0,
+                        rn: tmp_r0.to_reg(),
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: ALUOp::Lsl64,
+                        rd: tmp_r0,
+                        rn: tmp_r0.to_reg(),
+                        immshift: ImmShift { imm: 2 },
+                    });
+                    ctx.emit(Inst::MovToVec {
+                        rd: tmp_v0,
+                        rn: tmp_r0.to_reg(),
+                        idx: 1,
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::And,
+                        rd: tmp_v0,
+                        rn: tmp_v1.to_reg(),
+                        rm: tmp_v0.to_reg(),
+                        size: VectorSize::Size8x16,
+                    });
+                    ctx.emit(Inst::VecLanes {
+                        op: VecLanesOp::Addv,
+                        rd: tmp_v0,
+                        rn: tmp_v0.to_reg(),
+                        size: VectorSize::Size32x4,
+                    });
+                    ctx.emit(Inst::MovFromVec {
+                        rd: dst_r,
+                        rn: tmp_v0.to_reg(),
+                        idx: 0,
+                        size: VectorSize::Size32x4,
+                    });
+                }
+                _ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
+            }
+        }
+
+        Opcode::Shuffle => {
+            let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            // 2 register table vector lookups require consecutive table registers;
+            // we satisfy this constraint by hardcoding the usage of v29 and v30.
+            let temp = writable_vreg(29);
+            let temp2 = writable_vreg(30);
+            let input_ty = ctx.input_ty(insn, 0);
+            assert_eq!(input_ty, ctx.input_ty(insn, 1));
+            // Make sure that both inputs are in virtual registers, since it is
+            // not guaranteed that we can get them safely to the temporaries if
+            // either is in a real register.
+            let rn = ctx.ensure_in_vreg(rn, input_ty);
+            let rn2 = ctx.ensure_in_vreg(rn2, input_ty);
+
+            lower_constant_f128(ctx, rd, mask);
+            ctx.emit(Inst::gen_move(temp, rn, input_ty));
+            ctx.emit(Inst::gen_move(temp2, rn2, input_ty));
+            ctx.emit(Inst::VecTbl2 {
+                rd,
+                rn: temp.to_reg(),
+                rn2: temp2.to_reg(),
+                rm: rd.to_reg(),
+                is_extension: false,
+            });
+        }
+
+        Opcode::Swizzle => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+
+            ctx.emit(Inst::VecTbl {
+                rd,
+                rn,
+                rm,
+                is_extension: false,
+            });
+        }
+
+        Opcode::Vsplit
+        | Opcode::Vconcat
+        | Opcode::Uload8x8Complex
+        | Opcode::Sload8x8Complex
+        | Opcode::Uload16x4Complex
+        | Opcode::Sload16x4Complex
+        | Opcode::Uload32x2Complex
+        | Opcode::Sload32x2Complex => {
+            // TODO
+            panic!("Vector ops not implemented.");
+        }
+
+        Opcode::Isplit | Opcode::Iconcat => panic!("Vector ops not supported."),
+
+        Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => {
+            let alu_op = match op {
+                Opcode::Umin => VecALUOp::Umin,
+                Opcode::Imin => VecALUOp::Smin,
+                Opcode::Umax => VecALUOp::Umax,
+                Opcode::Imax => VecALUOp::Smax,
+                _ => unreachable!(),
+            };
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            ctx.emit(Inst::VecRRR {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                size: VectorSize::from_ty(ty),
+            });
+        }
+
+        Opcode::WideningPairwiseDotProductS => {
+            let r_y = get_output_reg(ctx, outputs[0]);
+            let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            if ty == I32X4 {
+                let tmp = ctx.alloc_tmp(RegClass::V128, I8X16);
+                // The args have type I16X8.
+                // "y = i32x4.dot_i16x8_s(a, b)"
+                // => smull  tmp, a, b
+                //    smull2 y,   a, b
+                //    addp   y,   tmp, y
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Smull,
+                    rd: tmp,
+                    rn: r_a,
+                    rm: r_b,
+                    size: VectorSize::Size16x8,
+                });
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Smull2,
+                    rd: r_y,
+                    rn: r_a,
+                    rm: r_b,
+                    size: VectorSize::Size16x8,
+                });
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Addp,
+                    rd: r_y,
+                    rn: tmp.to_reg(),
+                    rm: r_y.to_reg(),
+                    size: VectorSize::Size32x4,
+                });
+            } else {
+                return Err(CodegenError::Unsupported(format!(
+                    "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
+                    ty
+                )));
+            }
+        }
+
+        Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
+            let ty = ty.unwrap();
+            let bits = ty_bits(ty);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+            if !ty.is_vector() {
+                let fpu_op = match (op, bits) {
+                    (Opcode::Fadd, 32) => FPUOp2::Add32,
+                    (Opcode::Fadd, 64) => FPUOp2::Add64,
+                    (Opcode::Fsub, 32) => FPUOp2::Sub32,
+                    (Opcode::Fsub, 64) => FPUOp2::Sub64,
+                    (Opcode::Fmul, 32) => FPUOp2::Mul32,
+                    (Opcode::Fmul, 64) => FPUOp2::Mul64,
+                    (Opcode::Fdiv, 32) => FPUOp2::Div32,
+                    (Opcode::Fdiv, 64) => FPUOp2::Div64,
+                    (Opcode::Fmin, 32) => FPUOp2::Min32,
+                    (Opcode::Fmin, 64) => FPUOp2::Min64,
+                    (Opcode::Fmax, 32) => FPUOp2::Max32,
+                    (Opcode::Fmax, 64) => FPUOp2::Max64,
+                    _ => panic!("Unknown op/bits combination"),
+                };
+                ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm });
+            } else {
+                let alu_op = match op {
+                    Opcode::Fadd => VecALUOp::Fadd,
+                    Opcode::Fsub => VecALUOp::Fsub,
+                    Opcode::Fdiv => VecALUOp::Fdiv,
+                    Opcode::Fmax => VecALUOp::Fmax,
+                    Opcode::Fmin => VecALUOp::Fmin,
+                    Opcode::Fmul => VecALUOp::Fmul,
+                    _ => unreachable!(),
+                };
+
+                ctx.emit(Inst::VecRRR {
+                    rd,
+                    rn,
+                    rm,
+                    alu_op,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
+        }
+
+        Opcode::FminPseudo | Opcode::FmaxPseudo => {
+            let ty = ctx.input_ty(insn, 0);
+            if ty == F32X4 || ty == F64X2 {
+                // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
+                // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
+                let r_dst = get_output_reg(ctx, outputs[0]);
+                let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+                // Since we're going to write the output register `r_dst` anyway, we might as
+                // well first use it to hold the comparison result.  This has the slightly unusual
+                // effect that we modify the output register in the first instruction (`fcmgt`)
+                // but read both the inputs again in the second instruction (`bsl`), which means
+                // that the output register can't be either of the input registers.  Regalloc
+                // should handle this correctly, nevertheless.
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Fcmgt,
+                    rd: r_dst,
+                    rn: if op == Opcode::FminPseudo { r_a } else { r_b },
+                    rm: if op == Opcode::FminPseudo { r_b } else { r_a },
+                    size: if ty == F32X4 {
+                        VectorSize::Size32x4
+                    } else {
+                        VectorSize::Size64x2
+                    },
+                });
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Bsl,
+                    rd: r_dst,
+                    rn: r_b,
+                    rm: r_a,
+                    size: VectorSize::Size8x16,
+                });
+            } else {
+                panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
+            }
+        }
+
+        Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
+            let ty = ty.unwrap();
+            let bits = ty_bits(ty);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+            if !ty.is_vector() {
+                let fpu_op = match (op, bits) {
+                    (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
+                    (Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
+                    (Opcode::Fneg, 32) => FPUOp1::Neg32,
+                    (Opcode::Fneg, 64) => FPUOp1::Neg64,
+                    (Opcode::Fabs, 32) => FPUOp1::Abs32,
+                    (Opcode::Fabs, 64) => FPUOp1::Abs64,
+                    (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"),
+                    (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64,
+                    (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32,
+                    (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"),
+                    _ => panic!("Unknown op/bits combination"),
+                };
+                ctx.emit(Inst::FpuRR { fpu_op, rd, rn });
+            } else {
+                let op = match op {
+                    Opcode::Fabs => VecMisc2::Fabs,
+                    Opcode::Fneg => VecMisc2::Fneg,
+                    Opcode::Sqrt => VecMisc2::Fsqrt,
+                    _ => unimplemented!(),
+                };
+
+                ctx.emit(Inst::VecMisc {
+                    op,
+                    rd,
+                    rn,
+                    size: VectorSize::from_ty(ty),
+                });
+            }
+        }
+
+        Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
+            let ty = ctx.output_ty(insn, 0);
+            if !ty.is_vector() {
+                let bits = ty_bits(ty);
+                let op = match (op, bits) {
+                    (Opcode::Ceil, 32) => FpuRoundMode::Plus32,
+                    (Opcode::Ceil, 64) => FpuRoundMode::Plus64,
+                    (Opcode::Floor, 32) => FpuRoundMode::Minus32,
+                    (Opcode::Floor, 64) => FpuRoundMode::Minus64,
+                    (Opcode::Trunc, 32) => FpuRoundMode::Zero32,
+                    (Opcode::Trunc, 64) => FpuRoundMode::Zero64,
+                    (Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
+                    (Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
+                    _ => panic!("Unknown op/bits combination (scalar)"),
+                };
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rd = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::FpuRound { op, rd, rn });
+            } else {
+                let (op, size) = match (op, ty) {
+                    (Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
+                    (Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
+                    (Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
+                    (Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
+                    (Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
+                    (Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
+                    (Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
+                    (Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
+                    _ => panic!("Unknown op/ty combination (vector){:?}", ty),
+                };
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rd = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::VecMisc { op, rd, rn, size });
+            }
+        }
+
+        Opcode::Fma => {
+            let bits = ty_bits(ctx.output_ty(insn, 0));
+            let fpu_op = match bits {
+                32 => FPUOp3::MAdd32,
+                64 => FPUOp3::MAdd64,
+                _ => panic!("Unknown op size"),
+            };
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::FpuRRRR {
+                fpu_op,
+                rn,
+                rm,
+                ra,
+                rd,
+            });
+        }
+
+        Opcode::Fcopysign => {
+            // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
+            //
+            // This is a scalar Fcopysign.
+            // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
+            //
+            //  mov vd, vn
+            //  ushr vtmp, vm, #63 / #31
+            //  sli vd, vtmp, #63 / #31
+
+            let ty = ctx.output_ty(insn, 0);
+            let bits = ty_bits(ty) as u8;
+            assert!(bits == 32 || bits == 64);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let tmp = ctx.alloc_tmp(RegClass::V128, F64);
+
+            // Copy LHS to rd.
+            ctx.emit(Inst::FpuMove64 { rd, rn });
+
+            // Copy the sign bit to the lowest bit in tmp.
+            let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
+            ctx.emit(Inst::FpuRRI {
+                fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
+                rd: tmp,
+                rn: rm,
+            });
+
+            // Insert the bit from tmp into the sign bit of rd.
+            let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
+            ctx.emit(Inst::FpuRRI {
+                fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
+                rd,
+                rn: tmp.to_reg(),
+            });
+        }
+
+        Opcode::FcvtToUint | Opcode::FcvtToSint => {
+            let in_bits = ty_bits(ctx.input_ty(insn, 0));
+            let out_bits = ty_bits(ctx.output_ty(insn, 0));
+            let signed = op == Opcode::FcvtToSint;
+            let op = match (signed, in_bits, out_bits) {
+                (false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32,
+                (true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32,
+                (false, 32, 64) => FpuToIntOp::F32ToU64,
+                (true, 32, 64) => FpuToIntOp::F32ToI64,
+                (false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32,
+                (true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32,
+                (false, 64, 64) => FpuToIntOp::F64ToU64,
+                (true, 64, 64) => FpuToIntOp::F64ToI64,
+                _ => panic!("Unknown input/output-bits combination"),
+            };
+
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+
+            // First, check the output: it's important to carry the NaN conversion before the
+            // in-bounds conversion, per wasm semantics.
+
+            // Check that the input is not a NaN.
+            if in_bits == 32 {
+                ctx.emit(Inst::FpuCmp32 { rn, rm: rn });
+            } else {
+                ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
+            }
+            let trap_code = TrapCode::BadConversionToInteger;
+            ctx.emit(Inst::TrapIf {
+                trap_code,
+                kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)),
+            });
+
+            let tmp = ctx.alloc_tmp(RegClass::V128, I128);
+
+            // Check that the input is in range, with "truncate towards zero" semantics. This means
+            // we allow values that are slightly out of range:
+            // - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this
+            // can be represented), and strictly less than INT_MAX+1 (when this can be
+            // represented).
+            // - for unsigned conversions, we allow values strictly greater than -1, and strictly
+            // less than UINT_MAX+1 (when this can be represented).
+
+            if in_bits == 32 {
+                // From float32.
+                let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
+                    (true, 8) => (
+                        i8::min_value() as f32 - 1.,
+                        FloatCC::GreaterThan,
+                        i8::max_value() as f32 + 1.,
+                    ),
+                    (true, 16) => (
+                        i16::min_value() as f32 - 1.,
+                        FloatCC::GreaterThan,
+                        i16::max_value() as f32 + 1.,
+                    ),
+                    (true, 32) => (
+                        i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32.
+                        FloatCC::GreaterThanOrEqual,
+                        i32::max_value() as f32 + 1.,
+                    ),
+                    (true, 64) => (
+                        i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32.
+                        FloatCC::GreaterThanOrEqual,
+                        i64::max_value() as f32 + 1.,
+                    ),
+                    (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.),
+                    (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.),
+                    (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.),
+                    (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.),
+                    _ => panic!("Unknown input/output-bits combination"),
+                };
+
+                // >= low_bound
+                lower_constant_f32(ctx, tmp, low_bound);
+                ctx.emit(Inst::FpuCmp32 {
+                    rn,
+                    rm: tmp.to_reg(),
+                });
+                let trap_code = TrapCode::IntegerOverflow;
+                ctx.emit(Inst::TrapIf {
+                    trap_code,
+                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
+                });
+
+                // <= high_bound
+                lower_constant_f32(ctx, tmp, high_bound);
+                ctx.emit(Inst::FpuCmp32 {
+                    rn,
+                    rm: tmp.to_reg(),
+                });
+                let trap_code = TrapCode::IntegerOverflow;
+                ctx.emit(Inst::TrapIf {
+                    trap_code,
+                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
+                });
+            } else {
+                // From float64.
+                let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
+                    (true, 8) => (
+                        i8::min_value() as f64 - 1.,
+                        FloatCC::GreaterThan,
+                        i8::max_value() as f64 + 1.,
+                    ),
+                    (true, 16) => (
+                        i16::min_value() as f64 - 1.,
+                        FloatCC::GreaterThan,
+                        i16::max_value() as f64 + 1.,
+                    ),
+                    (true, 32) => (
+                        i32::min_value() as f64 - 1.,
+                        FloatCC::GreaterThan,
+                        i32::max_value() as f64 + 1.,
+                    ),
+                    (true, 64) => (
+                        i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64.
+                        FloatCC::GreaterThanOrEqual,
+                        i64::max_value() as f64 + 1.,
+                    ),
+                    (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.),
+                    (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.),
+                    (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.),
+                    (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.),
+                    _ => panic!("Unknown input/output-bits combination"),
+                };
+
+                // >= low_bound
+                lower_constant_f64(ctx, tmp, low_bound);
+                ctx.emit(Inst::FpuCmp64 {
+                    rn,
+                    rm: tmp.to_reg(),
+                });
+                let trap_code = TrapCode::IntegerOverflow;
+                ctx.emit(Inst::TrapIf {
+                    trap_code,
+                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
+                });
+
+                // <= high_bound
+                lower_constant_f64(ctx, tmp, high_bound);
+                ctx.emit(Inst::FpuCmp64 {
+                    rn,
+                    rm: tmp.to_reg(),
+                });
+                let trap_code = TrapCode::IntegerOverflow;
+                ctx.emit(Inst::TrapIf {
+                    trap_code,
+                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
+                });
+            };
+
+            // Do the conversion.
+            ctx.emit(Inst::FpuToInt { op, rd, rn });
+        }
+
+        Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
+            let ty = ty.unwrap();
+            let signed = op == Opcode::FcvtFromSint;
+            let rd = get_output_reg(ctx, outputs[0]);
+
+            if ty.is_vector() {
+                let op = if signed {
+                    VecMisc2::Scvtf
+                } else {
+                    VecMisc2::Ucvtf
+                };
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+
+                ctx.emit(Inst::VecMisc {
+                    op,
+                    rd,
+                    rn,
+                    size: VectorSize::from_ty(ty),
+                });
+            } else {
+                let in_bits = ty_bits(ctx.input_ty(insn, 0));
+                let out_bits = ty_bits(ty);
+                let op = match (signed, in_bits, out_bits) {
+                    (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
+                    (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
+                    (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
+                    (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
+                    (false, 64, 32) => IntToFpuOp::U64ToF32,
+                    (true, 64, 32) => IntToFpuOp::I64ToF32,
+                    (false, 64, 64) => IntToFpuOp::U64ToF64,
+                    (true, 64, 64) => IntToFpuOp::I64ToF64,
+                    _ => panic!("Unknown input/output-bits combination"),
+                };
+                let narrow_mode = match (signed, in_bits) {
+                    (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
+                    (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
+                    (false, 64) => NarrowValueMode::ZeroExtend64,
+                    (true, 64) => NarrowValueMode::SignExtend64,
+                    _ => panic!("Unknown input size"),
+                };
+                let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+                ctx.emit(Inst::IntToFpu { op, rd, rn });
+            }
+        }
+
+        Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
+            let ty = ty.unwrap();
+            let out_signed = op == Opcode::FcvtToSintSat;
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+
+            if ty.is_vector() {
+                let op = if out_signed {
+                    VecMisc2::Fcvtzs
+                } else {
+                    VecMisc2::Fcvtzu
+                };
+
+                ctx.emit(Inst::VecMisc {
+                    op,
+                    rd,
+                    rn,
+                    size: VectorSize::from_ty(ty),
+                });
+            } else {
+                let in_ty = ctx.input_ty(insn, 0);
+                let in_bits = ty_bits(in_ty);
+                let out_bits = ty_bits(ty);
+                // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
+                // FMIN Vtmp2, Vin, Vtmp1
+                // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
+                // FMAX Vtmp2, Vtmp2, Vtmp1
+                // (if signed) FIMM Vtmp1, 0
+                // FCMP Vin, Vin
+                // FCSEL Vtmp2, Vtmp1, Vtmp2, NE  // on NaN, select 0
+                // convert Rout, Vtmp2
+
+                assert!(in_bits == 32 || in_bits == 64);
+                assert!(out_bits == 32 || out_bits == 64);
+
+                let min: f64 = match (out_bits, out_signed) {
+                    (32, true) => std::i32::MIN as f64,
+                    (32, false) => 0.0,
+                    (64, true) => std::i64::MIN as f64,
+                    (64, false) => 0.0,
+                    _ => unreachable!(),
+                };
+
+                let max = match (out_bits, out_signed) {
+                    (32, true) => std::i32::MAX as f64,
+                    (32, false) => std::u32::MAX as f64,
+                    (64, true) => std::i64::MAX as f64,
+                    (64, false) => std::u64::MAX as f64,
+                    _ => unreachable!(),
+                };
+
+                let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty);
+                let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);
+
+                if in_bits == 32 {
+                    lower_constant_f32(ctx, rtmp1, max as f32);
+                } else {
+                    lower_constant_f64(ctx, rtmp1, max);
+                }
+                ctx.emit(Inst::FpuRRR {
+                    fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
+                    rd: rtmp2,
+                    rn: rn,
+                    rm: rtmp1.to_reg(),
+                });
+                if in_bits == 32 {
+                    lower_constant_f32(ctx, rtmp1, min as f32);
+                } else {
+                    lower_constant_f64(ctx, rtmp1, min);
+                }
+                ctx.emit(Inst::FpuRRR {
+                    fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
+                    rd: rtmp2,
+                    rn: rtmp2.to_reg(),
+                    rm: rtmp1.to_reg(),
+                });
+                if out_signed {
+                    if in_bits == 32 {
+                        lower_constant_f32(ctx, rtmp1, 0.0);
+                    } else {
+                        lower_constant_f64(ctx, rtmp1, 0.0);
+                    }
+                }
+                if in_bits == 32 {
+                    ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn });
+                    ctx.emit(Inst::FpuCSel32 {
+                        rd: rtmp2,
+                        rn: rtmp1.to_reg(),
+                        rm: rtmp2.to_reg(),
+                        cond: Cond::Ne,
+                    });
+                } else {
+                    ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn });
+                    ctx.emit(Inst::FpuCSel64 {
+                        rd: rtmp2,
+                        rn: rtmp1.to_reg(),
+                        rm: rtmp2.to_reg(),
+                        cond: Cond::Ne,
+                    });
+                }
+
+                let cvt = match (in_bits, out_bits, out_signed) {
+                    (32, 32, false) => FpuToIntOp::F32ToU32,
+                    (32, 32, true) => FpuToIntOp::F32ToI32,
+                    (32, 64, false) => FpuToIntOp::F32ToU64,
+                    (32, 64, true) => FpuToIntOp::F32ToI64,
+                    (64, 32, false) => FpuToIntOp::F64ToU32,
+                    (64, 32, true) => FpuToIntOp::F64ToI32,
+                    (64, 64, false) => FpuToIntOp::F64ToU64,
+                    (64, 64, true) => FpuToIntOp::F64ToI64,
+                    _ => unreachable!(),
+                };
+                ctx.emit(Inst::FpuToInt {
+                    op: cvt,
+                    rd,
+                    rn: rtmp2.to_reg(),
+                });
+            }
+        }
+
+        Opcode::IaddIfcout => {
+            // This is a two-output instruction that is needed for the
+            // legalizer's explicit heap-check sequence, among possible other
+            // uses. Its second output is a flags output only ever meant to
+            // check for overflow using the
+            // `backend.unsigned_add_overflow_condition()` condition.
+            //
+            // Note that the CLIF validation will ensure that no flag-setting
+            // operation comes between this IaddIfcout and its use (e.g., a
+            // Trapif). Thus, we can rely on implicit communication through the
+            // processor flags rather than explicitly generating flags into a
+            // register. We simply use the variant of the add instruction that
+            // sets flags (`adds`) here.
+
+            // Ensure that the second output isn't directly called for: it
+            // should only be used by a flags-consuming op, which will directly
+            // understand this instruction and merge the comparison.
+            assert!(!ctx.is_reg_needed(insn, ctx.get_output(insn, 1).to_reg()));
+
+            // Now handle the iadd as above, except use an AddS opcode that sets
+            // flags.
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let alu_op = choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64);
+            ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+        }
+
+        Opcode::IaddImm
+        | Opcode::ImulImm
+        | Opcode::UdivImm
+        | Opcode::SdivImm
+        | Opcode::UremImm
+        | Opcode::SremImm
+        | Opcode::IrsubImm
+        | Opcode::IaddCin
+        | Opcode::IaddIfcin
+        | Opcode::IaddCout
+        | Opcode::IaddCarry
+        | Opcode::IaddIfcarry
+        | Opcode::IsubBin
+        | Opcode::IsubIfbin
+        | Opcode::IsubBout
+        | Opcode::IsubIfbout
+        | Opcode::IsubBorrow
+        | Opcode::IsubIfborrow
+        | Opcode::BandImm
+        | Opcode::BorImm
+        | Opcode::BxorImm
+        | Opcode::RotlImm
+        | Opcode::RotrImm
+        | Opcode::IshlImm
+        | Opcode::UshrImm
+        | Opcode::SshrImm
+        | Opcode::IcmpImm
+        | Opcode::IfcmpImm => {
+            panic!("ALU+imm and ALU+carry ops should not appear here!");
+        }
+
+        #[cfg(feature = "x86")]
+        Opcode::X86Udivmodx
+        | Opcode::X86Sdivmodx
+        | Opcode::X86Umulx
+        | Opcode::X86Smulx
+        | Opcode::X86Cvtt2si
+        | Opcode::X86Fmin
+        | Opcode::X86Fmax
+        | Opcode::X86Push
+        | Opcode::X86Pop
+        | Opcode::X86Bsr
+        | Opcode::X86Bsf
+        | Opcode::X86Pblendw
+        | Opcode::X86Pshufd
+        | Opcode::X86Pshufb
+        | Opcode::X86Pextr
+        | Opcode::X86Pinsr
+        | Opcode::X86Insertps
+        | Opcode::X86Movsd
+        | Opcode::X86Movlhps
+        | Opcode::X86Palignr
+        | Opcode::X86Psll
+        | Opcode::X86Psrl
+        | Opcode::X86Psra
+        | Opcode::X86Ptest
+        | Opcode::X86Pmaxs
+        | Opcode::X86Pmaxu
+        | Opcode::X86Pmins
+        | Opcode::X86Pminu
+        | Opcode::X86Pmullq
+        | Opcode::X86Pmuludq
+        | Opcode::X86Punpckh
+        | Opcode::X86Punpckl
+        | Opcode::X86Vcvtudq2ps
+        | Opcode::X86ElfTlsGetAddr
+        | Opcode::X86MachoTlsGetAddr => {
+            panic!("x86-specific opcode in supposedly arch-neutral IR!");
+        }
+
+        Opcode::DummySargT => unreachable!(),
+
+        Opcode::Iabs => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            ctx.emit(Inst::VecMisc {
+                op: VecMisc2::Abs,
+                rd,
+                rn,
+                size: VectorSize::from_ty(ty),
+            });
+        }
+        Opcode::AvgRound => {
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            ctx.emit(Inst::VecRRR {
+                alu_op: VecALUOp::Urhadd,
+                rd,
+                rn,
+                rm,
+                size: VectorSize::from_ty(ty),
+            });
+        }
+
+        Opcode::Snarrow | Opcode::Unarrow => {
+            let op = if op == Opcode::Snarrow {
+                VecMiscNarrowOp::Sqxtn
+            } else {
+                VecMiscNarrowOp::Sqxtun
+            };
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+
+            ctx.emit(Inst::VecMiscNarrow {
+                op,
+                rd,
+                rn,
+                size: VectorSize::from_ty(ty),
+                high_half: false,
+            });
+            ctx.emit(Inst::VecMiscNarrow {
+                op,
+                rd,
+                rn: rn2,
+                size: VectorSize::from_ty(ty),
+                high_half: true,
+            });
+        }
+
+        Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
+            let lane_type = ty.unwrap().lane_type();
+            let rd = get_output_reg(ctx, outputs[0]);
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let (t, high_half) = match (lane_type, op) {
+                (I16, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
+                (I16, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true),
+                (I16, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false),
+                (I16, Opcode::UwidenHigh) => (VecExtendOp::Uxtl8, true),
+                (I32, Opcode::SwidenLow) => (VecExtendOp::Sxtl16, false),
+                (I32, Opcode::SwidenHigh) => (VecExtendOp::Sxtl16, true),
+                (I32, Opcode::UwidenLow) => (VecExtendOp::Uxtl16, false),
+                (I32, Opcode::UwidenHigh) => (VecExtendOp::Uxtl16, true),
+                _ => {
+                    return Err(CodegenError::Unsupported(format!(
+                        "Unsupported SIMD vector lane type: {:?}",
+                        lane_type
+                    )));
+                }
+            };
+
+            ctx.emit(Inst::VecExtend {
+                t,
+                rd,
+                rn,
+                high_half,
+            });
+        }
+
+        Opcode::TlsValue => unimplemented!("tls_value"),
+    }
+
+    Ok(())
+}
+
+pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    branches: &[IRInst],
+    targets: &[MachLabel],
+    fallthrough: Option<MachLabel>,
+) -> CodegenResult<()> {
+    // A block should end with at most two branches. The first may be a
+    // conditional branch; a conditional branch can be followed only by an
+    // unconditional branch or fallthrough. Otherwise, if only one branch,
+    // it may be an unconditional branch, a fallthrough, a return, or a
+    // trap. These conditions are verified by `is_ebb_basic()` during the
+    // verifier pass.
+    assert!(branches.len() <= 2);
+
+    if branches.len() == 2 {
+        // Must be a conditional branch followed by an unconditional branch.
+        let op0 = ctx.data(branches[0]).opcode();
+        let op1 = ctx.data(branches[1]).opcode();
+
+        assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
+        let taken = BranchTarget::Label(targets[0]);
+        let not_taken = match op1 {
+            Opcode::Jump => BranchTarget::Label(targets[1]),
+            Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()),
+            _ => unreachable!(), // assert above.
+        };
+
+        match op0 {
+            Opcode::Brz | Opcode::Brnz => {
+                let flag_input = InsnInput {
+                    insn: branches[0],
+                    input: 0,
+                };
+                if let Some(icmp_insn) =
+                    maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
+                {
+                    let condcode = ctx.data(icmp_insn).cond_code().unwrap();
+                    let cond = lower_condcode(condcode);
+                    let is_signed = condcode_is_signed(condcode);
+                    let negated = op0 == Opcode::Brz;
+                    let cond = if negated { cond.invert() } else { cond };
+
+                    lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
+                    ctx.emit(Inst::CondBr {
+                        taken,
+                        not_taken,
+                        kind: CondBrKind::Cond(cond),
+                    });
+                } else if let Some(fcmp_insn) =
+                    maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
+                {
+                    let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
+                    let cond = lower_fp_condcode(condcode);
+                    let negated = op0 == Opcode::Brz;
+                    let cond = if negated { cond.invert() } else { cond };
+
+                    lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
+                    ctx.emit(Inst::CondBr {
+                        taken,
+                        not_taken,
+                        kind: CondBrKind::Cond(cond),
+                    });
+                } else {
+                    let rt = put_input_in_reg(
+                        ctx,
+                        InsnInput {
+                            insn: branches[0],
+                            input: 0,
+                        },
+                        NarrowValueMode::ZeroExtend64,
+                    );
+                    let kind = match op0 {
+                        Opcode::Brz => CondBrKind::Zero(rt),
+                        Opcode::Brnz => CondBrKind::NotZero(rt),
+                        _ => unreachable!(),
+                    };
+                    ctx.emit(Inst::CondBr {
+                        taken,
+                        not_taken,
+                        kind,
+                    });
+                }
+            }
+            Opcode::BrIcmp => {
+                let condcode = ctx.data(branches[0]).cond_code().unwrap();
+                let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
+                let is_signed = condcode_is_signed(condcode);
+                let ty = ctx.input_ty(branches[0], 0);
+                let bits = ty_bits(ty);
+                let narrow_mode = match (bits <= 32, is_signed) {
+                    (true, true) => NarrowValueMode::SignExtend32,
+                    (true, false) => NarrowValueMode::ZeroExtend32,
+                    (false, true) => NarrowValueMode::SignExtend64,
+                    (false, false) => NarrowValueMode::ZeroExtend64,
+                };
+                let rn = put_input_in_reg(
+                    ctx,
+                    InsnInput {
+                        insn: branches[0],
+                        input: 0,
+                    },
+                    narrow_mode,
+                );
+                let rm = put_input_in_rse_imm12(
+                    ctx,
+                    InsnInput {
+                        insn: branches[0],
+                        input: 1,
+                    },
+                    narrow_mode,
+                );
+
+                let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
+                let rd = writable_zero_reg();
+                ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+                ctx.emit(Inst::CondBr {
+                    taken,
+                    not_taken,
+                    kind,
+                });
+            }
+
+            Opcode::Brif => {
+                let condcode = ctx.data(branches[0]).cond_code().unwrap();
+                let cond = lower_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+
+                let is_signed = condcode_is_signed(condcode);
+                let flag_input = InsnInput {
+                    insn: branches[0],
+                    input: 0,
+                };
+                if let Some(ifcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ifcmp) {
+                    lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
+                    ctx.emit(Inst::CondBr {
+                        taken,
+                        not_taken,
+                        kind,
+                    });
+                } else {
+                    // If the ifcmp result is actually placed in a
+                    // register, we need to move it back into the flags.
+                    let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
+                    ctx.emit(Inst::MovToNZCV { rn });
+                    ctx.emit(Inst::CondBr {
+                        taken,
+                        not_taken,
+                        kind,
+                    });
+                }
+            }
+
+            Opcode::Brff => {
+                let condcode = ctx.data(branches[0]).fp_cond_code().unwrap();
+                let cond = lower_fp_condcode(condcode);
+                let kind = CondBrKind::Cond(cond);
+                let flag_input = InsnInput {
+                    insn: branches[0],
+                    input: 0,
+                };
+                if let Some(ffcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ffcmp) {
+                    lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
+                    ctx.emit(Inst::CondBr {
+                        taken,
+                        not_taken,
+                        kind,
+                    });
+                } else {
+                    // If the ffcmp result is actually placed in a
+                    // register, we need to move it back into the flags.
+                    let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
+                    ctx.emit(Inst::MovToNZCV { rn });
+                    ctx.emit(Inst::CondBr {
+                        taken,
+                        not_taken,
+                        kind,
+                    });
+                }
+            }
+
+            _ => unimplemented!(),
+        }
+    } else {
+        // Must be an unconditional branch or an indirect branch.
+        let op = ctx.data(branches[0]).opcode();
+        match op {
+            Opcode::Jump | Opcode::Fallthrough => {
+                assert!(branches.len() == 1);
+                // In the Fallthrough case, the machine-independent driver
+                // fills in `targets[0]` with our fallthrough block, so this
+                // is valid for both Jump and Fallthrough.
+                ctx.emit(Inst::Jump {
+                    dest: BranchTarget::Label(targets[0]),
+                });
+            }
+
+            Opcode::BrTable => {
+                // Expand `br_table index, default, JT` to:
+                //
+                //   emit_island  // this forces an island at this point
+                //                // if the jumptable would push us past
+                //                // the deadline
+                //   subs idx, #jt_size
+                //   b.hs default
+                //   adr vTmp1, PC+16
+                //   ldr vTmp2, [vTmp1, idx, lsl #2]
+                //   add vTmp2, vTmp2, vTmp1
+                //   br vTmp2
+                //   [jumptable offsets relative to JT base]
+                let jt_size = targets.len() - 1;
+                assert!(jt_size <= std::u32::MAX as usize);
+
+                ctx.emit(Inst::EmitIsland {
+                    needed_space: 4 * (6 + jt_size) as CodeOffset,
+                });
+
+                let ridx = put_input_in_reg(
+                    ctx,
+                    InsnInput {
+                        insn: branches[0],
+                        input: 0,
+                    },
+                    NarrowValueMode::ZeroExtend32,
+                );
+
+                let rtmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                let rtmp2 = ctx.alloc_tmp(RegClass::I64, I32);
+
+                // Bounds-check, leaving condition codes for JTSequence's
+                // branch to default target below.
+                if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
+                    ctx.emit(Inst::AluRRImm12 {
+                        alu_op: ALUOp::SubS32,
+                        rd: writable_zero_reg(),
+                        rn: ridx,
+                        imm12,
+                    });
+                } else {
+                    lower_constant_u64(ctx, rtmp1, jt_size as u64);
+                    ctx.emit(Inst::AluRRR {
+                        alu_op: ALUOp::SubS32,
+                        rd: writable_zero_reg(),
+                        rn: ridx,
+                        rm: rtmp1.to_reg(),
+                    });
+                }
+
+                // Emit the compound instruction that does:
+                //
+                // b.hs default
+                // adr rA, jt
+                // ldrsw rB, [rA, rIndex, UXTW 2]
+                // add rA, rA, rB
+                // br rA
+                // [jt entries]
+                //
+                // This must be *one* instruction in the vcode because
+                // we cannot allow regalloc to insert any spills/fills
+                // in the middle of the sequence; otherwise, the ADR's
+                // PC-rel offset to the jumptable would be incorrect.
+                // (The alternative is to introduce a relocation pass
+                // for inlined jumptables, which is much worse, IMHO.)
+
+                let jt_targets: Vec<BranchTarget> = targets
+                    .iter()
+                    .skip(1)
+                    .map(|bix| BranchTarget::Label(*bix))
+                    .collect();
+                let default_target = BranchTarget::Label(targets[0]);
+                let targets_for_term: Vec<MachLabel> = targets.to_vec();
+                ctx.emit(Inst::JTSequence {
+                    ridx,
+                    rtmp1,
+                    rtmp2,
+                    info: Box::new(JTSequenceInfo {
+                        targets: jt_targets,
+                        default_target,
+                        targets_for_term,
+                    }),
+                });
+            }
+
+            _ => panic!("Unknown branch type!"),
+        }
+    }
+
+    Ok(())
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/mod.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/mod.rs
new file mode 100644
index 0000000000..c3c56632d3
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/mod.rs
@@ -0,0 +1,274 @@
+//! ARM 64-bit Instruction Set Architecture.
+
+use crate::ir::condcodes::IntCC;
+use crate::ir::Function;
+use crate::isa::Builder as IsaBuilder;
+use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
+use crate::result::CodegenResult;
+use crate::settings;
+
+use alloc::boxed::Box;
+
+use regalloc::{PrettyPrint, RealRegUniverse};
+use target_lexicon::{Aarch64Architecture, Architecture, Triple};
+
+// New backend:
+mod abi;
+pub(crate) mod inst;
+mod lower;
+mod lower_inst;
+
+use inst::create_reg_universe;
+
+use self::inst::EmitInfo;
+
+/// An AArch64 backend.
+pub struct AArch64Backend {
+    triple: Triple,
+    flags: settings::Flags,
+    reg_universe: RealRegUniverse,
+}
+
+impl AArch64Backend {
+    /// Create a new AArch64 backend with the given (shared) flags.
+    pub fn new_with_flags(triple: Triple, flags: settings::Flags) -> AArch64Backend {
+        let reg_universe = create_reg_universe(&flags);
+        AArch64Backend {
+            triple,
+            flags,
+            reg_universe,
+        }
+    }
+
+    /// This performs lowering to VCode, register-allocates the code, computes block layout and
+    /// finalizes branches. The result is ready for binary emission.
+    fn compile_vcode(
+        &self,
+        func: &Function,
+        flags: settings::Flags,
+    ) -> CodegenResult<VCode<inst::Inst>> {
+        let emit_info = EmitInfo::new(flags.clone());
+        let abi = Box::new(abi::AArch64ABICallee::new(func, flags)?);
+        compile::compile::<AArch64Backend>(func, self, abi, emit_info)
+    }
+}
+
+impl MachBackend for AArch64Backend {
+    fn compile_function(
+        &self,
+        func: &Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult> {
+        let flags = self.flags();
+        let vcode = self.compile_vcode(func, flags.clone())?;
+
+        let buffer = vcode.emit();
+        let frame_size = vcode.frame_size();
+        let unwind_info = vcode.unwind_info()?;
+
+        let disasm = if want_disasm {
+            Some(vcode.show_rru(Some(&create_reg_universe(flags))))
+        } else {
+            None
+        };
+
+        let buffer = buffer.finish();
+
+        Ok(MachCompileResult {
+            buffer,
+            frame_size,
+            disasm,
+            unwind_info,
+        })
+    }
+
+    fn name(&self) -> &'static str {
+        "aarch64"
+    }
+
+    fn triple(&self) -> Triple {
+        self.triple.clone()
+    }
+
+    fn flags(&self) -> &settings::Flags {
+        &self.flags
+    }
+
+    fn reg_universe(&self) -> &RealRegUniverse {
+        &self.reg_universe
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> IntCC {
+        // Unsigned `>=`; this corresponds to the carry flag set on aarch64, which happens on
+        // overflow of an add.
+        IntCC::UnsignedGreaterThanOrEqual
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> IntCC {
+        // unsigned `<`; this corresponds to the carry flag cleared on aarch64, which happens on
+        // underflow of a subtract (aarch64 follows a carry-cleared-on-borrow convention, the
+        // opposite of x86).
+        IntCC::UnsignedLessThan
+    }
+
+    #[cfg(feature = "unwind")]
+    fn emit_unwind_info(
+        &self,
+        result: &MachCompileResult,
+        kind: crate::machinst::UnwindInfoKind,
+    ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> {
+        use crate::isa::unwind::UnwindInfo;
+        use crate::machinst::UnwindInfoKind;
+        Ok(match (result.unwind_info.as_ref(), kind) {
+            (Some(info), UnwindInfoKind::SystemV) => {
+                inst::unwind::systemv::create_unwind_info(info.clone())?.map(UnwindInfo::SystemV)
+            }
+            (Some(_info), UnwindInfoKind::Windows) => {
+                // TODO: support Windows unwind info on AArch64
+                None
+            }
+            _ => None,
+        })
+    }
+
+    #[cfg(feature = "unwind")]
+    fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> {
+        Some(inst::unwind::systemv::create_cie())
+    }
+}
+
+/// Create a new `isa::Builder`.
+pub fn isa_builder(triple: Triple) -> IsaBuilder {
+    assert!(triple.architecture == Architecture::Aarch64(Aarch64Architecture::Aarch64));
+    IsaBuilder {
+        triple,
+        setup: settings::builder(),
+        constructor: |triple, shared_flags, _| {
+            let backend = AArch64Backend::new_with_flags(triple, shared_flags);
+            Box::new(TargetIsaAdapter::new(backend))
+        },
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+    use crate::settings;
+    use crate::settings::Configurable;
+    use core::str::FromStr;
+    use target_lexicon::Triple;
+
+    #[test]
+    fn test_compile_function() {
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        sig.returns.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+
+        let bb0 = func.dfg.make_block();
+        let arg0 = func.dfg.append_block_param(bb0, I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(bb0);
+        let v0 = pos.ins().iconst(I32, 0x1234);
+        let v1 = pos.ins().iadd(arg0, v0);
+        pos.ins().return_(&[v1]);
+
+        let mut shared_flags = settings::builder();
+        shared_flags.set("opt_level", "none").unwrap();
+        let backend = AArch64Backend::new_with_flags(
+            Triple::from_str("aarch64").unwrap(),
+            settings::Flags::new(shared_flags),
+        );
+        let buffer = backend.compile_function(&mut func, false).unwrap().buffer;
+        let code = &buffer.data[..];
+
+        // stp x29, x30, [sp, #-16]!
+        // mov x29, sp
+        // mov x1, #0x1234
+        // add w0, w0, w1
+        // mov sp, x29
+        // ldp x29, x30, [sp], #16
+        // ret
+        let golden = vec![
+            0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0x81, 0x46, 0x82, 0xd2, 0x00, 0x00,
+            0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6,
+        ];
+
+        assert_eq!(code, &golden[..]);
+    }
+
+    #[test]
+    fn test_branch_lowering() {
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        sig.returns.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+
+        let bb0 = func.dfg.make_block();
+        let arg0 = func.dfg.append_block_param(bb0, I32);
+        let bb1 = func.dfg.make_block();
+        let bb2 = func.dfg.make_block();
+        let bb3 = func.dfg.make_block();
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(bb0);
+        let v0 = pos.ins().iconst(I32, 0x1234);
+        let v1 = pos.ins().iadd(arg0, v0);
+        pos.ins().brnz(v1, bb1, &[]);
+        pos.ins().jump(bb2, &[]);
+        pos.insert_block(bb1);
+        pos.ins().brnz(v1, bb2, &[]);
+        pos.ins().jump(bb3, &[]);
+        pos.insert_block(bb2);
+        let v2 = pos.ins().iadd(v1, v0);
+        pos.ins().brnz(v2, bb2, &[]);
+        pos.ins().jump(bb1, &[]);
+        pos.insert_block(bb3);
+        let v3 = pos.ins().isub(v1, v0);
+        pos.ins().return_(&[v3]);
+
+        let mut shared_flags = settings::builder();
+        shared_flags.set("opt_level", "none").unwrap();
+        let backend = AArch64Backend::new_with_flags(
+            Triple::from_str("aarch64").unwrap(),
+            settings::Flags::new(shared_flags),
+        );
+        let result = backend
+            .compile_function(&mut func, /* want_disasm = */ false)
+            .unwrap();
+        let code = &result.buffer.data[..];
+
+        // stp	x29, x30, [sp, #-16]!
+        // mov	x29, sp
+        // mov	x1, #0x1234                	// #4660
+        // add	w0, w0, w1
+        // mov	w1, w0
+        // cbnz	x1, 0x28
+        // mov	x1, #0x1234                	// #4660
+        // add	w1, w0, w1
+        // mov	w1, w1
+        // cbnz	x1, 0x18
+        // mov	w1, w0
+        // cbnz	x1, 0x18
+        // mov	x1, #0x1234                	// #4660
+        // sub	w0, w0, w1
+        // mov	sp, x29
+        // ldp	x29, x30, [sp], #16
+        // ret
+        let golden = vec![
+            253, 123, 191, 169, 253, 3, 0, 145, 129, 70, 130, 210, 0, 0, 1, 11, 225, 3, 0, 42, 161,
+            0, 0, 181, 129, 70, 130, 210, 1, 0, 1, 11, 225, 3, 1, 42, 161, 255, 255, 181, 225, 3,
+            0, 42, 97, 255, 255, 181, 129, 70, 130, 210, 0, 0, 1, 75, 191, 3, 0, 145, 253, 123,
+            193, 168, 192, 3, 95, 214,
+        ];
+
+        assert_eq!(code, &golden[..]);
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/abi.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/abi.rs
new file mode 100644
index 0000000000..edf1792e52
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/abi.rs
@@ -0,0 +1,471 @@
+//! Implementation of the 32-bit ARM ABI.
+
+use crate::ir;
+use crate::ir::types::*;
+use crate::isa;
+use crate::isa::arm32::inst::*;
+use crate::machinst::*;
+use crate::settings;
+use crate::{CodegenError, CodegenResult};
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use regalloc::{RealReg, Reg, RegClass, Set, Writable};
+use smallvec::SmallVec;
+
+/// Support for the ARM ABI from the callee side (within a function body).
+pub(crate) type Arm32ABICallee = ABICalleeImpl<Arm32MachineDeps>;
+
+/// Support for the ARM ABI from the caller side (at a callsite).
+pub(crate) type Arm32ABICaller = ABICallerImpl<Arm32MachineDeps>;
+
+/// This is the limit for the size of argument and return-value areas on the
+/// stack. We place a reasonable limit here to avoid integer overflow issues
+/// with 32-bit arithmetic: for now, 128 MB.
+static STACK_ARG_RET_SIZE_LIMIT: u64 = 128 * 1024 * 1024;
+
+/// ARM-specific ABI behavior. This struct just serves as an implementation
+/// point for the trait; it is never actually instantiated.
+pub(crate) struct Arm32MachineDeps;
+
+impl Into<AMode> for StackAMode {
+    fn into(self) -> AMode {
+        match self {
+            StackAMode::FPOffset(off, ty) => AMode::FPOffset(off, ty),
+            StackAMode::NominalSPOffset(off, ty) => AMode::NominalSPOffset(off, ty),
+            StackAMode::SPOffset(off, ty) => AMode::SPOffset(off, ty),
+        }
+    }
+}
+
+impl ABIMachineSpec for Arm32MachineDeps {
+    type I = Inst;
+
+    fn word_bits() -> u32 {
+        32
+    }
+
+    /// Return required stack alignment in bytes.
+    fn stack_align(_call_conv: isa::CallConv) -> u32 {
+        8
+    }
+
+    fn compute_arg_locs(
+        _call_conv: isa::CallConv,
+        params: &[ir::AbiParam],
+        args_or_rets: ArgsOrRets,
+        add_ret_area_ptr: bool,
+    ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
+        let mut next_rreg = 0;
+        let mut next_stack: u64 = 0;
+        let mut ret = vec![];
+        let mut stack_args = vec![];
+
+        let max_reg_val = 4; // r0-r3
+
+        for i in 0..params.len() {
+            let param = params[i];
+
+            // Validate "purpose".
+            match &param.purpose {
+                &ir::ArgumentPurpose::VMContext
+                | &ir::ArgumentPurpose::Normal
+                | &ir::ArgumentPurpose::StackLimit
+                | &ir::ArgumentPurpose::SignatureId => {}
+                _ => panic!(
+                    "Unsupported argument purpose {:?} in signature: {:?}",
+                    param.purpose, params
+                ),
+            }
+            assert!(param.value_type.bits() <= 32);
+
+            if next_rreg < max_reg_val {
+                let reg = rreg(next_rreg);
+
+                ret.push(ABIArg::Reg(
+                    reg.to_real_reg(),
+                    param.value_type,
+                    param.extension,
+                    param.purpose,
+                ));
+                next_rreg += 1;
+            } else {
+                // Arguments are stored on stack in reversed order.
+                // https://static.docs.arm.com/ihi0042/g/aapcs32.pdf
+
+                // Stack offset is not known yet. Store param info for later.
+                stack_args.push((param.value_type, param.extension, param.purpose));
+                next_stack += 4;
+            }
+        }
+
+        let extra_arg = if add_ret_area_ptr {
+            debug_assert!(args_or_rets == ArgsOrRets::Args);
+            if next_rreg < max_reg_val {
+                ret.push(ABIArg::Reg(
+                    rreg(next_rreg).to_real_reg(),
+                    I32,
+                    ir::ArgumentExtension::None,
+                    ir::ArgumentPurpose::Normal,
+                ));
+            } else {
+                stack_args.push((
+                    I32,
+                    ir::ArgumentExtension::None,
+                    ir::ArgumentPurpose::Normal,
+                ));
+                next_stack += 4;
+            }
+            Some(ret.len() - 1)
+        } else {
+            None
+        };
+
+        // Now we can assign proper stack offsets to params.
+        let max_stack = next_stack;
+        for (ty, ext, purpose) in stack_args.into_iter().rev() {
+            next_stack -= 4;
+            ret.push(ABIArg::Stack(
+                (max_stack - next_stack) as i64,
+                ty,
+                ext,
+                purpose,
+            ));
+        }
+        assert_eq!(next_stack, 0);
+
+        next_stack = (next_stack + 7) & !7;
+
+        // To avoid overflow issues, limit the arg/return size to something
+        // reasonable -- here, 128 MB.
+        if next_stack > STACK_ARG_RET_SIZE_LIMIT {
+            return Err(CodegenError::ImplLimitExceeded);
+        }
+
+        Ok((ret, next_stack as i64, extra_arg))
+    }
+
+    fn fp_to_arg_offset(_call_conv: isa::CallConv, _flags: &settings::Flags) -> i64 {
+        8 // frame pointer and link register
+    }
+
+    fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Inst {
+        Inst::gen_load(into_reg, mem.into(), ty)
+    }
+
+    fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Inst {
+        Inst::gen_store(from_reg, mem.into(), ty)
+    }
+
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst {
+        Inst::gen_move(to_reg, from_reg, ty)
+    }
+
+    fn gen_extend(
+        to_reg: Writable<Reg>,
+        from_reg: Reg,
+        is_signed: bool,
+        from_bits: u8,
+        to_bits: u8,
+    ) -> Inst {
+        assert!(to_bits == 32);
+        assert!(from_bits < 32);
+        Inst::Extend {
+            rd: to_reg,
+            rm: from_reg,
+            signed: is_signed,
+            from_bits,
+        }
+    }
+
+    fn gen_ret() -> Inst {
+        Inst::Ret
+    }
+
+    fn gen_epilogue_placeholder() -> Inst {
+        Inst::EpiloguePlaceholder
+    }
+
+    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Inst; 4]> {
+        let mut insts = SmallVec::new();
+
+        if let Some(imm12) = UImm12::maybe_from_i64(imm as i64) {
+            insts.push(Inst::AluRRImm12 {
+                alu_op: ALUOp::Add,
+                rd: into_reg,
+                rn: from_reg,
+                imm12,
+            });
+        } else {
+            let scratch2 = writable_tmp2_reg();
+            insts.extend(Inst::load_constant(scratch2, imm));
+            insts.push(Inst::AluRRRShift {
+                alu_op: ALUOp::Add,
+                rd: into_reg,
+                rn: from_reg,
+                rm: scratch2.to_reg(),
+                shift: None,
+            });
+        }
+        insts
+    }
+
+    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Inst; 2]> {
+        let mut insts = SmallVec::new();
+        insts.push(Inst::Cmp {
+            rn: sp_reg(),
+            rm: limit_reg,
+        });
+        insts.push(Inst::TrapIf {
+            trap_info: ir::TrapCode::StackOverflow,
+            // Here `Lo` == "less than" when interpreting the two
+            // operands as unsigned integers.
+            cond: Cond::Lo,
+        });
+        insts
+    }
+
+    fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Inst {
+        let mem = mem.into();
+        Inst::LoadAddr { rd: into_reg, mem }
+    }
+
+    fn get_stacklimit_reg() -> Reg {
+        ip_reg()
+    }
+
+    fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Inst {
+        let mem = AMode::RegOffset(base, offset as i64);
+        Inst::gen_load(into_reg, mem, ty)
+    }
+
+    fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Inst {
+        let mem = AMode::RegOffset(base, offset as i64);
+        Inst::gen_store(from_reg, mem, ty)
+    }
+
+    fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Inst; 2]> {
+        let mut ret = SmallVec::new();
+
+        if amount == 0 {
+            return ret;
+        }
+        let (amount, is_sub) = if amount > 0 {
+            (amount, false)
+        } else {
+            (-amount, true)
+        };
+
+        let alu_op = if is_sub { ALUOp::Sub } else { ALUOp::Add };
+
+        if let Some(imm12) = UImm12::maybe_from_i64(amount as i64) {
+            ret.push(Inst::AluRRImm12 {
+                alu_op,
+                rd: writable_sp_reg(),
+                rn: sp_reg(),
+                imm12,
+            });
+        } else {
+            let tmp = writable_ip_reg();
+            ret.extend(Inst::load_constant(tmp, amount as u32));
+            ret.push(Inst::AluRRRShift {
+                alu_op,
+                rd: writable_sp_reg(),
+                rn: sp_reg(),
+                rm: tmp.to_reg(),
+                shift: None,
+            });
+        }
+        ret
+    }
+
+    fn gen_nominal_sp_adj(offset: i32) -> Inst {
+        let offset = i64::from(offset);
+        Inst::VirtualSPOffsetAdj { offset }
+    }
+
+    fn gen_prologue_frame_setup() -> SmallVec<[Inst; 2]> {
+        let mut ret = SmallVec::new();
+        let reg_list = vec![fp_reg(), lr_reg()];
+        ret.push(Inst::Push { reg_list });
+        ret.push(Inst::Mov {
+            rd: writable_fp_reg(),
+            rm: sp_reg(),
+        });
+        ret
+    }
+
+    fn gen_epilogue_frame_restore() -> SmallVec<[Inst; 2]> {
+        let mut ret = SmallVec::new();
+        ret.push(Inst::Mov {
+            rd: writable_sp_reg(),
+            rm: fp_reg(),
+        });
+        let reg_list = vec![writable_fp_reg(), writable_lr_reg()];
+        ret.push(Inst::Pop { reg_list });
+        ret
+    }
+
+    /// Returns stack bytes used as well as instructions. Does not adjust
+    /// nominal SP offset; caller will do that.
+    fn gen_clobber_save(
+        _call_conv: isa::CallConv,
+        _flags: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
+    ) -> (u64, SmallVec<[Inst; 16]>) {
+        let mut insts = SmallVec::new();
+        if fixed_frame_storage_size > 0 {
+            insts.extend(Self::gen_sp_reg_adjust(-(fixed_frame_storage_size as i32)).into_iter());
+        }
+        let clobbered_vec = get_callee_saves(clobbers);
+        let mut clobbered_vec: Vec<_> = clobbered_vec
+            .into_iter()
+            .map(|r| r.to_reg().to_reg())
+            .collect();
+        if clobbered_vec.len() % 2 == 1 {
+            // For alignment purposes.
+            clobbered_vec.push(ip_reg());
+        }
+        let stack_used = clobbered_vec.len() * 4;
+        if !clobbered_vec.is_empty() {
+            insts.push(Inst::Push {
+                reg_list: clobbered_vec,
+            });
+        }
+
+        (stack_used as u64, insts)
+    }
+
+    fn gen_clobber_restore(
+        _call_conv: isa::CallConv,
+        _flags: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        _fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
+    ) -> SmallVec<[Inst; 16]> {
+        let mut insts = SmallVec::new();
+        let clobbered_vec = get_callee_saves(clobbers);
+        let mut clobbered_vec: Vec<_> = clobbered_vec
+            .into_iter()
+            .map(|r| Writable::from_reg(r.to_reg().to_reg()))
+            .collect();
+        if clobbered_vec.len() % 2 == 1 {
+            clobbered_vec.push(writable_ip_reg());
+        }
+        if !clobbered_vec.is_empty() {
+            insts.push(Inst::Pop {
+                reg_list: clobbered_vec,
+            });
+        }
+        insts
+    }
+
+    fn gen_call(
+        dest: &CallDest,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: ir::Opcode,
+        tmp: Writable<Reg>,
+        _callee_conv: isa::CallConv,
+        _caller_conv: isa::CallConv,
+    ) -> SmallVec<[(InstIsSafepoint, Inst); 2]> {
+        let mut insts = SmallVec::new();
+        match &dest {
+            &CallDest::ExtName(ref name, RelocDistance::Near) => insts.push((
+                InstIsSafepoint::Yes,
+                Inst::Call {
+                    info: Box::new(CallInfo {
+                        dest: name.clone(),
+                        uses,
+                        defs,
+                        opcode,
+                    }),
+                },
+            )),
+            &CallDest::ExtName(ref name, RelocDistance::Far) => {
+                insts.push((
+                    InstIsSafepoint::No,
+                    Inst::LoadExtName {
+                        rt: tmp,
+                        name: Box::new(name.clone()),
+                        offset: 0,
+                    },
+                ));
+                insts.push((
+                    InstIsSafepoint::Yes,
+                    Inst::CallInd {
+                        info: Box::new(CallIndInfo {
+                            rm: tmp.to_reg(),
+                            uses,
+                            defs,
+                            opcode,
+                        }),
+                    },
+                ));
+            }
+            &CallDest::Reg(reg) => insts.push((
+                InstIsSafepoint::Yes,
+                Inst::CallInd {
+                    info: Box::new(CallIndInfo {
+                        rm: *reg,
+                        uses,
+                        defs,
+                        opcode,
+                    }),
+                },
+            )),
+        }
+
+        insts
+    }
+
+    fn get_number_of_spillslots_for_value(rc: RegClass, _ty: Type) -> u32 {
+        match rc {
+            RegClass::I32 => 1,
+            _ => panic!("Unexpected register class!"),
+        }
+    }
+
+    fn get_virtual_sp_offset_from_state(s: &EmitState) -> i64 {
+        s.virtual_sp_offset
+    }
+
+    fn get_nominal_sp_to_fp(s: &EmitState) -> i64 {
+        s.nominal_sp_to_fp
+    }
+
+    fn get_regs_clobbered_by_call(_: isa::CallConv) -> Vec<Writable<Reg>> {
+        let mut caller_saved = Vec::new();
+        for i in 0..15 {
+            let r = writable_rreg(i);
+            if is_reg_clobbered_by_call(r.to_reg().to_real_reg()) {
+                caller_saved.push(r);
+            }
+        }
+        caller_saved
+    }
+}
+
+fn is_callee_save(r: RealReg) -> bool {
+    let enc = r.get_hw_encoding();
+    4 <= enc && enc <= 10
+}
+
+fn get_callee_saves(regs: &Set<Writable<RealReg>>) -> Vec<Writable<RealReg>> {
+    let mut ret = Vec::new();
+    for &reg in regs.iter() {
+        if is_callee_save(reg.to_reg()) {
+            ret.push(reg);
+        }
+    }
+
+    // Sort registers for deterministic code output.
+    ret.sort_by_key(|r| r.to_reg().get_index());
+    ret
+}
+
+fn is_reg_clobbered_by_call(r: RealReg) -> bool {
+    let enc = r.get_hw_encoding();
+    enc <= 3
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/args.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/args.rs
new file mode 100644
index 0000000000..2c1b8e97d6
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/args.rs
@@ -0,0 +1,335 @@
+//! 32-bit ARM ISA definitions: instruction arguments.
+
+use crate::isa::arm32::inst::*;
+
+use regalloc::{PrettyPrint, RealRegUniverse, Reg};
+
+use std::string::String;
+
+/// A shift operator for a register or immediate.
+#[derive(Clone, Copy, Debug)]
+#[repr(u8)]
+pub enum ShiftOp {
+    LSL = 0b00,
+    LSR = 0b01,
+    ASR = 0b10,
+    ROR = 0b11,
+}
+
+impl ShiftOp {
+    /// Get the encoding of this shift op.
+    pub fn bits(self) -> u8 {
+        self as u8
+    }
+}
+
+/// A shift operator amount.
+#[derive(Clone, Copy, Debug)]
+pub struct ShiftOpShiftImm(u8);
+
+impl ShiftOpShiftImm {
+    /// Maximum shift for shifted-register operands.
+    pub const MAX_SHIFT: u32 = 31;
+
+    /// Create a new shiftop shift amount, if possible.
+    pub fn maybe_from_shift(shift: u32) -> Option<ShiftOpShiftImm> {
+        if shift <= Self::MAX_SHIFT {
+            Some(ShiftOpShiftImm(shift as u8))
+        } else {
+            None
+        }
+    }
+
+    /// Return the shift amount.
+    pub fn value(self) -> u8 {
+        self.0
+    }
+}
+
+/// A shift operator with an amount, guaranteed to be within range.
+#[derive(Clone, Debug)]
+pub struct ShiftOpAndAmt {
+    op: ShiftOp,
+    shift: ShiftOpShiftImm,
+}
+
+impl ShiftOpAndAmt {
+    pub fn new(op: ShiftOp, shift: ShiftOpShiftImm) -> ShiftOpAndAmt {
+        ShiftOpAndAmt { op, shift }
+    }
+
+    /// Get the shift op.
+    pub fn op(&self) -> ShiftOp {
+        self.op
+    }
+
+    /// Get the shift amount.
+    pub fn amt(&self) -> ShiftOpShiftImm {
+        self.shift
+    }
+}
+
+// An unsigned 8-bit immediate.
+#[derive(Clone, Copy, Debug)]
+pub struct UImm8 {
+    /// The value.
+    value: u8,
+}
+
+impl UImm8 {
+    pub fn maybe_from_i64(value: i64) -> Option<UImm8> {
+        if 0 <= value && value < (1 << 8) {
+            Some(UImm8 { value: value as u8 })
+        } else {
+            None
+        }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        u32::from(self.value)
+    }
+}
+
+/// An unsigned 12-bit immediate.
+#[derive(Clone, Copy, Debug)]
+pub struct UImm12 {
+    /// The value.
+    value: u16,
+}
+
+impl UImm12 {
+    pub fn maybe_from_i64(value: i64) -> Option<UImm12> {
+        if 0 <= value && value < (1 << 12) {
+            Some(UImm12 {
+                value: value as u16,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        u32::from(self.value)
+    }
+}
+
+/// An addressing mode specified for a load/store operation.
+#[derive(Clone, Debug)]
+pub enum AMode {
+    // Real addressing modes
+    /// Register plus register offset, which can be shifted left by imm2.
+    RegReg(Reg, Reg, u8),
+
+    /// Unsigned 12-bit immediate offset from reg.
+    RegOffset12(Reg, UImm12),
+
+    /// Immediate offset from program counter aligned to 4.
+    /// Cannot be used by store instructions.
+    PCRel(i32),
+
+    // Virtual addressing modes that are lowered at emission time:
+    /// Immediate offset from reg.
+    RegOffset(Reg, i64),
+
+    /// Signed immediate offset from stack pointer.
+    SPOffset(i64, Type),
+
+    /// Offset from the frame pointer.
+    FPOffset(i64, Type),
+
+    /// Signed immediate offset from "nominal stack pointer".
+    NominalSPOffset(i64, Type),
+}
+
+impl AMode {
+    /// Memory reference using the sum of two registers as an address.
+    pub fn reg_plus_reg(reg1: Reg, reg2: Reg, shift_amt: u8) -> AMode {
+        assert!(shift_amt <= 3);
+        AMode::RegReg(reg1, reg2, shift_amt)
+    }
+
+    /// Memory reference using the sum of a register and an immediate offset
+    /// as an address.
+    pub fn reg_plus_imm(reg: Reg, offset: i64) -> AMode {
+        AMode::RegOffset(reg, offset)
+    }
+}
+
+/// Condition for conditional branches.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[repr(u8)]
+pub enum Cond {
+    Eq = 0,
+    Ne = 1,
+    Hs = 2,
+    Lo = 3,
+    Mi = 4,
+    Pl = 5,
+    Vs = 6,
+    Vc = 7,
+    Hi = 8,
+    Ls = 9,
+    Ge = 10,
+    Lt = 11,
+    Gt = 12,
+    Le = 13,
+    Al = 14,
+}
+
+impl Cond {
+    /// Return the inverted condition.
+    pub fn invert(self) -> Cond {
+        match self {
+            Cond::Eq => Cond::Ne,
+            Cond::Ne => Cond::Eq,
+
+            Cond::Hs => Cond::Lo,
+            Cond::Lo => Cond::Hs,
+
+            Cond::Mi => Cond::Pl,
+            Cond::Pl => Cond::Mi,
+
+            Cond::Vs => Cond::Vc,
+            Cond::Vc => Cond::Vs,
+
+            Cond::Hi => Cond::Ls,
+            Cond::Ls => Cond::Hi,
+
+            Cond::Ge => Cond::Lt,
+            Cond::Lt => Cond::Ge,
+
+            Cond::Gt => Cond::Le,
+            Cond::Le => Cond::Gt,
+
+            Cond::Al => panic!("Cannot inverse {:?} condition", self),
+        }
+    }
+
+    /// Return the machine encoding of this condition.
+    pub fn bits(self) -> u16 {
+        self as u16
+    }
+}
+
+/// A branch target. Either unresolved (basic-block index) or resolved (offset
+/// from end of current instruction).
+#[derive(Clone, Copy, Debug)]
+pub enum BranchTarget {
+    /// An unresolved reference to a Label.
+    Label(MachLabel),
+    /// A fixed PC offset.
+    ResolvedOffset(i32),
+}
+
+impl BranchTarget {
+    /// Return the target's label, if it is a label-based target.
+    pub fn as_label(self) -> Option<MachLabel> {
+        match self {
+            BranchTarget::Label(l) => Some(l),
+            _ => None,
+        }
+    }
+
+    // Ready for embedding in instruction.
+    fn as_offset(self, inst_16_bit: bool) -> i32 {
+        match self {
+            BranchTarget::ResolvedOffset(off) => {
+                if inst_16_bit {
+                    // pc is equal to end of the current inst + 2.
+                    (off - 2) >> 1
+                } else {
+                    // pc points to end of the current inst.
+                    off >> 1
+                }
+            }
+            _ => 0,
+        }
+    }
+
+    // For 32-bit unconditional jump.
+    pub fn as_off24(self) -> u32 {
+        let off = self.as_offset(false);
+        assert!(off < (1 << 24));
+        assert!(off >= -(1 << 24));
+        (off as u32) & ((1 << 24) - 1)
+    }
+
+    // For 32-bit conditional jump.
+    pub fn as_off20(self) -> u32 {
+        let off = self.as_offset(false);
+        assert!(off < (1 << 20));
+        assert!(off >= -(1 << 20));
+        (off as u32) & ((1 << 20) - 1)
+    }
+}
+
+impl PrettyPrint for ShiftOpAndAmt {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        let op = match self.op() {
+            ShiftOp::LSL => "lsl",
+            ShiftOp::LSR => "lsr",
+            ShiftOp::ASR => "asr",
+            ShiftOp::ROR => "ror",
+        };
+        format!("{} #{}", op, self.amt().value())
+    }
+}
+
+impl PrettyPrint for UImm8 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl PrettyPrint for UImm12 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl PrettyPrint for AMode {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &AMode::RegReg(rn, rm, imm2) => {
+                let shift = if imm2 != 0 {
+                    format!(", lsl #{}", imm2)
+                } else {
+                    "".to_string()
+                };
+                format!(
+                    "[{}, {}{}]",
+                    rn.show_rru(mb_rru),
+                    rm.show_rru(mb_rru),
+                    shift
+                )
+            }
+            &AMode::RegOffset12(rn, off) => {
+                format!("[{}, {}]", rn.show_rru(mb_rru), off.show_rru(mb_rru))
+            }
+            &AMode::PCRel(off) => format!("[pc, #{}]", off),
+            &AMode::RegOffset(..)
+            | &AMode::SPOffset(..)
+            | &AMode::FPOffset(..)
+            | &AMode::NominalSPOffset(..) => panic!("unexpected mem mode"),
+        }
+    }
+}
+
+impl PrettyPrint for Cond {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        let mut s = format!("{:?}", self);
+        s.make_ascii_lowercase();
+        s
+    }
+}
+
+impl PrettyPrint for BranchTarget {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &BranchTarget::Label(label) => format!("label{:?}", label.get()),
+            &BranchTarget::ResolvedOffset(off) => format!("{}", off),
+        }
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit.rs
new file mode 100644
index 0000000000..5e4a412e96
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit.rs
@@ -0,0 +1,829 @@
+//! 32-bit ARM ISA: binary code emission.
+
+use crate::binemit::{Reloc, StackMap};
+use crate::ir::SourceLoc;
+use crate::isa::arm32::inst::*;
+
+use core::convert::TryFrom;
+use log::debug;
+
+/// Memory addressing mode finalization: convert "special" modes (e.g.,
+/// nominal stack offset) into real addressing modes, possibly by
+/// emitting some helper instructions that come immediately before the use
+/// of this amode.
+pub fn mem_finalize(mem: &AMode, state: &EmitState) -> (SmallVec<[Inst; 4]>, AMode) {
+    match mem {
+        &AMode::RegOffset(_, off)
+        | &AMode::SPOffset(off, _)
+        | &AMode::FPOffset(off, _)
+        | &AMode::NominalSPOffset(off, _) => {
+            let basereg = match mem {
+                &AMode::RegOffset(reg, _) => reg,
+                &AMode::SPOffset(..) | &AMode::NominalSPOffset(..) => sp_reg(),
+                &AMode::FPOffset(..) => fp_reg(),
+                _ => unreachable!(),
+            };
+            let adj = match mem {
+                &AMode::NominalSPOffset(..) => {
+                    debug!(
+                        "mem_finalize: nominal SP offset {} + adj {} -> {}",
+                        off,
+                        state.virtual_sp_offset,
+                        off + state.virtual_sp_offset
+                    );
+                    state.virtual_sp_offset
+                }
+                _ => 0,
+            };
+            let off = off + adj;
+
+            assert!(-(1 << 31) <= off && off <= (1 << 32));
+
+            if let Some(off) = UImm12::maybe_from_i64(off) {
+                let mem = AMode::RegOffset12(basereg, off);
+                (smallvec![], mem)
+            } else {
+                let tmp = writable_ip_reg();
+                let const_insts = Inst::load_constant(tmp, off as u32);
+                let mem = AMode::reg_plus_reg(basereg, tmp.to_reg(), 0);
+                (const_insts, mem)
+            }
+        }
+        // Just assert immediate is valid here.
+        _ => (smallvec![], mem.clone()),
+    }
+}
+
+//=============================================================================
+// Instructions and subcomponents: emission
+
+fn machreg_to_gpr(m: Reg) -> u16 {
+    assert_eq!(m.get_class(), RegClass::I32);
+    u16::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
+}
+
+fn machreg_to_gpr_lo(m: Reg) -> u16 {
+    let gpr_lo = machreg_to_gpr(m);
+    assert!(gpr_lo < 8);
+    gpr_lo
+}
+
+fn machreg_is_lo(m: Reg) -> bool {
+    machreg_to_gpr(m) < 8
+}
+
+fn enc_16_rr(bits_15_6: u16, rd: Reg, rm: Reg) -> u16 {
+    (bits_15_6 << 6) | machreg_to_gpr_lo(rd) | (machreg_to_gpr_lo(rm) << 3)
+}
+
+fn enc_16_rr_any(bits_15_8: u16, rd: Reg, rm: Reg) -> u16 {
+    let rd = machreg_to_gpr(rd);
+    (bits_15_8 << 8) | (rd & 0x7) | ((rd >> 3) << 7) | (machreg_to_gpr(rm) << 3)
+}
+
+fn enc_16_mov(rd: Writable<Reg>, rm: Reg) -> u16 {
+    enc_16_rr_any(0b01000110, rd.to_reg(), rm)
+}
+
+fn enc_16_it(cond: Cond, insts: &Vec<CondInst>) -> u16 {
+    let cond = cond.bits();
+    let mut mask: u16 = 0;
+    for inst in insts.iter().skip(1) {
+        if inst.then {
+            mask |= cond & 0x1;
+        } else {
+            mask |= (cond & 0x1) ^ 0x1;
+        }
+        mask <<= 1;
+    }
+    mask |= 0x1;
+    mask <<= 4 - insts.len();
+    0b1011_1111_0000_0000 | (cond << 4) | mask
+}
+
+fn enc_32_regs(
+    mut inst: u32,
+    reg_0: Option<Reg>,
+    reg_8: Option<Reg>,
+    reg_12: Option<Reg>,
+    reg_16: Option<Reg>,
+) -> u32 {
+    if let Some(reg_0) = reg_0 {
+        inst |= u32::from(machreg_to_gpr(reg_0));
+    }
+    if let Some(reg_8) = reg_8 {
+        inst |= u32::from(machreg_to_gpr(reg_8)) << 8;
+    }
+    if let Some(reg_12) = reg_12 {
+        inst |= u32::from(machreg_to_gpr(reg_12)) << 12;
+    }
+    if let Some(reg_16) = reg_16 {
+        inst |= u32::from(machreg_to_gpr(reg_16)) << 16;
+    }
+    inst
+}
+
+fn enc_32_reg_shift(inst: u32, shift: &Option<ShiftOpAndAmt>) -> u32 {
+    match shift {
+        Some(shift) => {
+            let op = u32::from(shift.op().bits());
+            let amt = u32::from(shift.amt().value());
+            let imm2 = amt & 0x3;
+            let imm3 = (amt >> 2) & 0x7;
+
+            inst | (op << 4) | (imm2 << 6) | (imm3 << 12)
+        }
+        None => inst,
+    }
+}
+
+fn enc_32_r_imm16(bits_31_20: u32, rd: Reg, imm16: u16) -> u32 {
+    let imm16 = u32::from(imm16);
+    let imm8 = imm16 & 0xff;
+    let imm3 = (imm16 >> 8) & 0x7;
+    let i = (imm16 >> 11) & 0x1;
+    let imm4 = (imm16 >> 12) & 0xf;
+
+    let inst = ((bits_31_20 << 20) & !(1 << 26)) | imm8 | (imm3 << 12) | (imm4 << 16) | (i << 26);
+    enc_32_regs(inst, None, Some(rd), None, None)
+}
+
+fn enc_32_rrr(bits_31_20: u32, bits_15_12: u32, bits_7_4: u32, rd: Reg, rm: Reg, rn: Reg) -> u32 {
+    let inst = (bits_31_20 << 20) | (bits_15_12 << 12) | (bits_7_4 << 4);
+    enc_32_regs(inst, Some(rm), Some(rd), None, Some(rn))
+}
+
+fn enc_32_imm12(inst: u32, imm12: UImm12) -> u32 {
+    let imm12 = imm12.bits();
+    let imm8 = imm12 & 0xff;
+    let imm3 = (imm12 >> 8) & 0x7;
+    let i = (imm12 >> 11) & 0x1;
+    inst | imm8 | (imm3 << 12) | (i << 26)
+}
+
+fn enc_32_mem_r(bits_24_20: u32, rt: Reg, rn: Reg, rm: Reg, imm2: u8) -> u32 {
+    let imm2 = u32::from(imm2);
+    let inst = (imm2 << 4) | (bits_24_20 << 20) | (0b11111 << 27);
+    enc_32_regs(inst, Some(rm), None, Some(rt), Some(rn))
+}
+
+fn enc_32_mem_off12(bits_24_20: u32, rt: Reg, rn: Reg, off12: UImm12) -> u32 {
+    let off12 = off12.bits();
+    let inst = off12 | (bits_24_20 << 20) | (0b11111 << 27);
+    enc_32_regs(inst, None, None, Some(rt), Some(rn))
+}
+
+fn enc_32_jump(target: BranchTarget) -> u32 {
+    let off24 = target.as_off24();
+    let imm11 = off24 & 0x7ff;
+    let imm10 = (off24 >> 11) & 0x3ff;
+    let i2 = (off24 >> 21) & 0x1;
+    let i1 = (off24 >> 22) & 0x1;
+    let s = (off24 >> 23) & 0x1;
+    let j1 = (i1 ^ s) ^ 1;
+    let j2 = (i2 ^ s) ^ 1;
+
+    0b11110_0_0000000000_10_0_1_0_00000000000
+        | imm11
+        | (j2 << 11)
+        | (j1 << 13)
+        | (imm10 << 16)
+        | (s << 26)
+}
+
+fn enc_32_cond_branch(cond: Cond, target: BranchTarget) -> u32 {
+    let cond = u32::from(cond.bits());
+    let off20 = target.as_off20();
+    let imm11 = off20 & 0x7ff;
+    let imm6 = (off20 >> 11) & 0x3f;
+    let j1 = (off20 >> 17) & 0x1;
+    let j2 = (off20 >> 18) & 0x1;
+    let s = (off20 >> 19) & 0x1;
+
+    0b11110_0_0000_000000_10_0_0_0_00000000000
+        | imm11
+        | (j2 << 11)
+        | (j1 << 13)
+        | (imm6 << 16)
+        | (cond << 22)
+        | (s << 26)
+}
+
+fn u32_swap_halfwords(x: u32) -> u32 {
+    (x >> 16) | (x << 16)
+}
+
+fn emit_32(inst: u32, sink: &mut MachBuffer<Inst>) {
+    let inst_hi = (inst >> 16) as u16;
+    let inst_lo = (inst & 0xffff) as u16;
+    sink.put2(inst_hi);
+    sink.put2(inst_lo);
+}
+
+/// State carried between emissions of a sequence of instructions.
+#[derive(Default, Clone, Debug)]
+pub struct EmitState {
+    /// Addend to convert nominal-SP offsets to real-SP offsets at the current
+    /// program point.
+    pub(crate) virtual_sp_offset: i64,
+    /// Offset of FP from nominal-SP.
+    pub(crate) nominal_sp_to_fp: i64,
+    /// Safepoint stack map for upcoming instruction, as provided to `pre_safepoint()`.
+    stack_map: Option<StackMap>,
+    /// Source location of next machine code instruction to be emitted.
+    cur_srcloc: SourceLoc,
+}
+
+impl MachInstEmitState<Inst> for EmitState {
+    fn new(abi: &dyn ABICallee<I = Inst>) -> Self {
+        EmitState {
+            virtual_sp_offset: 0,
+            nominal_sp_to_fp: abi.frame_size() as i64,
+            stack_map: None,
+            cur_srcloc: SourceLoc::default(),
+        }
+    }
+
+    fn pre_safepoint(&mut self, stack_map: StackMap) {
+        self.stack_map = Some(stack_map);
+    }
+
+    fn pre_sourceloc(&mut self, srcloc: SourceLoc) {
+        self.cur_srcloc = srcloc;
+    }
+}
+
+impl EmitState {
+    fn take_stack_map(&mut self) -> Option<StackMap> {
+        self.stack_map.take()
+    }
+
+    fn clear_post_insn(&mut self) {
+        self.stack_map = None;
+    }
+
+    fn cur_srcloc(&self) -> SourceLoc {
+        self.cur_srcloc
+    }
+}
+
+pub struct EmitInfo {
+    flags: settings::Flags,
+}
+
+impl EmitInfo {
+    pub(crate) fn new(flags: settings::Flags) -> Self {
+        EmitInfo { flags }
+    }
+}
+
+impl MachInstEmitInfo for EmitInfo {
+    fn flags(&self) -> &settings::Flags {
+        &self.flags
+    }
+}
+
+impl MachInstEmit for Inst {
+    type Info = EmitInfo;
+    type State = EmitState;
+    type UnwindInfo = super::unwind::Arm32UnwindInfo;
+
+    fn emit(&self, sink: &mut MachBuffer<Inst>, emit_info: &Self::Info, state: &mut EmitState) {
+        let start_off = sink.cur_offset();
+
+        match self {
+            &Inst::Nop0 | &Inst::EpiloguePlaceholder => {}
+            &Inst::Nop2 => {
+                sink.put2(0b1011_1111_0000_0000);
+            }
+            &Inst::AluRRR { alu_op, rd, rn, rm } => {
+                let (bits_31_20, bits_15_12, bits_7_4) = match alu_op {
+                    ALUOp::Lsl => (0b111110100000, 0b1111, 0b0000),
+                    ALUOp::Lsr => (0b111110100010, 0b1111, 0b0000),
+                    ALUOp::Asr => (0b111110100100, 0b1111, 0b0000),
+                    ALUOp::Ror => (0b111110100110, 0b1111, 0b0000),
+                    ALUOp::Qadd => (0b111110101000, 0b1111, 0b1000),
+                    ALUOp::Qsub => (0b111110101000, 0b1111, 0b1010),
+                    ALUOp::Mul => (0b111110110000, 0b1111, 0b0000),
+                    ALUOp::Udiv => (0b111110111011, 0b1111, 0b1111),
+                    ALUOp::Sdiv => (0b111110111001, 0b1111, 0b1111),
+                    _ => panic!("Invalid ALUOp {:?} in RRR form!", alu_op),
+                };
+                emit_32(
+                    enc_32_rrr(bits_31_20, bits_15_12, bits_7_4, rd.to_reg(), rm, rn),
+                    sink,
+                );
+            }
+            &Inst::AluRRRShift {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ref shift,
+            } => {
+                let bits_31_24 = 0b111_0101;
+                let bits_24_20 = match alu_op {
+                    ALUOp::And => 0b00000,
+                    ALUOp::Bic => 0b00010,
+                    ALUOp::Orr => 0b00100,
+                    ALUOp::Orn => 0b00110,
+                    ALUOp::Eor => 0b01000,
+                    ALUOp::Add => 0b10000,
+                    ALUOp::Adds => 0b10001,
+                    ALUOp::Adc => 0b10100,
+                    ALUOp::Adcs => 0b10101,
+                    ALUOp::Sbc => 0b10110,
+                    ALUOp::Sbcs => 0b10111,
+                    ALUOp::Sub => 0b11010,
+                    ALUOp::Subs => 0b11011,
+                    ALUOp::Rsb => 0b11100,
+                    _ => panic!("Invalid ALUOp {:?} in RRRShift form!", alu_op),
+                };
+                let bits_31_20 = (bits_31_24 << 5) | bits_24_20;
+                let inst = enc_32_rrr(bits_31_20, 0, 0, rd.to_reg(), rm, rn);
+                let inst = enc_32_reg_shift(inst, shift);
+                emit_32(inst, sink);
+            }
+            &Inst::AluRRShift {
+                alu_op,
+                rd,
+                rm,
+                ref shift,
+            } => {
+                let bits_24_21 = match alu_op {
+                    ALUOp1::Mvn => 0b0011,
+                    ALUOp1::Mov => 0b0010,
+                };
+                let inst = 0b1110101_0000_0_1111_0_000_0000_00_00_0000 | (bits_24_21 << 21);
+                let inst = enc_32_regs(inst, Some(rm), Some(rd.to_reg()), None, None);
+                let inst = enc_32_reg_shift(inst, shift);
+                emit_32(inst, sink);
+            }
+            &Inst::AluRRRR {
+                alu_op,
+                rd_hi,
+                rd_lo,
+                rn,
+                rm,
+            } => {
+                let (bits_22_20, bits_7_4) = match alu_op {
+                    ALUOp::Smull => (0b000, 0b0000),
+                    ALUOp::Umull => (0b010, 0b0000),
+                    _ => panic!("Invalid ALUOp {:?} in RRRR form!", alu_op),
+                };
+                let inst = (0b111110111 << 23) | (bits_22_20 << 20) | (bits_7_4 << 4);
+                let inst = enc_32_regs(
+                    inst,
+                    Some(rm),
+                    Some(rd_hi.to_reg()),
+                    Some(rd_lo.to_reg()),
+                    Some(rn),
+                );
+                emit_32(inst, sink);
+            }
+            &Inst::AluRRImm12 {
+                alu_op,
+                rd,
+                rn,
+                imm12,
+            } => {
+                let bits_24_20 = match alu_op {
+                    ALUOp::Add => 0b00000,
+                    ALUOp::Sub => 0b01010,
+                    _ => panic!("Invalid ALUOp {:?} in RRImm12 form!", alu_op),
+                };
+                let inst = (0b11110_0_1 << 25) | (bits_24_20 << 20);
+                let inst = enc_32_regs(inst, None, Some(rd.to_reg()), None, Some(rn));
+                let inst = enc_32_imm12(inst, imm12);
+                emit_32(inst, sink);
+            }
+            &Inst::AluRRImm8 {
+                alu_op,
+                rd,
+                rn,
+                imm8,
+            } => {
+                let bits_24_20 = match alu_op {
+                    ALUOp::And => 0b00000,
+                    ALUOp::Bic => 0b00010,
+                    ALUOp::Orr => 0b00100,
+                    ALUOp::Orn => 0b00110,
+                    ALUOp::Eor => 0b01000,
+                    ALUOp::Add => 0b10000,
+                    ALUOp::Adds => 0b10001,
+                    ALUOp::Adc => 0b10100,
+                    ALUOp::Adcs => 0b10101,
+                    ALUOp::Sbc => 0b10110,
+                    ALUOp::Sbcs => 0b10111,
+                    ALUOp::Sub => 0b11010,
+                    ALUOp::Subs => 0b11011,
+                    ALUOp::Rsb => 0b11100,
+                    _ => panic!("Invalid ALUOp {:?} in RRImm8 form!", alu_op),
+                };
+                let imm8 = imm8.bits();
+                let inst = 0b11110_0_0_00000_0000_0_000_0000_00000000 | imm8 | (bits_24_20 << 20);
+                let inst = enc_32_regs(inst, None, Some(rd.to_reg()), None, Some(rn));
+                emit_32(inst, sink);
+            }
+            &Inst::AluRImm8 { alu_op, rd, imm8 } => {
+                let bits_24_20 = match alu_op {
+                    ALUOp1::Mvn => 0b00110,
+                    ALUOp1::Mov => 0b00100,
+                };
+                let imm8 = imm8.bits();
+                let inst = 0b11110_0_0_00000_1111_0_000_0000_00000000 | imm8 | (bits_24_20 << 20);
+                let inst = enc_32_regs(inst, None, Some(rd.to_reg()), None, None);
+                emit_32(inst, sink);
+            }
+            &Inst::BitOpRR { bit_op, rd, rm } => {
+                let (bits_22_20, bits_7_4) = match bit_op {
+                    BitOp::Rbit => (0b001, 0b1010),
+                    BitOp::Rev => (0b001, 0b1000),
+                    BitOp::Clz => (0b011, 0b1000),
+                };
+                let inst =
+                    0b111110101_000_0000_1111_0000_0000_0000 | (bits_22_20 << 20) | (bits_7_4 << 4);
+                let inst = enc_32_regs(inst, Some(rm), Some(rd.to_reg()), None, Some(rm));
+                emit_32(inst, sink);
+            }
+            &Inst::Mov { rd, rm } => {
+                sink.put2(enc_16_mov(rd, rm));
+            }
+            &Inst::MovImm16 { rd, imm16 } => {
+                emit_32(enc_32_r_imm16(0b11110_0_100100, rd.to_reg(), imm16), sink);
+            }
+            &Inst::Movt { rd, imm16 } => {
+                emit_32(enc_32_r_imm16(0b11110_0_101100, rd.to_reg(), imm16), sink);
+            }
+            &Inst::Cmp { rn, rm } => {
+                // Check which 16-bit encoding is allowed.
+                if machreg_is_lo(rn) && machreg_is_lo(rm) {
+                    sink.put2(enc_16_rr(0b0100001010, rn, rm));
+                } else {
+                    sink.put2(enc_16_rr_any(0b01000101, rn, rm));
+                }
+            }
+            &Inst::CmpImm8 { rn, imm8 } => {
+                let inst = 0b11110_0_011011_0000_0_000_1111_00000000 | u32::from(imm8);
+                let inst = enc_32_regs(inst, None, None, None, Some(rn));
+                emit_32(inst, sink);
+            }
+            &Inst::Store { rt, ref mem, bits } => {
+                let (mem_insts, mem) = mem_finalize(mem, state);
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink, emit_info, state);
+                }
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() {
+                    // Register the offset at which the store instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                match mem {
+                    AMode::RegReg(rn, rm, imm2) => {
+                        let bits_24_20 = match bits {
+                            32 => 0b00100,
+                            16 => 0b00010,
+                            8 => 0b00000,
+                            _ => panic!("Unsupported store case {:?}", self),
+                        };
+                        emit_32(enc_32_mem_r(bits_24_20, rt, rn, rm, imm2), sink);
+                    }
+                    AMode::RegOffset12(rn, off12) => {
+                        let bits_24_20 = match bits {
+                            32 => 0b01100,
+                            16 => 0b01010,
+                            8 => 0b01000,
+                            _ => panic!("Unsupported store case {:?}", self),
+                        };
+                        emit_32(enc_32_mem_off12(bits_24_20, rt, rn, off12), sink);
+                    }
+                    AMode::PCRel(_) => panic!("Unsupported store case {:?}", self),
+                    _ => unreachable!(),
+                }
+            }
+            &Inst::Load {
+                rt,
+                ref mem,
+                bits,
+                sign_extend,
+            } => {
+                let (mem_insts, mem) = mem_finalize(mem, state);
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink, emit_info, state);
+                }
+                let srcloc = state.cur_srcloc();
+                if srcloc != SourceLoc::default() {
+                    // Register the offset at which the load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+                match mem {
+                    AMode::RegReg(rn, rm, imm2) => {
+                        let bits_24_20 = match (bits, sign_extend) {
+                            (32, _) => 0b00101,
+                            (16, true) => 0b10011,
+                            (16, false) => 0b00011,
+                            (8, true) => 0b10001,
+                            (8, false) => 0b00001,
+                            _ => panic!("Unsupported load case {:?}", self),
+                        };
+                        emit_32(enc_32_mem_r(bits_24_20, rt.to_reg(), rn, rm, imm2), sink);
+                    }
+                    AMode::RegOffset12(rn, off12) => {
+                        let bits_24_20 = match (bits, sign_extend) {
+                            (32, _) => 0b01101,
+                            (16, true) => 0b11011,
+                            (16, false) => 0b01011,
+                            (8, true) => 0b11001,
+                            (8, false) => 0b01001,
+                            _ => panic!("Unsupported load case {:?}", self),
+                        };
+                        emit_32(enc_32_mem_off12(bits_24_20, rt.to_reg(), rn, off12), sink);
+                    }
+                    AMode::PCRel(off12) => {
+                        let mut bits_24_20 = match (bits, sign_extend) {
+                            (32, _) => 0b00101,
+                            (16, true) => 0b10011,
+                            (16, false) => 0b00011,
+                            (8, true) => 0b10001,
+                            (8, false) => 0b00001,
+                            _ => panic!("Unsupported load case {:?}", self),
+                        };
+                        let (u, off12) = if off12 > 0 { (1, off12) } else { (0, -off12) };
+                        let off12 = UImm12::maybe_from_i64(i64::from(off12)).unwrap();
+                        bits_24_20 |= u << 3;
+
+                        emit_32(
+                            enc_32_mem_off12(bits_24_20, rt.to_reg(), pc_reg(), off12),
+                            sink,
+                        );
+                    }
+                    _ => unreachable!(),
+                }
+            }
+            &Inst::LoadAddr { rd, ref mem } => {
+                let (mem_insts, mem) = mem_finalize(mem, state);
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink, emit_info, state);
+                }
+                let inst = match mem {
+                    AMode::RegReg(reg1, reg2, shift) => {
+                        let shift = u32::from(shift);
+                        let shift_amt = ShiftOpShiftImm::maybe_from_shift(shift).unwrap();
+                        let shift = ShiftOpAndAmt::new(ShiftOp::LSL, shift_amt);
+                        Inst::AluRRRShift {
+                            alu_op: ALUOp::Add,
+                            rd,
+                            rn: reg1,
+                            rm: reg2,
+                            shift: Some(shift),
+                        }
+                    }
+                    AMode::RegOffset12(reg, imm12) => Inst::AluRRImm12 {
+                        alu_op: ALUOp::Add,
+                        rd,
+                        rn: reg,
+                        imm12,
+                    },
+                    AMode::PCRel(off12) => {
+                        let (off12, alu_op) = if off12 > 0 {
+                            (off12, ALUOp::Add)
+                        } else {
+                            (-off12, ALUOp::Sub)
+                        };
+                        let imm12 = UImm12::maybe_from_i64(i64::from(off12)).unwrap();
+                        Inst::AluRRImm12 {
+                            alu_op,
+                            rd,
+                            rn: pc_reg(),
+                            imm12,
+                        }
+                    }
+                    _ => unreachable!(),
+                };
+                inst.emit(sink, emit_info, state);
+            }
+            &Inst::Extend {
+                rd,
+                rm,
+                from_bits,
+                signed,
+            } if from_bits >= 8 => {
+                let rd = rd.to_reg();
+                if machreg_is_lo(rd) && machreg_is_lo(rm) {
+                    let bits_15_9 = match (from_bits, signed) {
+                        (16, true) => 0b1011001000,
+                        (16, false) => 0b1011001010,
+                        (8, true) => 0b1011001001,
+                        (8, false) => 0b1011001011,
+                        _ => panic!("Unsupported Extend case: {:?}", self),
+                    };
+                    sink.put2(enc_16_rr(bits_15_9, rd, rm));
+                } else {
+                    let bits_22_20 = match (from_bits, signed) {
+                        (16, true) => 0b000,
+                        (16, false) => 0b001,
+                        (8, true) => 0b100,
+                        (8, false) => 0b101,
+                        _ => panic!("Unsupported Extend case: {:?}", self),
+                    };
+                    let inst = 0b111110100_000_11111111_0000_1000_0000 | (bits_22_20 << 20);
+                    let inst = enc_32_regs(inst, Some(rm), Some(rd), None, None);
+                    emit_32(inst, sink);
+                }
+            }
+            &Inst::Extend {
+                rd,
+                rm,
+                from_bits,
+                signed,
+            } if from_bits == 1 => {
+                let inst = Inst::AluRRImm8 {
+                    alu_op: ALUOp::And,
+                    rd,
+                    rn: rm,
+                    imm8: UImm8::maybe_from_i64(1).unwrap(),
+                };
+                inst.emit(sink, emit_info, state);
+
+                if signed {
+                    let inst = Inst::AluRRImm8 {
+                        alu_op: ALUOp::Rsb,
+                        rd,
+                        rn: rd.to_reg(),
+                        imm8: UImm8::maybe_from_i64(1).unwrap(),
+                    };
+                    inst.emit(sink, emit_info, state);
+                }
+            }
+            &Inst::Extend { .. } => {
+                panic!("Unsupported extend variant");
+            }
+            &Inst::It { cond, ref insts } => {
+                assert!(1 <= insts.len() && insts.len() <= 4);
+                assert!(insts[0].then);
+
+                sink.put2(enc_16_it(cond, insts));
+                for inst in insts.iter() {
+                    inst.inst.emit(sink, emit_info, state);
+                }
+            }
+            &Inst::Push { ref reg_list } => match reg_list.len() {
+                0 => panic!("Unsupported Push case: {:?}", self),
+                1 => {
+                    let reg = u32::from(machreg_to_gpr(reg_list[0]));
+                    let inst: u32 = 0b1111100001001101_0000_110100000100 | (reg << 12);
+                    emit_32(inst, sink);
+                }
+                _ => {
+                    let mut inst: u32 = 0b1110100100101101 << 16;
+                    for reg in reg_list {
+                        inst |= 1 << machreg_to_gpr(*reg);
+                    }
+                    if inst & ((1 << 13) | (1 << 15)) != 0 {
+                        panic!("Unsupported Push case: {:?}", self);
+                    }
+                    emit_32(inst, sink);
+                }
+            },
+            &Inst::Pop { ref reg_list } => match reg_list.len() {
+                0 => panic!("Unsupported Pop case: {:?}", self),
+                1 => {
+                    let reg = u32::from(machreg_to_gpr(reg_list[0].to_reg()));
+                    let inst: u32 = 0b1111100001011101_0000_101100000100 | (reg << 12);
+                    emit_32(inst, sink);
+                }
+                _ => {
+                    let mut inst: u32 = 0b1110100010111101 << 16;
+                    for reg in reg_list {
+                        inst |= 1 << machreg_to_gpr(reg.to_reg());
+                    }
+                    if (inst & (1 << 14) != 0) && (inst & (1 << 15) != 0) {
+                        panic!("Unsupported Pop case: {:?}", self);
+                    }
+                    emit_32(inst, sink);
+                }
+            },
+            &Inst::Call { ref info } => {
+                let srcloc = state.cur_srcloc();
+                sink.add_reloc(srcloc, Reloc::Arm32Call, &info.dest, 0);
+                emit_32(0b11110_0_0000000000_11_0_1_0_00000000000, sink);
+                if info.opcode.is_call() {
+                    sink.add_call_site(srcloc, info.opcode);
+                }
+            }
+            &Inst::CallInd { ref info } => {
+                let srcloc = state.cur_srcloc();
+                sink.put2(0b01000111_1_0000_000 | (machreg_to_gpr(info.rm) << 3));
+                if info.opcode.is_call() {
+                    sink.add_call_site(srcloc, info.opcode);
+                }
+            }
+            &Inst::LoadExtName {
+                rt,
+                ref name,
+                offset,
+            } => {
+                //  maybe nop2          (0|2) bytes (pc is now 4-aligned)
+                //  ldr rt, [pc, #4]    4 bytes
+                //  b continue          4 bytes
+                //  addr                4 bytes
+                // continue:
+                //
+                if start_off & 0x3 != 0 {
+                    Inst::Nop2.emit(sink, emit_info, state);
+                }
+                assert_eq!(sink.cur_offset() & 0x3, 0);
+
+                let mem = AMode::PCRel(4);
+                let inst = Inst::Load {
+                    rt,
+                    mem,
+                    bits: 32,
+                    sign_extend: false,
+                };
+                inst.emit(sink, emit_info, state);
+
+                let inst = Inst::Jump {
+                    dest: BranchTarget::ResolvedOffset(4),
+                };
+                inst.emit(sink, emit_info, state);
+
+                let srcloc = state.cur_srcloc();
+                sink.add_reloc(srcloc, Reloc::Abs4, name, offset.into());
+                sink.put4(0);
+            }
+            &Inst::Ret => {
+                sink.put2(0b010001110_1110_000); // bx lr
+            }
+            &Inst::Jump { dest } => {
+                let off = sink.cur_offset();
+                // Indicate that the jump uses a label, if so, so that a fixup can occur later.
+                if let Some(l) = dest.as_label() {
+                    sink.use_label_at_offset(off, l, LabelUse::Branch24);
+                    sink.add_uncond_branch(off, off + 4, l);
+                }
+                emit_32(enc_32_jump(dest), sink);
+            }
+            &Inst::CondBr {
+                taken,
+                not_taken,
+                cond,
+            } => {
+                // Conditional part first.
+                let cond_off = sink.cur_offset();
+                if let Some(l) = taken.as_label() {
+                    let label_use = LabelUse::Branch20;
+                    sink.use_label_at_offset(cond_off, l, label_use);
+                    let inverted = enc_32_cond_branch(cond.invert(), taken);
+                    let inverted = u32_swap_halfwords(inverted).to_le_bytes();
+                    sink.add_cond_branch(cond_off, cond_off + 4, l, &inverted[..]);
+                }
+                emit_32(enc_32_cond_branch(cond, taken), sink);
+
+                // Unconditional part.
+                let uncond_off = sink.cur_offset();
+                if let Some(l) = not_taken.as_label() {
+                    sink.use_label_at_offset(uncond_off, l, LabelUse::Branch24);
+                    sink.add_uncond_branch(uncond_off, uncond_off + 4, l);
+                }
+                emit_32(enc_32_jump(not_taken), sink);
+            }
+            &Inst::IndirectBr { rm, .. } => {
+                let inst = 0b010001110_0000_000 | (machreg_to_gpr(rm) << 3);
+                sink.put2(inst);
+            }
+            &Inst::Udf { trap_info } => {
+                let srcloc = state.cur_srcloc();
+                let code = trap_info;
+                sink.add_trap(srcloc, code);
+                sink.put2(0b11011110_00000000);
+            }
+            &Inst::Bkpt => {
+                sink.put2(0b10111110_00000000);
+            }
+            &Inst::TrapIf { cond, trap_info } => {
+                let cond = cond.invert();
+                let dest = BranchTarget::ResolvedOffset(2);
+                emit_32(enc_32_cond_branch(cond, dest), sink);
+
+                let trap = Inst::Udf { trap_info };
+                trap.emit(sink, emit_info, state);
+            }
+            &Inst::VirtualSPOffsetAdj { offset } => {
+                debug!(
+                    "virtual sp offset adjusted by {} -> {}",
+                    offset,
+                    state.virtual_sp_offset + offset,
+                );
+                state.virtual_sp_offset += offset;
+            }
+        }
+
+        let end_off = sink.cur_offset();
+        debug_assert!((end_off - start_off) <= Inst::worst_case_size());
+    }
+
+    fn pretty_print(&self, mb_rru: Option<&RealRegUniverse>, state: &mut EmitState) -> String {
+        self.print_with_state(mb_rru, state)
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit_tests.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit_tests.rs
new file mode 100644
index 0000000000..73269be999
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit_tests.rs
@@ -0,0 +1,1959 @@
+use crate::isa::arm32::inst::*;
+use crate::isa::test_utils;
+use crate::settings;
+
+use alloc::vec::Vec;
+
+#[test]
+fn test_arm32_emit() {
+    let flags = settings::Flags::new(settings::builder());
+    let mut insns = Vec::<(Inst, &str, &str)>::new();
+
+    // litle endian order
+    insns.push((Inst::Nop0, "", "nop-zero-len"));
+    insns.push((Inst::Nop2, "00BF", "nop"));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Lsl,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+        },
+        "01FA02F0",
+        "lsl r0, r1, r2",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Lsl,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+        },
+        "09FA0AF8",
+        "lsl r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Lsr,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+        },
+        "21FA02F0",
+        "lsr r0, r1, r2",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Lsr,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+        },
+        "29FA0AF8",
+        "lsr r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Asr,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+        },
+        "41FA02F0",
+        "asr r0, r1, r2",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Asr,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+        },
+        "49FA0AF8",
+        "asr r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Ror,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+        },
+        "61FA02F0",
+        "ror r0, r1, r2",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Ror,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+        },
+        "69FA0AF8",
+        "ror r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Qadd,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+        },
+        "81FA82F0",
+        "qadd r0, r1, r2",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Qadd,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+        },
+        "89FA8AF8",
+        "qadd r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Qsub,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+        },
+        "81FAA2F0",
+        "qsub r0, r1, r2",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Qsub,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+        },
+        "89FAAAF8",
+        "qsub r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Mul,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+        },
+        "01FB02F0",
+        "mul r0, r1, r2",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Mul,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+        },
+        "09FB0AF8",
+        "mul r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Udiv,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+        },
+        "B1FBF2F0",
+        "udiv r0, r1, r2",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Udiv,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+        },
+        "B9FBFAF8",
+        "udiv r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Sdiv,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+        },
+        "91FBF2F0",
+        "sdiv r0, r1, r2",
+    ));
+    insns.push((
+        Inst::AluRRR {
+            alu_op: ALUOp::Sdiv,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+        },
+        "99FBFAF8",
+        "sdiv r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::And,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "01EAC250",
+        "and r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::And,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "09EA0A08",
+        "and r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Bic,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "21EAC250",
+        "bic r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Bic,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "29EA0A08",
+        "bic r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Orr,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "41EAC250",
+        "orr r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Orr,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "49EA0A08",
+        "orr r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Orn,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "61EAC250",
+        "orn r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Orn,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "69EA0A08",
+        "orn r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Eor,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "81EAC250",
+        "eor r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Eor,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "89EA0A08",
+        "eor r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Add,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "01EBC250",
+        "add r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Add,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "09EB0A08",
+        "add r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Adds,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "11EBC250",
+        "adds r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Adds,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "19EB0A08",
+        "adds r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Adc,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "41EBC250",
+        "adc r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Adc,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "49EB0A08",
+        "adc r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Adcs,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "51EBC250",
+        "adcs r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Adcs,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "59EB0A08",
+        "adcs r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Sbc,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "61EBC250",
+        "sbc r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Sbc,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "69EB0A08",
+        "sbc r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Sbcs,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "71EBC250",
+        "sbcs r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Sbcs,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "79EB0A08",
+        "sbcs r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Sub,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "A1EBC250",
+        "sub r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Sub,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "A9EB0A08",
+        "sub r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Subs,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "B1EBC250",
+        "subs r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Subs,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "B9EB0A08",
+        "subs r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Rsb,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            rm: rreg(2),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+            )),
+        },
+        "C1EBC250",
+        "rsb r0, r1, r2, lsl #23",
+    ));
+    insns.push((
+        Inst::AluRRRShift {
+            alu_op: ALUOp::Rsb,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            rm: rreg(10),
+            shift: None,
+        },
+        "C9EB0A08",
+        "rsb r8, r9, r10",
+    ));
+    insns.push((
+        Inst::AluRRShift {
+            alu_op: ALUOp1::Mvn,
+            rd: writable_rreg(0),
+            rm: rreg(1),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(11).unwrap(),
+            )),
+        },
+        "6FEAC120",
+        "mvn r0, r1, lsl #11",
+    ));
+    insns.push((
+        Inst::AluRRShift {
+            alu_op: ALUOp1::Mvn,
+            rd: writable_rreg(8),
+            rm: rreg(9),
+            shift: None,
+        },
+        "6FEA0908",
+        "mvn r8, r9",
+    ));
+    insns.push((
+        Inst::AluRRShift {
+            alu_op: ALUOp1::Mov,
+            rd: writable_rreg(0),
+            rm: rreg(1),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSL,
+                ShiftOpShiftImm::maybe_from_shift(11).unwrap(),
+            )),
+        },
+        "4FEAC120",
+        "mov r0, r1, lsl #11",
+    ));
+    insns.push((
+        Inst::AluRRShift {
+            alu_op: ALUOp1::Mov,
+            rd: writable_rreg(2),
+            rm: rreg(8),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::LSR,
+                ShiftOpShiftImm::maybe_from_shift(27).unwrap(),
+            )),
+        },
+        "4FEAD862",
+        "mov r2, r8, lsr #27",
+    ));
+    insns.push((
+        Inst::AluRRShift {
+            alu_op: ALUOp1::Mov,
+            rd: writable_rreg(9),
+            rm: rreg(3),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::ASR,
+                ShiftOpShiftImm::maybe_from_shift(3).unwrap(),
+            )),
+        },
+        "4FEAE309",
+        "mov r9, r3, asr #3",
+    ));
+    insns.push((
+        Inst::AluRRShift {
+            alu_op: ALUOp1::Mov,
+            rd: writable_rreg(10),
+            rm: rreg(11),
+            shift: Some(ShiftOpAndAmt::new(
+                ShiftOp::ROR,
+                ShiftOpShiftImm::maybe_from_shift(7).unwrap(),
+            )),
+        },
+        "4FEAFB1A",
+        "mov r10, fp, ror #7",
+    ));
+    insns.push((
+        Inst::AluRRRR {
+            alu_op: ALUOp::Smull,
+            rd_lo: writable_rreg(0),
+            rd_hi: writable_rreg(1),
+            rn: rreg(2),
+            rm: rreg(3),
+        },
+        "82FB0301",
+        "smull r0, r1, r2, r3",
+    ));
+    insns.push((
+        Inst::AluRRRR {
+            alu_op: ALUOp::Smull,
+            rd_lo: writable_rreg(8),
+            rd_hi: writable_rreg(9),
+            rn: rreg(10),
+            rm: rreg(11),
+        },
+        "8AFB0B89",
+        "smull r8, r9, r10, fp",
+    ));
+    insns.push((
+        Inst::AluRRRR {
+            alu_op: ALUOp::Umull,
+            rd_lo: writable_rreg(0),
+            rd_hi: writable_rreg(1),
+            rn: rreg(2),
+            rm: rreg(3),
+        },
+        "A2FB0301",
+        "umull r0, r1, r2, r3",
+    ));
+    insns.push((
+        Inst::AluRRRR {
+            alu_op: ALUOp::Umull,
+            rd_lo: writable_rreg(8),
+            rd_hi: writable_rreg(9),
+            rn: rreg(10),
+            rm: rreg(11),
+        },
+        "AAFB0B89",
+        "umull r8, r9, r10, fp",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::Add,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm12: UImm12::maybe_from_i64(4095).unwrap(),
+        },
+        "01F6FF70",
+        "add r0, r1, #4095",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::Add,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm12: UImm12::maybe_from_i64(0).unwrap(),
+        },
+        "09F20008",
+        "add r8, r9, #0",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::Sub,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm12: UImm12::maybe_from_i64(1999).unwrap(),
+        },
+        "A1F2CF70",
+        "sub r0, r1, #1999",
+    ));
+    insns.push((
+        Inst::AluRRImm12 {
+            alu_op: ALUOp::Sub,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm12: UImm12::maybe_from_i64(101).unwrap(),
+        },
+        "A9F26508",
+        "sub r8, r9, #101",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::And,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "01F0FF00",
+        "and r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::And,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "09F00108",
+        "and r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Bic,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "21F0FF00",
+        "bic r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Bic,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "29F00108",
+        "bic r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Orr,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "41F0FF00",
+        "orr r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Orr,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "49F00108",
+        "orr r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Orn,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "61F0FF00",
+        "orn r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Orn,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "69F00108",
+        "orn r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Eor,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "81F0FF00",
+        "eor r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Eor,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "89F00108",
+        "eor r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Add,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "01F1FF00",
+        "add r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Add,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "09F10108",
+        "add r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Adds,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "11F1FF00",
+        "adds r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Adds,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "19F10108",
+        "adds r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Adc,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "41F1FF00",
+        "adc r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Adc,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "49F10108",
+        "adc r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Adcs,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "51F1FF00",
+        "adcs r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Adcs,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "59F10108",
+        "adcs r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Sbc,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "61F1FF00",
+        "sbc r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Sbc,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "69F10108",
+        "sbc r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Sbcs,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "71F1FF00",
+        "sbcs r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Sbcs,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "79F10108",
+        "sbcs r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Sub,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "A1F1FF00",
+        "sub r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Sub,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "A9F10108",
+        "sub r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Subs,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "B1F1FF00",
+        "subs r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Subs,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "B9F10108",
+        "subs r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Rsb,
+            rd: writable_rreg(0),
+            rn: rreg(1),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "C1F1FF00",
+        "rsb r0, r1, #255",
+    ));
+    insns.push((
+        Inst::AluRRImm8 {
+            alu_op: ALUOp::Rsb,
+            rd: writable_rreg(8),
+            rn: rreg(9),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "C9F10108",
+        "rsb r8, r9, #1",
+    ));
+    insns.push((
+        Inst::AluRImm8 {
+            alu_op: ALUOp1::Mvn,
+            rd: writable_rreg(0),
+            imm8: UImm8::maybe_from_i64(255).unwrap(),
+        },
+        "6FF0FF00",
+        "mvn r0, #255",
+    ));
+    insns.push((
+        Inst::AluRImm8 {
+            alu_op: ALUOp1::Mvn,
+            rd: writable_rreg(8),
+            imm8: UImm8::maybe_from_i64(1).unwrap(),
+        },
+        "6FF00108",
+        "mvn r8, #1",
+    ));
+    insns.push((
+        Inst::AluRImm8 {
+            alu_op: ALUOp1::Mov,
+            rd: writable_rreg(0),
+            imm8: UImm8::maybe_from_i64(0).unwrap(),
+        },
+        "4FF00000",
+        "mov r0, #0",
+    ));
+    insns.push((
+        Inst::AluRImm8 {
+            alu_op: ALUOp1::Mov,
+            rd: writable_rreg(8),
+            imm8: UImm8::maybe_from_i64(176).unwrap(),
+        },
+        "4FF0B008",
+        "mov r8, #176",
+    ));
+    insns.push((
+        Inst::BitOpRR {
+            bit_op: BitOp::Rbit,
+            rd: writable_rreg(0),
+            rm: rreg(1),
+        },
+        "91FAA1F0",
+        "rbit r0, r1",
+    ));
+    insns.push((
+        Inst::BitOpRR {
+            bit_op: BitOp::Rbit,
+            rd: writable_rreg(8),
+            rm: rreg(9),
+        },
+        "99FAA9F8",
+        "rbit r8, r9",
+    ));
+    insns.push((
+        Inst::BitOpRR {
+            bit_op: BitOp::Rev,
+            rd: writable_rreg(0),
+            rm: rreg(1),
+        },
+        "91FA81F0",
+        "rev r0, r1",
+    ));
+    insns.push((
+        Inst::BitOpRR {
+            bit_op: BitOp::Rev,
+            rd: writable_rreg(8),
+            rm: rreg(9),
+        },
+        "99FA89F8",
+        "rev r8, r9",
+    ));
+    insns.push((
+        Inst::BitOpRR {
+            bit_op: BitOp::Clz,
+            rd: writable_rreg(0),
+            rm: rreg(1),
+        },
+        "B1FA81F0",
+        "clz r0, r1",
+    ));
+    insns.push((
+        Inst::BitOpRR {
+            bit_op: BitOp::Clz,
+            rd: writable_rreg(8),
+            rm: rreg(9),
+        },
+        "B9FA89F8",
+        "clz r8, r9",
+    ));
+    insns.push((
+        Inst::Mov {
+            rd: writable_rreg(0),
+            rm: rreg(1),
+        },
+        "0846",
+        "mov r0, r1",
+    ));
+    insns.push((
+        Inst::Mov {
+            rd: writable_rreg(2),
+            rm: rreg(8),
+        },
+        "4246",
+        "mov r2, r8",
+    ));
+    insns.push((
+        Inst::Mov {
+            rd: writable_rreg(9),
+            rm: rreg(3),
+        },
+        "9946",
+        "mov r9, r3",
+    ));
+    insns.push((
+        Inst::Mov {
+            rd: writable_rreg(10),
+            rm: rreg(11),
+        },
+        "DA46",
+        "mov r10, fp",
+    ));
+    insns.push((
+        Inst::MovImm16 {
+            rd: writable_rreg(0),
+            imm16: 0,
+        },
+        "40F20000",
+        "mov r0, #0",
+    ));
+    insns.push((
+        Inst::MovImm16 {
+            rd: writable_rreg(1),
+            imm16: 15,
+        },
+        "40F20F01",
+        "mov r1, #15",
+    ));
+    insns.push((
+        Inst::MovImm16 {
+            rd: writable_rreg(2),
+            imm16: 255,
+        },
+        "40F2FF02",
+        "mov r2, #255",
+    ));
+    insns.push((
+        Inst::MovImm16 {
+            rd: writable_rreg(8),
+            imm16: 4095,
+        },
+        "40F6FF78",
+        "mov r8, #4095",
+    ));
+    insns.push((
+        Inst::MovImm16 {
+            rd: writable_rreg(9),
+            imm16: 65535,
+        },
+        "4FF6FF79",
+        "mov r9, #65535",
+    ));
+    insns.push((
+        Inst::Movt {
+            rd: writable_rreg(0),
+            imm16: 0,
+        },
+        "C0F20000",
+        "movt r0, #0",
+    ));
+    insns.push((
+        Inst::Movt {
+            rd: writable_rreg(1),
+            imm16: 15,
+        },
+        "C0F20F01",
+        "movt r1, #15",
+    ));
+    insns.push((
+        Inst::Movt {
+            rd: writable_rreg(2),
+            imm16: 255,
+        },
+        "C0F2FF02",
+        "movt r2, #255",
+    ));
+    insns.push((
+        Inst::Movt {
+            rd: writable_rreg(8),
+            imm16: 4095,
+        },
+        "C0F6FF78",
+        "movt r8, #4095",
+    ));
+    insns.push((
+        Inst::Movt {
+            rd: writable_rreg(9),
+            imm16: 65535,
+        },
+        "CFF6FF79",
+        "movt r9, #65535",
+    ));
+    insns.push((
+        Inst::Cmp {
+            rn: rreg(0),
+            rm: rreg(1),
+        },
+        "8842",
+        "cmp r0, r1",
+    ));
+    insns.push((
+        Inst::Cmp {
+            rn: rreg(2),
+            rm: rreg(8),
+        },
+        "4245",
+        "cmp r2, r8",
+    ));
+    insns.push((
+        Inst::Cmp {
+            rn: rreg(9),
+            rm: rreg(3),
+        },
+        "9945",
+        "cmp r9, r3",
+    ));
+    insns.push((
+        Inst::Cmp {
+            rn: rreg(10),
+            rm: rreg(11),
+        },
+        "DA45",
+        "cmp r10, fp",
+    ));
+    insns.push((
+        Inst::CmpImm8 {
+            rn: rreg(0),
+            imm8: 255,
+        },
+        "B0F1FF0F",
+        "cmp r0, #255",
+    ));
+    insns.push((
+        Inst::CmpImm8 {
+            rn: rreg(1),
+            imm8: 0,
+        },
+        "B1F1000F",
+        "cmp r1, #0",
+    ));
+    insns.push((
+        Inst::CmpImm8 {
+            rn: rreg(8),
+            imm8: 1,
+        },
+        "B8F1010F",
+        "cmp r8, #1",
+    ));
+
+    insns.push((
+        Inst::Store {
+            rt: rreg(0),
+            mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0),
+            bits: 32,
+        },
+        "41F80200",
+        "str r0, [r1, r2]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(8),
+            mem: AMode::reg_plus_reg(rreg(9), rreg(10), 3),
+            bits: 32,
+        },
+        "49F83A80",
+        "str r8, [r9, r10, lsl #3]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(0),
+            mem: AMode::RegOffset(rreg(1), 4095),
+            bits: 32,
+        },
+        "C1F8FF0F",
+        "str r0, [r1, #4095]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(8),
+            mem: AMode::RegOffset(rreg(9), 0),
+            bits: 32,
+        },
+        "C9F80080",
+        "str r8, [r9, #0]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(7),
+            mem: AMode::RegOffset(rreg(11), 65535),
+            bits: 32,
+        },
+        "4FF6FF7C4BF80C70",
+        "mov ip, #65535 ; str r7, [fp, ip]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(10),
+            mem: AMode::RegOffset(rreg(4), 16777215),
+            bits: 32,
+        },
+        "4FF6FF7CC0F2FF0C44F80CA0",
+        "mov ip, #65535 ; movt ip, #255 ; str r10, [r4, ip]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(0),
+            mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0),
+            bits: 16,
+        },
+        "21F80200",
+        "strh r0, [r1, r2]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(8),
+            mem: AMode::reg_plus_reg(rreg(9), rreg(10), 2),
+            bits: 16,
+        },
+        "29F82A80",
+        "strh r8, [r9, r10, lsl #2]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(0),
+            mem: AMode::RegOffset(rreg(1), 3210),
+            bits: 16,
+        },
+        "A1F88A0C",
+        "strh r0, [r1, #3210]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(8),
+            mem: AMode::RegOffset(rreg(9), 1),
+            bits: 16,
+        },
+        "A9F80180",
+        "strh r8, [r9, #1]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(7),
+            mem: AMode::RegOffset(rreg(11), 65535),
+            bits: 16,
+        },
+        "4FF6FF7C2BF80C70",
+        "mov ip, #65535 ; strh r7, [fp, ip]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(10),
+            mem: AMode::RegOffset(rreg(4), 16777215),
+            bits: 16,
+        },
+        "4FF6FF7CC0F2FF0C24F80CA0",
+        "mov ip, #65535 ; movt ip, #255 ; strh r10, [r4, ip]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(0),
+            mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0),
+            bits: 8,
+        },
+        "01F80200",
+        "strb r0, [r1, r2]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(8),
+            mem: AMode::reg_plus_reg(rreg(9), rreg(10), 1),
+            bits: 8,
+        },
+        "09F81A80",
+        "strb r8, [r9, r10, lsl #1]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(0),
+            mem: AMode::RegOffset(rreg(1), 4),
+            bits: 8,
+        },
+        "81F80400",
+        "strb r0, [r1, #4]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(8),
+            mem: AMode::RegOffset(rreg(9), 777),
+            bits: 8,
+        },
+        "89F80983",
+        "strb r8, [r9, #777]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(7),
+            mem: AMode::RegOffset(rreg(11), 65535),
+            bits: 8,
+        },
+        "4FF6FF7C0BF80C70",
+        "mov ip, #65535 ; strb r7, [fp, ip]",
+    ));
+    insns.push((
+        Inst::Store {
+            rt: rreg(10),
+            mem: AMode::RegOffset(rreg(4), 16777215),
+            bits: 8,
+        },
+        "4FF6FF7CC0F2FF0C04F80CA0",
+        "mov ip, #65535 ; movt ip, #255 ; strb r10, [r4, ip]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(0),
+            mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0),
+            bits: 32,
+            sign_extend: false,
+        },
+        "51F80200",
+        "ldr r0, [r1, r2]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(8),
+            mem: AMode::reg_plus_reg(rreg(9), rreg(10), 1),
+            bits: 32,
+            sign_extend: false,
+        },
+        "59F81A80",
+        "ldr r8, [r9, r10, lsl #1]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(0),
+            mem: AMode::RegOffset(rreg(1), 55),
+            bits: 32,
+            sign_extend: false,
+        },
+        "D1F83700",
+        "ldr r0, [r1, #55]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(8),
+            mem: AMode::RegOffset(rreg(9), 1234),
+            bits: 32,
+            sign_extend: false,
+        },
+        "D9F8D284",
+        "ldr r8, [r9, #1234]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(7),
+            mem: AMode::RegOffset(rreg(11), 9876),
+            bits: 32,
+            sign_extend: false,
+        },
+        "42F2946C5BF80C70",
+        "mov ip, #9876 ; ldr r7, [fp, ip]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(10),
+            mem: AMode::RegOffset(rreg(4), 252645135),
+            bits: 32,
+            sign_extend: false,
+        },
+        "40F60F7CC0F60F7C54F80CA0",
+        "mov ip, #3855 ; movt ip, #3855 ; ldr r10, [r4, ip]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(0),
+            mem: AMode::PCRel(-56),
+            bits: 32,
+            sign_extend: false,
+        },
+        "5FF83800",
+        "ldr r0, [pc, #-56]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(8),
+            mem: AMode::PCRel(1024),
+            bits: 32,
+            sign_extend: false,
+        },
+        "DFF80084",
+        "ldr r8, [pc, #1024]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(0),
+            mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0),
+            bits: 16,
+            sign_extend: true,
+        },
+        "31F90200",
+        "ldrsh r0, [r1, r2]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(8),
+            mem: AMode::reg_plus_reg(rreg(9), rreg(10), 2),
+            bits: 16,
+            sign_extend: false,
+        },
+        "39F82A80",
+        "ldrh r8, [r9, r10, lsl #2]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(0),
+            mem: AMode::RegOffset(rreg(1), 55),
+            bits: 16,
+            sign_extend: false,
+        },
+        "B1F83700",
+        "ldrh r0, [r1, #55]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(8),
+            mem: AMode::RegOffset(rreg(9), 1234),
+            bits: 16,
+            sign_extend: true,
+        },
+        "B9F9D284",
+        "ldrsh r8, [r9, #1234]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(7),
+            mem: AMode::RegOffset(rreg(11), 9876),
+            bits: 16,
+            sign_extend: true,
+        },
+        "42F2946C3BF90C70",
+        "mov ip, #9876 ; ldrsh r7, [fp, ip]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(10),
+            mem: AMode::RegOffset(rreg(4), 252645135),
+            bits: 16,
+            sign_extend: false,
+        },
+        "40F60F7CC0F60F7C34F80CA0",
+        "mov ip, #3855 ; movt ip, #3855 ; ldrh r10, [r4, ip]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(0),
+            mem: AMode::PCRel(56),
+            bits: 16,
+            sign_extend: false,
+        },
+        "BFF83800",
+        "ldrh r0, [pc, #56]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(8),
+            mem: AMode::PCRel(-1000),
+            bits: 16,
+            sign_extend: true,
+        },
+        "3FF9E883",
+        "ldrsh r8, [pc, #-1000]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(0),
+            mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0),
+            bits: 8,
+            sign_extend: true,
+        },
+        "11F90200",
+        "ldrsb r0, [r1, r2]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(8),
+            mem: AMode::reg_plus_reg(rreg(9), rreg(10), 3),
+            bits: 8,
+            sign_extend: false,
+        },
+        "19F83A80",
+        "ldrb r8, [r9, r10, lsl #3]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(0),
+            mem: AMode::RegOffset(rreg(1), 55),
+            bits: 8,
+            sign_extend: false,
+        },
+        "91F83700",
+        "ldrb r0, [r1, #55]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(8),
+            mem: AMode::RegOffset(rreg(9), 1234),
+            bits: 8,
+            sign_extend: true,
+        },
+        "99F9D284",
+        "ldrsb r8, [r9, #1234]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(7),
+            mem: AMode::RegOffset(rreg(11), 9876),
+            bits: 8,
+            sign_extend: true,
+        },
+        "42F2946C1BF90C70",
+        "mov ip, #9876 ; ldrsb r7, [fp, ip]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(10),
+            mem: AMode::RegOffset(rreg(4), 252645135),
+            bits: 8,
+            sign_extend: false,
+        },
+        "40F60F7CC0F60F7C14F80CA0",
+        "mov ip, #3855 ; movt ip, #3855 ; ldrb r10, [r4, ip]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(0),
+            mem: AMode::PCRel(72),
+            bits: 8,
+            sign_extend: false,
+        },
+        "9FF84800",
+        "ldrb r0, [pc, #72]",
+    ));
+    insns.push((
+        Inst::Load {
+            rt: writable_rreg(8),
+            mem: AMode::PCRel(-1234),
+            bits: 8,
+            sign_extend: true,
+        },
+        "1FF9D284",
+        "ldrsb r8, [pc, #-1234]",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_rreg(0),
+            rm: rreg(1),
+            from_bits: 16,
+            signed: false,
+        },
+        "88B2",
+        "uxth r0, r1",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_rreg(8),
+            rm: rreg(9),
+            from_bits: 16,
+            signed: false,
+        },
+        "1FFA89F8",
+        "uxth r8, r9",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_rreg(0),
+            rm: rreg(1),
+            from_bits: 8,
+            signed: false,
+        },
+        "C8B2",
+        "uxtb r0, r1",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_rreg(8),
+            rm: rreg(9),
+            from_bits: 8,
+            signed: false,
+        },
+        "5FFA89F8",
+        "uxtb r8, r9",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_rreg(0),
+            rm: rreg(1),
+            from_bits: 16,
+            signed: true,
+        },
+        "08B2",
+        "sxth r0, r1",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_rreg(8),
+            rm: rreg(9),
+            from_bits: 16,
+            signed: true,
+        },
+        "0FFA89F8",
+        "sxth r8, r9",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_rreg(0),
+            rm: rreg(1),
+            from_bits: 8,
+            signed: true,
+        },
+        "48B2",
+        "sxtb r0, r1",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_rreg(8),
+            rm: rreg(9),
+            from_bits: 8,
+            signed: true,
+        },
+        "4FFA89F8",
+        "sxtb r8, r9",
+    ));
+    insns.push((
+        Inst::It {
+            cond: Cond::Eq,
+            insts: vec![CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true)],
+        },
+        "08BF0046",
+        "it eq ; mov r0, r0",
+    ));
+    insns.push((
+        Inst::It {
+            cond: Cond::Ne,
+            insts: vec![
+                CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true),
+                CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), false),
+            ],
+        },
+        "14BF00460046",
+        "ite ne ; mov r0, r0 ; mov r0, r0",
+    ));
+    insns.push((
+        Inst::It {
+            cond: Cond::Lt,
+            insts: vec![
+                CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true),
+                CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), false),
+                CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true),
+            ],
+        },
+        "B6BF004600460046",
+        "itet lt ; mov r0, r0 ; mov r0, r0 ; mov r0, r0",
+    ));
+    insns.push((
+        Inst::It {
+            cond: Cond::Hs,
+            insts: vec![
+                CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true),
+                CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true),
+                CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), false),
+                CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), false),
+            ],
+        },
+        "27BF0046004600460046",
+        "ittee hs ; mov r0, r0 ; mov r0, r0 ; mov r0, r0 ; mov r0, r0",
+    ));
+    insns.push((
+        Inst::Push {
+            reg_list: vec![rreg(0)],
+        },
+        "4DF8040D",
+        "push {r0}",
+    ));
+    insns.push((
+        Inst::Push {
+            reg_list: vec![rreg(8)],
+        },
+        "4DF8048D",
+        "push {r8}",
+    ));
+    insns.push((
+        Inst::Push {
+            reg_list: vec![rreg(0), rreg(1), rreg(2), rreg(6), rreg(8)],
+        },
+        "2DE94701",
+        "push {r0, r1, r2, r6, r8}",
+    ));
+    insns.push((
+        Inst::Push {
+            reg_list: vec![rreg(8), rreg(9), rreg(10)],
+        },
+        "2DE90007",
+        "push {r8, r9, r10}",
+    ));
+    insns.push((
+        Inst::Pop {
+            reg_list: vec![writable_rreg(0)],
+        },
+        "5DF8040B",
+        "pop {r0}",
+    ));
+    insns.push((
+        Inst::Pop {
+            reg_list: vec![writable_rreg(8)],
+        },
+        "5DF8048B",
+        "pop {r8}",
+    ));
+    insns.push((
+        Inst::Pop {
+            reg_list: vec![
+                writable_rreg(0),
+                writable_rreg(1),
+                writable_rreg(2),
+                writable_rreg(6),
+                writable_rreg(8),
+            ],
+        },
+        "BDE84701",
+        "pop {r0, r1, r2, r6, r8}",
+    ));
+    insns.push((
+        Inst::Pop {
+            reg_list: vec![writable_rreg(8), writable_rreg(9), writable_rreg(10)],
+        },
+        "BDE80007",
+        "pop {r8, r9, r10}",
+    ));
+    insns.push((
+        Inst::Call {
+            info: Box::new(CallInfo {
+                dest: ExternalName::testcase("test0"),
+                uses: Vec::new(),
+                defs: Vec::new(),
+                loc: SourceLoc::default(),
+                opcode: Opcode::Call,
+            }),
+        },
+        "00F000D0",
+        "bl 0",
+    ));
+    insns.push((
+        Inst::CallInd {
+            info: Box::new(CallIndInfo {
+                rm: rreg(0),
+                uses: Vec::new(),
+                defs: Vec::new(),
+                loc: SourceLoc::default(),
+                opcode: Opcode::CallIndirect,
+            }),
+        },
+        "8047",
+        "blx r0",
+    ));
+    insns.push((
+        Inst::CallInd {
+            info: Box::new(CallIndInfo {
+                rm: rreg(8),
+                uses: Vec::new(),
+                defs: Vec::new(),
+                loc: SourceLoc::default(),
+                opcode: Opcode::CallIndirect,
+            }),
+        },
+        "C047",
+        "blx r8",
+    ));
+    insns.push((Inst::Ret, "7047", "bx lr"));
+    insns.push((
+        Inst::Jump {
+            dest: BranchTarget::ResolvedOffset(32),
+        },
+        "00F010B8",
+        "b 32",
+    ));
+    insns.push((
+        Inst::Jump {
+            dest: BranchTarget::ResolvedOffset(0xfffff4),
+        },
+        "FFF3FA97",
+        "b 16777204",
+    ));
+    insns.push((
+        Inst::CondBr {
+            taken: BranchTarget::ResolvedOffset(20),
+            not_taken: BranchTarget::ResolvedOffset(68),
+            cond: Cond::Eq,
+        },
+        "00F00A8000F022B8",
+        "beq 20 ; b 68",
+    ));
+    insns.push((
+        Inst::CondBr {
+            taken: BranchTarget::ResolvedOffset(6),
+            not_taken: BranchTarget::ResolvedOffset(100),
+            cond: Cond::Gt,
+        },
+        "00F3038000F032B8",
+        "bgt 6 ; b 100",
+    ));
+    insns.push((
+        Inst::IndirectBr {
+            rm: rreg(0),
+            targets: vec![],
+        },
+        "0047",
+        "bx r0",
+    ));
+    insns.push((
+        Inst::IndirectBr {
+            rm: rreg(8),
+            targets: vec![],
+        },
+        "4047",
+        "bx r8",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            cond: Cond::Eq,
+            trap_info: TrapCode::Interrupt,
+        },
+        "40F0018000DE",
+        "bne 2 ; udf #0",
+    ));
+    insns.push((
+        Inst::TrapIf {
+            cond: Cond::Hs,
+            trap_info: TrapCode::Interrupt,
+        },
+        "C0F0018000DE",
+        "blo 2 ; udf #0",
+    ));
+    insns.push((
+        Inst::Udf {
+            trap_info: TrapCode::Interrupt,
+        },
+        "00DE",
+        "udf #0",
+    ));
+    insns.push((Inst::Bkpt, "00BE", "bkpt #0"));
+
+    // ========================================================
+    // Run the tests
+    let rru = regs::create_reg_universe();
+    for (insn, expected_encoding, expected_printing) in insns {
+        // Check the printed text is as expected.
+        let actual_printing = insn.show_rru(Some(&rru));
+        assert_eq!(expected_printing, actual_printing);
+        let mut sink = test_utils::TestCodeSink::new();
+        let mut buffer = MachBuffer::new();
+        insn.emit(&mut buffer, &flags, &mut Default::default());
+        let buffer = buffer.finish();
+        buffer.emit(&mut sink);
+        let actual_encoding = &sink.stringify();
+        assert_eq!(expected_encoding, actual_encoding, "{}", expected_printing);
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/mod.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/mod.rs
new file mode 100644
index 0000000000..fff01b7d82
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/mod.rs
@@ -0,0 +1,1358 @@
+//! This module defines 32-bit ARM specific machine instruction types.
+
+#![allow(dead_code)]
+
+use crate::binemit::CodeOffset;
+use crate::ir::types::{B1, B16, B32, B8, I16, I32, I8, IFLAGS};
+use crate::ir::{ExternalName, Opcode, TrapCode, Type};
+use crate::machinst::*;
+use crate::{settings, CodegenError, CodegenResult};
+
+use regalloc::{PrettyPrint, RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable};
+use regalloc::{RegUsageCollector, RegUsageMapper};
+
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use smallvec::{smallvec, SmallVec};
+use std::string::{String, ToString};
+
+mod args;
+pub use self::args::*;
+mod emit;
+pub use self::emit::*;
+mod regs;
+pub use self::regs::*;
+pub mod unwind;
+
+#[cfg(test)]
+mod emit_tests;
+
+//=============================================================================
+// Instructions (top level): definition
+
+/// An ALU operation. This can be paired with several instruction formats
+/// below (see `Inst`) in any combination.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum ALUOp {
+    Add,
+    Adds,
+    Adc,
+    Adcs,
+    Qadd,
+    Sub,
+    Subs,
+    Sbc,
+    Sbcs,
+    Rsb,
+    Qsub,
+    Mul,
+    Smull,
+    Umull,
+    Udiv,
+    Sdiv,
+    And,
+    Orr,
+    Orn,
+    Eor,
+    Bic,
+    Lsl,
+    Lsr,
+    Asr,
+    Ror,
+}
+
+/// An ALU operation with one argument.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum ALUOp1 {
+    Mvn,
+    Mov,
+}
+
+/// An operation on the bits of a register.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum BitOp {
+    Rbit,
+    Rev,
+    Clz,
+}
+
+/// Additional information for (direct) Call instructions, left out of line to lower the size of
+/// the Inst enum.
+#[derive(Clone, Debug)]
+pub struct CallInfo {
+    pub dest: ExternalName,
+    pub uses: Vec<Reg>,
+    pub defs: Vec<Writable<Reg>>,
+    pub opcode: Opcode,
+}
+
+/// Additional information for CallInd instructions, left out of line to lower the size of the Inst
+/// enum.
+#[derive(Clone, Debug)]
+pub struct CallIndInfo {
+    pub rm: Reg,
+    pub uses: Vec<Reg>,
+    pub defs: Vec<Writable<Reg>>,
+    pub opcode: Opcode,
+}
+
+/// Instruction formats.
+#[derive(Clone, Debug)]
+pub enum Inst {
+    /// A no-op of zero size.
+    Nop0,
+
+    /// A no-op that is two bytes large.
+    Nop2,
+
+    /// An ALU operation with two register sources and one register destination.
+    AluRRR {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// An ALU operation with two register sources, one of which can be optionally shifted
+    /// and one register destination.
+    AluRRRShift {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        shift: Option<ShiftOpAndAmt>,
+    },
+
+    /// An ALU operation with one register source, which can be optionally shifted
+    /// and one register destination.
+    AluRRShift {
+        alu_op: ALUOp1,
+        rd: Writable<Reg>,
+        rm: Reg,
+        shift: Option<ShiftOpAndAmt>,
+    },
+
+    /// An ALU operation with two register sources and two register destinations.
+    AluRRRR {
+        alu_op: ALUOp,
+        rd_hi: Writable<Reg>,
+        rd_lo: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// An ALU operation with a register source and a 12-bit immediate source,
+    /// and a register destination.
+    AluRRImm12 {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        imm12: UImm12,
+    },
+
+    /// An ALU operation with a register source and a 8-bit immediate source,
+    /// and a register destination.
+    ///
+    /// In fact these instructions take a `modified immediate constant` operand,
+    /// which is encoded as a 12-bit immediate. The only case used here
+    /// is when high 4 bits of that 12-immediate are zeros.
+    /// In this case operand is simple 8-bit immediate.
+    /// For all possible operands see
+    /// https://static.docs.arm.com/ddi0406/c/DDI0406C_C_arm_architecture_reference_manual.pdf#G10.4954509
+    AluRRImm8 {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        imm8: UImm8,
+    },
+
+    /// An ALU operation with a 8-bit immediate and a register destination.
+    /// See `AluRRImm8` description above.
+    AluRImm8 {
+        alu_op: ALUOp1,
+        rd: Writable<Reg>,
+        imm8: UImm8,
+    },
+
+    /// A bit operation with a register source and a register destination.
+    BitOpRR {
+        bit_op: BitOp,
+        rd: Writable<Reg>,
+        rm: Reg,
+    },
+
+    /// A mov instruction with a GPR source and a GPR destination.
+    Mov {
+        rd: Writable<Reg>,
+        rm: Reg,
+    },
+
+    /// A move instruction with a 16-bit immediate source and a register destination.
+    MovImm16 {
+        rd: Writable<Reg>,
+        imm16: u16,
+    },
+
+    /// A move top instruction, which writes 16-bit immediate to the top
+    /// halfword of the destination register.
+    Movt {
+        rd: Writable<Reg>,
+        imm16: u16,
+    },
+
+    /// A compare instruction with two register arguments.
+    Cmp {
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// A compare instruction with a register operand and a 8-bit immediate operand.
+    CmpImm8 {
+        rn: Reg,
+        imm8: u8,
+    },
+
+    /// A store instruction, which stores to memory 8, 16 or 32-bit operand.
+    Store {
+        rt: Reg,
+        mem: AMode,
+        bits: u8,
+    },
+
+    /// A load instruction, which loads from memory 8, 16 or 32-bit operand,
+    /// which can be sign- or zero-extended.
+    Load {
+        rt: Writable<Reg>,
+        mem: AMode,
+        bits: u8,
+        sign_extend: bool,
+    },
+
+    /// Load address referenced by `mem` into `rd`.
+    LoadAddr {
+        rd: Writable<Reg>,
+        mem: AMode,
+    },
+
+    /// A sign- or zero-extend operation.
+    Extend {
+        rd: Writable<Reg>,
+        rm: Reg,
+        from_bits: u8,
+        signed: bool,
+    },
+
+    // An If-Then instruction, which makes up to four instructions conditinal.
+    It {
+        cond: Cond,
+        insts: Vec<CondInst>,
+    },
+
+    /// A push instuction, which stores registers to the stack and updates sp.
+    Push {
+        reg_list: Vec<Reg>,
+    },
+
+    /// A pop instuction, which load registers from the stack and updates sp.
+    Pop {
+        reg_list: Vec<Writable<Reg>>,
+    },
+
+    /// A machine call instruction.
+    Call {
+        info: Box<CallInfo>,
+    },
+
+    /// A machine indirect-call instruction.
+    CallInd {
+        info: Box<CallIndInfo>,
+    },
+
+    /// Load an inline symbol reference.
+    LoadExtName {
+        rt: Writable<Reg>,
+        name: Box<ExternalName>,
+        offset: i32,
+    },
+
+    /// A return instruction, which is encoded as `bx lr`.
+    Ret,
+
+    /// An unconditional branch.
+    Jump {
+        dest: BranchTarget,
+    },
+
+    /// A conditional branch.
+    CondBr {
+        taken: BranchTarget,
+        not_taken: BranchTarget,
+        cond: Cond,
+    },
+
+    /// An indirect branch through a register, augmented with set of all
+    /// possible successors.
+    IndirectBr {
+        rm: Reg,
+        targets: Vec<MachLabel>,
+    },
+
+    /// A conditional trap: execute a `udf` if the condition is true. This is
+    /// one VCode instruction because it uses embedded control flow; it is
+    /// logically a single-in, single-out region, but needs to appear as one
+    /// unit to the register allocator.
+    TrapIf {
+        cond: Cond,
+        trap_info: TrapCode,
+    },
+
+    /// An instruction guaranteed to always be undefined and to trigger an illegal instruction at
+    /// runtime.
+    Udf {
+        trap_info: TrapCode,
+    },
+
+    /// A "breakpoint" instruction, used for e.g. traps and debug breakpoints.
+    Bkpt,
+
+    /// Marker, no-op in generated code: SP "virtual offset" is adjusted.
+    VirtualSPOffsetAdj {
+        offset: i64,
+    },
+
+    /// A placeholder instruction, generating no code, meaning that a function epilogue must be
+    /// inserted there.
+    EpiloguePlaceholder,
+}
+
+/// An instruction inside an it block.
+#[derive(Clone, Debug)]
+pub struct CondInst {
+    inst: Inst,
+    // In which case execute the instruction:
+    // true => when it condition is met
+    // false => otherwise.
+    then: bool,
+}
+
+impl CondInst {
+    pub fn new(inst: Inst, then: bool) -> Self {
+        match inst {
+            Inst::It { .. }
+            | Inst::Ret { .. }
+            | Inst::Jump { .. }
+            | Inst::CondBr { .. }
+            | Inst::TrapIf { .. }
+            | Inst::EpiloguePlaceholder { .. }
+            | Inst::LoadExtName { .. } => panic!("Instruction {:?} cannot occur in it block", inst),
+            _ => Self { inst, then },
+        }
+    }
+}
+
+impl Inst {
+    /// Create a move instruction.
+    pub fn mov(to_reg: Writable<Reg>, from_reg: Reg) -> Inst {
+        Inst::Mov {
+            rd: to_reg,
+            rm: from_reg,
+        }
+    }
+
+    /// Create an instruction that loads a constant.
+    pub fn load_constant(rd: Writable<Reg>, value: u32) -> SmallVec<[Inst; 4]> {
+        let mut insts = smallvec![];
+        let imm_lo = (value & 0xffff) as u16;
+        let imm_hi = (value >> 16) as u16;
+
+        if imm_lo != 0 || imm_hi == 0 {
+            // imm_lo == 0 && imm_hi == 0 => we have to overwrite reg value with 0
+            insts.push(Inst::MovImm16 { rd, imm16: imm_lo });
+        }
+        if imm_hi != 0 {
+            insts.push(Inst::Movt { rd, imm16: imm_hi });
+        }
+
+        insts
+    }
+
+    /// Generic constructor for a load (zero-extending where appropriate).
+    pub fn gen_load(into_reg: Writable<Reg>, mem: AMode, ty: Type) -> Inst {
+        assert!(ty.bits() <= 32);
+        // Load 8 bits for B1.
+        let bits = std::cmp::max(ty.bits(), 8) as u8;
+
+        Inst::Load {
+            rt: into_reg,
+            mem,
+            bits,
+            sign_extend: false,
+        }
+    }
+
+    /// Generic constructor for a store.
+    pub fn gen_store(from_reg: Reg, mem: AMode, ty: Type) -> Inst {
+        assert!(ty.bits() <= 32);
+        // Store 8 bits for B1.
+        let bits = std::cmp::max(ty.bits(), 8) as u8;
+
+        Inst::Store {
+            rt: from_reg,
+            mem,
+            bits,
+        }
+    }
+}
+
+//=============================================================================
+// Instructions: get_regs
+
+fn memarg_regs(memarg: &AMode, collector: &mut RegUsageCollector) {
+    match memarg {
+        &AMode::RegReg(rn, rm, ..) => {
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &AMode::RegOffset12(rn, ..) | &AMode::RegOffset(rn, _) => {
+            collector.add_use(rn);
+        }
+        &AMode::SPOffset(..) | &AMode::NominalSPOffset(..) => {
+            collector.add_use(sp_reg());
+        }
+        &AMode::FPOffset(..) => {
+            collector.add_use(fp_reg());
+        }
+        &AMode::PCRel(_) => {}
+    }
+}
+
+fn arm32_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
+    match inst {
+        &Inst::Nop0
+        | &Inst::Nop2
+        | &Inst::Ret
+        | &Inst::VirtualSPOffsetAdj { .. }
+        | &Inst::EpiloguePlaceholder
+        | &Inst::Jump { .. }
+        | &Inst::CondBr { .. }
+        | &Inst::Bkpt
+        | &Inst::Udf { .. }
+        | &Inst::TrapIf { .. } => {}
+        &Inst::AluRRR { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::AluRRRShift { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::AluRRShift { rd, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rm);
+        }
+        &Inst::AluRRRR {
+            rd_hi,
+            rd_lo,
+            rn,
+            rm,
+            ..
+        } => {
+            collector.add_def(rd_hi);
+            collector.add_def(rd_lo);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::AluRRImm12 { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::AluRRImm8 { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::AluRImm8 { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::BitOpRR { rd, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rm);
+        }
+        &Inst::Mov { rd, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rm);
+        }
+        &Inst::MovImm16 { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::Movt { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::Cmp { rn, rm } => {
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::CmpImm8 { rn, .. } => {
+            collector.add_use(rn);
+        }
+        &Inst::Store { rt, ref mem, .. } => {
+            collector.add_use(rt);
+            memarg_regs(mem, collector);
+        }
+        &Inst::Load { rt, ref mem, .. } => {
+            collector.add_def(rt);
+            memarg_regs(mem, collector);
+        }
+        &Inst::LoadAddr { rd, mem: _ } => {
+            collector.add_def(rd);
+        }
+        &Inst::Extend { rd, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rm);
+        }
+        &Inst::It { ref insts, .. } => {
+            for inst in insts.iter() {
+                arm32_get_regs(&inst.inst, collector);
+            }
+        }
+        &Inst::Push { ref reg_list } => {
+            for reg in reg_list {
+                collector.add_use(*reg);
+            }
+        }
+        &Inst::Pop { ref reg_list } => {
+            for reg in reg_list {
+                collector.add_def(*reg);
+            }
+        }
+        &Inst::Call { ref info, .. } => {
+            collector.add_uses(&*info.uses);
+            collector.add_defs(&*info.defs);
+        }
+        &Inst::CallInd { ref info, .. } => {
+            collector.add_uses(&*info.uses);
+            collector.add_defs(&*info.defs);
+            collector.add_use(info.rm);
+        }
+        &Inst::LoadExtName { rt, .. } => {
+            collector.add_def(rt);
+        }
+        &Inst::IndirectBr { rm, .. } => {
+            collector.add_use(rm);
+        }
+    }
+}
+
+//=============================================================================
+// Instructions: map_regs
+
+fn arm32_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
+    fn map_use<RUM: RegUsageMapper>(m: &RUM, r: &mut Reg) {
+        if r.is_virtual() {
+            let new = m.get_use(r.to_virtual_reg()).unwrap().to_reg();
+            *r = new;
+        }
+    }
+
+    fn map_def<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
+        if r.to_reg().is_virtual() {
+            let new = m.get_def(r.to_reg().to_virtual_reg()).unwrap().to_reg();
+            *r = Writable::from_reg(new);
+        }
+    }
+
+    fn map_mod<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
+        if r.to_reg().is_virtual() {
+            let new = m.get_mod(r.to_reg().to_virtual_reg()).unwrap().to_reg();
+            *r = Writable::from_reg(new);
+        }
+    }
+
+    fn map_mem<RUM: RegUsageMapper>(m: &RUM, mem: &mut AMode) {
+        match mem {
+            &mut AMode::RegReg(ref mut rn, ref mut rm, ..) => {
+                map_use(m, rn);
+                map_use(m, rm);
+            }
+            &mut AMode::RegOffset12(ref mut rn, ..) | &mut AMode::RegOffset(ref mut rn, ..) => {
+                map_use(m, rn)
+            }
+            &mut AMode::SPOffset(..)
+            | &mut AMode::FPOffset(..)
+            | &mut AMode::NominalSPOffset(..)
+            | &mut AMode::PCRel(_) => {}
+        };
+    }
+
+    match inst {
+        &mut Inst::Nop0
+        | &mut Inst::Nop2
+        | &mut Inst::Ret
+        | &mut Inst::VirtualSPOffsetAdj { .. }
+        | &mut Inst::EpiloguePlaceholder
+        | &mut Inst::Jump { .. }
+        | &mut Inst::CondBr { .. }
+        | &mut Inst::Bkpt
+        | &mut Inst::Udf { .. }
+        | &mut Inst::TrapIf { .. } => {}
+        &mut Inst::AluRRR {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::AluRRRShift {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::AluRRShift {
+            ref mut rd,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rm);
+        }
+        &mut Inst::AluRRRR {
+            ref mut rd_hi,
+            ref mut rd_lo,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd_hi);
+            map_def(mapper, rd_lo);
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::AluRRImm12 {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::AluRRImm8 {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
+        &mut Inst::AluRImm8 { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::BitOpRR {
+            ref mut rd,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rm);
+        }
+        &mut Inst::Mov {
+            ref mut rd,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rm);
+        }
+        &mut Inst::MovImm16 { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::Movt { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
+        &mut Inst::Cmp {
+            ref mut rn,
+            ref mut rm,
+        } => {
+            map_use(mapper, rn);
+            map_use(mapper, rm);
+        }
+        &mut Inst::CmpImm8 { ref mut rn, .. } => {
+            map_use(mapper, rn);
+        }
+        &mut Inst::Store {
+            ref mut rt,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rt);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::Load {
+            ref mut rt,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rt);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::LoadAddr {
+            ref mut rd,
+            ref mut mem,
+        } => {
+            map_def(mapper, rd);
+            map_mem(mapper, mem);
+        }
+        &mut Inst::Extend {
+            ref mut rd,
+            ref mut rm,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rm);
+        }
+        &mut Inst::It { ref mut insts, .. } => {
+            for inst in insts.iter_mut() {
+                arm32_map_regs(&mut inst.inst, mapper);
+            }
+        }
+        &mut Inst::Push { ref mut reg_list } => {
+            for reg in reg_list {
+                map_use(mapper, reg);
+            }
+        }
+        &mut Inst::Pop { ref mut reg_list } => {
+            for reg in reg_list {
+                map_def(mapper, reg);
+            }
+        }
+        &mut Inst::Call { ref mut info } => {
+            for r in info.uses.iter_mut() {
+                map_use(mapper, r);
+            }
+            for r in info.defs.iter_mut() {
+                map_def(mapper, r);
+            }
+        }
+        &mut Inst::CallInd { ref mut info, .. } => {
+            for r in info.uses.iter_mut() {
+                map_use(mapper, r);
+            }
+            for r in info.defs.iter_mut() {
+                map_def(mapper, r);
+            }
+            map_use(mapper, &mut info.rm);
+        }
+        &mut Inst::LoadExtName { ref mut rt, .. } => {
+            map_def(mapper, rt);
+        }
+        &mut Inst::IndirectBr { ref mut rm, .. } => {
+            map_use(mapper, rm);
+        }
+    }
+}
+
+//=============================================================================
+// Instructions: misc functions and external interface
+
+impl MachInst for Inst {
+    type LabelUse = LabelUse;
+
+    fn get_regs(&self, collector: &mut RegUsageCollector) {
+        arm32_get_regs(self, collector)
+    }
+
+    fn map_regs<RUM: RegUsageMapper>(&mut self, mapper: &RUM) {
+        arm32_map_regs(self, mapper);
+    }
+
+    fn is_move(&self) -> Option<(Writable<Reg>, Reg)> {
+        match self {
+            &Inst::Mov { rd, rm } => Some((rd, rm)),
+            _ => None,
+        }
+    }
+
+    fn is_epilogue_placeholder(&self) -> bool {
+        if let Inst::EpiloguePlaceholder = self {
+            true
+        } else {
+            false
+        }
+    }
+
+    fn is_term<'a>(&'a self) -> MachTerminator<'a> {
+        match self {
+            &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret,
+            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_label().unwrap()),
+            &Inst::CondBr {
+                taken, not_taken, ..
+            } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()),
+            &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]),
+            _ => MachTerminator::None,
+        }
+    }
+
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, _ty: Type) -> Inst {
+        assert_eq!(from_reg.get_class(), RegClass::I32);
+        assert_eq!(to_reg.to_reg().get_class(), from_reg.get_class());
+
+        Inst::mov(to_reg, from_reg)
+    }
+
+    fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        to_reg: Writable<Reg>,
+        value: u64,
+        ty: Type,
+        _alloc_tmp: F,
+    ) -> SmallVec<[Inst; 4]> {
+        match ty {
+            B1 | I8 | B8 | I16 | B16 | I32 | B32 => {
+                let v: i64 = value as i64;
+
+                if v >= (1 << 32) || v < -(1 << 32) {
+                    panic!("Cannot load constant value {}", value)
+                }
+                Inst::load_constant(to_reg, value as u32)
+            }
+            _ => unimplemented!(),
+        }
+    }
+
+    fn gen_zero_len_nop() -> Inst {
+        Inst::Nop0
+    }
+
+    fn gen_nop(preferred_size: usize) -> Inst {
+        assert!(preferred_size >= 2);
+        Inst::Nop2
+    }
+
+    fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> {
+        None
+    }
+
+    fn rc_for_type(ty: Type) -> CodegenResult<RegClass> {
+        match ty {
+            I8 | I16 | I32 | B1 | B8 | B16 | B32 => Ok(RegClass::I32),
+            IFLAGS => Ok(RegClass::I32),
+            _ => Err(CodegenError::Unsupported(format!(
+                "Unexpected SSA-value type: {}",
+                ty
+            ))),
+        }
+    }
+
+    fn gen_jump(target: MachLabel) -> Inst {
+        Inst::Jump {
+            dest: BranchTarget::Label(target),
+        }
+    }
+
+    fn reg_universe(_flags: &settings::Flags) -> RealRegUniverse {
+        create_reg_universe()
+    }
+
+    fn worst_case_size() -> CodeOffset {
+        // It inst with four 32-bit instructions
+        2 + 4 * 4
+    }
+
+    fn ref_type_regclass(_: &settings::Flags) -> RegClass {
+        RegClass::I32
+    }
+}
+
+//=============================================================================
+// Pretty-printing of instructions.
+
+fn mem_finalize_for_show(
+    mem: &AMode,
+    mb_rru: Option<&RealRegUniverse>,
+    state: &EmitState,
+) -> (String, AMode) {
+    let (mem_insts, mem) = mem_finalize(mem, state);
+    let mut mem_str = mem_insts
+        .into_iter()
+        .map(|inst| inst.show_rru(mb_rru))
+        .collect::<Vec<_>>()
+        .join(" ; ");
+    if !mem_str.is_empty() {
+        mem_str += " ; ";
+    }
+
+    (mem_str, mem)
+}
+
+impl PrettyPrint for Inst {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        self.pretty_print(mb_rru, &mut EmitState::default())
+    }
+}
+
+impl Inst {
+    fn print_with_state(&self, mb_rru: Option<&RealRegUniverse>, state: &mut EmitState) -> String {
+        fn op_name(alu_op: ALUOp) -> &'static str {
+            match alu_op {
+                ALUOp::Add => "add",
+                ALUOp::Adds => "adds",
+                ALUOp::Adc => "adc",
+                ALUOp::Adcs => "adcs",
+                ALUOp::Qadd => "qadd",
+                ALUOp::Sub => "sub",
+                ALUOp::Subs => "subs",
+                ALUOp::Sbc => "sbc",
+                ALUOp::Sbcs => "sbcs",
+                ALUOp::Rsb => "rsb",
+                ALUOp::Qsub => "qsub",
+                ALUOp::Mul => "mul",
+                ALUOp::Smull => "smull",
+                ALUOp::Umull => "umull",
+                ALUOp::Udiv => "udiv",
+                ALUOp::Sdiv => "sdiv",
+                ALUOp::And => "and",
+                ALUOp::Orr => "orr",
+                ALUOp::Orn => "orn",
+                ALUOp::Eor => "eor",
+                ALUOp::Bic => "bic",
+                ALUOp::Lsl => "lsl",
+                ALUOp::Lsr => "lsr",
+                ALUOp::Asr => "asr",
+                ALUOp::Ror => "ror",
+            }
+        }
+
+        fn reg_shift_str(
+            shift: &Option<ShiftOpAndAmt>,
+            mb_rru: Option<&RealRegUniverse>,
+        ) -> String {
+            if let Some(ref shift) = shift {
+                format!(", {}", shift.show_rru(mb_rru))
+            } else {
+                "".to_string()
+            }
+        }
+
+        match self {
+            &Inst::Nop0 => "nop-zero-len".to_string(),
+            &Inst::Nop2 => "nop".to_string(),
+            &Inst::AluRRR { alu_op, rd, rn, rm } => {
+                let op = op_name(alu_op);
+                let rd = rd.show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
+            &Inst::AluRRRShift {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ref shift,
+            } => {
+                let op = op_name(alu_op);
+                let rd = rd.show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                let shift = reg_shift_str(shift, mb_rru);
+                format!("{} {}, {}, {}{}", op, rd, rn, rm, shift)
+            }
+            &Inst::AluRRShift {
+                alu_op,
+                rd,
+                rm,
+                ref shift,
+            } => {
+                let op = match alu_op {
+                    ALUOp1::Mvn => "mvn",
+                    ALUOp1::Mov => "mov",
+                };
+                let rd = rd.show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                let shift = reg_shift_str(shift, mb_rru);
+                format!("{} {}, {}{}", op, rd, rm, shift)
+            }
+            &Inst::AluRRRR {
+                alu_op,
+                rd_hi,
+                rd_lo,
+                rn,
+                rm,
+            } => {
+                let op = op_name(alu_op);
+                let rd_hi = rd_hi.show_rru(mb_rru);
+                let rd_lo = rd_lo.show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                format!("{} {}, {}, {}, {}", op, rd_lo, rd_hi, rn, rm)
+            }
+            &Inst::AluRRImm12 {
+                alu_op,
+                rd,
+                rn,
+                imm12,
+            } => {
+                let op = op_name(alu_op);
+                let rd = rd.show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                let imm = imm12.show_rru(mb_rru);
+                format!("{} {}, {}, {}", op, rd, rn, imm)
+            }
+            &Inst::AluRRImm8 {
+                alu_op,
+                rd,
+                rn,
+                imm8,
+            } => {
+                let op = op_name(alu_op);
+                let rd = rd.show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                let imm = imm8.show_rru(mb_rru);
+                format!("{} {}, {}, {}", op, rd, rn, imm)
+            }
+            &Inst::AluRImm8 { alu_op, rd, imm8 } => {
+                let op = match alu_op {
+                    ALUOp1::Mvn => "mvn",
+                    ALUOp1::Mov => "mov",
+                };
+                let rd = rd.show_rru(mb_rru);
+                let imm = imm8.show_rru(mb_rru);
+                format!("{} {}, {}", op, rd, imm)
+            }
+            &Inst::BitOpRR { bit_op, rd, rm } => {
+                let op = match bit_op {
+                    BitOp::Rbit => "rbit",
+                    BitOp::Rev => "rev",
+                    BitOp::Clz => "clz",
+                };
+                let rd = rd.show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                format!("{} {}, {}", op, rd, rm)
+            }
+            &Inst::Mov { rd, rm } => {
+                let rd = rd.show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                format!("mov {}, {}", rd, rm)
+            }
+            &Inst::MovImm16 { rd, imm16 } => {
+                let rd = rd.show_rru(mb_rru);
+                format!("mov {}, #{}", rd, imm16)
+            }
+            &Inst::Movt { rd, imm16 } => {
+                let rd = rd.show_rru(mb_rru);
+                format!("movt {}, #{}", rd, imm16)
+            }
+            &Inst::Cmp { rn, rm } => {
+                let rn = rn.show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                format!("cmp {}, {}", rn, rm)
+            }
+            &Inst::CmpImm8 { rn, imm8 } => {
+                let rn = rn.show_rru(mb_rru);
+                format!("cmp {}, #{}", rn, imm8)
+            }
+            &Inst::Store {
+                rt, ref mem, bits, ..
+            } => {
+                let op = match bits {
+                    32 => "str",
+                    16 => "strh",
+                    8 => "strb",
+                    _ => panic!("Invalid bit amount {}", bits),
+                };
+                let rt = rt.show_rru(mb_rru);
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}{} {}, {}", mem_str, op, rt, mem)
+            }
+            &Inst::Load {
+                rt,
+                ref mem,
+                bits,
+                sign_extend,
+                ..
+            } => {
+                let op = match (bits, sign_extend) {
+                    (32, _) => "ldr",
+                    (16, true) => "ldrsh",
+                    (16, false) => "ldrh",
+                    (8, true) => "ldrsb",
+                    (8, false) => "ldrb",
+                    (_, _) => panic!("Invalid bit amount {}", bits),
+                };
+                let rt = rt.show_rru(mb_rru);
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}{} {}, {}", mem_str, op, rt, mem)
+            }
+            &Inst::LoadAddr { rd, ref mem } => {
+                let mut ret = String::new();
+                let (mem_insts, mem) = mem_finalize(mem, state);
+                for inst in mem_insts.into_iter() {
+                    ret.push_str(&inst.show_rru(mb_rru));
+                }
+                let inst = match mem {
+                    AMode::RegReg(rn, rm, shift) => {
+                        let shift = u32::from(shift);
+                        let shift_amt = ShiftOpShiftImm::maybe_from_shift(shift).unwrap();
+                        let shift = ShiftOpAndAmt::new(ShiftOp::LSL, shift_amt);
+                        Inst::AluRRRShift {
+                            alu_op: ALUOp::Add,
+                            rd,
+                            rn,
+                            rm,
+                            shift: Some(shift),
+                        }
+                    }
+                    AMode::RegOffset12(reg, imm12) => Inst::AluRRImm12 {
+                        alu_op: ALUOp::Add,
+                        rd,
+                        rn: reg,
+                        imm12,
+                    },
+                    _ => unreachable!(),
+                };
+                ret.push_str(&inst.show_rru(mb_rru));
+                ret
+            }
+            &Inst::Extend {
+                rd,
+                rm,
+                from_bits,
+                signed,
+            } => {
+                let op = match (from_bits, signed) {
+                    (16, true) => "sxth",
+                    (16, false) => "uxth",
+                    (8, true) => "sxtb",
+                    (8, false) => "uxtb",
+                    _ => panic!("Unsupported extend case: {:?}", self),
+                };
+                let rd = rd.show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                format!("{} {}, {}", op, rd, rm)
+            }
+            &Inst::It { cond, ref insts } => {
+                let te: String = insts
+                    .iter()
+                    .skip(1)
+                    .map(|i| if i.then { "t" } else { "e" })
+                    .collect();
+                let cond = cond.show_rru(mb_rru);
+                let mut ret = format!("it{} {}", te, cond);
+                for inst in insts.into_iter() {
+                    ret.push_str(" ; ");
+                    ret.push_str(&inst.inst.show_rru(mb_rru));
+                }
+                ret
+            }
+            &Inst::Push { ref reg_list } => {
+                assert!(!reg_list.is_empty());
+                let first_reg = reg_list[0].show_rru(mb_rru);
+                let regs: String = reg_list
+                    .iter()
+                    .skip(1)
+                    .map(|r| [",", &r.show_rru(mb_rru)].join(" "))
+                    .collect();
+                format!("push {{{}{}}}", first_reg, regs)
+            }
+            &Inst::Pop { ref reg_list } => {
+                assert!(!reg_list.is_empty());
+                let first_reg = reg_list[0].show_rru(mb_rru);
+                let regs: String = reg_list
+                    .iter()
+                    .skip(1)
+                    .map(|r| [",", &r.show_rru(mb_rru)].join(" "))
+                    .collect();
+                format!("pop {{{}{}}}", first_reg, regs)
+            }
+            &Inst::Call { .. } => format!("bl 0"),
+            &Inst::CallInd { ref info, .. } => {
+                let rm = info.rm.show_rru(mb_rru);
+                format!("blx {}", rm)
+            }
+            &Inst::LoadExtName {
+                rt,
+                ref name,
+                offset,
+            } => {
+                let rt = rt.show_rru(mb_rru);
+                format!("ldr {}, [pc, #4] ; b 4 ; data {:?} + {}", rt, name, offset)
+            }
+            &Inst::Ret => "bx lr".to_string(),
+            &Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
+            &Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(),
+            &Inst::Jump { ref dest } => {
+                let dest = dest.show_rru(mb_rru);
+                format!("b {}", dest)
+            }
+            &Inst::CondBr {
+                ref taken,
+                ref not_taken,
+                ref cond,
+            } => {
+                let taken = taken.show_rru(mb_rru);
+                let not_taken = not_taken.show_rru(mb_rru);
+                let c = cond.show_rru(mb_rru);
+                format!("b{} {} ; b {}", c, taken, not_taken)
+            }
+            &Inst::IndirectBr { rm, .. } => {
+                let rm = rm.show_rru(mb_rru);
+                format!("bx {}", rm)
+            }
+            &Inst::Udf { .. } => "udf #0".to_string(),
+            &Inst::Bkpt => "bkpt #0".to_string(),
+            &Inst::TrapIf { cond, .. } => {
+                let c = cond.invert().show_rru(mb_rru);
+                format!("b{} 2 ; udf #0", c)
+            }
+        }
+    }
+}
+
+//=============================================================================
+// Label fixups and jump veneers.
+
+/// Different forms of label references for different instruction formats.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LabelUse {
+    /// 20-bit branch offset used by 32-bit conditional jumps.
+    Branch20,
+
+    /// 24-bit branch offset used by 32-bit uncoditional jump instruction.
+    Branch24,
+}
+
+impl MachInstLabelUse for LabelUse {
+    /// Alignment for veneer code. Every instruction must be 4-byte-aligned.
+    const ALIGN: CodeOffset = 2;
+
+    // Branches range:
+    // 20-bit sign-extended immediate gives us range [-(2^19), 2^19 - 1].
+    // Left-shifted by 1 => [-(2^20), 2^20 - 2].
+    // PC is start of this instruction + 4 bytes => [-(2^20) + 4, 2^20 + 2].
+    // Likewise for Branch24.
+
+    /// Maximum PC-relative range (positive), inclusive.
+    fn max_pos_range(self) -> CodeOffset {
+        match self {
+            LabelUse::Branch20 => (1 << 20) + 2,
+            LabelUse::Branch24 => (1 << 24) + 2,
+        }
+    }
+
+    /// Maximum PC-relative range (negative).
+    fn max_neg_range(self) -> CodeOffset {
+        match self {
+            LabelUse::Branch20 => (1 << 20) - 4,
+            LabelUse::Branch24 => (1 << 24) - 4,
+        }
+    }
+
+    /// Size of window into code needed to do the patch.
+    fn patch_size(self) -> CodeOffset {
+        4
+    }
+
+    /// Perform the patch.
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
+        let off = (label_offset as i64) - (use_offset as i64);
+        debug_assert!(off <= self.max_pos_range() as i64);
+        debug_assert!(off >= -(self.max_neg_range() as i64));
+        let off = off - 4;
+        match self {
+            LabelUse::Branch20 => {
+                let off = off as u32 >> 1;
+                let imm11 = (off & 0x7ff) as u16;
+                let imm6 = ((off >> 11) & 0x3f) as u16;
+                let j1 = ((off >> 17) & 0x1) as u16;
+                let j2 = ((off >> 18) & 0x1) as u16;
+                let s = ((off >> 19) & 0x1) as u16;
+                let insn_fst = u16::from_le_bytes([buffer[0], buffer[1]]);
+                let insn_fst = (insn_fst & !0x43f) | imm6 | (s << 10);
+                let insn_snd = u16::from_le_bytes([buffer[2], buffer[3]]);
+                let insn_snd = (insn_snd & !0x2fff) | imm11 | (j2 << 11) | (j1 << 13);
+                buffer[0..2].clone_from_slice(&u16::to_le_bytes(insn_fst));
+                buffer[2..4].clone_from_slice(&u16::to_le_bytes(insn_snd));
+            }
+            LabelUse::Branch24 => {
+                let off = off as u32 >> 1;
+                let imm11 = (off & 0x7ff) as u16;
+                let imm10 = ((off >> 11) & 0x3ff) as u16;
+                let s = ((off >> 23) & 0x1) as u16;
+                let j1 = (((off >> 22) & 0x1) as u16 ^ s) ^ 0x1;
+                let j2 = (((off >> 21) & 0x1) as u16 ^ s) ^ 0x1;
+                let insn_fst = u16::from_le_bytes([buffer[0], buffer[1]]);
+                let insn_fst = (insn_fst & !0x07ff) | imm10 | (s << 10);
+                let insn_snd = u16::from_le_bytes([buffer[2], buffer[3]]);
+                let insn_snd = (insn_snd & !0x2fff) | imm11 | (j2 << 11) | (j1 << 13);
+                buffer[0..2].clone_from_slice(&u16::to_le_bytes(insn_fst));
+                buffer[2..4].clone_from_slice(&u16::to_le_bytes(insn_snd));
+            }
+        }
+    }
+
+    fn supports_veneer(self) -> bool {
+        false
+    }
+
+    fn veneer_size(self) -> CodeOffset {
+        0
+    }
+
+    fn generate_veneer(
+        self,
+        _buffer: &mut [u8],
+        _veneer_offset: CodeOffset,
+    ) -> (CodeOffset, LabelUse) {
+        panic!("Veneer not supported yet.")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn patch_branch20() {
+        let label_use = LabelUse::Branch20;
+        let mut buffer = 0x8000_f000_u32.to_le_bytes(); // beq
+        let use_offset: CodeOffset = 0;
+        let label_offset: CodeOffset = label_use.max_pos_range();
+        label_use.patch(&mut buffer, use_offset, label_offset);
+        assert_eq!(u16::from_le_bytes([buffer[0], buffer[1]]), 0xf03f);
+        assert_eq!(u16::from_le_bytes([buffer[2], buffer[3]]), 0xafff);
+
+        let mut buffer = 0x8000_f000_u32.to_le_bytes(); // beq
+        let use_offset = label_use.max_neg_range();
+        let label_offset: CodeOffset = 0;
+        label_use.patch(&mut buffer, use_offset, label_offset);
+        assert_eq!(u16::from_le_bytes([buffer[0], buffer[1]]), 0xf400);
+        assert_eq!(u16::from_le_bytes([buffer[2], buffer[3]]), 0x8000);
+    }
+
+    #[test]
+    fn patch_branch24() {
+        let label_use = LabelUse::Branch24;
+        let mut buffer = 0x9000_f000_u32.to_le_bytes(); // b
+        let use_offset: CodeOffset = 0;
+        let label_offset: CodeOffset = label_use.max_pos_range();
+        label_use.patch(&mut buffer, use_offset, label_offset);
+        assert_eq!(u16::from_le_bytes([buffer[0], buffer[1]]), 0xf3ff);
+        assert_eq!(u16::from_le_bytes([buffer[2], buffer[3]]), 0x97ff);
+
+        let mut buffer = 0x9000_f000_u32.to_le_bytes(); // b
+        let use_offset = label_use.max_neg_range();
+        let label_offset: CodeOffset = 0;
+        label_use.patch(&mut buffer, use_offset, label_offset);
+        assert_eq!(u16::from_le_bytes([buffer[0], buffer[1]]), 0xf400);
+        assert_eq!(u16::from_le_bytes([buffer[2], buffer[3]]), 0x9000);
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/regs.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/regs.rs
new file mode 100644
index 0000000000..55df5c8db3
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/regs.rs
@@ -0,0 +1,128 @@
+//! 32-bit ARM ISA definitions: registers.
+
+use regalloc::{RealRegUniverse, Reg, RegClass, RegClassInfo, Writable, NUM_REG_CLASSES};
+
+use std::string::ToString;
+
+/// Get a reference to a GPR.
+pub fn rreg(num: u8) -> Reg {
+    assert!(num < 16);
+    Reg::new_real(RegClass::I32, num, num)
+}
+
+/// Get a writable reference to a GPR.
+pub fn writable_rreg(num: u8) -> Writable<Reg> {
+    Writable::from_reg(rreg(num))
+}
+
+/// Get a reference to the program counter (r15).
+pub fn pc_reg() -> Reg {
+    rreg(15)
+}
+
+/// Get a writable reference to the program counter.
+pub fn writable_pc_reg() -> Writable<Reg> {
+    Writable::from_reg(pc_reg())
+}
+
+/// Get a reference to the link register (r14).
+pub fn lr_reg() -> Reg {
+    rreg(14)
+}
+
+/// Get a writable reference to the link register.
+pub fn writable_lr_reg() -> Writable<Reg> {
+    Writable::from_reg(lr_reg())
+}
+
+/// Get a reference to the stack pointer (r13).
+pub fn sp_reg() -> Reg {
+    rreg(13)
+}
+
+/// Get a writable reference to the stack pointer.
+pub fn writable_sp_reg() -> Writable<Reg> {
+    Writable::from_reg(sp_reg())
+}
+
+/// Get a reference to the intra-procedure-call scratch register (r12),
+/// which is used as a temporary register.
+pub fn ip_reg() -> Reg {
+    rreg(12)
+}
+
+/// Get a writable reference to the Intra-Procedure-call scratch register.
+pub fn writable_ip_reg() -> Writable<Reg> {
+    Writable::from_reg(ip_reg())
+}
+
+/// Get a reference to the frame pointer register (r11).
+pub fn fp_reg() -> Reg {
+    rreg(11)
+}
+
+/// Get a writable reference to the frame-pointer register.
+pub fn writable_fp_reg() -> Writable<Reg> {
+    Writable::from_reg(fp_reg())
+}
+
+/// Get a reference to the second temp register. We need this in some edge cases
+/// where we need both the ip and another temporary.
+///
+/// We use r10 for this role.
+pub fn tmp2_reg() -> Reg {
+    rreg(10)
+}
+
+/// Get a writable reference to the tmp2 reg.
+pub fn writable_tmp2_reg() -> Writable<Reg> {
+    Writable::from_reg(tmp2_reg())
+}
+
+/// Create the register universe.
+/// Use only GPR for now.
+pub fn create_reg_universe() -> RealRegUniverse {
+    let mut regs = vec![];
+    let mut allocable_by_class = [None; NUM_REG_CLASSES];
+
+    let r_reg_base = 0u8;
+    let r_reg_count = 10; // to exclude r10, fp, ip, sp, lr  and pc.
+    for i in 0..r_reg_count {
+        let reg = Reg::new_real(
+            RegClass::I32,
+            /* enc = */ i,
+            /* index = */ r_reg_base + i,
+        )
+        .to_real_reg();
+        let name = format!("r{}", i);
+        regs.push((reg, name));
+    }
+    let r_reg_last = r_reg_base + r_reg_count - 1;
+
+    allocable_by_class[RegClass::I32.rc_to_usize()] = Some(RegClassInfo {
+        first: r_reg_base as usize,
+        last: r_reg_last as usize,
+        suggested_scratch: None,
+    });
+
+    // Other regs, not available to the allocator.
+    let allocable = regs.len();
+    regs.push((tmp2_reg().to_real_reg(), "r10".to_string()));
+    regs.push((fp_reg().to_real_reg(), "fp".to_string()));
+    regs.push((ip_reg().to_real_reg(), "ip".to_string()));
+    regs.push((sp_reg().to_real_reg(), "sp".to_string()));
+    regs.push((lr_reg().to_real_reg(), "lr".to_string()));
+    regs.push((pc_reg().to_real_reg(), "pc".to_string()));
+
+    // The indices in the register structs must match their
+    // actual indices in the array.
+    for (i, reg) in regs.iter().enumerate() {
+        assert_eq!(i, reg.0.get_index());
+    }
+
+    RealRegUniverse {
+        regs,
+        allocable,
+        allocable_by_class,
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/unwind.rs
new file mode 100644
index 0000000000..b9ffeba0cf
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/unwind.rs
@@ -0,0 +1,14 @@
+use super::*;
+use crate::isa::unwind::input::UnwindInfo;
+use crate::result::CodegenResult;
+
+pub struct Arm32UnwindInfo;
+
+impl UnwindInfoGenerator<Inst> for Arm32UnwindInfo {
+    fn create_unwind_info(
+        _context: UnwindInfoContext<Inst>,
+    ) -> CodegenResult<Option<UnwindInfo<Reg>>> {
+        // TODO
+        Ok(None)
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/lower.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/lower.rs
new file mode 100644
index 0000000000..7c11ae95ba
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/lower.rs
@@ -0,0 +1,240 @@
+//! Lowering rules for 32-bit ARM.
+
+use crate::ir::condcodes::IntCC;
+use crate::ir::types::*;
+use crate::ir::Inst as IRInst;
+use crate::ir::{InstructionData, Opcode, TrapCode};
+use crate::machinst::lower::*;
+use crate::machinst::*;
+use crate::CodegenResult;
+
+use crate::isa::arm32::inst::*;
+use crate::isa::arm32::Arm32Backend;
+
+use super::lower_inst;
+
+use regalloc::{Reg, RegClass, Writable};
+
+//============================================================================
+// Lowering: convert instruction outputs to result types.
+
+/// Lower an instruction output to a 32-bit constant, if possible.
+pub(crate) fn output_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u64> {
+    if out.output > 0 {
+        None
+    } else {
+        let inst_data = ctx.data(out.insn);
+        if inst_data.opcode() == Opcode::Null {
+            Some(0)
+        } else {
+            match inst_data {
+                &InstructionData::UnaryImm { opcode: _, imm } => {
+                    // Only has Into for i64; we use u64 elsewhere, so we cast.
+                    let imm: i64 = imm.into();
+                    Some(imm as u64)
+                }
+                &InstructionData::UnaryBool { opcode: _, imm } => Some(u64::from(imm)),
+                &InstructionData::UnaryIeee32 { .. } | &InstructionData::UnaryIeee64 { .. } => {
+                    unimplemented!()
+                }
+                _ => None,
+            }
+        }
+    }
+}
+
+/// How to handle narrow values loaded into registers.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum NarrowValueMode {
+    None,
+    /// Zero-extend to 32 bits if original is < 32 bits.
+    ZeroExtend,
+    /// Sign-extend to 32 bits if original is < 32 bits.
+    SignExtend,
+}
+
+/// Lower an instruction output to a reg.
+pub(crate) fn output_to_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> {
+    ctx.get_output(out.insn, out.output)
+}
+
+/// Lower an instruction input to a reg.
+///
+/// The given register will be extended appropriately, according to `narrow_mode`.
+pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> Reg {
+    let ty = ctx.input_ty(input.insn, input.input);
+    let from_bits = ty.bits() as u8;
+    let inputs = ctx.get_input(input.insn, input.input);
+    let in_reg = if let Some(c) = inputs.constant {
+        let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        for inst in Inst::gen_constant(to_reg, c, ty, |reg_class, ty| ctx.alloc_tmp(reg_class, ty))
+            .into_iter()
+        {
+            ctx.emit(inst);
+        }
+        to_reg.to_reg()
+    } else {
+        ctx.use_input_reg(inputs);
+        inputs.reg
+    };
+
+    match (narrow_mode, from_bits) {
+        (NarrowValueMode::None, _) => in_reg,
+        (NarrowValueMode::ZeroExtend, 1) => {
+            let tmp = ctx.alloc_tmp(RegClass::I32, I32);
+            ctx.emit(Inst::AluRRImm8 {
+                alu_op: ALUOp::And,
+                rd: tmp,
+                rn: in_reg,
+                imm8: UImm8::maybe_from_i64(0x1).unwrap(),
+            });
+            tmp.to_reg()
+        }
+        (NarrowValueMode::ZeroExtend, n) if n < 32 => {
+            let tmp = ctx.alloc_tmp(RegClass::I32, I32);
+            ctx.emit(Inst::Extend {
+                rd: tmp,
+                rm: in_reg,
+                signed: false,
+                from_bits: n,
+            });
+            tmp.to_reg()
+        }
+        (NarrowValueMode::SignExtend, n) if n < 32 => {
+            let tmp = ctx.alloc_tmp(RegClass::I32, I32);
+            ctx.emit(Inst::Extend {
+                rd: tmp,
+                rm: in_reg,
+                signed: true,
+                from_bits: n,
+            });
+            tmp.to_reg()
+        }
+        (NarrowValueMode::ZeroExtend, 32) | (NarrowValueMode::SignExtend, 32) => in_reg,
+        _ => panic!(
+            "Unsupported input width: input ty {} bits {} mode {:?}",
+            ty, from_bits, narrow_mode
+        ),
+    }
+}
+
+pub(crate) fn lower_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, rd: Writable<Reg>, value: u64) {
+    // We allow sign bits for high word.
+    assert!((value >> 32) == 0x0 || (value >> 32) == (1 << 32) - 1);
+
+    for inst in Inst::load_constant(rd, (value & ((1 << 32) - 1)) as u32) {
+        ctx.emit(inst);
+    }
+}
+
+pub(crate) fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
+    let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
+    let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+    let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+
+    ctx.emit(Inst::Cmp { rn, rm });
+}
+
+pub(crate) fn lower_condcode(cc: IntCC) -> Cond {
+    match cc {
+        IntCC::Equal => Cond::Eq,
+        IntCC::NotEqual => Cond::Ne,
+        IntCC::SignedGreaterThanOrEqual => Cond::Ge,
+        IntCC::SignedGreaterThan => Cond::Gt,
+        IntCC::SignedLessThanOrEqual => Cond::Le,
+        IntCC::SignedLessThan => Cond::Lt,
+        IntCC::UnsignedGreaterThanOrEqual => Cond::Hs,
+        IntCC::UnsignedGreaterThan => Cond::Hi,
+        IntCC::UnsignedLessThanOrEqual => Cond::Ls,
+        IntCC::UnsignedLessThan => Cond::Lo,
+        IntCC::Overflow => Cond::Vs,
+        IntCC::NotOverflow => Cond::Vc,
+    }
+}
+
+/// Determines whether this condcode interprets inputs as signed or unsigned.
+pub(crate) fn condcode_is_signed(cc: IntCC) -> bool {
+    match cc {
+        IntCC::Equal => false,
+        IntCC::NotEqual => false,
+        IntCC::SignedGreaterThanOrEqual => true,
+        IntCC::SignedGreaterThan => true,
+        IntCC::SignedLessThanOrEqual => true,
+        IntCC::SignedLessThan => true,
+        IntCC::UnsignedGreaterThanOrEqual => false,
+        IntCC::UnsignedGreaterThan => false,
+        IntCC::UnsignedLessThanOrEqual => false,
+        IntCC::UnsignedLessThan => false,
+        IntCC::Overflow => true,
+        IntCC::NotOverflow => true,
+    }
+}
+
+//=============================================================================
+// Helpers for instruction lowering.
+
+pub(crate) fn ldst_offset(data: &InstructionData) -> Option<i32> {
+    match data {
+        &InstructionData::Load { offset, .. }
+        | &InstructionData::StackLoad { offset, .. }
+        | &InstructionData::LoadComplex { offset, .. }
+        | &InstructionData::Store { offset, .. }
+        | &InstructionData::StackStore { offset, .. }
+        | &InstructionData::StoreComplex { offset, .. } => Some(offset.into()),
+        _ => None,
+    }
+}
+
+pub(crate) fn inst_condcode(data: &InstructionData) -> Option<IntCC> {
+    match data {
+        &InstructionData::IntCond { cond, .. }
+        | &InstructionData::BranchIcmp { cond, .. }
+        | &InstructionData::IntCompare { cond, .. }
+        | &InstructionData::IntCondTrap { cond, .. }
+        | &InstructionData::BranchInt { cond, .. }
+        | &InstructionData::IntSelect { cond, .. }
+        | &InstructionData::IntCompareImm { cond, .. } => Some(cond),
+        _ => None,
+    }
+}
+
+pub(crate) fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
+    match data {
+        &InstructionData::Trap { code, .. }
+        | &InstructionData::CondTrap { code, .. }
+        | &InstructionData::IntCondTrap { code, .. } => Some(code),
+        &InstructionData::FloatCondTrap { code, .. } => {
+            panic!("Unexpected float cond trap {:?}", code)
+        }
+        _ => None,
+    }
+}
+
+//=============================================================================
+// Lowering-backend trait implementation.
+
+impl LowerBackend for Arm32Backend {
+    type MInst = Inst;
+
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
+        lower_inst::lower_insn_to_regs(ctx, ir_inst)
+    }
+
+    fn lower_branch_group<C: LowerCtx<I = Inst>>(
+        &self,
+        ctx: &mut C,
+        branches: &[IRInst],
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
+        lower_inst::lower_branch(ctx, branches, targets, fallthrough)
+    }
+
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        None
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/lower_inst.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/lower_inst.rs
new file mode 100644
index 0000000000..05256b2540
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/lower_inst.rs
@@ -0,0 +1,608 @@
+//! Lower a single Cranelift instruction into vcode.
+
+use crate::ir::types::*;
+use crate::ir::Inst as IRInst;
+use crate::ir::Opcode;
+use crate::machinst::lower::*;
+use crate::machinst::*;
+use crate::CodegenResult;
+
+use crate::isa::arm32::abi::*;
+use crate::isa::arm32::inst::*;
+
+use regalloc::RegClass;
+use smallvec::SmallVec;
+
+use super::lower::*;
+
+/// Actually codegen an instruction's results into registers.
+pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+) -> CodegenResult<()> {
+    let op = ctx.data(insn).opcode();
+    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
+        .map(|i| InsnInput { insn, input: i })
+        .collect();
+    let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
+        .map(|i| InsnOutput { insn, output: i })
+        .collect();
+    let ty = if outputs.len() > 0 {
+        let ty = ctx.output_ty(insn, 0);
+        if ty.bits() > 32 || ty.is_float() {
+            panic!("Cannot lower inst with type {}!", ty);
+        }
+        Some(ty)
+    } else {
+        None
+    };
+
+    match op {
+        Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
+            let value = output_to_const(ctx, outputs[0]).unwrap();
+            let rd = output_to_reg(ctx, outputs[0]);
+            lower_constant(ctx, rd, value);
+        }
+        Opcode::Iadd
+        | Opcode::IaddIfcin
+        | Opcode::IaddIfcout
+        | Opcode::IaddIfcarry
+        | Opcode::Isub
+        | Opcode::IsubIfbin
+        | Opcode::IsubIfbout
+        | Opcode::IsubIfborrow
+        | Opcode::Band
+        | Opcode::Bor
+        | Opcode::Bxor
+        | Opcode::BandNot
+        | Opcode::BorNot => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+
+            let alu_op = match op {
+                Opcode::Iadd => ALUOp::Add,
+                Opcode::IaddIfcin => ALUOp::Adc,
+                Opcode::IaddIfcout => ALUOp::Adds,
+                Opcode::IaddIfcarry => ALUOp::Adcs,
+                Opcode::Isub => ALUOp::Sub,
+                Opcode::IsubIfbin => ALUOp::Sbc,
+                Opcode::IsubIfbout => ALUOp::Subs,
+                Opcode::IsubIfborrow => ALUOp::Sbcs,
+                Opcode::Band => ALUOp::And,
+                Opcode::Bor => ALUOp::Orr,
+                Opcode::Bxor => ALUOp::Eor,
+                Opcode::BandNot => ALUOp::Bic,
+                Opcode::BorNot => ALUOp::Orn,
+                _ => unreachable!(),
+            };
+            ctx.emit(Inst::AluRRRShift {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                shift: None,
+            });
+        }
+        Opcode::SaddSat | Opcode::SsubSat | Opcode::Imul | Opcode::Udiv | Opcode::Sdiv => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+
+            let alu_op = match op {
+                Opcode::SaddSat => ALUOp::Qadd,
+                Opcode::SsubSat => ALUOp::Qsub,
+                Opcode::Imul => ALUOp::Mul,
+                Opcode::Udiv => ALUOp::Udiv,
+                Opcode::Sdiv => ALUOp::Sdiv,
+                _ => unreachable!(),
+            };
+            ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
+        }
+        Opcode::Ineg => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+
+            ctx.emit(Inst::AluRRImm8 {
+                alu_op: ALUOp::Rsb,
+                rd,
+                rn,
+                imm8: UImm8::maybe_from_i64(0).unwrap(),
+            });
+        }
+        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
+            let (alu_op, ext) = match op {
+                Opcode::Ishl => (ALUOp::Lsl, NarrowValueMode::None),
+                Opcode::Ushr => (ALUOp::Lsr, NarrowValueMode::ZeroExtend),
+                Opcode::Sshr => (ALUOp::Asr, NarrowValueMode::SignExtend),
+                _ => unreachable!(),
+            };
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], ext);
+            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::ZeroExtend);
+            ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
+        }
+        Opcode::Rotr => {
+            if ty.unwrap().bits() != 32 {
+                unimplemented!()
+            }
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::Ror,
+                rd,
+                rn,
+                rm,
+            });
+        }
+        Opcode::Rotl => {
+            if ty.unwrap().bits() != 32 {
+                unimplemented!()
+            }
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            let tmp = ctx.alloc_tmp(RegClass::I32, I32);
+
+            // ror rd, rn, 32 - (rm & 31)
+            ctx.emit(Inst::AluRRImm8 {
+                alu_op: ALUOp::And,
+                rd: tmp,
+                rn: rm,
+                imm8: UImm8::maybe_from_i64(31).unwrap(),
+            });
+            ctx.emit(Inst::AluRRImm8 {
+                alu_op: ALUOp::Rsb,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                imm8: UImm8::maybe_from_i64(32).unwrap(),
+            });
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::Ror,
+                rd,
+                rn,
+                rm: tmp.to_reg(),
+            });
+        }
+        Opcode::Smulhi | Opcode::Umulhi => {
+            let ty = ty.unwrap();
+            let is_signed = op == Opcode::Smulhi;
+            match ty {
+                I32 => {
+                    let rd_hi = output_to_reg(ctx, outputs[0]);
+                    let rd_lo = ctx.alloc_tmp(RegClass::I32, ty);
+                    let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+                    let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+
+                    let alu_op = if is_signed {
+                        ALUOp::Smull
+                    } else {
+                        ALUOp::Umull
+                    };
+                    ctx.emit(Inst::AluRRRR {
+                        alu_op,
+                        rd_hi,
+                        rd_lo,
+                        rn,
+                        rm,
+                    });
+                }
+                I16 | I8 => {
+                    let narrow_mode = if is_signed {
+                        NarrowValueMode::SignExtend
+                    } else {
+                        NarrowValueMode::ZeroExtend
+                    };
+                    let rd = output_to_reg(ctx, outputs[0]);
+                    let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+                    let rm = input_to_reg(ctx, inputs[1], narrow_mode);
+
+                    ctx.emit(Inst::AluRRR {
+                        alu_op: ALUOp::Mul,
+                        rd,
+                        rn,
+                        rm,
+                    });
+                    let shift_amt = if ty == I16 { 16 } else { 8 };
+                    let imm8 = UImm8::maybe_from_i64(shift_amt).unwrap();
+                    let alu_op = if is_signed { ALUOp::Asr } else { ALUOp::Lsr };
+
+                    ctx.emit(Inst::AluRRImm8 {
+                        alu_op,
+                        rd,
+                        rn: rd.to_reg(),
+                        imm8,
+                    });
+                }
+                _ => panic!("Unexpected type {} in lower {}!", ty, op),
+            }
+        }
+        Opcode::Bnot => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+
+            ctx.emit(Inst::AluRRShift {
+                alu_op: ALUOp1::Mvn,
+                rd,
+                rm,
+                shift: None,
+            });
+        }
+        Opcode::Clz | Opcode::Ctz => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend);
+            let ty = ctx.output_ty(insn, 0);
+
+            let in_reg = if op == Opcode::Ctz {
+                ctx.emit(Inst::BitOpRR {
+                    bit_op: BitOp::Rbit,
+                    rd,
+                    rm,
+                });
+                rd.to_reg()
+            } else {
+                rm
+            };
+            ctx.emit(Inst::BitOpRR {
+                bit_op: BitOp::Clz,
+                rd,
+                rm: in_reg,
+            });
+
+            if ty.bits() < 32 {
+                let imm12 = UImm12::maybe_from_i64(32 - ty.bits() as i64).unwrap();
+                ctx.emit(Inst::AluRRImm12 {
+                    alu_op: ALUOp::Sub,
+                    rd,
+                    rn: rd.to_reg(),
+                    imm12,
+                });
+            }
+        }
+        Opcode::Bitrev => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ctx.output_ty(insn, 0);
+            let bit_op = BitOp::Rbit;
+
+            match ty.bits() {
+                32 => ctx.emit(Inst::BitOpRR { bit_op, rd, rm }),
+                n if n < 32 => {
+                    let shift = ShiftOpAndAmt::new(
+                        ShiftOp::LSL,
+                        ShiftOpShiftImm::maybe_from_shift(32 - n as u32).unwrap(),
+                    );
+                    ctx.emit(Inst::AluRRShift {
+                        alu_op: ALUOp1::Mov,
+                        rd,
+                        rm,
+                        shift: Some(shift),
+                    });
+                    ctx.emit(Inst::BitOpRR {
+                        bit_op,
+                        rd,
+                        rm: rd.to_reg(),
+                    });
+                }
+                _ => panic!("Unexpected output type {}", ty),
+            }
+        }
+        Opcode::Icmp | Opcode::Ifcmp => {
+            let condcode = inst_condcode(ctx.data(insn)).unwrap();
+            let cond = lower_condcode(condcode);
+            let is_signed = condcode_is_signed(condcode);
+
+            let narrow_mode = if is_signed {
+                NarrowValueMode::SignExtend
+            } else {
+                NarrowValueMode::ZeroExtend
+            };
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+            let rm = input_to_reg(ctx, inputs[1], narrow_mode);
+
+            ctx.emit(Inst::Cmp { rn, rm });
+
+            if op == Opcode::Icmp {
+                let mut it_insts = vec![];
+                it_insts.push(CondInst::new(Inst::MovImm16 { rd, imm16: 1 }, true));
+                it_insts.push(CondInst::new(Inst::MovImm16 { rd, imm16: 0 }, false));
+                ctx.emit(Inst::It {
+                    cond,
+                    insts: it_insts,
+                });
+            }
+        }
+        Opcode::Trueif => {
+            let cmp_insn = ctx
+                .get_input(inputs[0].insn, inputs[0].input)
+                .inst
+                .unwrap()
+                .0;
+            debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
+            emit_cmp(ctx, cmp_insn);
+
+            let condcode = inst_condcode(ctx.data(insn)).unwrap();
+            let cond = lower_condcode(condcode);
+            let rd = output_to_reg(ctx, outputs[0]);
+
+            let mut it_insts = vec![];
+            it_insts.push(CondInst::new(Inst::MovImm16 { rd, imm16: 1 }, true));
+            it_insts.push(CondInst::new(Inst::MovImm16 { rd, imm16: 0 }, false));
+
+            ctx.emit(Inst::It {
+                cond,
+                insts: it_insts,
+            });
+        }
+        Opcode::Select | Opcode::Selectif => {
+            let cond = if op == Opcode::Select {
+                let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend);
+                ctx.emit(Inst::CmpImm8 { rn, imm8: 0 });
+                Cond::Ne
+            } else {
+                // Verification ensures that the input is always a single-def ifcmp.
+                let cmp_insn = ctx
+                    .get_input(inputs[0].insn, inputs[0].input)
+                    .inst
+                    .unwrap()
+                    .0;
+                debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
+                emit_cmp(ctx, cmp_insn);
+
+                let condcode = inst_condcode(ctx.data(insn)).unwrap();
+                lower_condcode(condcode)
+            };
+            let r1 = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            let r2 = input_to_reg(ctx, inputs[2], NarrowValueMode::None);
+            let out_reg = output_to_reg(ctx, outputs[0]);
+
+            let mut it_insts = vec![];
+            it_insts.push(CondInst::new(Inst::mov(out_reg, r1), true));
+            it_insts.push(CondInst::new(Inst::mov(out_reg, r2), false));
+
+            ctx.emit(Inst::It {
+                cond,
+                insts: it_insts,
+            });
+        }
+        Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
+            let off = ldst_offset(ctx.data(insn)).unwrap();
+            let elem_ty = match op {
+                Opcode::Istore8 => I8,
+                Opcode::Istore16 => I16,
+                Opcode::Istore32 => I32,
+                Opcode::Store => ctx.input_ty(insn, 0),
+                _ => unreachable!(),
+            };
+            if elem_ty.bits() > 32 {
+                unimplemented!()
+            }
+            let bits = elem_ty.bits() as u8;
+
+            assert_eq!(inputs.len(), 2, "only one input for store memory operands");
+            let rt = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let base = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+
+            let mem = AMode::RegOffset(base, i64::from(off));
+
+            ctx.emit(Inst::Store { rt, mem, bits });
+        }
+        Opcode::Load
+        | Opcode::Uload8
+        | Opcode::Sload8
+        | Opcode::Uload16
+        | Opcode::Sload16
+        | Opcode::Uload32
+        | Opcode::Sload32 => {
+            let off = ldst_offset(ctx.data(insn)).unwrap();
+            let elem_ty = match op {
+                Opcode::Sload8 | Opcode::Uload8 => I8,
+                Opcode::Sload16 | Opcode::Uload16 => I16,
+                Opcode::Sload32 | Opcode::Uload32 => I32,
+                Opcode::Load => ctx.output_ty(insn, 0),
+                _ => unreachable!(),
+            };
+            if elem_ty.bits() > 32 {
+                unimplemented!()
+            }
+            let bits = elem_ty.bits() as u8;
+
+            let sign_extend = match op {
+                Opcode::Sload8 | Opcode::Sload16 | Opcode::Sload32 => true,
+                _ => false,
+            };
+            let out_reg = output_to_reg(ctx, outputs[0]);
+
+            assert_eq!(inputs.len(), 2, "only one input for store memory operands");
+            let base = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            let mem = AMode::RegOffset(base, i64::from(off));
+
+            ctx.emit(Inst::Load {
+                rt: out_reg,
+                mem,
+                bits,
+                sign_extend,
+            });
+        }
+        Opcode::Uextend | Opcode::Sextend => {
+            let output_ty = ty.unwrap();
+            let input_ty = ctx.input_ty(insn, 0);
+            let from_bits = input_ty.bits() as u8;
+            let to_bits = 32;
+            let signed = op == Opcode::Sextend;
+
+            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+
+            if output_ty.bits() > 32 {
+                panic!("Unexpected output type {}", output_ty);
+            }
+            if from_bits < to_bits {
+                ctx.emit(Inst::Extend {
+                    rd,
+                    rm,
+                    from_bits,
+                    signed,
+                });
+            }
+        }
+        Opcode::Bint | Opcode::Breduce | Opcode::Bextend | Opcode::Ireduce => {
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend);
+            let rd = output_to_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+
+            ctx.emit(Inst::gen_move(rd, rn, ty));
+        }
+        Opcode::Copy => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ctx.input_ty(insn, 0);
+
+            ctx.emit(Inst::gen_move(rd, rn, ty));
+        }
+        Opcode::Debugtrap => {
+            ctx.emit(Inst::Bkpt);
+        }
+        Opcode::Trap => {
+            let trap_info = inst_trapcode(ctx.data(insn)).unwrap();
+            ctx.emit(Inst::Udf { trap_info })
+        }
+        Opcode::Trapif => {
+            let cmp_insn = ctx
+                .get_input(inputs[0].insn, inputs[0].input)
+                .inst
+                .unwrap()
+                .0;
+            debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
+            emit_cmp(ctx, cmp_insn);
+
+            let trap_info = inst_trapcode(ctx.data(insn)).unwrap();
+            let condcode = inst_condcode(ctx.data(insn)).unwrap();
+            let cond = lower_condcode(condcode);
+
+            ctx.emit(Inst::TrapIf { cond, trap_info });
+        }
+        Opcode::FallthroughReturn | Opcode::Return => {
+            for (i, input) in inputs.iter().enumerate() {
+                let reg = input_to_reg(ctx, *input, NarrowValueMode::None);
+                let retval_reg = ctx.retval(i);
+                let ty = ctx.input_ty(insn, i);
+
+                ctx.emit(Inst::gen_move(retval_reg, reg, ty));
+            }
+        }
+        Opcode::Call | Opcode::CallIndirect => {
+            let caller_conv = ctx.abi().call_conv();
+            let (mut abi, inputs) = match op {
+                Opcode::Call => {
+                    let (extname, dist) = ctx.call_target(insn).unwrap();
+                    let extname = extname.clone();
+                    let sig = ctx.call_sig(insn).unwrap();
+                    assert_eq!(inputs.len(), sig.params.len());
+                    assert_eq!(outputs.len(), sig.returns.len());
+                    (
+                        Arm32ABICaller::from_func(sig, &extname, dist, caller_conv)?,
+                        &inputs[..],
+                    )
+                }
+                Opcode::CallIndirect => {
+                    let ptr = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend);
+                    let sig = ctx.call_sig(insn).unwrap();
+                    assert_eq!(inputs.len() - 1, sig.params.len());
+                    assert_eq!(outputs.len(), sig.returns.len());
+                    (
+                        Arm32ABICaller::from_ptr(sig, ptr, op, caller_conv)?,
+                        &inputs[1..],
+                    )
+                }
+                _ => unreachable!(),
+            };
+            assert_eq!(inputs.len(), abi.num_args());
+            for (i, input) in inputs.iter().enumerate().filter(|(i, _)| *i <= 3) {
+                let arg_reg = input_to_reg(ctx, *input, NarrowValueMode::None);
+                abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
+            }
+            abi.emit_call(ctx);
+            for (i, output) in outputs.iter().enumerate() {
+                let retval_reg = output_to_reg(ctx, *output);
+                abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
+            }
+        }
+        _ => panic!("lowering {} unimplemented!", op),
+    }
+
+    Ok(())
+}
+
+pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    branches: &[IRInst],
+    targets: &[MachLabel],
+    fallthrough: Option<MachLabel>,
+) -> CodegenResult<()> {
+    // A block should end with at most two branches. The first may be a
+    // conditional branch; a conditional branch can be followed only by an
+    // unconditional branch or fallthrough. Otherwise, if only one branch,
+    // it may be an unconditional branch, a fallthrough, a return, or a
+    // trap. These conditions are verified by `is_ebb_basic()` during the
+    // verifier pass.
+    assert!(branches.len() <= 2);
+
+    if branches.len() == 2 {
+        // Must be a conditional branch followed by an unconditional branch.
+        let op0 = ctx.data(branches[0]).opcode();
+        let op1 = ctx.data(branches[1]).opcode();
+
+        assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
+        let taken = BranchTarget::Label(targets[0]);
+        let not_taken = match op1 {
+            Opcode::Jump => BranchTarget::Label(targets[1]),
+            Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()),
+            _ => unreachable!(), // assert above.
+        };
+        match op0 {
+            Opcode::Brz | Opcode::Brnz => {
+                let rn = input_to_reg(
+                    ctx,
+                    InsnInput {
+                        insn: branches[0],
+                        input: 0,
+                    },
+                    NarrowValueMode::ZeroExtend,
+                );
+                let cond = if op0 == Opcode::Brz {
+                    Cond::Eq
+                } else {
+                    Cond::Ne
+                };
+
+                ctx.emit(Inst::CmpImm8 { rn, imm8: 0 });
+                ctx.emit(Inst::CondBr {
+                    taken,
+                    not_taken,
+                    cond,
+                });
+            }
+            _ => unimplemented!(),
+        }
+    } else {
+        // Must be an unconditional branch or an indirect branch.
+        let op = ctx.data(branches[0]).opcode();
+        match op {
+            Opcode::Jump | Opcode::Fallthrough => {
+                assert_eq!(branches.len(), 1);
+                // In the Fallthrough case, the machine-independent driver
+                // fills in `targets[0]` with our fallthrough block, so this
+                // is valid for both Jump and Fallthrough.
+                ctx.emit(Inst::Jump {
+                    dest: BranchTarget::Label(targets[0]),
+                });
+            }
+            _ => unimplemented!(),
+        }
+    }
+
+    Ok(())
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/mod.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/mod.rs
new file mode 100644
index 0000000000..4b9701fd1d
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/arm32/mod.rs
@@ -0,0 +1,123 @@
+//! 32-bit ARM Instruction Set Architecture.
+
+use crate::ir::condcodes::IntCC;
+use crate::ir::Function;
+use crate::isa::Builder as IsaBuilder;
+use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
+use crate::result::CodegenResult;
+use crate::settings;
+
+use alloc::boxed::Box;
+use regalloc::{PrettyPrint, RealRegUniverse};
+use target_lexicon::{Architecture, ArmArchitecture, Triple};
+
+// New backend:
+mod abi;
+mod inst;
+mod lower;
+mod lower_inst;
+
+use inst::{create_reg_universe, EmitInfo};
+
+/// An ARM32 backend.
+pub struct Arm32Backend {
+    triple: Triple,
+    flags: settings::Flags,
+    reg_universe: RealRegUniverse,
+}
+
+impl Arm32Backend {
+    /// Create a new ARM32 backend with the given (shared) flags.
+    pub fn new_with_flags(triple: Triple, flags: settings::Flags) -> Arm32Backend {
+        let reg_universe = create_reg_universe();
+        Arm32Backend {
+            triple,
+            flags,
+            reg_universe,
+        }
+    }
+
+    fn compile_vcode(
+        &self,
+        func: &Function,
+        flags: settings::Flags,
+    ) -> CodegenResult<VCode<inst::Inst>> {
+        // This performs lowering to VCode, register-allocates the code, computes
+        // block layout and finalizes branches. The result is ready for binary emission.
+        let emit_info = EmitInfo::new(flags.clone());
+        let abi = Box::new(abi::Arm32ABICallee::new(func, flags)?);
+        compile::compile::<Arm32Backend>(func, self, abi, emit_info)
+    }
+}
+
+impl MachBackend for Arm32Backend {
+    fn compile_function(
+        &self,
+        func: &Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult> {
+        let flags = self.flags();
+        let vcode = self.compile_vcode(func, flags.clone())?;
+        let buffer = vcode.emit();
+        let frame_size = vcode.frame_size();
+
+        let disasm = if want_disasm {
+            Some(vcode.show_rru(Some(&create_reg_universe())))
+        } else {
+            None
+        };
+
+        let buffer = buffer.finish();
+
+        Ok(MachCompileResult {
+            buffer,
+            frame_size,
+            disasm,
+            unwind_info: None,
+        })
+    }
+
+    fn name(&self) -> &'static str {
+        "arm32"
+    }
+
+    fn triple(&self) -> Triple {
+        self.triple.clone()
+    }
+
+    fn flags(&self) -> &settings::Flags {
+        &self.flags
+    }
+
+    fn reg_universe(&self) -> &RealRegUniverse {
+        &self.reg_universe
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> IntCC {
+        // Carry flag set.
+        IntCC::UnsignedGreaterThanOrEqual
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> IntCC {
+        // Carry flag clear.
+        IntCC::UnsignedLessThan
+    }
+}
+
+/// Create a new `isa::Builder`.
+pub fn isa_builder(triple: Triple) -> IsaBuilder {
+    assert!(match triple.architecture {
+        Architecture::Arm(ArmArchitecture::Arm)
+        | Architecture::Arm(ArmArchitecture::Armv7)
+        | Architecture::Arm(ArmArchitecture::Armv6) => true,
+        _ => false,
+    });
+    IsaBuilder {
+        triple,
+        setup: settings::builder(),
+        constructor: |triple, shared_flags, _| {
+            let backend = Arm32Backend::new_with_flags(triple, shared_flags);
+            Box::new(TargetIsaAdapter::new(backend))
+        },
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/call_conv.rs b/third_party/rust/cranelift-codegen/src/isa/call_conv.rs
new file mode 100644
index 0000000000..61a94e5a43
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/call_conv.rs
@@ -0,0 +1,106 @@
+use crate::settings::{self, LibcallCallConv};
+use core::fmt;
+use core::str;
+use target_lexicon::{CallingConvention, Triple};
+
+#[cfg(feature = "enable-serde")]
+use serde::{Deserialize, Serialize};
+
+/// Calling convention identifiers.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+pub enum CallConv {
+    /// Best performance, not ABI-stable
+    Fast,
+    /// Smallest caller code size, not ABI-stable
+    Cold,
+    /// System V-style convention used on many platforms
+    SystemV,
+    /// Windows "fastcall" convention, also used for x64 and ARM
+    WindowsFastcall,
+    /// SpiderMonkey WebAssembly convention on systems using natively SystemV
+    BaldrdashSystemV,
+    /// SpiderMonkey WebAssembly convention on Windows
+    BaldrdashWindows,
+    /// SpiderMonkey WebAssembly convention for "ABI-2020", with extra TLS
+    /// register slots in the frame.
+    Baldrdash2020,
+    /// Specialized convention for the probestack function
+    Probestack,
+}
+
+impl CallConv {
+    /// Return the default calling convention for the given target triple.
+    pub fn triple_default(triple: &Triple) -> Self {
+        match triple.default_calling_convention() {
+            // Default to System V for unknown targets because most everything
+            // uses System V.
+            Ok(CallingConvention::SystemV) | Err(()) => Self::SystemV,
+            Ok(CallingConvention::WindowsFastcall) => Self::WindowsFastcall,
+            Ok(unimp) => unimplemented!("calling convention: {:?}", unimp),
+        }
+    }
+
+    /// Returns the calling convention used for libcalls according to the current flags.
+    pub fn for_libcall(flags: &settings::Flags, default_call_conv: CallConv) -> Self {
+        match flags.libcall_call_conv() {
+            LibcallCallConv::IsaDefault => default_call_conv,
+            LibcallCallConv::Fast => Self::Fast,
+            LibcallCallConv::Cold => Self::Cold,
+            LibcallCallConv::SystemV => Self::SystemV,
+            LibcallCallConv::WindowsFastcall => Self::WindowsFastcall,
+            LibcallCallConv::BaldrdashSystemV => Self::BaldrdashSystemV,
+            LibcallCallConv::BaldrdashWindows => Self::BaldrdashWindows,
+            LibcallCallConv::Baldrdash2020 => Self::Baldrdash2020,
+            LibcallCallConv::Probestack => Self::Probestack,
+        }
+    }
+
+    /// Is the calling convention extending the Windows Fastcall ABI?
+    pub fn extends_windows_fastcall(self) -> bool {
+        match self {
+            Self::WindowsFastcall | Self::BaldrdashWindows => true,
+            _ => false,
+        }
+    }
+
+    /// Is the calling convention extending the Baldrdash ABI?
+    pub fn extends_baldrdash(self) -> bool {
+        match self {
+            Self::BaldrdashSystemV | Self::BaldrdashWindows | Self::Baldrdash2020 => true,
+            _ => false,
+        }
+    }
+}
+
+impl fmt::Display for CallConv {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.write_str(match *self {
+            Self::Fast => "fast",
+            Self::Cold => "cold",
+            Self::SystemV => "system_v",
+            Self::WindowsFastcall => "windows_fastcall",
+            Self::BaldrdashSystemV => "baldrdash_system_v",
+            Self::BaldrdashWindows => "baldrdash_windows",
+            Self::Baldrdash2020 => "baldrdash_2020",
+            Self::Probestack => "probestack",
+        })
+    }
+}
+
+impl str::FromStr for CallConv {
+    type Err = ();
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "fast" => Ok(Self::Fast),
+            "cold" => Ok(Self::Cold),
+            "system_v" => Ok(Self::SystemV),
+            "windows_fastcall" => Ok(Self::WindowsFastcall),
+            "baldrdash_system_v" => Ok(Self::BaldrdashSystemV),
+            "baldrdash_windows" => Ok(Self::BaldrdashWindows),
+            "baldrdash_2020" => Ok(Self::Baldrdash2020),
+            "probestack" => Ok(Self::Probestack),
+            _ => Err(()),
+        }
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/constraints.rs b/third_party/rust/cranelift-codegen/src/isa/constraints.rs
new file mode 100644
index 0000000000..c87c3bd9d4
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/constraints.rs
@@ -0,0 +1,207 @@
+//! Register constraints for instruction operands.
+//!
+//! An encoding recipe specifies how an instruction is encoded as binary machine code, but it only
+//! works if the operands and results satisfy certain constraints. Constraints on immediate
+//! operands are checked by instruction predicates when the recipe is chosen.
+//!
+//! It is the register allocator's job to make sure that the register constraints on value operands
+//! are satisfied.
+
+use crate::binemit::CodeOffset;
+use crate::ir::{Function, Inst, ValueLoc};
+use crate::isa::{RegClass, RegUnit};
+use crate::regalloc::RegDiversions;
+
+/// Register constraint for a single value operand or instruction result.
+#[derive(PartialEq, Debug)]
+pub struct OperandConstraint {
+    /// The kind of constraint.
+    pub kind: ConstraintKind,
+
+    /// The register class of the operand.
+    ///
+    /// This applies to all kinds of constraints, but with slightly different meaning.
+    pub regclass: RegClass,
+}
+
+impl OperandConstraint {
+    /// Check if this operand constraint is satisfied by the given value location.
+    /// For tied constraints, this only checks the register class, not that the
+    /// counterpart operand has the same value location.
+    pub fn satisfied(&self, loc: ValueLoc) -> bool {
+        match self.kind {
+            ConstraintKind::Reg | ConstraintKind::Tied(_) => {
+                if let ValueLoc::Reg(reg) = loc {
+                    self.regclass.contains(reg)
+                } else {
+                    false
+                }
+            }
+            ConstraintKind::FixedReg(reg) | ConstraintKind::FixedTied(reg) => {
+                loc == ValueLoc::Reg(reg) && self.regclass.contains(reg)
+            }
+            ConstraintKind::Stack => {
+                if let ValueLoc::Stack(_) = loc {
+                    true
+                } else {
+                    false
+                }
+            }
+        }
+    }
+}
+
+/// The different kinds of operand constraints.
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum ConstraintKind {
+    /// This operand or result must be a register from the given register class.
+    Reg,
+
+    /// This operand or result must be a fixed register.
+    ///
+    /// The constraint's `regclass` field is the top-level register class containing the fixed
+    /// register.
+    FixedReg(RegUnit),
+
+    /// This result value must use the same register as an input value operand.
+    ///
+    /// The associated number is the index of the input value operand this result is tied to. The
+    /// constraint's `regclass` field is the same as the tied operand's register class.
+    ///
+    /// When an (in, out) operand pair is tied, this constraint kind appears in both the `ins` and
+    /// the `outs` arrays. The constraint for the in operand is `Tied(out)`, and the constraint for
+    /// the out operand is `Tied(in)`.
+    Tied(u8),
+
+    /// This operand must be a fixed register, and it has a tied counterpart.
+    ///
+    /// This works just like `FixedReg`, but additionally indicates that there are identical
+    /// input/output operands for this fixed register. For an input operand, this means that the
+    /// value will be clobbered by the instruction
+    FixedTied(RegUnit),
+
+    /// This operand must be a value in a stack slot.
+    ///
+    /// The constraint's `regclass` field is the register class that would normally be used to load
+    /// and store values of this type.
+    Stack,
+}
+
+/// Value operand constraints for an encoding recipe.
+#[derive(PartialEq, Clone)]
+pub struct RecipeConstraints {
+    /// Constraints for the instruction's fixed value operands.
+    ///
+    /// If the instruction takes a variable number of operands, the register constraints for those
+    /// operands must be computed dynamically.
+    ///
+    /// - For branches and jumps, block arguments must match the expectations of the destination block.
+    /// - For calls and returns, the calling convention ABI specifies constraints.
+    pub ins: &'static [OperandConstraint],
+
+    /// Constraints for the instruction's fixed results.
+    ///
+    /// If the instruction produces a variable number of results, it's probably a call and the
+    /// constraints must be derived from the calling convention ABI.
+    pub outs: &'static [OperandConstraint],
+
+    /// Are any of the input constraints `FixedReg` or `FixedTied`?
+    pub fixed_ins: bool,
+
+    /// Are any of the output constraints `FixedReg` or `FixedTied`?
+    pub fixed_outs: bool,
+
+    /// Are any of the input/output constraints `Tied` (but not `FixedTied`)?
+    pub tied_ops: bool,
+
+    /// Does this instruction clobber the CPU flags?
+    ///
+    /// When true, SSA values of type `iflags` or `fflags` can not be live across the instruction.
+    pub clobbers_flags: bool,
+}
+
+impl RecipeConstraints {
+    /// Check that these constraints are satisfied by the operands on `inst`.
+    pub fn satisfied(&self, inst: Inst, divert: &RegDiversions, func: &Function) -> bool {
+        for (&arg, constraint) in func.dfg.inst_args(inst).iter().zip(self.ins) {
+            let loc = divert.get(arg, &func.locations);
+
+            if let ConstraintKind::Tied(out_index) = constraint.kind {
+                let out_val = func.dfg.inst_results(inst)[out_index as usize];
+                let out_loc = func.locations[out_val];
+                if loc != out_loc {
+                    return false;
+                }
+            }
+
+            if !constraint.satisfied(loc) {
+                return false;
+            }
+        }
+
+        for (&arg, constraint) in func.dfg.inst_results(inst).iter().zip(self.outs) {
+            let loc = divert.get(arg, &func.locations);
+            if !constraint.satisfied(loc) {
+                return false;
+            }
+        }
+
+        true
+    }
+}
+
+/// Constraints on the range of a branch instruction.
+///
+/// A branch instruction usually encodes its destination as a signed n-bit offset from an origin.
+/// The origin depends on the ISA and the specific instruction:
+///
+/// - RISC-V and ARM Aarch64 use the address of the branch instruction, `origin = 0`.
+/// - x86 uses the address of the instruction following the branch, `origin = 2` for a 2-byte
+///   branch instruction.
+/// - ARM's A32 encoding uses the address of the branch instruction + 8 bytes, `origin = 8`.
+#[derive(Clone, Copy, Debug)]
+pub struct BranchRange {
+    /// Offset in bytes from the address of the branch instruction to the origin used for computing
+    /// the branch displacement. This is the destination of a branch that encodes a 0 displacement.
+    pub origin: u8,
+
+    /// Number of bits in the signed byte displacement encoded in the instruction. This does not
+    /// account for branches that can only target aligned addresses.
+    pub bits: u8,
+}
+
+impl BranchRange {
+    /// Determine if this branch range can represent the range from `branch` to `dest`, where
+    /// `branch` is the code offset of the branch instruction itself and `dest` is the code offset
+    /// of the destination block header.
+    ///
+    /// This method does not detect if the range is larger than 2 GB.
+    pub fn contains(self, branch: CodeOffset, dest: CodeOffset) -> bool {
+        let d = dest.wrapping_sub(branch + CodeOffset::from(self.origin)) as i32;
+        let s = 32 - self.bits;
+        d == d << s >> s
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn branch_range() {
+        // ARM T1 branch.
+        let t1 = BranchRange { origin: 4, bits: 9 };
+        assert!(t1.contains(0, 0));
+        assert!(t1.contains(0, 2));
+        assert!(t1.contains(2, 0));
+        assert!(t1.contains(1000, 1000));
+
+        // Forward limit.
+        assert!(t1.contains(1000, 1258));
+        assert!(!t1.contains(1000, 1260));
+
+        // Backward limit
+        assert!(t1.contains(1000, 748));
+        assert!(!t1.contains(1000, 746));
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/enc_tables.rs b/third_party/rust/cranelift-codegen/src/isa/enc_tables.rs
new file mode 100644
index 0000000000..e21557497e
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/enc_tables.rs
@@ -0,0 +1,292 @@
+//! Support types for generated encoding tables.
+//!
+//! This module contains types and functions for working with the encoding tables generated by
+//! `cranelift-codegen/meta/src/gen_encodings.rs`.
+
+use crate::constant_hash::{probe, Table};
+use crate::ir::{Function, InstructionData, Opcode, Type};
+use crate::isa::{Encoding, Legalize};
+use crate::settings::PredicateView;
+use core::ops::Range;
+
+/// A recipe predicate.
+///
+/// This is a predicate function capable of testing ISA and instruction predicates simultaneously.
+///
+/// A None predicate is always satisfied.
+pub type RecipePredicate = Option<fn(PredicateView, &InstructionData) -> bool>;
+
+/// An instruction predicate.
+///
+/// This is a predicate function that needs to be tested in addition to the recipe predicate. It
+/// can't depend on ISA settings.
+pub type InstPredicate = fn(&Function, &InstructionData) -> bool;
+
+/// Legalization action to perform when no encoding can be found for an instruction.
+///
+/// This is an index into an ISA-specific table of legalization actions.
+pub type LegalizeCode = u8;
+
+/// Level 1 hash table entry.
+///
+/// One level 1 hash table is generated per CPU mode. This table is keyed by the controlling type
+/// variable, using `INVALID` for non-polymorphic instructions.
+///
+/// The hash table values are references to level 2 hash tables, encoded as an offset in `LEVEL2`
+/// where the table begins, and the binary logarithm of its length. All the level 2 hash tables
+/// have a power-of-two size.
+///
+/// Entries are generic over the offset type. It will typically be `u32` or `u16`, depending on the
+/// size of the `LEVEL2` table.
+///
+/// Empty entries are encoded with a `!0` value for `log2len` which will always be out of range.
+/// Entries that have a `legalize` value but no level 2 table have an `offset` field that is out of
+/// bounds.
+pub struct Level1Entry<OffT: Into<u32> + Copy> {
+    pub ty: Type,
+    pub log2len: u8,
+    pub legalize: LegalizeCode,
+    pub offset: OffT,
+}
+
+impl<OffT: Into<u32> + Copy> Level1Entry<OffT> {
+    /// Get the level 2 table range indicated by this entry.
+    fn range(&self) -> Range<usize> {
+        let b = self.offset.into() as usize;
+        b..b + (1 << self.log2len)
+    }
+}
+
+impl<OffT: Into<u32> + Copy> Table<Type> for [Level1Entry<OffT>] {
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    fn key(&self, idx: usize) -> Option<Type> {
+        if self[idx].log2len != !0 {
+            Some(self[idx].ty)
+        } else {
+            None
+        }
+    }
+}
+
+/// Level 2 hash table entry.
+///
+/// The second level hash tables are keyed by `Opcode`, and contain an offset into the `ENCLISTS`
+/// table where the encoding recipes for the instruction are stored.
+///
+/// Entries are generic over the offset type which depends on the size of `ENCLISTS`. A `u16`
+/// offset allows the entries to be only 32 bits each. There is no benefit to dropping down to `u8`
+/// for tiny ISAs. The entries won't shrink below 32 bits since the opcode is expected to be 16
+/// bits.
+///
+/// Empty entries are encoded with a `NotAnOpcode` `opcode` field.
+pub struct Level2Entry<OffT: Into<u32> + Copy> {
+    pub opcode: Option<Opcode>,
+    pub offset: OffT,
+}
+
+impl<OffT: Into<u32> + Copy> Table<Opcode> for [Level2Entry<OffT>] {
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    fn key(&self, idx: usize) -> Option<Opcode> {
+        self[idx].opcode
+    }
+}
+
+/// Two-level hash table lookup and iterator construction.
+///
+/// Given the controlling type variable and instruction opcode, find the corresponding encoding
+/// list.
+///
+/// Returns an iterator that produces legal encodings for `inst`.
+pub fn lookup_enclist<'a, OffT1, OffT2>(
+    ctrl_typevar: Type,
+    inst: &'a InstructionData,
+    func: &'a Function,
+    level1_table: &'static [Level1Entry<OffT1>],
+    level2_table: &'static [Level2Entry<OffT2>],
+    enclist: &'static [EncListEntry],
+    legalize_actions: &'static [Legalize],
+    recipe_preds: &'static [RecipePredicate],
+    inst_preds: &'static [InstPredicate],
+    isa_preds: PredicateView<'a>,
+) -> Encodings<'a>
+where
+    OffT1: Into<u32> + Copy,
+    OffT2: Into<u32> + Copy,
+{
+    let (offset, legalize) = match probe(level1_table, ctrl_typevar, ctrl_typevar.index()) {
+        Err(l1idx) => {
+            // No level 1 entry found for the type.
+            // We have a sentinel entry with the default legalization code.
+            (!0, level1_table[l1idx].legalize)
+        }
+        Ok(l1idx) => {
+            // We have a valid level 1 entry for this type.
+            let l1ent = &level1_table[l1idx];
+            let offset = match level2_table.get(l1ent.range()) {
+                Some(l2tab) => {
+                    let opcode = inst.opcode();
+                    match probe(l2tab, opcode, opcode as usize) {
+                        Ok(l2idx) => l2tab[l2idx].offset.into() as usize,
+                        Err(_) => !0,
+                    }
+                }
+                // The l1ent range is invalid. This means that we just have a customized
+                // legalization code for this type. The level 2 table is empty.
+                None => !0,
+            };
+            (offset, l1ent.legalize)
+        }
+    };
+
+    // Now we have an offset into `enclist` that is `!0` when no encoding list could be found.
+    // The default legalization code is always valid.
+    Encodings::new(
+        offset,
+        legalize,
+        inst,
+        func,
+        enclist,
+        legalize_actions,
+        recipe_preds,
+        inst_preds,
+        isa_preds,
+    )
+}
+
+/// Encoding list entry.
+///
+/// Encoding lists are represented as sequences of u16 words.
+pub type EncListEntry = u16;
+
+/// Number of bits used to represent a predicate. c.f. `meta/src/gen_encodings.rs`.
+const PRED_BITS: u8 = 12;
+const PRED_MASK: usize = (1 << PRED_BITS) - 1;
+/// First code word representing a predicate check. c.f. `meta/src/gen_encodings.rs`.
+const PRED_START: usize = 0x1000;
+
+/// An iterator over legal encodings for the instruction.
+pub struct Encodings<'a> {
+    // Current offset into `enclist`, or out of bounds after we've reached the end.
+    offset: usize,
+    // Legalization code to use of no encoding is found.
+    legalize: LegalizeCode,
+    inst: &'a InstructionData,
+    func: &'a Function,
+    enclist: &'static [EncListEntry],
+    legalize_actions: &'static [Legalize],
+    recipe_preds: &'static [RecipePredicate],
+    inst_preds: &'static [InstPredicate],
+    isa_preds: PredicateView<'a>,
+}
+
+impl<'a> Encodings<'a> {
+    /// Creates a new instance of `Encodings`.
+    ///
+    /// This iterator provides search for encodings that applies to the given instruction. The
+    /// encoding lists are laid out such that first call to `next` returns valid entry in the list
+    /// or `None`.
+    pub fn new(
+        offset: usize,
+        legalize: LegalizeCode,
+        inst: &'a InstructionData,
+        func: &'a Function,
+        enclist: &'static [EncListEntry],
+        legalize_actions: &'static [Legalize],
+        recipe_preds: &'static [RecipePredicate],
+        inst_preds: &'static [InstPredicate],
+        isa_preds: PredicateView<'a>,
+    ) -> Self {
+        Encodings {
+            offset,
+            inst,
+            func,
+            legalize,
+            isa_preds,
+            recipe_preds,
+            inst_preds,
+            enclist,
+            legalize_actions,
+        }
+    }
+
+    /// Get the legalization action that caused the enumeration of encodings to stop.
+    /// This can be the default legalization action for the type or a custom code for the
+    /// instruction.
+    ///
+    /// This method must only be called after the iterator returns `None`.
+    pub fn legalize(&self) -> Legalize {
+        debug_assert_eq!(self.offset, !0, "Premature Encodings::legalize()");
+        self.legalize_actions[self.legalize as usize]
+    }
+
+    /// Check if the `rpred` recipe predicate is satisfied.
+    fn check_recipe(&self, rpred: RecipePredicate) -> bool {
+        match rpred {
+            Some(p) => p(self.isa_preds, self.inst),
+            None => true,
+        }
+    }
+
+    /// Check an instruction or isa predicate.
+    fn check_pred(&self, pred: usize) -> bool {
+        if let Some(&p) = self.inst_preds.get(pred) {
+            p(self.func, self.inst)
+        } else {
+            let pred = pred - self.inst_preds.len();
+            self.isa_preds.test(pred)
+        }
+    }
+}
+
+impl<'a> Iterator for Encodings<'a> {
+    type Item = Encoding;
+
+    fn next(&mut self) -> Option<Encoding> {
+        while let Some(entryref) = self.enclist.get(self.offset) {
+            let entry = *entryref as usize;
+
+            // Check for "recipe+bits".
+            let recipe = entry >> 1;
+            if let Some(&rpred) = self.recipe_preds.get(recipe) {
+                let bits = self.offset + 1;
+                if entry & 1 == 0 {
+                    self.offset += 2; // Next entry.
+                } else {
+                    self.offset = !0; // Stop.
+                }
+                if self.check_recipe(rpred) {
+                    return Some(Encoding::new(recipe as u16, self.enclist[bits]));
+                }
+                continue;
+            }
+
+            // Check for "stop with legalize".
+            if entry < PRED_START {
+                self.legalize = (entry - 2 * self.recipe_preds.len()) as LegalizeCode;
+                self.offset = !0; // Stop.
+                return None;
+            }
+
+            // Finally, this must be a predicate entry.
+            let pred_entry = entry - PRED_START;
+            let skip = pred_entry >> PRED_BITS;
+            let pred = pred_entry & PRED_MASK;
+
+            if self.check_pred(pred) {
+                self.offset += 1;
+            } else if skip == 0 {
+                self.offset = !0; // Stop.
+                return None;
+            } else {
+                self.offset += 1 + skip;
+            }
+        }
+        None
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/encoding.rs b/third_party/rust/cranelift-codegen/src/isa/encoding.rs
new file mode 100644
index 0000000000..99894cab2c
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/encoding.rs
@@ -0,0 +1,163 @@
+//! The `Encoding` struct.
+
+use crate::binemit::CodeOffset;
+use crate::ir::{Function, Inst};
+use crate::isa::constraints::{BranchRange, RecipeConstraints};
+use crate::regalloc::RegDiversions;
+use core::fmt;
+
+/// Bits needed to encode an instruction as binary machine code.
+///
+/// The encoding consists of two parts, both specific to the target ISA: An encoding *recipe*, and
+/// encoding *bits*. The recipe determines the native instruction format and the mapping of
+/// operands to encoded bits. The encoding bits provide additional information to the recipe,
+/// typically parts of the opcode.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Encoding {
+    recipe: u16,
+    bits: u16,
+}
+
+impl Encoding {
+    /// Create a new `Encoding` containing `(recipe, bits)`.
+    pub fn new(recipe: u16, bits: u16) -> Self {
+        Self { recipe, bits }
+    }
+
+    /// Get the recipe number in this encoding.
+    pub fn recipe(self) -> usize {
+        self.recipe as usize
+    }
+
+    /// Get the recipe-specific encoding bits.
+    pub fn bits(self) -> u16 {
+        self.bits
+    }
+
+    /// Is this a legal encoding, or the default placeholder?
+    pub fn is_legal(self) -> bool {
+        self != Self::default()
+    }
+}
+
+/// The default encoding is the illegal one.
+impl Default for Encoding {
+    fn default() -> Self {
+        Self::new(0xffff, 0xffff)
+    }
+}
+
+/// ISA-independent display of an encoding.
+impl fmt::Display for Encoding {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        if self.is_legal() {
+            write!(f, "{}#{:02x}", self.recipe, self.bits)
+        } else {
+            write!(f, "-")
+        }
+    }
+}
+
+/// Temporary object that holds enough context to properly display an encoding.
+/// This is meant to be created by `EncInfo::display()`.
+pub struct DisplayEncoding {
+    pub encoding: Encoding,
+    pub recipe_names: &'static [&'static str],
+}
+
+impl fmt::Display for DisplayEncoding {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        if self.encoding.is_legal() {
+            write!(
+                f,
+                "{}#{:02x}",
+                self.recipe_names[self.encoding.recipe()],
+                self.encoding.bits
+            )
+        } else {
+            write!(f, "-")
+        }
+    }
+}
+
+type SizeCalculatorFn = fn(&RecipeSizing, Encoding, Inst, &RegDiversions, &Function) -> u8;
+
+/// Returns the base size of the Recipe, assuming it's fixed. This is the default for most
+/// encodings; others can be variable and longer than this base size, depending on the registers
+/// they're using and use a different function, specific per platform.
+pub fn base_size(
+    sizing: &RecipeSizing,
+    _: Encoding,
+    _: Inst,
+    _: &RegDiversions,
+    _: &Function,
+) -> u8 {
+    sizing.base_size
+}
+
+/// Code size information for an encoding recipe.
+///
+/// Encoding recipes may have runtime-determined instruction size.
+pub struct RecipeSizing {
+    /// Minimum size in bytes of instructions encoded with this recipe.
+    pub base_size: u8,
+
+    /// Method computing the instruction's real size, given inputs and outputs.
+    pub compute_size: SizeCalculatorFn,
+
+    /// Allowed branch range in this recipe, if any.
+    ///
+    /// All encoding recipes for branches have exact branch range information.
+    pub branch_range: Option<BranchRange>,
+}
+
+/// Information about all the encodings in this ISA.
+#[derive(Clone)]
+pub struct EncInfo {
+    /// Constraints on value operands per recipe.
+    pub constraints: &'static [RecipeConstraints],
+
+    /// Code size information per recipe.
+    pub sizing: &'static [RecipeSizing],
+
+    /// Names of encoding recipes.
+    pub names: &'static [&'static str],
+}
+
+impl EncInfo {
+    /// Get the value operand constraints for `enc` if it is a legal encoding.
+    pub fn operand_constraints(&self, enc: Encoding) -> Option<&'static RecipeConstraints> {
+        self.constraints.get(enc.recipe())
+    }
+
+    /// Create an object that can display an ISA-dependent encoding properly.
+    pub fn display(&self, enc: Encoding) -> DisplayEncoding {
+        DisplayEncoding {
+            encoding: enc,
+            recipe_names: self.names,
+        }
+    }
+
+    /// Get the size in bytes of `inst`, if it were encoded with `enc`.
+    ///
+    /// Returns 0 for illegal encodings.
+    pub fn byte_size(
+        &self,
+        enc: Encoding,
+        inst: Inst,
+        divert: &RegDiversions,
+        func: &Function,
+    ) -> CodeOffset {
+        self.sizing.get(enc.recipe()).map_or(0, |s| {
+            let compute_size = s.compute_size;
+            CodeOffset::from(compute_size(&s, enc, inst, divert, func))
+        })
+    }
+
+    /// Get the branch range that is supported by `enc`, if any.
+    ///
+    /// This will never return `None` for a legal branch encoding.
+    pub fn branch_range(&self, enc: Encoding) -> Option<BranchRange> {
+        self.sizing.get(enc.recipe()).and_then(|s| s.branch_range)
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/mod.rs b/third_party/rust/cranelift-codegen/src/isa/mod.rs
new file mode 100644
index 0000000000..2e56c025d0
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/mod.rs
@@ -0,0 +1,447 @@
+//! Instruction Set Architectures.
+//!
+//! The `isa` module provides a `TargetIsa` trait which provides the behavior specialization needed
+//! by the ISA-independent code generator. The sub-modules of this module provide definitions for
+//! the instruction sets that Cranelift can target. Each sub-module has it's own implementation of
+//! `TargetIsa`.
+//!
+//! # Constructing a `TargetIsa` instance
+//!
+//! The target ISA is built from the following information:
+//!
+//! - The name of the target ISA as a string. Cranelift is a cross-compiler, so the ISA to target
+//!   can be selected dynamically. Individual ISAs can be left out when Cranelift is compiled, so a
+//!   string is used to identify the proper sub-module.
+//! - Values for settings that apply to all ISAs. This is represented by a `settings::Flags`
+//!   instance.
+//! - Values for ISA-specific settings.
+//!
+//! The `isa::lookup()` function is the main entry point which returns an `isa::Builder`
+//! appropriate for the requested ISA:
+//!
+//! ```
+//! # extern crate cranelift_codegen;
+//! # #[macro_use] extern crate target_lexicon;
+//! use cranelift_codegen::isa;
+//! use cranelift_codegen::settings::{self, Configurable};
+//! use std::str::FromStr;
+//! use target_lexicon::Triple;
+//!
+//! let shared_builder = settings::builder();
+//! let shared_flags = settings::Flags::new(shared_builder);
+//!
+//! match isa::lookup(triple!("riscv32")) {
+//!     Err(_) => {
+//!         // The RISC-V target ISA is not available.
+//!     }
+//!     Ok(mut isa_builder) => {
+//!         isa_builder.set("supports_m", "on");
+//!         let isa = isa_builder.finish(shared_flags);
+//!     }
+//! }
+//! ```
+//!
+//! The configured target ISA trait object is a `Box<TargetIsa>` which can be used for multiple
+//! concurrent function compilations.
+
+pub use crate::isa::call_conv::CallConv;
+pub use crate::isa::constraints::{
+    BranchRange, ConstraintKind, OperandConstraint, RecipeConstraints,
+};
+pub use crate::isa::enc_tables::Encodings;
+pub use crate::isa::encoding::{base_size, EncInfo, Encoding};
+pub use crate::isa::registers::{regs_overlap, RegClass, RegClassIndex, RegInfo, RegUnit};
+pub use crate::isa::stack::{StackBase, StackBaseMask, StackRef};
+
+use crate::binemit;
+use crate::flowgraph;
+use crate::ir;
+#[cfg(feature = "unwind")]
+use crate::isa::unwind::systemv::RegisterMappingError;
+use crate::machinst::MachBackend;
+use crate::regalloc;
+use crate::result::CodegenResult;
+use crate::settings;
+use crate::settings::SetResult;
+use crate::timing;
+use alloc::borrow::Cow;
+use alloc::boxed::Box;
+use core::any::Any;
+use core::fmt;
+use core::fmt::{Debug, Formatter};
+use target_lexicon::{triple, Architecture, PointerWidth, Triple};
+use thiserror::Error;
+
+#[cfg(feature = "riscv")]
+mod riscv;
+
+#[cfg(feature = "x86")]
+mod x86;
+
+#[cfg(feature = "x64")]
+mod x64;
+
+#[cfg(feature = "arm32")]
+mod arm32;
+
+#[cfg(feature = "arm64")]
+pub(crate) mod aarch64;
+
+pub mod unwind;
+
+mod call_conv;
+mod constraints;
+mod enc_tables;
+mod encoding;
+pub mod registers;
+mod stack;
+
+#[cfg(test)]
+mod test_utils;
+
+/// Returns a builder that can create a corresponding `TargetIsa`
+/// or `Err(LookupError::SupportDisabled)` if not enabled.
+macro_rules! isa_builder {
+    ($name: ident, $feature: tt, $triple: ident) => {{
+        #[cfg(feature = $feature)]
+        {
+            Ok($name::isa_builder($triple))
+        }
+        #[cfg(not(feature = $feature))]
+        {
+            Err(LookupError::SupportDisabled)
+        }
+    }};
+}
+
+/// Look for an ISA for the given `triple`.
+/// Return a builder that can create a corresponding `TargetIsa`.
+pub fn lookup(triple: Triple) -> Result<Builder, LookupError> {
+    match triple.architecture {
+        Architecture::Riscv32 { .. } | Architecture::Riscv64 { .. } => {
+            isa_builder!(riscv, "riscv", triple)
+        }
+        Architecture::X86_32 { .. } | Architecture::X86_64 => {
+            if cfg!(feature = "x64") {
+                isa_builder!(x64, "x64", triple)
+            } else {
+                isa_builder!(x86, "x86", triple)
+            }
+        }
+        Architecture::Arm { .. } => isa_builder!(arm32, "arm32", triple),
+        Architecture::Aarch64 { .. } => isa_builder!(aarch64, "arm64", triple),
+        _ => Err(LookupError::Unsupported),
+    }
+}
+
+/// Look for a supported ISA with the given `name`.
+/// Return a builder that can create a corresponding `TargetIsa`.
+pub fn lookup_by_name(name: &str) -> Result<Builder, LookupError> {
+    use alloc::str::FromStr;
+    lookup(triple!(name))
+}
+
+/// Describes reason for target lookup failure
+#[derive(Error, PartialEq, Eq, Copy, Clone, Debug)]
+pub enum LookupError {
+    /// Support for this target was disabled in the current build.
+    #[error("Support for this target is disabled")]
+    SupportDisabled,
+
+    /// Support for this target has not yet been implemented.
+    #[error("Support for this target has not been implemented yet")]
+    Unsupported,
+}
+
+/// Builder for a `TargetIsa`.
+/// Modify the ISA-specific settings before creating the `TargetIsa` trait object with `finish`.
+#[derive(Clone)]
+pub struct Builder {
+    triple: Triple,
+    setup: settings::Builder,
+    constructor: fn(Triple, settings::Flags, settings::Builder) -> Box<dyn TargetIsa>,
+}
+
+impl Builder {
+    /// Combine the ISA-specific settings with the provided ISA-independent settings and allocate a
+    /// fully configured `TargetIsa` trait object.
+    pub fn finish(self, shared_flags: settings::Flags) -> Box<dyn TargetIsa> {
+        (self.constructor)(self.triple, shared_flags, self.setup)
+    }
+}
+
+impl settings::Configurable for Builder {
+    fn set(&mut self, name: &str, value: &str) -> SetResult<()> {
+        self.setup.set(name, value)
+    }
+
+    fn enable(&mut self, name: &str) -> SetResult<()> {
+        self.setup.enable(name)
+    }
+}
+
+/// After determining that an instruction doesn't have an encoding, how should we proceed to
+/// legalize it?
+///
+/// The `Encodings` iterator returns a legalization function to call.
+pub type Legalize =
+    fn(ir::Inst, &mut ir::Function, &mut flowgraph::ControlFlowGraph, &dyn TargetIsa) -> bool;
+
+/// This struct provides information that a frontend may need to know about a target to
+/// produce Cranelift IR for the target.
+#[derive(Clone, Copy, Hash)]
+pub struct TargetFrontendConfig {
+    /// The default calling convention of the target.
+    pub default_call_conv: CallConv,
+
+    /// The pointer width of the target.
+    pub pointer_width: PointerWidth,
+}
+
+impl TargetFrontendConfig {
+    /// Get the pointer type of this target.
+    pub fn pointer_type(self) -> ir::Type {
+        ir::Type::int(u16::from(self.pointer_bits())).unwrap()
+    }
+
+    /// Get the width of pointers on this target, in units of bits.
+    pub fn pointer_bits(self) -> u8 {
+        self.pointer_width.bits()
+    }
+
+    /// Get the width of pointers on this target, in units of bytes.
+    pub fn pointer_bytes(self) -> u8 {
+        self.pointer_width.bytes()
+    }
+}
+
+/// Methods that are specialized to a target ISA. Implies a Display trait that shows the
+/// shared flags, as well as any isa-specific flags.
+pub trait TargetIsa: fmt::Display + Send + Sync {
+    /// Get the name of this ISA.
+    fn name(&self) -> &'static str;
+
+    /// Get the target triple that was used to make this trait object.
+    fn triple(&self) -> &Triple;
+
+    /// Get the ISA-independent flags that were used to make this trait object.
+    fn flags(&self) -> &settings::Flags;
+
+    /// Get the default calling convention of this target.
+    fn default_call_conv(&self) -> CallConv {
+        CallConv::triple_default(self.triple())
+    }
+
+    /// Get the pointer type of this ISA.
+    fn pointer_type(&self) -> ir::Type {
+        ir::Type::int(u16::from(self.pointer_bits())).unwrap()
+    }
+
+    /// Get the width of pointers on this ISA.
+    fn pointer_width(&self) -> PointerWidth {
+        self.triple().pointer_width().unwrap()
+    }
+
+    /// Get the width of pointers on this ISA, in units of bits.
+    fn pointer_bits(&self) -> u8 {
+        self.pointer_width().bits()
+    }
+
+    /// Get the width of pointers on this ISA, in units of bytes.
+    fn pointer_bytes(&self) -> u8 {
+        self.pointer_width().bytes()
+    }
+
+    /// Get the information needed by frontends producing Cranelift IR.
+    fn frontend_config(&self) -> TargetFrontendConfig {
+        TargetFrontendConfig {
+            default_call_conv: self.default_call_conv(),
+            pointer_width: self.pointer_width(),
+        }
+    }
+
+    /// Does the CPU implement scalar comparisons using a CPU flags register?
+    fn uses_cpu_flags(&self) -> bool {
+        false
+    }
+
+    /// Does the CPU implement multi-register addressing?
+    fn uses_complex_addresses(&self) -> bool {
+        false
+    }
+
+    /// Get a data structure describing the registers in this ISA.
+    fn register_info(&self) -> RegInfo;
+
+    #[cfg(feature = "unwind")]
+    /// Map a Cranelift register to its corresponding DWARF register.
+    fn map_dwarf_register(&self, _: RegUnit) -> Result<u16, RegisterMappingError> {
+        Err(RegisterMappingError::UnsupportedArchitecture)
+    }
+
+    /// Returns an iterator over legal encodings for the instruction.
+    fn legal_encodings<'a>(
+        &'a self,
+        func: &'a ir::Function,
+        inst: &'a ir::InstructionData,
+        ctrl_typevar: ir::Type,
+    ) -> Encodings<'a>;
+
+    /// Encode an instruction after determining it is legal.
+    ///
+    /// If `inst` can legally be encoded in this ISA, produce the corresponding `Encoding` object.
+    /// Otherwise, return `Legalize` action.
+    ///
+    /// This is also the main entry point for determining if an instruction is legal.
+    fn encode(
+        &self,
+        func: &ir::Function,
+        inst: &ir::InstructionData,
+        ctrl_typevar: ir::Type,
+    ) -> Result<Encoding, Legalize> {
+        let mut iter = self.legal_encodings(func, inst, ctrl_typevar);
+        iter.next().ok_or_else(|| iter.legalize())
+    }
+
+    /// Get a data structure describing the instruction encodings in this ISA.
+    fn encoding_info(&self) -> EncInfo;
+
+    /// Legalize a function signature.
+    ///
+    /// This is used to legalize both the signature of the function being compiled and any called
+    /// functions. The signature should be modified by adding `ArgumentLoc` annotations to all
+    /// arguments and return values.
+    ///
+    /// Arguments with types that are not supported by the ABI can be expanded into multiple
+    /// arguments:
+    ///
+    /// - Integer types that are too large to fit in a register can be broken into multiple
+    ///   arguments of a smaller integer type.
+    /// - Floating point types can be bit-cast to an integer type of the same size, and possible
+    ///   broken into smaller integer types.
+    /// - Vector types can be bit-cast and broken down into smaller vectors or scalars.
+    ///
+    /// The legalizer will adapt argument and return values as necessary at all ABI boundaries.
+    ///
+    /// When this function is called to legalize the signature of the function currently being
+    /// compiled, `current` is true. The legalized signature can then also contain special purpose
+    /// arguments and return values such as:
+    ///
+    /// - A `link` argument representing the link registers on RISC architectures that don't push
+    ///   the return address on the stack.
+    /// - A `link` return value which will receive the value that was passed to the `link`
+    ///   argument.
+    /// - An `sret` argument can be added if one wasn't present already. This is necessary if the
+    ///   signature returns more values than registers are available for returning values.
+    /// - An `sret` return value can be added if the ABI requires a function to return its `sret`
+    ///   argument in a register.
+    ///
+    /// Arguments and return values for the caller's frame pointer and other callee-saved registers
+    /// should not be added by this function. These arguments are not added until after register
+    /// allocation.
+    fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool);
+
+    /// Get the register class that should be used to represent an ABI argument or return value of
+    /// type `ty`. This should be the top-level register class that contains the argument
+    /// registers.
+    ///
+    /// This function can assume that it will only be asked to provide register classes for types
+    /// that `legalize_signature()` produces in `ArgumentLoc::Reg` entries.
+    fn regclass_for_abi_type(&self, ty: ir::Type) -> RegClass;
+
+    /// Get the set of allocatable registers that can be used when compiling `func`.
+    ///
+    /// This set excludes reserved registers like the stack pointer and other special-purpose
+    /// registers.
+    fn allocatable_registers(&self, func: &ir::Function) -> regalloc::RegisterSet;
+
+    /// Compute the stack layout and insert prologue and epilogue code into `func`.
+    ///
+    /// Return an error if the stack frame is too large.
+    fn prologue_epilogue(&self, func: &mut ir::Function) -> CodegenResult<()> {
+        let _tt = timing::prologue_epilogue();
+        // This default implementation is unlikely to be good enough.
+        use crate::ir::stackslot::{StackOffset, StackSize};
+        use crate::stack_layout::layout_stack;
+
+        let word_size = StackSize::from(self.pointer_bytes());
+
+        // Account for the SpiderMonkey standard prologue pushes.
+        if func.signature.call_conv.extends_baldrdash() {
+            let bytes = StackSize::from(self.flags().baldrdash_prologue_words()) * word_size;
+            let mut ss = ir::StackSlotData::new(ir::StackSlotKind::IncomingArg, bytes);
+            ss.offset = Some(-(bytes as StackOffset));
+            func.stack_slots.push(ss);
+        }
+
+        let is_leaf = func.is_leaf();
+        layout_stack(&mut func.stack_slots, is_leaf, word_size)?;
+        Ok(())
+    }
+
+    /// Emit binary machine code for a single instruction into the `sink` trait object.
+    ///
+    /// Note that this will call `put*` methods on the `sink` trait object via its vtable which
+    /// is not the fastest way of emitting code.
+    ///
+    /// This function is under the "testing_hooks" feature, and is only suitable for use by
+    /// test harnesses. It increases code size, and is inefficient.
+    #[cfg(feature = "testing_hooks")]
+    fn emit_inst(
+        &self,
+        func: &ir::Function,
+        inst: ir::Inst,
+        divert: &mut regalloc::RegDiversions,
+        sink: &mut dyn binemit::CodeSink,
+    );
+
+    /// Emit a whole function into memory.
+    fn emit_function_to_memory(&self, func: &ir::Function, sink: &mut binemit::MemoryCodeSink);
+
+    /// IntCC condition for Unsigned Addition Overflow (Carry).
+    fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC;
+
+    /// IntCC condition for Unsigned Subtraction Overflow (Borrow/Carry).
+    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC;
+
+    /// Creates unwind information for the function.
+    ///
+    /// Returns `None` if there is no unwind information for the function.
+    #[cfg(feature = "unwind")]
+    fn create_unwind_info(
+        &self,
+        _func: &ir::Function,
+    ) -> CodegenResult<Option<unwind::UnwindInfo>> {
+        // By default, an ISA has no unwind information
+        Ok(None)
+    }
+
+    /// Creates a new System V Common Information Entry for the ISA.
+    ///
+    /// Returns `None` if the ISA does not support System V unwind information.
+    #[cfg(feature = "unwind")]
+    fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> {
+        // By default, an ISA cannot create a System V CIE
+        None
+    }
+
+    /// Get the new-style MachBackend, if this is an adapter around one.
+    fn get_mach_backend(&self) -> Option<&dyn MachBackend> {
+        None
+    }
+
+    /// Return an [Any] reference for downcasting to the ISA-specific implementation of this trait
+    /// with `isa.as_any().downcast_ref::<isa::foo::Isa>()`.
+    fn as_any(&self) -> &dyn Any;
+}
+
+impl Debug for &dyn TargetIsa {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "TargetIsa {{ triple: {:?}, pointer_width: {:?}}}",
+            self.triple(),
+            self.pointer_width()
+        )
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/registers.rs b/third_party/rust/cranelift-codegen/src/isa/registers.rs
new file mode 100644
index 0000000000..e67ae13453
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/registers.rs
@@ -0,0 +1,360 @@
+//! Data structures describing the registers in an ISA.
+
+use crate::entity::EntityRef;
+use core::fmt;
+
+/// Register units are the smallest units of register allocation.
+///
+/// Normally there is a 1-1 correspondence between registers and register units, but when an ISA
+/// has aliasing registers, the aliasing can be modeled with registers that cover multiple
+/// register units.
+///
+/// The register allocator will enforce that each register unit only gets used for one thing.
+pub type RegUnit = u16;
+
+/// A bit mask indexed by register classes.
+///
+/// The size of this type is determined by the ISA with the most register classes.
+pub type RegClassMask = u32;
+
+/// A bit mask indexed by register units.
+///
+/// The size of this type is determined by the target ISA that has the most register units defined.
+/// Currently that is arm32 which has 64+16 units.
+pub type RegUnitMask = [RegClassMask; 3];
+
+/// The register units in a target ISA are divided into disjoint register banks. Each bank covers a
+/// contiguous range of register units.
+///
+/// The `RegBank` struct provides a static description of a register bank.
+pub struct RegBank {
+    /// The name of this register bank as defined in the ISA's DSL definition.
+    pub name: &'static str,
+
+    /// The first register unit in this bank.
+    pub first_unit: RegUnit,
+
+    /// The total number of register units in this bank.
+    pub units: RegUnit,
+
+    /// Array of specially named register units. This array can be shorter than the number of units
+    /// in the bank.
+    pub names: &'static [&'static str],
+
+    /// Name prefix to use for those register units in the bank not covered by the `names` array.
+    /// The remaining register units will be named this prefix followed by their decimal offset in
+    /// the bank. So with a prefix `r`, registers will be named `r8`, `r9`, ...
+    pub prefix: &'static str,
+
+    /// Index of the first top-level register class in this bank.
+    pub first_toprc: usize,
+
+    /// Number of top-level register classes in this bank.
+    ///
+    /// The top-level register classes in a bank are guaranteed to be numbered sequentially from
+    /// `first_toprc`, and all top-level register classes across banks come before any sub-classes.
+    pub num_toprcs: usize,
+
+    /// Is register pressure tracking enabled for this bank?
+    pub pressure_tracking: bool,
+}
+
+impl RegBank {
+    /// Does this bank contain `regunit`?
+    fn contains(&self, regunit: RegUnit) -> bool {
+        regunit >= self.first_unit && regunit - self.first_unit < self.units
+    }
+
+    /// Try to parse a regunit name. The name is not expected to begin with `%`.
+    fn parse_regunit(&self, name: &str) -> Option<RegUnit> {
+        match self.names.iter().position(|&x| x == name) {
+            Some(offset) => {
+                // This is one of the special-cased names.
+                Some(offset as RegUnit)
+            }
+            None => {
+                // Try a regular prefixed name.
+                if name.starts_with(self.prefix) {
+                    name[self.prefix.len()..].parse().ok()
+                } else {
+                    None
+                }
+            }
+        }
+        .and_then(|offset| {
+            if offset < self.units {
+                Some(offset + self.first_unit)
+            } else {
+                None
+            }
+        })
+    }
+
+    /// Write `regunit` to `w`, assuming that it belongs to this bank.
+    /// All regunits are written with a `%` prefix.
+    fn write_regunit(&self, f: &mut fmt::Formatter, regunit: RegUnit) -> fmt::Result {
+        let offset = regunit - self.first_unit;
+        assert!(offset < self.units);
+        if (offset as usize) < self.names.len() {
+            write!(f, "%{}", self.names[offset as usize])
+        } else {
+            write!(f, "%{}{}", self.prefix, offset)
+        }
+    }
+}
+
+/// A register class reference.
+///
+/// All register classes are statically defined in tables generated from the meta descriptions.
+pub type RegClass = &'static RegClassData;
+
+/// Data about a register class.
+///
+/// A register class represents a subset of the registers in a bank. It describes the set of
+/// permitted registers for a register operand in a given encoding of an instruction.
+///
+/// A register class can be a subset of another register class. The top-level register classes are
+/// disjoint.
+pub struct RegClassData {
+    /// The name of the register class.
+    pub name: &'static str,
+
+    /// The index of this class in the ISA's RegInfo description.
+    pub index: u8,
+
+    /// How many register units to allocate per register.
+    pub width: u8,
+
+    /// Index of the register bank this class belongs to.
+    pub bank: u8,
+
+    /// Index of the top-level register class contains this one.
+    pub toprc: u8,
+
+    /// The first register unit in this class.
+    pub first: RegUnit,
+
+    /// Bit-mask of sub-classes of this register class, including itself.
+    ///
+    /// Bits correspond to RC indexes.
+    pub subclasses: RegClassMask,
+
+    /// Mask of register units in the class. If `width > 1`, the mask only has a bit set for the
+    /// first register unit in each allocatable register.
+    pub mask: RegUnitMask,
+
+    /// The global `RegInfo` instance containing this register class.
+    pub info: &'static RegInfo,
+
+    /// The "pinned" register of the associated register bank.
+    ///
+    /// This register must be non-volatile (callee-preserved) and must not be the fixed
+    /// output register of any instruction.
+    pub pinned_reg: Option<RegUnit>,
+}
+
+impl RegClassData {
+    /// Get the register class index corresponding to the intersection of `self` and `other`.
+    ///
+    /// This register class is guaranteed to exist if the register classes overlap. If the register
+    /// classes don't overlap, returns `None`.
+    pub fn intersect_index(&self, other: RegClass) -> Option<RegClassIndex> {
+        // Compute the set of common subclasses.
+        let mask = self.subclasses & other.subclasses;
+
+        if mask == 0 {
+            // No overlap.
+            None
+        } else {
+            // Register class indexes are topologically ordered, so the largest common subclass has
+            // the smallest index.
+            Some(RegClassIndex(mask.trailing_zeros() as u8))
+        }
+    }
+
+    /// Get the intersection of `self` and `other`.
+    pub fn intersect(&self, other: RegClass) -> Option<RegClass> {
+        self.intersect_index(other).map(|rci| self.info.rc(rci))
+    }
+
+    /// Returns true if `other` is a subclass of this register class.
+    /// A register class is considered to be a subclass of itself.
+    pub fn has_subclass<RCI: Into<RegClassIndex>>(&self, other: RCI) -> bool {
+        self.subclasses & (1 << other.into().0) as u32 != 0
+    }
+
+    /// Get the top-level register class containing this class.
+    pub fn toprc(&self) -> RegClass {
+        self.info.rc(RegClassIndex(self.toprc))
+    }
+
+    /// Get a specific register unit in this class.
+    pub fn unit(&self, offset: usize) -> RegUnit {
+        let uoffset = offset * usize::from(self.width);
+        self.first + uoffset as RegUnit
+    }
+
+    /// Does this register class contain `regunit`?
+    pub fn contains(&self, regunit: RegUnit) -> bool {
+        self.mask[(regunit / 32) as usize] & (1u32 << (regunit % 32) as u32) != 0
+    }
+
+    /// If the pinned register is used, is the given regunit the pinned register of this class?
+    #[inline]
+    pub fn is_pinned_reg(&self, enabled: bool, regunit: RegUnit) -> bool {
+        enabled
+            && self
+                .pinned_reg
+                .map_or(false, |pinned_reg| pinned_reg == regunit)
+    }
+
+    /// Calculate the index of the register inside the class.
+    pub fn index_of(&self, regunit: RegUnit) -> u16 {
+        assert!(
+            self.contains(regunit),
+            "the {} register class does not contain {}",
+            self.name,
+            regunit
+        );
+        regunit - self.first
+    }
+}
+
+impl fmt::Display for RegClassData {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.write_str(self.name)
+    }
+}
+
+impl fmt::Debug for RegClassData {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.write_str(self.name)
+    }
+}
+
+/// Within an ISA, register classes are uniquely identified by their index.
+impl PartialEq for RegClassData {
+    fn eq(&self, other: &Self) -> bool {
+        self.index == other.index
+    }
+}
+
+/// A small reference to a register class.
+///
+/// Use this when storing register classes in compact data structures. The `RegInfo::rc()` method
+/// can be used to get the real register class reference back.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct RegClassIndex(u8);
+
+impl EntityRef for RegClassIndex {
+    fn new(idx: usize) -> Self {
+        Self(idx as u8)
+    }
+
+    fn index(self) -> usize {
+        usize::from(self.0)
+    }
+}
+
+impl From<RegClass> for RegClassIndex {
+    fn from(rc: RegClass) -> Self {
+        Self(rc.index)
+    }
+}
+
+impl fmt::Display for RegClassIndex {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "rci{}", self.0)
+    }
+}
+
+/// Test of two registers overlap.
+///
+/// A register is identified as a `(RegClass, RegUnit)` pair. The register class is needed to
+/// determine the width (in regunits) of the register.
+pub fn regs_overlap(rc1: RegClass, reg1: RegUnit, rc2: RegClass, reg2: RegUnit) -> bool {
+    let end1 = reg1 + RegUnit::from(rc1.width);
+    let end2 = reg2 + RegUnit::from(rc2.width);
+    !(end1 <= reg2 || end2 <= reg1)
+}
+
+/// Information about the registers in an ISA.
+///
+/// The `RegUnit` data structure collects all relevant static information about the registers in an
+/// ISA.
+#[derive(Clone)]
+pub struct RegInfo {
+    /// All register banks, ordered by their `first_unit`. The register banks are disjoint, but
+    /// there may be holes of unused register unit numbers between banks due to alignment.
+    pub banks: &'static [RegBank],
+
+    /// All register classes ordered topologically so a sub-class always follows its parent.
+    pub classes: &'static [RegClass],
+}
+
+impl RegInfo {
+    /// Get the register bank holding `regunit`.
+    pub fn bank_containing_regunit(&self, regunit: RegUnit) -> Option<&RegBank> {
+        // We could do a binary search, but most ISAs have only two register banks...
+        self.banks.iter().find(|b| b.contains(regunit))
+    }
+
+    /// Try to parse a regunit name. The name is not expected to begin with `%`.
+    pub fn parse_regunit(&self, name: &str) -> Option<RegUnit> {
+        self.banks
+            .iter()
+            .filter_map(|b| b.parse_regunit(name))
+            .next()
+    }
+
+    /// Make a temporary object that can display a register unit.
+    pub fn display_regunit(&self, regunit: RegUnit) -> DisplayRegUnit {
+        DisplayRegUnit {
+            regunit,
+            reginfo: self,
+        }
+    }
+
+    /// Get the register class corresponding to `idx`.
+    pub fn rc(&self, idx: RegClassIndex) -> RegClass {
+        self.classes[idx.index()]
+    }
+
+    /// Get the top-level register class containing the `idx` class.
+    pub fn toprc(&self, idx: RegClassIndex) -> RegClass {
+        self.classes[self.rc(idx).toprc as usize]
+    }
+}
+
+/// Temporary object that holds enough information to print a register unit.
+pub struct DisplayRegUnit<'a> {
+    regunit: RegUnit,
+    reginfo: &'a RegInfo,
+}
+
+impl<'a> fmt::Display for DisplayRegUnit<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self.reginfo.bank_containing_regunit(self.regunit) {
+            Some(b) => b.write_regunit(f, self.regunit),
+            None => write!(f, "%INVALID{}", self.regunit),
+        }
+    }
+}
+
+#[test]
+fn assert_sizes() {
+    use cranelift_codegen_shared::constants;
+    use std::mem::size_of;
+
+    // In these tests, size_of returns number of bytes: we actually want the number of bits, so
+    // multiply these by 8.
+    assert!(
+        (size_of::<RegClassMask>() * 8) <= constants::MAX_NUM_REG_CLASSES,
+        "need to bump MAX_NUM_REG_CLASSES or change RegClassMask type"
+    );
+
+    assert!(
+        constants::MAX_NUM_REG_CLASSES < (1 << (size_of::<RegClassIndex>() * 8)),
+        "need to change RegClassIndex's type to a wider type"
+    );
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/abi.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/abi.rs
new file mode 100644
index 0000000000..44c5f36afe
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/riscv/abi.rs
@@ -0,0 +1,149 @@
+//! RISC-V ABI implementation.
+//!
+//! This module implements the RISC-V calling convention through the primary `legalize_signature()`
+//! entry point.
+//!
+//! This doesn't support the soft-float ABI at the moment.
+
+use super::registers::{FPR, GPR};
+use super::settings;
+use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
+use crate::ir::{self, AbiParam, ArgumentExtension, ArgumentLoc, ArgumentPurpose, Type};
+use crate::isa::RegClass;
+use crate::regalloc::RegisterSet;
+use alloc::borrow::Cow;
+use core::i32;
+use target_lexicon::Triple;
+
+struct Args {
+    pointer_bits: u8,
+    pointer_bytes: u8,
+    pointer_type: Type,
+    regs: u32,
+    reg_limit: u32,
+    offset: u32,
+}
+
+impl Args {
+    fn new(bits: u8, enable_e: bool) -> Self {
+        Self {
+            pointer_bits: bits,
+            pointer_bytes: bits / 8,
+            pointer_type: Type::int(u16::from(bits)).unwrap(),
+            regs: 0,
+            reg_limit: if enable_e { 6 } else { 8 },
+            offset: 0,
+        }
+    }
+}
+
+impl ArgAssigner for Args {
+    fn assign(&mut self, arg: &AbiParam) -> ArgAction {
+        fn align(value: u32, to: u32) -> u32 {
+            (value + to - 1) & !(to - 1)
+        }
+
+        let ty = arg.value_type;
+
+        // Check for a legal type.
+        // RISC-V doesn't have SIMD at all, so break all vectors down.
+        if ty.is_vector() {
+            return ValueConversion::VectorSplit.into();
+        }
+
+        // Large integers and booleans are broken down to fit in a register.
+        if !ty.is_float() && ty.bits() > u16::from(self.pointer_bits) {
+            // Align registers and stack to a multiple of two pointers.
+            self.regs = align(self.regs, 2);
+            self.offset = align(self.offset, 2 * u32::from(self.pointer_bytes));
+            return ValueConversion::IntSplit.into();
+        }
+
+        // Small integers are extended to the size of a pointer register.
+        if ty.is_int() && ty.bits() < u16::from(self.pointer_bits) {
+            match arg.extension {
+                ArgumentExtension::None => {}
+                ArgumentExtension::Uext => return ValueConversion::Uext(self.pointer_type).into(),
+                ArgumentExtension::Sext => return ValueConversion::Sext(self.pointer_type).into(),
+            }
+        }
+
+        if self.regs < self.reg_limit {
+            // Assign to a register.
+            let reg = if ty.is_float() {
+                FPR.unit(10 + self.regs as usize)
+            } else {
+                GPR.unit(10 + self.regs as usize)
+            };
+            self.regs += 1;
+            ArgumentLoc::Reg(reg).into()
+        } else {
+            // Assign a stack location.
+            let loc = ArgumentLoc::Stack(self.offset as i32);
+            self.offset += u32::from(self.pointer_bytes);
+            debug_assert!(self.offset <= i32::MAX as u32);
+            loc.into()
+        }
+    }
+}
+
+/// Legalize `sig` for RISC-V.
+pub fn legalize_signature(
+    sig: &mut Cow<ir::Signature>,
+    triple: &Triple,
+    isa_flags: &settings::Flags,
+    current: bool,
+) {
+    let bits = triple.pointer_width().unwrap().bits();
+
+    let mut args = Args::new(bits, isa_flags.enable_e());
+    if let Some(new_params) = legalize_args(&sig.params, &mut args) {
+        sig.to_mut().params = new_params;
+    }
+
+    let mut rets = Args::new(bits, isa_flags.enable_e());
+    if let Some(new_returns) = legalize_args(&sig.returns, &mut rets) {
+        sig.to_mut().returns = new_returns;
+    }
+
+    if current {
+        let ptr = Type::int(u16::from(bits)).unwrap();
+
+        // Add the link register as an argument and return value.
+        //
+        // The `jalr` instruction implementing a return can technically accept the return address
+        // in any register, but a micro-architecture with a return address predictor will only
+        // recognize it as a return if the address is in `x1`.
+        let link = AbiParam::special_reg(ptr, ArgumentPurpose::Link, GPR.unit(1));
+        sig.to_mut().params.push(link);
+        sig.to_mut().returns.push(link);
+    }
+}
+
+/// Get register class for a type appearing in a legalized signature.
+pub fn regclass_for_abi_type(ty: Type) -> RegClass {
+    if ty.is_float() {
+        FPR
+    } else {
+        GPR
+    }
+}
+
+pub fn allocatable_registers(_func: &ir::Function, isa_flags: &settings::Flags) -> RegisterSet {
+    let mut regs = RegisterSet::new();
+    regs.take(GPR, GPR.unit(0)); // Hard-wired 0.
+                                 // %x1 is the link register which is available for allocation.
+    regs.take(GPR, GPR.unit(2)); // Stack pointer.
+    regs.take(GPR, GPR.unit(3)); // Global pointer.
+    regs.take(GPR, GPR.unit(4)); // Thread pointer.
+                                 // TODO: %x8 is the frame pointer. Reserve it?
+
+    // Remove %x16 and up for RV32E.
+    if isa_flags.enable_e() {
+        for u in 16..32 {
+            regs.take(GPR, GPR.unit(u));
+        }
+    }
+
+    regs
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/binemit.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/binemit.rs
new file mode 100644
index 0000000000..a1d2b82e12
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/riscv/binemit.rs
@@ -0,0 +1,182 @@
+//! Emitting binary RISC-V machine code.
+
+use crate::binemit::{bad_encoding, CodeSink, Reloc};
+use crate::ir::{Function, Inst, InstructionData};
+use crate::isa::{RegUnit, StackBaseMask, StackRef, TargetIsa};
+use crate::predicates::is_signed_int;
+use crate::regalloc::RegDiversions;
+use core::u32;
+
+include!(concat!(env!("OUT_DIR"), "/binemit-riscv.rs"));
+
+/// R-type instructions.
+///
+///   31     24  19  14     11 6
+///   funct7 rs2 rs1 funct3 rd opcode
+///       25  20  15     12  7      0
+///
+/// Encoding bits: `opcode[6:2] | (funct3 << 5) | (funct7 << 8)`.
+fn put_r<CS: CodeSink + ?Sized>(bits: u16, rs1: RegUnit, rs2: RegUnit, rd: RegUnit, sink: &mut CS) {
+    let bits = u32::from(bits);
+    let opcode5 = bits & 0x1f;
+    let funct3 = (bits >> 5) & 0x7;
+    let funct7 = (bits >> 8) & 0x7f;
+    let rs1 = u32::from(rs1) & 0x1f;
+    let rs2 = u32::from(rs2) & 0x1f;
+    let rd = u32::from(rd) & 0x1f;
+
+    // 0-6: opcode
+    let mut i = 0x3;
+    i |= opcode5 << 2;
+    i |= rd << 7;
+    i |= funct3 << 12;
+    i |= rs1 << 15;
+    i |= rs2 << 20;
+    i |= funct7 << 25;
+
+    sink.put4(i);
+}
+
+/// R-type instructions with a shift amount instead of rs2.
+///
+///   31     25    19  14     11 6
+///   funct7 shamt rs1 funct3 rd opcode
+///       25    20  15     12  7      0
+///
+/// Both funct7 and shamt contribute to bit 25. In RV64, shamt uses it for shifts > 31.
+///
+/// Encoding bits: `opcode[6:2] | (funct3 << 5) | (funct7 << 8)`.
+fn put_rshamt<CS: CodeSink + ?Sized>(
+    bits: u16,
+    rs1: RegUnit,
+    shamt: i64,
+    rd: RegUnit,
+    sink: &mut CS,
+) {
+    let bits = u32::from(bits);
+    let opcode5 = bits & 0x1f;
+    let funct3 = (bits >> 5) & 0x7;
+    let funct7 = (bits >> 8) & 0x7f;
+    let rs1 = u32::from(rs1) & 0x1f;
+    let shamt = shamt as u32 & 0x3f;
+    let rd = u32::from(rd) & 0x1f;
+
+    // 0-6: opcode
+    let mut i = 0x3;
+    i |= opcode5 << 2;
+    i |= rd << 7;
+    i |= funct3 << 12;
+    i |= rs1 << 15;
+    i |= shamt << 20;
+    i |= funct7 << 25;
+
+    sink.put4(i);
+}
+
+/// I-type instructions.
+///
+///   31  19  14     11 6
+///   imm rs1 funct3 rd opcode
+///    20  15     12  7      0
+///
+/// Encoding bits: `opcode[6:2] | (funct3 << 5)`
+fn put_i<CS: CodeSink + ?Sized>(bits: u16, rs1: RegUnit, imm: i64, rd: RegUnit, sink: &mut CS) {
+    let bits = u32::from(bits);
+    let opcode5 = bits & 0x1f;
+    let funct3 = (bits >> 5) & 0x7;
+    let rs1 = u32::from(rs1) & 0x1f;
+    let rd = u32::from(rd) & 0x1f;
+
+    // 0-6: opcode
+    let mut i = 0x3;
+    i |= opcode5 << 2;
+    i |= rd << 7;
+    i |= funct3 << 12;
+    i |= rs1 << 15;
+    i |= (imm << 20) as u32;
+
+    sink.put4(i);
+}
+
+/// U-type instructions.
+///
+///   31  11 6
+///   imm rd opcode
+///    12  7      0
+///
+/// Encoding bits: `opcode[6:2] | (funct3 << 5)`
+fn put_u<CS: CodeSink + ?Sized>(bits: u16, imm: i64, rd: RegUnit, sink: &mut CS) {
+    let bits = u32::from(bits);
+    let opcode5 = bits & 0x1f;
+    let rd = u32::from(rd) & 0x1f;
+
+    // 0-6: opcode
+    let mut i = 0x3;
+    i |= opcode5 << 2;
+    i |= rd << 7;
+    i |= imm as u32 & 0xfffff000;
+
+    sink.put4(i);
+}
+
+/// SB-type branch instructions.
+///
+///   31  24  19  14     11  6
+///   imm rs2 rs1 funct3 imm opcode
+///    25  20  15     12   7      0
+///
+/// Encoding bits: `opcode[6:2] | (funct3 << 5)`
+fn put_sb<CS: CodeSink + ?Sized>(bits: u16, imm: i64, rs1: RegUnit, rs2: RegUnit, sink: &mut CS) {
+    let bits = u32::from(bits);
+    let opcode5 = bits & 0x1f;
+    let funct3 = (bits >> 5) & 0x7;
+    let rs1 = u32::from(rs1) & 0x1f;
+    let rs2 = u32::from(rs2) & 0x1f;
+
+    debug_assert!(is_signed_int(imm, 13, 1), "SB out of range {:#x}", imm);
+    let imm = imm as u32;
+
+    // 0-6: opcode
+    let mut i = 0x3;
+    i |= opcode5 << 2;
+    i |= funct3 << 12;
+    i |= rs1 << 15;
+    i |= rs2 << 20;
+
+    // The displacement is completely hashed up.
+    i |= ((imm >> 11) & 0x1) << 7;
+    i |= ((imm >> 1) & 0xf) << 8;
+    i |= ((imm >> 5) & 0x3f) << 25;
+    i |= ((imm >> 12) & 0x1) << 31;
+
+    sink.put4(i);
+}
+
+/// UJ-type jump instructions.
+///
+///   31  11 6
+///   imm rd opcode
+///    12  7      0
+///
+/// Encoding bits: `opcode[6:2]`
+fn put_uj<CS: CodeSink + ?Sized>(bits: u16, imm: i64, rd: RegUnit, sink: &mut CS) {
+    let bits = u32::from(bits);
+    let opcode5 = bits & 0x1f;
+    let rd = u32::from(rd) & 0x1f;
+
+    debug_assert!(is_signed_int(imm, 21, 1), "UJ out of range {:#x}", imm);
+    let imm = imm as u32;
+
+    // 0-6: opcode
+    let mut i = 0x3;
+    i |= opcode5 << 2;
+    i |= rd << 7;
+
+    // The displacement is completely hashed up.
+    i |= imm & 0xff000;
+    i |= ((imm >> 11) & 0x1) << 20;
+    i |= ((imm >> 1) & 0x3ff) << 21;
+    i |= ((imm >> 20) & 0x1) << 31;
+
+    sink.put4(i);
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/enc_tables.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/enc_tables.rs
new file mode 100644
index 0000000000..76184ad727
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/riscv/enc_tables.rs
@@ -0,0 +1,18 @@
+//! Encoding tables for RISC-V.
+
+use super::registers::*;
+use crate::ir;
+use crate::isa;
+use crate::isa::constraints::*;
+use crate::isa::enc_tables::*;
+use crate::isa::encoding::{base_size, RecipeSizing};
+use crate::predicates;
+
+// Include the generated encoding tables:
+// - `LEVEL1_RV32`
+// - `LEVEL1_RV64`
+// - `LEVEL2`
+// - `ENCLIST`
+// - `INFO`
+include!(concat!(env!("OUT_DIR"), "/encoding-riscv.rs"));
+include!(concat!(env!("OUT_DIR"), "/legalize-riscv.rs"));
diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/mod.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/mod.rs
new file mode 100644
index 0000000000..e69a3a0e12
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/riscv/mod.rs
@@ -0,0 +1,295 @@
+//! RISC-V Instruction Set Architecture.
+
+mod abi;
+mod binemit;
+mod enc_tables;
+mod registers;
+pub mod settings;
+
+use super::super::settings as shared_settings;
+#[cfg(feature = "testing_hooks")]
+use crate::binemit::CodeSink;
+use crate::binemit::{emit_function, MemoryCodeSink};
+use crate::ir;
+use crate::isa::enc_tables::{self as shared_enc_tables, lookup_enclist, Encodings};
+use crate::isa::Builder as IsaBuilder;
+use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
+use crate::regalloc;
+use alloc::borrow::Cow;
+use alloc::boxed::Box;
+use core::any::Any;
+use core::fmt;
+use target_lexicon::{PointerWidth, Triple};
+
+#[allow(dead_code)]
+struct Isa {
+    triple: Triple,
+    shared_flags: shared_settings::Flags,
+    isa_flags: settings::Flags,
+    cpumode: &'static [shared_enc_tables::Level1Entry<u16>],
+}
+
+/// Get an ISA builder for creating RISC-V targets.
+pub fn isa_builder(triple: Triple) -> IsaBuilder {
+    IsaBuilder {
+        triple,
+        setup: settings::builder(),
+        constructor: isa_constructor,
+    }
+}
+
+fn isa_constructor(
+    triple: Triple,
+    shared_flags: shared_settings::Flags,
+    builder: shared_settings::Builder,
+) -> Box<dyn TargetIsa> {
+    let level1 = match triple.pointer_width().unwrap() {
+        PointerWidth::U16 => panic!("16-bit RISC-V unrecognized"),
+        PointerWidth::U32 => &enc_tables::LEVEL1_RV32[..],
+        PointerWidth::U64 => &enc_tables::LEVEL1_RV64[..],
+    };
+    Box::new(Isa {
+        triple,
+        isa_flags: settings::Flags::new(&shared_flags, builder),
+        shared_flags,
+        cpumode: level1,
+    })
+}
+
+impl TargetIsa for Isa {
+    fn name(&self) -> &'static str {
+        "riscv"
+    }
+
+    fn triple(&self) -> &Triple {
+        &self.triple
+    }
+
+    fn flags(&self) -> &shared_settings::Flags {
+        &self.shared_flags
+    }
+
+    fn register_info(&self) -> RegInfo {
+        registers::INFO.clone()
+    }
+
+    fn encoding_info(&self) -> EncInfo {
+        enc_tables::INFO.clone()
+    }
+
+    fn legal_encodings<'a>(
+        &'a self,
+        func: &'a ir::Function,
+        inst: &'a ir::InstructionData,
+        ctrl_typevar: ir::Type,
+    ) -> Encodings<'a> {
+        lookup_enclist(
+            ctrl_typevar,
+            inst,
+            func,
+            self.cpumode,
+            &enc_tables::LEVEL2[..],
+            &enc_tables::ENCLISTS[..],
+            &enc_tables::LEGALIZE_ACTIONS[..],
+            &enc_tables::RECIPE_PREDICATES[..],
+            &enc_tables::INST_PREDICATES[..],
+            self.isa_flags.predicate_view(),
+        )
+    }
+
+    fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool) {
+        abi::legalize_signature(sig, &self.triple, &self.isa_flags, current)
+    }
+
+    fn regclass_for_abi_type(&self, ty: ir::Type) -> RegClass {
+        abi::regclass_for_abi_type(ty)
+    }
+
+    fn allocatable_registers(&self, func: &ir::Function) -> regalloc::RegisterSet {
+        abi::allocatable_registers(func, &self.isa_flags)
+    }
+
+    #[cfg(feature = "testing_hooks")]
+    fn emit_inst(
+        &self,
+        func: &ir::Function,
+        inst: ir::Inst,
+        divert: &mut regalloc::RegDiversions,
+        sink: &mut dyn CodeSink,
+    ) {
+        binemit::emit_inst(func, inst, divert, sink, self)
+    }
+
+    fn emit_function_to_memory(&self, func: &ir::Function, sink: &mut MemoryCodeSink) {
+        emit_function(func, binemit::emit_inst, sink, self)
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC {
+        unimplemented!()
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC {
+        unimplemented!()
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::ir::{immediates, types};
+    use crate::ir::{Function, InstructionData, Opcode};
+    use crate::isa;
+    use crate::settings::{self, Configurable};
+    use alloc::string::{String, ToString};
+    use core::str::FromStr;
+    use target_lexicon::triple;
+
+    fn encstr(isa: &dyn isa::TargetIsa, enc: Result<isa::Encoding, isa::Legalize>) -> String {
+        match enc {
+            Ok(e) => isa.encoding_info().display(e).to_string(),
+            Err(_) => "no encoding".to_string(),
+        }
+    }
+
+    #[test]
+    fn test_64bitenc() {
+        let shared_builder = settings::builder();
+        let shared_flags = settings::Flags::new(shared_builder);
+        let isa = isa::lookup(triple!("riscv64"))
+            .unwrap()
+            .finish(shared_flags);
+
+        let mut func = Function::new();
+        let block = func.dfg.make_block();
+        let arg64 = func.dfg.append_block_param(block, types::I64);
+        let arg32 = func.dfg.append_block_param(block, types::I32);
+
+        // Try to encode iadd_imm.i64 v1, -10.
+        let inst64 = InstructionData::BinaryImm64 {
+            opcode: Opcode::IaddImm,
+            arg: arg64,
+            imm: immediates::Imm64::new(-10),
+        };
+
+        // ADDI is I/0b00100
+        assert_eq!(
+            encstr(&*isa, isa.encode(&func, &inst64, types::I64)),
+            "Ii#04"
+        );
+
+        // Try to encode iadd_imm.i64 v1, -10000.
+        let inst64_large = InstructionData::BinaryImm64 {
+            opcode: Opcode::IaddImm,
+            arg: arg64,
+            imm: immediates::Imm64::new(-10000),
+        };
+
+        // Immediate is out of range for ADDI.
+        assert!(isa.encode(&func, &inst64_large, types::I64).is_err());
+
+        // Create an iadd_imm.i32 which is encodable in RV64.
+        let inst32 = InstructionData::BinaryImm64 {
+            opcode: Opcode::IaddImm,
+            arg: arg32,
+            imm: immediates::Imm64::new(10),
+        };
+
+        // ADDIW is I/0b00110
+        assert_eq!(
+            encstr(&*isa, isa.encode(&func, &inst32, types::I32)),
+            "Ii#06"
+        );
+    }
+
+    // Same as above, but for RV32.
+    #[test]
+    fn test_32bitenc() {
+        let shared_builder = settings::builder();
+        let shared_flags = settings::Flags::new(shared_builder);
+        let isa = isa::lookup(triple!("riscv32"))
+            .unwrap()
+            .finish(shared_flags);
+
+        let mut func = Function::new();
+        let block = func.dfg.make_block();
+        let arg64 = func.dfg.append_block_param(block, types::I64);
+        let arg32 = func.dfg.append_block_param(block, types::I32);
+
+        // Try to encode iadd_imm.i64 v1, -10.
+        let inst64 = InstructionData::BinaryImm64 {
+            opcode: Opcode::IaddImm,
+            arg: arg64,
+            imm: immediates::Imm64::new(-10),
+        };
+
+        // In 32-bit mode, an i64 bit add should be narrowed.
+        assert!(isa.encode(&func, &inst64, types::I64).is_err());
+
+        // Try to encode iadd_imm.i64 v1, -10000.
+        let inst64_large = InstructionData::BinaryImm64 {
+            opcode: Opcode::IaddImm,
+            arg: arg64,
+            imm: immediates::Imm64::new(-10000),
+        };
+
+        // In 32-bit mode, an i64 bit add should be narrowed.
+        assert!(isa.encode(&func, &inst64_large, types::I64).is_err());
+
+        // Create an iadd_imm.i32 which is encodable in RV32.
+        let inst32 = InstructionData::BinaryImm64 {
+            opcode: Opcode::IaddImm,
+            arg: arg32,
+            imm: immediates::Imm64::new(10),
+        };
+
+        // ADDI is I/0b00100
+        assert_eq!(
+            encstr(&*isa, isa.encode(&func, &inst32, types::I32)),
+            "Ii#04"
+        );
+
+        // Create an imul.i32 which is encodable in RV32, but only when use_m is true.
+        let mul32 = InstructionData::Binary {
+            opcode: Opcode::Imul,
+            args: [arg32, arg32],
+        };
+
+        assert!(isa.encode(&func, &mul32, types::I32).is_err());
+    }
+
+    #[test]
+    fn test_rv32m() {
+        let shared_builder = settings::builder();
+        let shared_flags = settings::Flags::new(shared_builder);
+
+        // Set the supports_m stting which in turn enables the use_m predicate that unlocks
+        // encodings for imul.
+        let mut isa_builder = isa::lookup(triple!("riscv32")).unwrap();
+        isa_builder.enable("supports_m").unwrap();
+
+        let isa = isa_builder.finish(shared_flags);
+
+        let mut func = Function::new();
+        let block = func.dfg.make_block();
+        let arg32 = func.dfg.append_block_param(block, types::I32);
+
+        // Create an imul.i32 which is encodable in RV32M.
+        let mul32 = InstructionData::Binary {
+            opcode: Opcode::Imul,
+            args: [arg32, arg32],
+        };
+        assert_eq!(
+            encstr(&*isa, isa.encode(&func, &mul32, types::I32)),
+            "R#10c"
+        );
+    }
+}
+
+impl fmt::Display for Isa {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}\n{}", self.shared_flags, self.isa_flags)
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/registers.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/registers.rs
new file mode 100644
index 0000000000..9043b7f65f
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/riscv/registers.rs
@@ -0,0 +1,50 @@
+//! RISC-V register descriptions.
+
+use crate::isa::registers::{RegBank, RegClass, RegClassData, RegInfo, RegUnit};
+
+include!(concat!(env!("OUT_DIR"), "/registers-riscv.rs"));
+
+#[cfg(test)]
+mod tests {
+    use super::{FPR, GPR, INFO};
+    use crate::isa::RegUnit;
+    use alloc::string::{String, ToString};
+
+    #[test]
+    fn unit_encodings() {
+        assert_eq!(INFO.parse_regunit("x0"), Some(0));
+        assert_eq!(INFO.parse_regunit("x31"), Some(31));
+        assert_eq!(INFO.parse_regunit("f0"), Some(32));
+        assert_eq!(INFO.parse_regunit("f31"), Some(63));
+
+        assert_eq!(INFO.parse_regunit("x32"), None);
+        assert_eq!(INFO.parse_regunit("f32"), None);
+    }
+
+    #[test]
+    fn unit_names() {
+        fn uname(ru: RegUnit) -> String {
+            INFO.display_regunit(ru).to_string()
+        }
+
+        assert_eq!(uname(0), "%x0");
+        assert_eq!(uname(1), "%x1");
+        assert_eq!(uname(31), "%x31");
+        assert_eq!(uname(32), "%f0");
+        assert_eq!(uname(33), "%f1");
+        assert_eq!(uname(63), "%f31");
+        assert_eq!(uname(64), "%INVALID64");
+    }
+
+    #[test]
+    fn classes() {
+        assert!(GPR.contains(GPR.unit(0)));
+        assert!(GPR.contains(GPR.unit(31)));
+        assert!(!FPR.contains(GPR.unit(0)));
+        assert!(!FPR.contains(GPR.unit(31)));
+        assert!(!GPR.contains(FPR.unit(0)));
+        assert!(!GPR.contains(FPR.unit(31)));
+        assert!(FPR.contains(FPR.unit(0)));
+        assert!(FPR.contains(FPR.unit(31)));
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/settings.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/settings.rs
new file mode 100644
index 0000000000..40aa3bed2b
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/riscv/settings.rs
@@ -0,0 +1,56 @@
+//! RISC-V Settings.
+
+use crate::settings::{self, detail, Builder};
+use core::fmt;
+
+// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs`. This file contains a
+// public `Flags` struct with an impl for all of the settings defined in
+// `cranelift-codegen/meta/src/isa/riscv/mod.rs`.
+include!(concat!(env!("OUT_DIR"), "/settings-riscv.rs"));
+
+#[cfg(test)]
+mod tests {
+    use super::{builder, Flags};
+    use crate::settings::{self, Configurable};
+    use alloc::string::ToString;
+
+    #[test]
+    fn display_default() {
+        let shared = settings::Flags::new(settings::builder());
+        let b = builder();
+        let f = Flags::new(&shared, b);
+        assert_eq!(
+            f.to_string(),
+            "[riscv]\n\
+             supports_m = false\n\
+             supports_a = false\n\
+             supports_f = false\n\
+             supports_d = false\n\
+             enable_m = true\n\
+             enable_e = false\n"
+        );
+        // Predicates are not part of the Display output.
+        assert_eq!(f.full_float(), false);
+    }
+
+    #[test]
+    fn predicates() {
+        let mut sb = settings::builder();
+        sb.set("enable_simd", "true").unwrap();
+        let shared = settings::Flags::new(sb);
+        let mut b = builder();
+        b.enable("supports_f").unwrap();
+        b.enable("supports_d").unwrap();
+        let f = Flags::new(&shared, b);
+        assert_eq!(f.full_float(), true);
+
+        let mut sb = settings::builder();
+        sb.set("enable_simd", "false").unwrap();
+        let shared = settings::Flags::new(sb);
+        let mut b = builder();
+        b.enable("supports_f").unwrap();
+        b.enable("supports_d").unwrap();
+        let f = Flags::new(&shared, b);
+        assert_eq!(f.full_float(), false);
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/stack.rs b/third_party/rust/cranelift-codegen/src/isa/stack.rs
new file mode 100644
index 0000000000..ae093bed28
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/stack.rs
@@ -0,0 +1,95 @@
+//! Low-level details of stack accesses.
+//!
+//! The `ir::StackSlots` type deals with stack slots and stack frame layout. The `StackRef` type
+//! defined in this module expresses the low-level details of accessing a stack slot from an
+//! encoded instruction.
+
+use crate::ir::stackslot::{StackOffset, StackSlotKind, StackSlots};
+use crate::ir::StackSlot;
+
+/// A method for referencing a stack slot in the current stack frame.
+///
+/// Stack slots are addressed with a constant offset from a base register. The base can be the
+/// stack pointer, the frame pointer, or (in the future) a zone register pointing to an inner zone
+/// of a large stack frame.
+#[derive(Clone, Copy, Debug)]
+pub struct StackRef {
+    /// The base register to use for addressing.
+    pub base: StackBase,
+
+    /// Immediate offset from the base register to the first byte of the stack slot.
+    pub offset: StackOffset,
+}
+
+impl StackRef {
+    /// Get a reference to the stack slot `ss` using one of the base pointers in `mask`.
+    pub fn masked(ss: StackSlot, mask: StackBaseMask, frame: &StackSlots) -> Option<Self> {
+        // Try an SP-relative reference.
+        if mask.contains(StackBase::SP) {
+            return Some(Self::sp(ss, frame));
+        }
+
+        // No reference possible with this mask.
+        None
+    }
+
+    /// Get a reference to `ss` using the stack pointer as a base.
+    pub fn sp(ss: StackSlot, frame: &StackSlots) -> Self {
+        let size = frame
+            .layout_info
+            .expect("Stack layout must be computed before referencing stack slots")
+            .frame_size;
+        let slot = &frame[ss];
+        let offset = if slot.kind == StackSlotKind::OutgoingArg {
+            // Outgoing argument slots have offsets relative to our stack pointer.
+            slot.offset.unwrap()
+        } else {
+            // All other slots have offsets relative to our caller's stack frame.
+            // Offset where SP is pointing. (All ISAs have stacks growing downwards.)
+            let sp_offset = -(size as StackOffset);
+            slot.offset.unwrap() - sp_offset
+        };
+        Self {
+            base: StackBase::SP,
+            offset,
+        }
+    }
+}
+
+/// Generic base register for referencing stack slots.
+///
+/// Most ISAs have a stack pointer and an optional frame pointer, so provide generic names for
+/// those two base pointers.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum StackBase {
+    /// Use the stack pointer.
+    SP = 0,
+
+    /// Use the frame pointer (if one is present).
+    FP = 1,
+
+    /// Use an explicit zone pointer in a general-purpose register.
+    ///
+    /// This feature is not yet implemented.
+    Zone = 2,
+}
+
+/// Bit mask of supported stack bases.
+///
+/// Many instruction encodings can use different base registers while others only work with the
+/// stack pointer, say. A `StackBaseMask` is a bit mask of supported stack bases for a given
+/// instruction encoding.
+///
+/// This behaves like a set of `StackBase` variants.
+///
+/// The internal representation as a `u8` is public because stack base masks are used in constant
+/// tables generated from the meta-language encoding definitions.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct StackBaseMask(pub u8);
+
+impl StackBaseMask {
+    /// Check if this mask contains the `base` variant.
+    pub fn contains(self, base: StackBase) -> bool {
+        self.0 & (1 << base as usize) != 0
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/test_utils.rs b/third_party/rust/cranelift-codegen/src/isa/test_utils.rs
new file mode 100644
index 0000000000..01c500d6ca
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/test_utils.rs
@@ -0,0 +1,86 @@
+// This is unused when no platforms with the new backend are enabled.
+#![allow(dead_code)]
+
+use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
+use crate::ir::Value;
+use crate::ir::{ConstantOffset, ExternalName, Function, JumpTable, Opcode, SourceLoc, TrapCode};
+use crate::isa::TargetIsa;
+
+use alloc::vec::Vec;
+use std::string::String;
+
+pub struct TestCodeSink {
+    bytes: Vec<u8>,
+}
+
+impl TestCodeSink {
+    /// Create a new TestCodeSink.
+    pub fn new() -> TestCodeSink {
+        TestCodeSink { bytes: vec![] }
+    }
+
+    /// Return the code emitted to this sink as a hex string.
+    pub fn stringify(&self) -> String {
+        // This is pretty lame, but whatever ..
+        use std::fmt::Write;
+        let mut s = String::with_capacity(self.bytes.len() * 2);
+        for b in &self.bytes {
+            write!(&mut s, "{:02X}", b).unwrap();
+        }
+        s
+    }
+}
+
+impl CodeSink for TestCodeSink {
+    fn offset(&self) -> CodeOffset {
+        self.bytes.len() as CodeOffset
+    }
+
+    fn put1(&mut self, x: u8) {
+        self.bytes.push(x);
+    }
+
+    fn put2(&mut self, x: u16) {
+        self.bytes.push((x >> 0) as u8);
+        self.bytes.push((x >> 8) as u8);
+    }
+
+    fn put4(&mut self, mut x: u32) {
+        for _ in 0..4 {
+            self.bytes.push(x as u8);
+            x >>= 8;
+        }
+    }
+
+    fn put8(&mut self, mut x: u64) {
+        for _ in 0..8 {
+            self.bytes.push(x as u8);
+            x >>= 8;
+        }
+    }
+
+    fn reloc_external(
+        &mut self,
+        _srcloc: SourceLoc,
+        _rel: Reloc,
+        _name: &ExternalName,
+        _addend: Addend,
+    ) {
+    }
+
+    fn reloc_constant(&mut self, _rel: Reloc, _constant_offset: ConstantOffset) {}
+
+    fn reloc_jt(&mut self, _rel: Reloc, _jt: JumpTable) {}
+
+    fn trap(&mut self, _code: TrapCode, _srcloc: SourceLoc) {}
+
+    fn begin_jumptables(&mut self) {}
+
+    fn begin_rodata(&mut self) {}
+
+    fn end_codegen(&mut self) {}
+
+    fn add_stack_map(&mut self, _val_list: &[Value], _func: &Function, _isa: &dyn TargetIsa) {}
+
+    fn add_call_site(&mut self, _opcode: Opcode, _srcloc: SourceLoc) {}
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/unwind.rs
new file mode 100644
index 0000000000..a4c5f0b6b7
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/unwind.rs
@@ -0,0 +1,88 @@
+//! Represents information relating to function unwinding.
+#[cfg(feature = "enable-serde")]
+use serde::{Deserialize, Serialize};
+
+#[cfg(feature = "unwind")]
+pub mod systemv;
+
+#[cfg(feature = "unwind")]
+pub mod winx64;
+
+/// Represents unwind information for a single function.
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+#[non_exhaustive]
+pub enum UnwindInfo {
+    /// Windows x64 ABI unwind information.
+    #[cfg(feature = "unwind")]
+    WindowsX64(winx64::UnwindInfo),
+    /// System V ABI unwind information.
+    #[cfg(feature = "unwind")]
+    SystemV(systemv::UnwindInfo),
+}
+
+/// Intermediate representation for the unwind information
+/// generated by a backend.
+pub mod input {
+    use crate::binemit::CodeOffset;
+    use alloc::vec::Vec;
+    #[cfg(feature = "enable-serde")]
+    use serde::{Deserialize, Serialize};
+
+    /// Elementary operation in the unwind operations.
+    #[derive(Clone, Debug, PartialEq, Eq)]
+    #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+    pub enum UnwindCode<Reg> {
+        /// Defines that a register is saved at the specified offset.
+        SaveRegister {
+            /// The saved register.
+            reg: Reg,
+            /// The specified offset relative to the stack pointer.
+            stack_offset: u32,
+        },
+        /// Defines that a register is as defined before call.
+        RestoreRegister {
+            /// The restored register.
+            reg: Reg,
+        },
+        /// The stack pointer was adjusted to allocate the stack.
+        StackAlloc {
+            /// Size to allocate.
+            size: u32,
+        },
+        /// The stack pointer was adjusted to free the stack.
+        StackDealloc {
+            /// Size to deallocate.
+            size: u32,
+        },
+        /// The alternative register was assigned as frame pointer base.
+        SetFramePointer {
+            /// The specified register.
+            reg: Reg,
+        },
+        /// Restores a frame pointer base to default register.
+        RestoreFramePointer,
+        /// Saves the state.
+        RememberState,
+        /// Restores the state.
+        RestoreState,
+    }
+
+    /// Unwind information as generated by a backend.
+    #[derive(Clone, Debug, PartialEq, Eq)]
+    #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+    pub struct UnwindInfo<Reg> {
+        /// Size of the prologue.
+        pub prologue_size: CodeOffset,
+        /// Unwind codes for prologue.
+        pub prologue_unwind_codes: Vec<(CodeOffset, UnwindCode<Reg>)>,
+        /// Unwind codes for epilogues.
+        pub epilogues_unwind_codes: Vec<Vec<(CodeOffset, UnwindCode<Reg>)>>,
+        /// Entire function size.
+        pub function_size: CodeOffset,
+        /// Platform word size in bytes.
+        pub word_size: u8,
+        /// Initial stack pointer offset.
+        pub initial_sp_offset: u8,
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/unwind/systemv.rs b/third_party/rust/cranelift-codegen/src/isa/unwind/systemv.rs
new file mode 100644
index 0000000000..dfb2ef5936
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/unwind/systemv.rs
@@ -0,0 +1,313 @@
+//! System V ABI unwind information.
+
+use crate::isa::unwind::input;
+use crate::result::{CodegenError, CodegenResult};
+use alloc::vec::Vec;
+use gimli::write::{Address, FrameDescriptionEntry};
+use thiserror::Error;
+
+#[cfg(feature = "enable-serde")]
+use serde::{Deserialize, Serialize};
+
+type Register = u16;
+
+/// Enumerate the errors possible in mapping Cranelift registers to their DWARF equivalent.
+#[allow(missing_docs)]
+#[derive(Error, Debug, PartialEq, Eq)]
+pub enum RegisterMappingError {
+    #[error("unable to find bank for register info")]
+    MissingBank,
+    #[error("register mapping is currently only implemented for x86_64")]
+    UnsupportedArchitecture,
+    #[error("unsupported register bank: {0}")]
+    UnsupportedRegisterBank(&'static str),
+}
+
+// This mirrors gimli's CallFrameInstruction, but is serializable
+// This excludes CfaExpression, Expression, ValExpression due to
+// https://github.com/gimli-rs/gimli/issues/513.
+// TODO: if gimli ever adds serialization support, remove this type
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+pub(crate) enum CallFrameInstruction {
+    Cfa(Register, i32),
+    CfaRegister(Register),
+    CfaOffset(i32),
+    Restore(Register),
+    Undefined(Register),
+    SameValue(Register),
+    Offset(Register, i32),
+    ValOffset(Register, i32),
+    Register(Register, Register),
+    RememberState,
+    RestoreState,
+    ArgsSize(u32),
+}
+
+impl From<gimli::write::CallFrameInstruction> for CallFrameInstruction {
+    fn from(cfi: gimli::write::CallFrameInstruction) -> Self {
+        use gimli::write::CallFrameInstruction;
+
+        match cfi {
+            CallFrameInstruction::Cfa(reg, offset) => Self::Cfa(reg.0, offset),
+            CallFrameInstruction::CfaRegister(reg) => Self::CfaRegister(reg.0),
+            CallFrameInstruction::CfaOffset(offset) => Self::CfaOffset(offset),
+            CallFrameInstruction::Restore(reg) => Self::Restore(reg.0),
+            CallFrameInstruction::Undefined(reg) => Self::Undefined(reg.0),
+            CallFrameInstruction::SameValue(reg) => Self::SameValue(reg.0),
+            CallFrameInstruction::Offset(reg, offset) => Self::Offset(reg.0, offset),
+            CallFrameInstruction::ValOffset(reg, offset) => Self::ValOffset(reg.0, offset),
+            CallFrameInstruction::Register(reg1, reg2) => Self::Register(reg1.0, reg2.0),
+            CallFrameInstruction::RememberState => Self::RememberState,
+            CallFrameInstruction::RestoreState => Self::RestoreState,
+            CallFrameInstruction::ArgsSize(size) => Self::ArgsSize(size),
+            _ => {
+                // Cranelift's unwind support does not generate `CallFrameInstruction`s with
+                // Expression at this moment, and it is not trivial to
+                // serialize such instructions.
+                panic!("CallFrameInstruction with Expression not supported");
+            }
+        }
+    }
+}
+
+impl Into<gimli::write::CallFrameInstruction> for CallFrameInstruction {
+    fn into(self) -> gimli::write::CallFrameInstruction {
+        use gimli::{write::CallFrameInstruction, Register};
+
+        match self {
+            Self::Cfa(reg, offset) => CallFrameInstruction::Cfa(Register(reg), offset),
+            Self::CfaRegister(reg) => CallFrameInstruction::CfaRegister(Register(reg)),
+            Self::CfaOffset(offset) => CallFrameInstruction::CfaOffset(offset),
+            Self::Restore(reg) => CallFrameInstruction::Restore(Register(reg)),
+            Self::Undefined(reg) => CallFrameInstruction::Undefined(Register(reg)),
+            Self::SameValue(reg) => CallFrameInstruction::SameValue(Register(reg)),
+            Self::Offset(reg, offset) => CallFrameInstruction::Offset(Register(reg), offset),
+            Self::ValOffset(reg, offset) => CallFrameInstruction::ValOffset(Register(reg), offset),
+            Self::Register(reg1, reg2) => {
+                CallFrameInstruction::Register(Register(reg1), Register(reg2))
+            }
+            Self::RememberState => CallFrameInstruction::RememberState,
+            Self::RestoreState => CallFrameInstruction::RestoreState,
+            Self::ArgsSize(size) => CallFrameInstruction::ArgsSize(size),
+        }
+    }
+}
+
+/// Maps UnwindInfo register to gimli's index space.
+pub(crate) trait RegisterMapper<Reg> {
+    /// Maps Reg.
+    fn map(&self, reg: Reg) -> Result<Register, RegisterMappingError>;
+    /// Gets stack pointer register.
+    fn sp(&self) -> Register;
+}
+
+/// Represents unwind information for a single System V ABI function.
+///
+/// This representation is not ISA specific.
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+pub struct UnwindInfo {
+    instructions: Vec<(u32, CallFrameInstruction)>,
+    len: u32,
+}
+
+impl UnwindInfo {
+    pub(crate) fn build<'b, Reg: PartialEq + Copy>(
+        unwind: input::UnwindInfo<Reg>,
+        map_reg: &'b dyn RegisterMapper<Reg>,
+    ) -> CodegenResult<Self> {
+        use input::UnwindCode;
+        let mut builder = InstructionBuilder::new(unwind.initial_sp_offset, map_reg);
+
+        for (offset, c) in unwind.prologue_unwind_codes.iter().chain(
+            unwind
+                .epilogues_unwind_codes
+                .iter()
+                .map(|c| c.iter())
+                .flatten(),
+        ) {
+            match c {
+                UnwindCode::SaveRegister { reg, stack_offset } => {
+                    builder
+                        .save_reg(*offset, *reg, *stack_offset)
+                        .map_err(CodegenError::RegisterMappingError)?;
+                }
+                UnwindCode::StackAlloc { size } => {
+                    builder.adjust_sp_down_imm(*offset, *size as i64);
+                }
+                UnwindCode::StackDealloc { size } => {
+                    builder.adjust_sp_up_imm(*offset, *size as i64);
+                }
+                UnwindCode::RestoreRegister { reg } => {
+                    builder
+                        .restore_reg(*offset, *reg)
+                        .map_err(CodegenError::RegisterMappingError)?;
+                }
+                UnwindCode::SetFramePointer { reg } => {
+                    builder
+                        .set_cfa_reg(*offset, *reg)
+                        .map_err(CodegenError::RegisterMappingError)?;
+                }
+                UnwindCode::RestoreFramePointer => {
+                    builder.restore_cfa(*offset);
+                }
+                UnwindCode::RememberState => {
+                    builder.remember_state(*offset);
+                }
+                UnwindCode::RestoreState => {
+                    builder.restore_state(*offset);
+                }
+            }
+        }
+
+        let instructions = builder.instructions;
+        let len = unwind.function_size;
+
+        Ok(Self { instructions, len })
+    }
+
+    /// Converts the unwind information into a `FrameDescriptionEntry`.
+    pub fn to_fde(&self, address: Address) -> gimli::write::FrameDescriptionEntry {
+        let mut fde = FrameDescriptionEntry::new(address, self.len);
+
+        for (offset, inst) in &self.instructions {
+            fde.add_instruction(*offset, inst.clone().into());
+        }
+
+        fde
+    }
+}
+
+struct InstructionBuilder<'a, Reg: PartialEq + Copy> {
+    sp_offset: i32,
+    frame_register: Option<Reg>,
+    saved_state: Option<(i32, Option<Reg>)>,
+    map_reg: &'a dyn RegisterMapper<Reg>,
+    instructions: Vec<(u32, CallFrameInstruction)>,
+}
+
+impl<'a, Reg: PartialEq + Copy> InstructionBuilder<'a, Reg> {
+    fn new(sp_offset: u8, map_reg: &'a (dyn RegisterMapper<Reg> + 'a)) -> Self {
+        Self {
+            sp_offset: sp_offset as i32, // CFA offset starts at the specified offset to account for the return address on stack
+            saved_state: None,
+            frame_register: None,
+            map_reg,
+            instructions: Vec::new(),
+        }
+    }
+
+    fn save_reg(
+        &mut self,
+        offset: u32,
+        reg: Reg,
+        stack_offset: u32,
+    ) -> Result<(), RegisterMappingError> {
+        // Pushes in the prologue are register saves, so record an offset of the save
+        self.instructions.push((
+            offset,
+            CallFrameInstruction::Offset(
+                self.map_reg.map(reg)?,
+                stack_offset as i32 - self.sp_offset,
+            ),
+        ));
+
+        Ok(())
+    }
+
+    fn adjust_sp_down_imm(&mut self, offset: u32, imm: i64) {
+        assert!(imm <= core::u32::MAX as i64);
+
+        self.sp_offset += imm as i32;
+
+        // Don't adjust the CFA if we're using a frame pointer
+        if self.frame_register.is_some() {
+            return;
+        }
+
+        self.instructions
+            .push((offset, CallFrameInstruction::CfaOffset(self.sp_offset)));
+    }
+
+    fn adjust_sp_up_imm(&mut self, offset: u32, imm: i64) {
+        assert!(imm <= core::u32::MAX as i64);
+
+        self.sp_offset -= imm as i32;
+
+        // Don't adjust the CFA if we're using a frame pointer
+        if self.frame_register.is_some() {
+            return;
+        }
+
+        let cfa_inst_ofs = {
+            // Scan to find and merge with CFA instruction with the same offset.
+            let mut it = self.instructions.iter_mut();
+            loop {
+                match it.next_back() {
+                    Some((i_offset, i)) if *i_offset == offset => {
+                        if let CallFrameInstruction::Cfa(_, o) = i {
+                            break Some(o);
+                        }
+                    }
+                    _ => {
+                        break None;
+                    }
+                }
+            }
+        };
+
+        if let Some(o) = cfa_inst_ofs {
+            // Update previous CFA instruction.
+            *o = self.sp_offset;
+        } else {
+            // Add just CFA offset instruction.
+            self.instructions
+                .push((offset, CallFrameInstruction::CfaOffset(self.sp_offset)));
+        }
+    }
+
+    fn set_cfa_reg(&mut self, offset: u32, reg: Reg) -> Result<(), RegisterMappingError> {
+        self.instructions.push((
+            offset,
+            CallFrameInstruction::CfaRegister(self.map_reg.map(reg)?),
+        ));
+        self.frame_register = Some(reg);
+        Ok(())
+    }
+
+    fn restore_cfa(&mut self, offset: u32) {
+        // Restore SP and its offset.
+        self.instructions.push((
+            offset,
+            CallFrameInstruction::Cfa(self.map_reg.sp(), self.sp_offset),
+        ));
+        self.frame_register = None;
+    }
+
+    fn restore_reg(&mut self, offset: u32, reg: Reg) -> Result<(), RegisterMappingError> {
+        // Pops in the epilogue are register restores, so record a "same value" for the register
+        self.instructions.push((
+            offset,
+            CallFrameInstruction::SameValue(self.map_reg.map(reg)?),
+        ));
+
+        Ok(())
+    }
+
+    fn remember_state(&mut self, offset: u32) {
+        self.saved_state = Some((self.sp_offset, self.frame_register));
+
+        self.instructions
+            .push((offset, CallFrameInstruction::RememberState));
+    }
+
+    fn restore_state(&mut self, offset: u32) {
+        let (sp_offset, frame_register) = self.saved_state.take().unwrap();
+        self.sp_offset = sp_offset;
+        self.frame_register = frame_register;
+
+        self.instructions
+            .push((offset, CallFrameInstruction::RestoreState));
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/unwind/winx64.rs b/third_party/rust/cranelift-codegen/src/isa/unwind/winx64.rs
new file mode 100644
index 0000000000..b3c21fc473
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/unwind/winx64.rs
@@ -0,0 +1,294 @@
+//! Windows x64 ABI unwind information.
+
+use crate::isa::{unwind::input, RegUnit};
+use crate::result::{CodegenError, CodegenResult};
+use alloc::vec::Vec;
+use byteorder::{ByteOrder, LittleEndian};
+use log::warn;
+#[cfg(feature = "enable-serde")]
+use serde::{Deserialize, Serialize};
+
+/// Maximum (inclusive) size of a "small" stack allocation
+const SMALL_ALLOC_MAX_SIZE: u32 = 128;
+/// Maximum (inclusive) size of a "large" stack allocation that can represented in 16-bits
+const LARGE_ALLOC_16BIT_MAX_SIZE: u32 = 524280;
+
+struct Writer<'a> {
+    buf: &'a mut [u8],
+    offset: usize,
+}
+
+impl<'a> Writer<'a> {
+    pub fn new(buf: &'a mut [u8]) -> Self {
+        Self { buf, offset: 0 }
+    }
+
+    fn write_u8(&mut self, v: u8) {
+        self.buf[self.offset] = v;
+        self.offset += 1;
+    }
+
+    fn write_u16<T: ByteOrder>(&mut self, v: u16) {
+        T::write_u16(&mut self.buf[self.offset..(self.offset + 2)], v);
+        self.offset += 2;
+    }
+
+    fn write_u32<T: ByteOrder>(&mut self, v: u32) {
+        T::write_u32(&mut self.buf[self.offset..(self.offset + 4)], v);
+        self.offset += 4;
+    }
+}
+
+/// The supported unwind codes for the x64 Windows ABI.
+///
+/// See: https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64
+/// Only what is needed to describe the prologues generated by the Cranelift x86 ISA are represented here.
+/// Note: the Cranelift x86 ISA RU enum matches the Windows unwind GPR encoding values.
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+pub(crate) enum UnwindCode {
+    PushRegister {
+        offset: u8,
+        reg: u8,
+    },
+    SaveXmm {
+        offset: u8,
+        reg: u8,
+        stack_offset: u32,
+    },
+    StackAlloc {
+        offset: u8,
+        size: u32,
+    },
+}
+
+impl UnwindCode {
+    fn emit(&self, writer: &mut Writer) {
+        enum UnwindOperation {
+            PushNonvolatileRegister = 0,
+            LargeStackAlloc = 1,
+            SmallStackAlloc = 2,
+            SaveXmm128 = 8,
+            SaveXmm128Far = 9,
+        }
+
+        match self {
+            Self::PushRegister { offset, reg } => {
+                writer.write_u8(*offset);
+                writer.write_u8((*reg << 4) | (UnwindOperation::PushNonvolatileRegister as u8));
+            }
+            Self::SaveXmm {
+                offset,
+                reg,
+                stack_offset,
+            } => {
+                writer.write_u8(*offset);
+                let scaled_stack_offset = stack_offset / 16;
+                if scaled_stack_offset <= core::u16::MAX as u32 {
+                    writer.write_u8((*reg << 4) | (UnwindOperation::SaveXmm128 as u8));
+                    writer.write_u16::<LittleEndian>(scaled_stack_offset as u16);
+                } else {
+                    writer.write_u8((*reg << 4) | (UnwindOperation::SaveXmm128Far as u8));
+                    writer.write_u16::<LittleEndian>(*stack_offset as u16);
+                    writer.write_u16::<LittleEndian>((stack_offset >> 16) as u16);
+                }
+            }
+            Self::StackAlloc { offset, size } => {
+                // Stack allocations on Windows must be a multiple of 8 and be at least 1 slot
+                assert!(*size >= 8);
+                assert!((*size % 8) == 0);
+
+                writer.write_u8(*offset);
+                if *size <= SMALL_ALLOC_MAX_SIZE {
+                    writer.write_u8(
+                        ((((*size - 8) / 8) as u8) << 4) | UnwindOperation::SmallStackAlloc as u8,
+                    );
+                } else if *size <= LARGE_ALLOC_16BIT_MAX_SIZE {
+                    writer.write_u8(UnwindOperation::LargeStackAlloc as u8);
+                    writer.write_u16::<LittleEndian>((*size / 8) as u16);
+                } else {
+                    writer.write_u8((1 << 4) | (UnwindOperation::LargeStackAlloc as u8));
+                    writer.write_u32::<LittleEndian>(*size);
+                }
+            }
+        };
+    }
+
+    fn node_count(&self) -> usize {
+        match self {
+            Self::StackAlloc { size, .. } => {
+                if *size <= SMALL_ALLOC_MAX_SIZE {
+                    1
+                } else if *size <= LARGE_ALLOC_16BIT_MAX_SIZE {
+                    2
+                } else {
+                    3
+                }
+            }
+            Self::SaveXmm { stack_offset, .. } => {
+                if *stack_offset <= core::u16::MAX as u32 {
+                    2
+                } else {
+                    3
+                }
+            }
+            _ => 1,
+        }
+    }
+}
+
+pub(crate) enum MappedRegister {
+    Int(u8),
+    Xmm(u8),
+}
+
+/// Maps UnwindInfo register to Windows x64 unwind data.
+pub(crate) trait RegisterMapper {
+    /// Maps RegUnit.
+    fn map(reg: RegUnit) -> MappedRegister;
+}
+
+/// Represents Windows x64 unwind information.
+///
+/// For information about Windows x64 unwind info, see:
+/// https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+pub struct UnwindInfo {
+    pub(crate) flags: u8,
+    pub(crate) prologue_size: u8,
+    pub(crate) frame_register: Option<u8>,
+    pub(crate) frame_register_offset: u8,
+    pub(crate) unwind_codes: Vec<UnwindCode>,
+}
+
+impl UnwindInfo {
+    /// Gets the emit size of the unwind information, in bytes.
+    pub fn emit_size(&self) -> usize {
+        let node_count = self.node_count();
+
+        // Calculation of the size requires no SEH handler or chained info
+        assert!(self.flags == 0);
+
+        // Size of fixed part of UNWIND_INFO is 4 bytes
+        // Then comes the UNWIND_CODE nodes (2 bytes each)
+        // Then comes 2 bytes of padding for the unwind codes if necessary
+        // Next would come the SEH data, but we assert above that the function doesn't have SEH data
+
+        4 + (node_count * 2) + if (node_count & 1) == 1 { 2 } else { 0 }
+    }
+
+    /// Emits the unwind information into the given mutable byte slice.
+    ///
+    /// This function will panic if the slice is not at least `emit_size` in length.
+    pub fn emit(&self, buf: &mut [u8]) {
+        const UNWIND_INFO_VERSION: u8 = 1;
+
+        let node_count = self.node_count();
+        assert!(node_count <= 256);
+
+        let mut writer = Writer::new(buf);
+
+        writer.write_u8((self.flags << 3) | UNWIND_INFO_VERSION);
+        writer.write_u8(self.prologue_size);
+        writer.write_u8(node_count as u8);
+
+        if let Some(reg) = self.frame_register {
+            writer.write_u8((self.frame_register_offset << 4) | reg);
+        } else {
+            writer.write_u8(0);
+        }
+
+        // Unwind codes are written in reverse order (prologue offset descending)
+        for code in self.unwind_codes.iter().rev() {
+            code.emit(&mut writer);
+        }
+
+        // To keep a 32-bit alignment, emit 2 bytes of padding if there's an odd number of 16-bit nodes
+        if (node_count & 1) == 1 {
+            writer.write_u16::<LittleEndian>(0);
+        }
+
+        // Ensure the correct number of bytes was emitted
+        assert_eq!(writer.offset, self.emit_size());
+    }
+
+    fn node_count(&self) -> usize {
+        self.unwind_codes
+            .iter()
+            .fold(0, |nodes, c| nodes + c.node_count())
+    }
+
+    pub(crate) fn build<MR: RegisterMapper>(
+        unwind: input::UnwindInfo<RegUnit>,
+    ) -> CodegenResult<Self> {
+        use crate::isa::unwind::input::UnwindCode as InputUnwindCode;
+
+        let word_size: u32 = unwind.word_size.into();
+        let mut unwind_codes = Vec::new();
+        for (offset, c) in unwind.prologue_unwind_codes.iter() {
+            match c {
+                InputUnwindCode::SaveRegister { reg, stack_offset } => {
+                    let reg = MR::map(*reg);
+                    let offset = ensure_unwind_offset(*offset)?;
+                    match reg {
+                        MappedRegister::Int(reg) => {
+                            // Attempt to convert sequence of the `InputUnwindCode`:
+                            // `StackAlloc { size = word_size }`, `SaveRegister { stack_offset: 0 }`
+                            // to the shorter `UnwindCode::PushRegister`.
+                            let push_reg_sequence = if let Some(UnwindCode::StackAlloc {
+                                offset: alloc_offset,
+                                size,
+                            }) = unwind_codes.last()
+                            {
+                                *size == word_size && offset == *alloc_offset && *stack_offset == 0
+                            } else {
+                                false
+                            };
+                            if push_reg_sequence {
+                                *unwind_codes.last_mut().unwrap() =
+                                    UnwindCode::PushRegister { offset, reg };
+                            } else {
+                                // TODO add `UnwindCode::SaveRegister` to handle multiple register
+                                // pushes with single `UnwindCode::StackAlloc`.
+                                return Err(CodegenError::Unsupported(
+                                    "Unsupported UnwindCode::PushRegister sequence".into(),
+                                ));
+                            }
+                        }
+                        MappedRegister::Xmm(reg) => {
+                            unwind_codes.push(UnwindCode::SaveXmm {
+                                offset,
+                                reg,
+                                stack_offset: *stack_offset,
+                            });
+                        }
+                    }
+                }
+                InputUnwindCode::StackAlloc { size } => {
+                    unwind_codes.push(UnwindCode::StackAlloc {
+                        offset: ensure_unwind_offset(*offset)?,
+                        size: *size,
+                    });
+                }
+                _ => {}
+            }
+        }
+
+        Ok(Self {
+            flags: 0, // this assumes cranelift functions have no SEH handlers
+            prologue_size: ensure_unwind_offset(unwind.prologue_size)?,
+            frame_register: None,
+            frame_register_offset: 0,
+            unwind_codes,
+        })
+    }
+}
+
+fn ensure_unwind_offset(offset: u32) -> CodegenResult<u8> {
+    if offset > 255 {
+        warn!("function prologues cannot exceed 255 bytes in size for Windows x64");
+        return Err(CodegenError::CodeTooLarge);
+    }
+    Ok(offset as u8)
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs b/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs
new file mode 100644
index 0000000000..f4c7624f36
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs
@@ -0,0 +1,794 @@
+//! Implementation of the standard x64 ABI.
+
+use crate::ir::types::*;
+use crate::ir::{self, types, MemFlags, TrapCode, Type};
+use crate::isa;
+use crate::isa::{x64::inst::*, CallConv};
+use crate::machinst::abi_impl::*;
+use crate::machinst::*;
+use crate::settings;
+use crate::{CodegenError, CodegenResult};
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use args::*;
+use regalloc::{RealReg, Reg, RegClass, Set, Writable};
+use smallvec::{smallvec, SmallVec};
+use std::convert::TryFrom;
+
+/// This is the limit for the size of argument and return-value areas on the
+/// stack. We place a reasonable limit here to avoid integer overflow issues
+/// with 32-bit arithmetic: for now, 128 MB.
+static STACK_ARG_RET_SIZE_LIMIT: u64 = 128 * 1024 * 1024;
+
+/// Offset in stack-arg area to callee-TLS slot in Baldrdash-2020 calling convention.
+static BALDRDASH_CALLEE_TLS_OFFSET: i64 = 0;
+/// Offset in stack-arg area to caller-TLS slot in Baldrdash-2020 calling convention.
+static BALDRDASH_CALLER_TLS_OFFSET: i64 = 8;
+
+/// Try to fill a Baldrdash register, returning it if it was found.
+fn try_fill_baldrdash_reg(call_conv: CallConv, param: &ir::AbiParam) -> Option<ABIArg> {
+    if call_conv.extends_baldrdash() {
+        match &param.purpose {
+            &ir::ArgumentPurpose::VMContext => {
+                // This is SpiderMonkey's `WasmTlsReg`.
+                Some(ABIArg::Reg(
+                    regs::r14().to_real_reg(),
+                    types::I64,
+                    param.extension,
+                    param.purpose,
+                ))
+            }
+            &ir::ArgumentPurpose::SignatureId => {
+                // This is SpiderMonkey's `WasmTableCallSigReg`.
+                Some(ABIArg::Reg(
+                    regs::r10().to_real_reg(),
+                    types::I64,
+                    param.extension,
+                    param.purpose,
+                ))
+            }
+            &ir::ArgumentPurpose::CalleeTLS => {
+                // This is SpiderMonkey's callee TLS slot in the extended frame of Wasm's ABI-2020.
+                assert!(call_conv == isa::CallConv::Baldrdash2020);
+                Some(ABIArg::Stack(
+                    BALDRDASH_CALLEE_TLS_OFFSET,
+                    ir::types::I64,
+                    ir::ArgumentExtension::None,
+                    param.purpose,
+                ))
+            }
+            &ir::ArgumentPurpose::CallerTLS => {
+                // This is SpiderMonkey's caller TLS slot in the extended frame of Wasm's ABI-2020.
+                assert!(call_conv == isa::CallConv::Baldrdash2020);
+                Some(ABIArg::Stack(
+                    BALDRDASH_CALLER_TLS_OFFSET,
+                    ir::types::I64,
+                    ir::ArgumentExtension::None,
+                    param.purpose,
+                ))
+            }
+            _ => None,
+        }
+    } else {
+        None
+    }
+}
+
+/// Support for the x64 ABI from the callee side (within a function body).
+pub(crate) type X64ABICallee = ABICalleeImpl<X64ABIMachineSpec>;
+
+/// Support for the x64 ABI from the caller side (at a callsite).
+pub(crate) type X64ABICaller = ABICallerImpl<X64ABIMachineSpec>;
+
+/// Implementation of ABI primitives for x64.
+pub(crate) struct X64ABIMachineSpec;
+
+impl ABIMachineSpec for X64ABIMachineSpec {
+    type I = Inst;
+
+    fn word_bits() -> u32 {
+        64
+    }
+
+    /// Return required stack alignment in bytes.
+    fn stack_align(_call_conv: isa::CallConv) -> u32 {
+        16
+    }
+
+    fn compute_arg_locs(
+        call_conv: isa::CallConv,
+        params: &[ir::AbiParam],
+        args_or_rets: ArgsOrRets,
+        add_ret_area_ptr: bool,
+    ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
+        let is_baldrdash = call_conv.extends_baldrdash();
+        let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020;
+
+        let mut next_gpr = 0;
+        let mut next_vreg = 0;
+        let mut next_stack: u64 = 0;
+        let mut ret = vec![];
+
+        if args_or_rets == ArgsOrRets::Args && has_baldrdash_tls {
+            // Baldrdash ABI-2020 always has two stack-arg slots reserved, for the callee and
+            // caller TLS-register values, respectively.
+            next_stack = 16;
+        }
+
+        for i in 0..params.len() {
+            // Process returns backward, according to the SpiderMonkey ABI (which we
+            // adopt internally if `is_baldrdash` is set).
+            let param = match (args_or_rets, is_baldrdash) {
+                (ArgsOrRets::Args, _) => &params[i],
+                (ArgsOrRets::Rets, false) => &params[i],
+                (ArgsOrRets::Rets, true) => &params[params.len() - 1 - i],
+            };
+
+            // Validate "purpose".
+            match &param.purpose {
+                &ir::ArgumentPurpose::VMContext
+                | &ir::ArgumentPurpose::Normal
+                | &ir::ArgumentPurpose::StackLimit
+                | &ir::ArgumentPurpose::SignatureId
+                | &ir::ArgumentPurpose::CalleeTLS
+                | &ir::ArgumentPurpose::CallerTLS => {}
+                _ => panic!(
+                    "Unsupported argument purpose {:?} in signature: {:?}",
+                    param.purpose, params
+                ),
+            }
+
+            let intreg = in_int_reg(param.value_type);
+            let vecreg = in_vec_reg(param.value_type);
+            debug_assert!(intreg || vecreg);
+            debug_assert!(!(intreg && vecreg));
+
+            let (next_reg, candidate) = if intreg {
+                let candidate = match args_or_rets {
+                    ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr),
+                    ArgsOrRets::Rets => get_intreg_for_retval_systemv(&call_conv, next_gpr, i),
+                };
+                debug_assert!(candidate
+                    .map(|r| r.get_class() == RegClass::I64)
+                    .unwrap_or(true));
+                (&mut next_gpr, candidate)
+            } else {
+                let candidate = match args_or_rets {
+                    ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg),
+                    ArgsOrRets::Rets => get_fltreg_for_retval_systemv(&call_conv, next_vreg, i),
+                };
+                debug_assert!(candidate
+                    .map(|r| r.get_class() == RegClass::V128)
+                    .unwrap_or(true));
+                (&mut next_vreg, candidate)
+            };
+
+            if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
+                assert!(intreg);
+                ret.push(param);
+            } else if let Some(reg) = candidate {
+                ret.push(ABIArg::Reg(
+                    reg.to_real_reg(),
+                    param.value_type,
+                    param.extension,
+                    param.purpose,
+                ));
+                *next_reg += 1;
+            } else {
+                // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte
+                // stack alignment happens separately after all args.)
+                let size = (param.value_type.bits() / 8) as u64;
+                let size = std::cmp::max(size, 8);
+                // Align.
+                debug_assert!(size.is_power_of_two());
+                next_stack = (next_stack + size - 1) & !(size - 1);
+                ret.push(ABIArg::Stack(
+                    next_stack as i64,
+                    param.value_type,
+                    param.extension,
+                    param.purpose,
+                ));
+                next_stack += size;
+            }
+        }
+
+        if args_or_rets == ArgsOrRets::Rets && is_baldrdash {
+            ret.reverse();
+        }
+
+        let extra_arg = if add_ret_area_ptr {
+            debug_assert!(args_or_rets == ArgsOrRets::Args);
+            if let Some(reg) = get_intreg_for_arg_systemv(&call_conv, next_gpr) {
+                ret.push(ABIArg::Reg(
+                    reg.to_real_reg(),
+                    types::I64,
+                    ir::ArgumentExtension::None,
+                    ir::ArgumentPurpose::Normal,
+                ));
+            } else {
+                ret.push(ABIArg::Stack(
+                    next_stack as i64,
+                    types::I64,
+                    ir::ArgumentExtension::None,
+                    ir::ArgumentPurpose::Normal,
+                ));
+                next_stack += 8;
+            }
+            Some(ret.len() - 1)
+        } else {
+            None
+        };
+
+        next_stack = (next_stack + 15) & !15;
+
+        // To avoid overflow issues, limit the arg/return size to something reasonable.
+        if next_stack > STACK_ARG_RET_SIZE_LIMIT {
+            return Err(CodegenError::ImplLimitExceeded);
+        }
+
+        Ok((ret, next_stack as i64, extra_arg))
+    }
+
+    fn fp_to_arg_offset(call_conv: isa::CallConv, flags: &settings::Flags) -> i64 {
+        if call_conv.extends_baldrdash() {
+            let num_words = flags.baldrdash_prologue_words() as i64;
+            debug_assert!(num_words > 0, "baldrdash must set baldrdash_prologue_words");
+            num_words * 8
+        } else {
+            16 // frame pointer + return address.
+        }
+    }
+
+    fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I {
+        let ext_kind = match ty {
+            types::B1
+            | types::B8
+            | types::I8
+            | types::B16
+            | types::I16
+            | types::B32
+            | types::I32 => ExtKind::SignExtend,
+            types::B64 | types::I64 | types::R64 | types::F32 | types::F64 => ExtKind::None,
+            _ if ty.bytes() == 16 => ExtKind::None,
+            _ => panic!("load_stack({})", ty),
+        };
+        Inst::load(ty, mem, into_reg, ext_kind)
+    }
+
+    fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Self::I {
+        Inst::store(ty, from_reg, mem)
+    }
+
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self::I {
+        Inst::gen_move(to_reg, from_reg, ty)
+    }
+
+    /// Generate an integer-extend operation.
+    fn gen_extend(
+        to_reg: Writable<Reg>,
+        from_reg: Reg,
+        is_signed: bool,
+        from_bits: u8,
+        to_bits: u8,
+    ) -> Self::I {
+        let ext_mode = ExtMode::new(from_bits as u16, to_bits as u16)
+            .expect(&format!("invalid extension: {} -> {}", from_bits, to_bits));
+        if is_signed {
+            Inst::movsx_rm_r(ext_mode, RegMem::reg(from_reg), to_reg)
+        } else {
+            Inst::movzx_rm_r(ext_mode, RegMem::reg(from_reg), to_reg)
+        }
+    }
+
+    fn gen_ret() -> Self::I {
+        Inst::ret()
+    }
+
+    fn gen_epilogue_placeholder() -> Self::I {
+        Inst::epilogue_placeholder()
+    }
+
+    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Self::I; 4]> {
+        let mut ret = SmallVec::new();
+        if from_reg != into_reg.to_reg() {
+            ret.push(Inst::gen_move(into_reg, from_reg, I64));
+        }
+        ret.push(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(imm),
+            into_reg,
+        ));
+        ret
+    }
+
+    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Self::I; 2]> {
+        smallvec![
+            Inst::cmp_rmi_r(/* bytes = */ 8, RegMemImm::reg(regs::rsp()), limit_reg),
+            Inst::TrapIf {
+                // NBE == "> unsigned"; args above are reversed; this tests limit_reg > rsp.
+                cc: CC::NBE,
+                trap_code: TrapCode::StackOverflow,
+            },
+        ]
+    }
+
+    fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Self::I {
+        let mem: SyntheticAmode = mem.into();
+        Inst::lea(mem, into_reg)
+    }
+
+    fn get_stacklimit_reg() -> Reg {
+        debug_assert!(
+            !is_callee_save_systemv(regs::r10().to_real_reg())
+                && !is_callee_save_baldrdash(regs::r10().to_real_reg())
+        );
+
+        // As per comment on trait definition, we must return a caller-save
+        // register here.
+        regs::r10()
+    }
+
+    fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Self::I {
+        // Only ever used for I64s; if that changes, see if the ExtKind below needs to be changed.
+        assert_eq!(ty, I64);
+        let simm32 = offset as u32;
+        let mem = Amode::imm_reg(simm32, base);
+        Inst::load(ty, mem, into_reg, ExtKind::None)
+    }
+
+    fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Self::I {
+        let simm32 = offset as u32;
+        let mem = Amode::imm_reg(simm32, base);
+        Inst::store(ty, from_reg, mem)
+    }
+
+    fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Self::I; 2]> {
+        let (alu_op, amount) = if amount >= 0 {
+            (AluRmiROpcode::Add, amount)
+        } else {
+            (AluRmiROpcode::Sub, -amount)
+        };
+
+        let amount = amount as u32;
+
+        smallvec![Inst::alu_rmi_r(
+            true,
+            alu_op,
+            RegMemImm::imm(amount),
+            Writable::from_reg(regs::rsp()),
+        )]
+    }
+
+    fn gen_nominal_sp_adj(offset: i32) -> Self::I {
+        Inst::VirtualSPOffsetAdj {
+            offset: offset as i64,
+        }
+    }
+
+    fn gen_prologue_frame_setup() -> SmallVec<[Self::I; 2]> {
+        let r_rsp = regs::rsp();
+        let r_rbp = regs::rbp();
+        let w_rbp = Writable::from_reg(r_rbp);
+        let mut insts = SmallVec::new();
+        // RSP before the call will be 0 % 16.  So here, it is 8 % 16.
+        insts.push(Inst::push64(RegMemImm::reg(r_rbp)));
+        // RSP is now 0 % 16
+        insts.push(Inst::mov_r_r(true, r_rsp, w_rbp));
+        insts
+    }
+
+    fn gen_epilogue_frame_restore() -> SmallVec<[Self::I; 2]> {
+        let mut insts = SmallVec::new();
+        insts.push(Inst::mov_r_r(
+            true,
+            regs::rbp(),
+            Writable::from_reg(regs::rsp()),
+        ));
+        insts.push(Inst::pop64(Writable::from_reg(regs::rbp())));
+        insts
+    }
+
+    fn gen_clobber_save(
+        call_conv: isa::CallConv,
+        _: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
+    ) -> (u64, SmallVec<[Self::I; 16]>) {
+        let mut insts = SmallVec::new();
+        // Find all clobbered registers that are callee-save. These are only I64
+        // registers (all XMM registers are caller-save) so we can compute the
+        // total size of the needed stack space easily.
+        let clobbered = get_callee_saves(&call_conv, clobbers);
+        let clobbered_size = 8 * clobbered.len() as u32;
+        let stack_size = clobbered_size + fixed_frame_storage_size;
+        // Align to 16 bytes.
+        let stack_size = (stack_size + 15) & !15;
+        // Adjust the stack pointer downward with one `sub rsp, IMM`
+        // instruction.
+        if stack_size > 0 {
+            insts.push(Inst::alu_rmi_r(
+                true,
+                AluRmiROpcode::Sub,
+                RegMemImm::imm(stack_size),
+                Writable::from_reg(regs::rsp()),
+            ));
+        }
+        // Store each clobbered register in order at offsets from RSP.
+        let mut cur_offset = 0;
+        for reg in &clobbered {
+            let r_reg = reg.to_reg();
+            match r_reg.get_class() {
+                RegClass::I64 => {
+                    insts.push(Inst::mov_r_m(
+                        /* bytes = */ 8,
+                        r_reg.to_reg(),
+                        Amode::imm_reg(cur_offset, regs::rsp()),
+                    ));
+                    cur_offset += 8;
+                }
+                // No XMM regs are callee-save, so we do not need to implement
+                // this.
+                _ => unimplemented!(),
+            }
+        }
+
+        (clobbered_size as u64, insts)
+    }
+
+    fn gen_clobber_restore(
+        call_conv: isa::CallConv,
+        flags: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        _fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
+    ) -> SmallVec<[Self::I; 16]> {
+        let mut insts = SmallVec::new();
+
+        let clobbered = get_callee_saves(&call_conv, clobbers);
+        let stack_size = 8 * clobbered.len() as u32;
+        let stack_size = (stack_size + 15) & !15;
+
+        // Restore regs by loading from offsets of RSP.
+        let mut cur_offset = 0;
+        for reg in &clobbered {
+            let rreg = reg.to_reg();
+            match rreg.get_class() {
+                RegClass::I64 => {
+                    insts.push(Inst::mov64_m_r(
+                        Amode::imm_reg(cur_offset, regs::rsp()),
+                        Writable::from_reg(rreg.to_reg()),
+                    ));
+                    cur_offset += 8;
+                }
+                _ => unimplemented!(),
+            }
+        }
+        // Adjust RSP back upward.
+        if stack_size > 0 {
+            insts.push(Inst::alu_rmi_r(
+                true,
+                AluRmiROpcode::Add,
+                RegMemImm::imm(stack_size),
+                Writable::from_reg(regs::rsp()),
+            ));
+        }
+
+        // If this is Baldrdash-2020, restore the callee (i.e., our) TLS
+        // register. We may have allocated it for something else and clobbered
+        // it, but the ABI expects us to leave the TLS register unchanged.
+        if call_conv == isa::CallConv::Baldrdash2020 {
+            let off = BALDRDASH_CALLEE_TLS_OFFSET + Self::fp_to_arg_offset(call_conv, flags);
+            insts.push(Inst::mov64_m_r(
+                Amode::imm_reg(off as u32, regs::rbp()),
+                Writable::from_reg(regs::r14()),
+            ));
+        }
+
+        insts
+    }
+
+    /// Generate a call instruction/sequence.
+    fn gen_call(
+        dest: &CallDest,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: ir::Opcode,
+        tmp: Writable<Reg>,
+        _callee_conv: isa::CallConv,
+        _caller_conv: isa::CallConv,
+    ) -> SmallVec<[(InstIsSafepoint, Self::I); 2]> {
+        let mut insts = SmallVec::new();
+        match dest {
+            &CallDest::ExtName(ref name, RelocDistance::Near) => {
+                insts.push((
+                    InstIsSafepoint::Yes,
+                    Inst::call_known(name.clone(), uses, defs, opcode),
+                ));
+            }
+            &CallDest::ExtName(ref name, RelocDistance::Far) => {
+                insts.push((
+                    InstIsSafepoint::No,
+                    Inst::LoadExtName {
+                        dst: tmp,
+                        name: Box::new(name.clone()),
+                        offset: 0,
+                    },
+                ));
+                insts.push((
+                    InstIsSafepoint::Yes,
+                    Inst::call_unknown(RegMem::reg(tmp.to_reg()), uses, defs, opcode),
+                ));
+            }
+            &CallDest::Reg(reg) => {
+                insts.push((
+                    InstIsSafepoint::Yes,
+                    Inst::call_unknown(RegMem::reg(reg), uses, defs, opcode),
+                ));
+            }
+        }
+        insts
+    }
+
+    fn get_number_of_spillslots_for_value(rc: RegClass, ty: Type) -> u32 {
+        // We allocate in terms of 8-byte slots.
+        match (rc, ty) {
+            (RegClass::I64, _) => 1,
+            (RegClass::V128, types::F32) | (RegClass::V128, types::F64) => 1,
+            (RegClass::V128, _) => 2,
+            _ => panic!("Unexpected register class!"),
+        }
+    }
+
+    fn get_virtual_sp_offset_from_state(s: &<Self::I as MachInstEmit>::State) -> i64 {
+        s.virtual_sp_offset
+    }
+
+    fn get_nominal_sp_to_fp(s: &<Self::I as MachInstEmit>::State) -> i64 {
+        s.nominal_sp_to_fp
+    }
+
+    fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> Vec<Writable<Reg>> {
+        let mut caller_saved = vec![
+            // Systemv calling convention:
+            // - GPR: all except RBX, RBP, R12 to R15 (which are callee-saved).
+            Writable::from_reg(regs::rsi()),
+            Writable::from_reg(regs::rdi()),
+            Writable::from_reg(regs::rax()),
+            Writable::from_reg(regs::rcx()),
+            Writable::from_reg(regs::rdx()),
+            Writable::from_reg(regs::r8()),
+            Writable::from_reg(regs::r9()),
+            Writable::from_reg(regs::r10()),
+            Writable::from_reg(regs::r11()),
+            // - XMM: all the registers!
+            Writable::from_reg(regs::xmm0()),
+            Writable::from_reg(regs::xmm1()),
+            Writable::from_reg(regs::xmm2()),
+            Writable::from_reg(regs::xmm3()),
+            Writable::from_reg(regs::xmm4()),
+            Writable::from_reg(regs::xmm5()),
+            Writable::from_reg(regs::xmm6()),
+            Writable::from_reg(regs::xmm7()),
+            Writable::from_reg(regs::xmm8()),
+            Writable::from_reg(regs::xmm9()),
+            Writable::from_reg(regs::xmm10()),
+            Writable::from_reg(regs::xmm11()),
+            Writable::from_reg(regs::xmm12()),
+            Writable::from_reg(regs::xmm13()),
+            Writable::from_reg(regs::xmm14()),
+            Writable::from_reg(regs::xmm15()),
+        ];
+
+        if call_conv_of_callee.extends_baldrdash() {
+            caller_saved.push(Writable::from_reg(regs::r12()));
+            caller_saved.push(Writable::from_reg(regs::r13()));
+            // Not r14; implicitly preserved in the entry.
+            caller_saved.push(Writable::from_reg(regs::r15()));
+            caller_saved.push(Writable::from_reg(regs::rbx()));
+        }
+
+        caller_saved
+    }
+}
+
+impl From<StackAMode> for SyntheticAmode {
+    fn from(amode: StackAMode) -> Self {
+        // We enforce a 128 MB stack-frame size limit above, so these
+        // `expect()`s should never fail.
+        match amode {
+            StackAMode::FPOffset(off, _ty) => {
+                let off = i32::try_from(off)
+                    .expect("Offset in FPOffset is greater than 2GB; should hit impl limit first");
+                let simm32 = off as u32;
+                SyntheticAmode::Real(Amode::ImmReg {
+                    simm32,
+                    base: regs::rbp(),
+                    flags: MemFlags::trusted(),
+                })
+            }
+            StackAMode::NominalSPOffset(off, _ty) => {
+                let off = i32::try_from(off).expect(
+                    "Offset in NominalSPOffset is greater than 2GB; should hit impl limit first",
+                );
+                let simm32 = off as u32;
+                SyntheticAmode::nominal_sp_offset(simm32)
+            }
+            StackAMode::SPOffset(off, _ty) => {
+                let off = i32::try_from(off)
+                    .expect("Offset in SPOffset is greater than 2GB; should hit impl limit first");
+                let simm32 = off as u32;
+                SyntheticAmode::Real(Amode::ImmReg {
+                    simm32,
+                    base: regs::rsp(),
+                    flags: MemFlags::trusted(),
+                })
+            }
+        }
+    }
+}
+
+fn in_int_reg(ty: types::Type) -> bool {
+    match ty {
+        types::I8
+        | types::I16
+        | types::I32
+        | types::I64
+        | types::B1
+        | types::B8
+        | types::B16
+        | types::B32
+        | types::B64
+        | types::R64 => true,
+        types::R32 => panic!("unexpected 32-bits refs on x64!"),
+        _ => false,
+    }
+}
+
+fn in_vec_reg(ty: types::Type) -> bool {
+    match ty {
+        types::F32 | types::F64 => true,
+        _ if ty.is_vector() => true,
+        _ => false,
+    }
+}
+
+fn get_intreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> {
+    match call_conv {
+        CallConv::Fast
+        | CallConv::Cold
+        | CallConv::SystemV
+        | CallConv::BaldrdashSystemV
+        | CallConv::Baldrdash2020 => {}
+        _ => panic!("int args only supported for SysV calling convention"),
+    };
+    match idx {
+        0 => Some(regs::rdi()),
+        1 => Some(regs::rsi()),
+        2 => Some(regs::rdx()),
+        3 => Some(regs::rcx()),
+        4 => Some(regs::r8()),
+        5 => Some(regs::r9()),
+        _ => None,
+    }
+}
+
+fn get_fltreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> {
+    match call_conv {
+        CallConv::Fast
+        | CallConv::Cold
+        | CallConv::SystemV
+        | CallConv::BaldrdashSystemV
+        | CallConv::Baldrdash2020 => {}
+        _ => panic!("float args only supported for SysV calling convention"),
+    };
+    match idx {
+        0 => Some(regs::xmm0()),
+        1 => Some(regs::xmm1()),
+        2 => Some(regs::xmm2()),
+        3 => Some(regs::xmm3()),
+        4 => Some(regs::xmm4()),
+        5 => Some(regs::xmm5()),
+        6 => Some(regs::xmm6()),
+        7 => Some(regs::xmm7()),
+        _ => None,
+    }
+}
+
+fn get_intreg_for_retval_systemv(
+    call_conv: &CallConv,
+    intreg_idx: usize,
+    retval_idx: usize,
+) -> Option<Reg> {
+    match call_conv {
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV => match intreg_idx {
+            0 => Some(regs::rax()),
+            1 => Some(regs::rdx()),
+            _ => None,
+        },
+        CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => {
+            if intreg_idx == 0 && retval_idx == 0 {
+                Some(regs::rax())
+            } else {
+                None
+            }
+        }
+        CallConv::WindowsFastcall | CallConv::BaldrdashWindows | CallConv::Probestack => todo!(),
+    }
+}
+
+fn get_fltreg_for_retval_systemv(
+    call_conv: &CallConv,
+    fltreg_idx: usize,
+    retval_idx: usize,
+) -> Option<Reg> {
+    match call_conv {
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV => match fltreg_idx {
+            0 => Some(regs::xmm0()),
+            1 => Some(regs::xmm1()),
+            _ => None,
+        },
+        CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => {
+            if fltreg_idx == 0 && retval_idx == 0 {
+                Some(regs::xmm0())
+            } else {
+                None
+            }
+        }
+        CallConv::WindowsFastcall | CallConv::BaldrdashWindows | CallConv::Probestack => todo!(),
+    }
+}
+
+fn is_callee_save_systemv(r: RealReg) -> bool {
+    use regs::*;
+    match r.get_class() {
+        RegClass::I64 => match r.get_hw_encoding() as u8 {
+            ENC_RBX | ENC_RBP | ENC_R12 | ENC_R13 | ENC_R14 | ENC_R15 => true,
+            _ => false,
+        },
+        RegClass::V128 => false,
+        _ => unimplemented!(),
+    }
+}
+
+fn is_callee_save_baldrdash(r: RealReg) -> bool {
+    use regs::*;
+    match r.get_class() {
+        RegClass::I64 => {
+            if r.get_hw_encoding() as u8 == ENC_R14 {
+                // r14 is the WasmTlsReg and is preserved implicitly.
+                false
+            } else {
+                // Defer to native for the other ones.
+                is_callee_save_systemv(r)
+            }
+        }
+        RegClass::V128 => false,
+        _ => unimplemented!(),
+    }
+}
+
+fn get_callee_saves(call_conv: &CallConv, regs: &Set<Writable<RealReg>>) -> Vec<Writable<RealReg>> {
+    let mut regs: Vec<Writable<RealReg>> = match call_conv {
+        CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => regs
+            .iter()
+            .cloned()
+            .filter(|r| is_callee_save_baldrdash(r.to_reg()))
+            .collect(),
+        CallConv::BaldrdashWindows => {
+            todo!("baldrdash windows");
+        }
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV => regs
+            .iter()
+            .cloned()
+            .filter(|r| is_callee_save_systemv(r.to_reg()))
+            .collect(),
+        CallConv::WindowsFastcall => todo!("windows fastcall"),
+        CallConv::Probestack => todo!("probestack?"),
+    };
+    // Sort registers for deterministic code output. We can do an unstable sort because the
+    // registers will be unique (there are no dups).
+    regs.sort_unstable_by_key(|r| r.to_reg().get_index());
+    regs
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs
new file mode 100644
index 0000000000..6a8f65feb3
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs
@@ -0,0 +1,1215 @@
+//! Instruction operand sub-components (aka "parts"): definitions and printing.
+
+use super::regs::{self, show_ireg_sized};
+use super::EmitState;
+use crate::ir::condcodes::{FloatCC, IntCC};
+use crate::ir::MemFlags;
+use crate::machinst::*;
+use regalloc::{
+    PrettyPrint, PrettyPrintSized, RealRegUniverse, Reg, RegClass, RegUsageCollector,
+    RegUsageMapper, Writable,
+};
+use std::fmt;
+use std::string::String;
+
+/// A possible addressing mode (amode) that can be used in instructions.
+/// These denote a 64-bit value only.
+#[derive(Clone, Debug)]
+pub enum Amode {
+    /// Immediate sign-extended and a Register.
+    ImmReg {
+        simm32: u32,
+        base: Reg,
+        flags: MemFlags,
+    },
+
+    /// sign-extend-32-to-64(Immediate) + Register1 + (Register2 << Shift)
+    ImmRegRegShift {
+        simm32: u32,
+        base: Reg,
+        index: Reg,
+        shift: u8, /* 0 .. 3 only */
+        flags: MemFlags,
+    },
+
+    /// sign-extend-32-to-64(Immediate) + RIP (instruction pointer).
+    /// To wit: not supported in 32-bits mode.
+    RipRelative { target: MachLabel },
+}
+
+impl Amode {
+    pub(crate) fn imm_reg(simm32: u32, base: Reg) -> Self {
+        debug_assert!(base.get_class() == RegClass::I64);
+        Self::ImmReg {
+            simm32,
+            base,
+            flags: MemFlags::trusted(),
+        }
+    }
+
+    pub(crate) fn imm_reg_reg_shift(simm32: u32, base: Reg, index: Reg, shift: u8) -> Self {
+        debug_assert!(base.get_class() == RegClass::I64);
+        debug_assert!(index.get_class() == RegClass::I64);
+        debug_assert!(shift <= 3);
+        Self::ImmRegRegShift {
+            simm32,
+            base,
+            index,
+            shift,
+            flags: MemFlags::trusted(),
+        }
+    }
+
+    pub(crate) fn rip_relative(target: MachLabel) -> Self {
+        Self::RipRelative { target }
+    }
+
+    pub(crate) fn with_flags(&self, flags: MemFlags) -> Self {
+        match self {
+            &Self::ImmReg { simm32, base, .. } => Self::ImmReg {
+                simm32,
+                base,
+                flags,
+            },
+            &Self::ImmRegRegShift {
+                simm32,
+                base,
+                index,
+                shift,
+                ..
+            } => Self::ImmRegRegShift {
+                simm32,
+                base,
+                index,
+                shift,
+                flags,
+            },
+            _ => panic!("Amode {:?} cannot take memflags", self),
+        }
+    }
+
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            Amode::ImmReg { base, .. } => {
+                collector.add_use(*base);
+            }
+            Amode::ImmRegRegShift { base, index, .. } => {
+                collector.add_use(*base);
+                collector.add_use(*index);
+            }
+            Amode::RipRelative { .. } => {
+                // RIP isn't involved in regalloc.
+            }
+        }
+    }
+
+    pub(crate) fn get_flags(&self) -> MemFlags {
+        match self {
+            Amode::ImmReg { flags, .. } => *flags,
+            Amode::ImmRegRegShift { flags, .. } => *flags,
+            Amode::RipRelative { .. } => MemFlags::trusted(),
+        }
+    }
+
+    pub(crate) fn can_trap(&self) -> bool {
+        !self.get_flags().notrap()
+    }
+}
+
+impl PrettyPrint for Amode {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            Amode::ImmReg { simm32, base, .. } => {
+                format!("{}({})", *simm32 as i32, base.show_rru(mb_rru))
+            }
+            Amode::ImmRegRegShift {
+                simm32,
+                base,
+                index,
+                shift,
+                ..
+            } => format!(
+                "{}({},{},{})",
+                *simm32 as i32,
+                base.show_rru(mb_rru),
+                index.show_rru(mb_rru),
+                1 << shift
+            ),
+            Amode::RipRelative { ref target } => format!("label{}(%rip)", target.get()),
+        }
+    }
+}
+
+/// A Memory Address. These denote a 64-bit value only.
+/// Used for usual addressing modes as well as addressing modes used during compilation, when the
+/// moving SP offset is not known.
+#[derive(Clone)]
+pub enum SyntheticAmode {
+    /// A real amode.
+    Real(Amode),
+
+    /// A (virtual) offset to the "nominal SP" value, which will be recomputed as we push and pop
+    /// within the function.
+    NominalSPOffset { simm32: u32 },
+}
+
+impl SyntheticAmode {
+    pub(crate) fn nominal_sp_offset(simm32: u32) -> Self {
+        SyntheticAmode::NominalSPOffset { simm32 }
+    }
+
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            SyntheticAmode::Real(addr) => addr.get_regs_as_uses(collector),
+            SyntheticAmode::NominalSPOffset { .. } => {
+                // Nothing to do; the base is SP and isn't involved in regalloc.
+            }
+        }
+    }
+
+    pub(crate) fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            SyntheticAmode::Real(addr) => addr.map_uses(map),
+            SyntheticAmode::NominalSPOffset { .. } => {
+                // Nothing to do.
+            }
+        }
+    }
+
+    pub(crate) fn finalize(&self, state: &mut EmitState) -> Amode {
+        match self {
+            SyntheticAmode::Real(addr) => addr.clone(),
+            SyntheticAmode::NominalSPOffset { simm32 } => {
+                let off = *simm32 as i64 + state.virtual_sp_offset;
+                // TODO will require a sequence of add etc.
+                assert!(
+                    off <= u32::max_value() as i64,
+                    "amode finalize: add sequence NYI"
+                );
+                Amode::imm_reg(off as u32, regs::rsp())
+            }
+        }
+    }
+}
+
+impl Into<SyntheticAmode> for Amode {
+    fn into(self) -> SyntheticAmode {
+        SyntheticAmode::Real(self)
+    }
+}
+
+impl PrettyPrint for SyntheticAmode {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            SyntheticAmode::Real(addr) => addr.show_rru(mb_rru),
+            SyntheticAmode::NominalSPOffset { simm32 } => {
+                format!("rsp({} + virtual offset)", *simm32 as i32)
+            }
+        }
+    }
+}
+
+/// An operand which is either an integer Register, a value in Memory or an Immediate.  This can
+/// denote an 8, 16, 32 or 64 bit value.  For the Immediate form, in the 8- and 16-bit case, only
+/// the lower 8 or 16 bits of `simm32` is relevant.  In the 64-bit case, the value denoted by
+/// `simm32` is its sign-extension out to 64 bits.
+#[derive(Clone)]
+pub enum RegMemImm {
+    Reg { reg: Reg },
+    Mem { addr: SyntheticAmode },
+    Imm { simm32: u32 },
+}
+
+impl RegMemImm {
+    pub(crate) fn reg(reg: Reg) -> Self {
+        debug_assert!(reg.get_class() == RegClass::I64 || reg.get_class() == RegClass::V128);
+        Self::Reg { reg }
+    }
+    pub(crate) fn mem(addr: impl Into<SyntheticAmode>) -> Self {
+        Self::Mem { addr: addr.into() }
+    }
+    pub(crate) fn imm(simm32: u32) -> Self {
+        Self::Imm { simm32 }
+    }
+
+    /// Asserts that in register mode, the reg class is the one that's expected.
+    pub(crate) fn assert_regclass_is(&self, expected_reg_class: RegClass) {
+        if let Self::Reg { reg } = self {
+            debug_assert_eq!(reg.get_class(), expected_reg_class);
+        }
+    }
+
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            Self::Reg { reg } => collector.add_use(*reg),
+            Self::Mem { addr } => addr.get_regs_as_uses(collector),
+            Self::Imm { .. } => {}
+        }
+    }
+
+    pub(crate) fn to_reg(&self) -> Option<Reg> {
+        match self {
+            Self::Reg { reg } => Some(*reg),
+            _ => None,
+        }
+    }
+}
+
+impl PrettyPrint for RegMemImm {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        self.show_rru_sized(mb_rru, 8)
+    }
+}
+
+impl PrettyPrintSized for RegMemImm {
+    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+        match self {
+            Self::Reg { reg } => show_ireg_sized(*reg, mb_rru, size),
+            Self::Mem { addr } => addr.show_rru(mb_rru),
+            Self::Imm { simm32 } => format!("${}", *simm32 as i32),
+        }
+    }
+}
+
+/// An operand which is either an integer Register or a value in Memory.  This can denote an 8, 16,
+/// 32, 64, or 128 bit value.
+#[derive(Clone)]
+pub enum RegMem {
+    Reg { reg: Reg },
+    Mem { addr: SyntheticAmode },
+}
+
+impl RegMem {
+    pub(crate) fn reg(reg: Reg) -> Self {
+        debug_assert!(reg.get_class() == RegClass::I64 || reg.get_class() == RegClass::V128);
+        Self::Reg { reg }
+    }
+    pub(crate) fn mem(addr: impl Into<SyntheticAmode>) -> Self {
+        Self::Mem { addr: addr.into() }
+    }
+    /// Asserts that in register mode, the reg class is the one that's expected.
+    pub(crate) fn assert_regclass_is(&self, expected_reg_class: RegClass) {
+        if let Self::Reg { reg } = self {
+            debug_assert_eq!(reg.get_class(), expected_reg_class);
+        }
+    }
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            RegMem::Reg { reg } => collector.add_use(*reg),
+            RegMem::Mem { addr, .. } => addr.get_regs_as_uses(collector),
+        }
+    }
+    pub(crate) fn to_reg(&self) -> Option<Reg> {
+        match self {
+            RegMem::Reg { reg } => Some(*reg),
+            _ => None,
+        }
+    }
+}
+
+impl From<Writable<Reg>> for RegMem {
+    fn from(r: Writable<Reg>) -> Self {
+        RegMem::reg(r.to_reg())
+    }
+}
+
+impl PrettyPrint for RegMem {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        self.show_rru_sized(mb_rru, 8)
+    }
+}
+
+impl PrettyPrintSized for RegMem {
+    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+        match self {
+            RegMem::Reg { reg } => show_ireg_sized(*reg, mb_rru, size),
+            RegMem::Mem { addr, .. } => addr.show_rru(mb_rru),
+        }
+    }
+}
+
+/// Some basic ALU operations.  TODO: maybe add Adc, Sbb.
+#[derive(Copy, Clone, PartialEq)]
+pub enum AluRmiROpcode {
+    Add,
+    Sub,
+    And,
+    Or,
+    Xor,
+    /// The signless, non-extending (N x N -> N, for N in {32,64}) variant.
+    Mul,
+}
+
+impl fmt::Debug for AluRmiROpcode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            AluRmiROpcode::Add => "add",
+            AluRmiROpcode::Sub => "sub",
+            AluRmiROpcode::And => "and",
+            AluRmiROpcode::Or => "or",
+            AluRmiROpcode::Xor => "xor",
+            AluRmiROpcode::Mul => "imul",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for AluRmiROpcode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+#[derive(Clone, PartialEq)]
+pub enum UnaryRmROpcode {
+    /// Bit-scan reverse.
+    Bsr,
+    /// Bit-scan forward.
+    Bsf,
+}
+
+impl fmt::Debug for UnaryRmROpcode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            UnaryRmROpcode::Bsr => write!(fmt, "bsr"),
+            UnaryRmROpcode::Bsf => write!(fmt, "bsf"),
+        }
+    }
+}
+
+impl fmt::Display for UnaryRmROpcode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+pub(crate) enum InstructionSet {
+    SSE,
+    SSE2,
+    SSSE3,
+    SSE41,
+    SSE42,
+}
+
+/// Some SSE operations requiring 2 operands r/m and r.
+#[derive(Clone, Copy, PartialEq)]
+pub enum SseOpcode {
+    Addps,
+    Addpd,
+    Addss,
+    Addsd,
+    Andps,
+    Andpd,
+    Andnps,
+    Andnpd,
+    Comiss,
+    Comisd,
+    Cmpps,
+    Cmppd,
+    Cmpss,
+    Cmpsd,
+    Cvtdq2ps,
+    Cvtsd2ss,
+    Cvtsd2si,
+    Cvtsi2ss,
+    Cvtsi2sd,
+    Cvtss2si,
+    Cvtss2sd,
+    Cvttps2dq,
+    Cvttss2si,
+    Cvttsd2si,
+    Divps,
+    Divpd,
+    Divss,
+    Divsd,
+    Insertps,
+    Maxps,
+    Maxpd,
+    Maxss,
+    Maxsd,
+    Minps,
+    Minpd,
+    Minss,
+    Minsd,
+    Movaps,
+    Movapd,
+    Movd,
+    Movdqa,
+    Movdqu,
+    Movlhps,
+    Movmskps,
+    Movmskpd,
+    Movq,
+    Movss,
+    Movsd,
+    Movups,
+    Movupd,
+    Mulps,
+    Mulpd,
+    Mulss,
+    Mulsd,
+    Orps,
+    Orpd,
+    Pabsb,
+    Pabsw,
+    Pabsd,
+    Packsswb,
+    Paddb,
+    Paddd,
+    Paddq,
+    Paddw,
+    Paddsb,
+    Paddsw,
+    Paddusb,
+    Paddusw,
+    Pand,
+    Pandn,
+    Pavgb,
+    Pavgw,
+    Pcmpeqb,
+    Pcmpeqw,
+    Pcmpeqd,
+    Pcmpeqq,
+    Pcmpgtb,
+    Pcmpgtw,
+    Pcmpgtd,
+    Pcmpgtq,
+    Pextrb,
+    Pextrw,
+    Pextrd,
+    Pinsrb,
+    Pinsrw,
+    Pinsrd,
+    Pmaxsb,
+    Pmaxsw,
+    Pmaxsd,
+    Pmaxub,
+    Pmaxuw,
+    Pmaxud,
+    Pminsb,
+    Pminsw,
+    Pminsd,
+    Pminub,
+    Pminuw,
+    Pminud,
+    Pmovmskb,
+    Pmulld,
+    Pmullw,
+    Pmuludq,
+    Por,
+    Pshufb,
+    Pshufd,
+    Psllw,
+    Pslld,
+    Psllq,
+    Psraw,
+    Psrad,
+    Psrlw,
+    Psrld,
+    Psrlq,
+    Psubb,
+    Psubd,
+    Psubq,
+    Psubw,
+    Psubsb,
+    Psubsw,
+    Psubusb,
+    Psubusw,
+    Ptest,
+    Pxor,
+    Rcpss,
+    Roundss,
+    Roundsd,
+    Rsqrtss,
+    Sqrtps,
+    Sqrtpd,
+    Sqrtss,
+    Sqrtsd,
+    Subps,
+    Subpd,
+    Subss,
+    Subsd,
+    Ucomiss,
+    Ucomisd,
+    Xorps,
+    Xorpd,
+}
+
+impl SseOpcode {
+    /// Which `InstructionSet` is the first supporting this opcode?
+    pub(crate) fn available_from(&self) -> InstructionSet {
+        use InstructionSet::*;
+        match self {
+            SseOpcode::Addps
+            | SseOpcode::Addss
+            | SseOpcode::Andps
+            | SseOpcode::Andnps
+            | SseOpcode::Comiss
+            | SseOpcode::Cmpps
+            | SseOpcode::Cmpss
+            | SseOpcode::Cvtsi2ss
+            | SseOpcode::Cvtss2si
+            | SseOpcode::Cvttss2si
+            | SseOpcode::Divps
+            | SseOpcode::Divss
+            | SseOpcode::Maxps
+            | SseOpcode::Maxss
+            | SseOpcode::Minps
+            | SseOpcode::Minss
+            | SseOpcode::Movaps
+            | SseOpcode::Movlhps
+            | SseOpcode::Movmskps
+            | SseOpcode::Movss
+            | SseOpcode::Movups
+            | SseOpcode::Mulps
+            | SseOpcode::Mulss
+            | SseOpcode::Orps
+            | SseOpcode::Rcpss
+            | SseOpcode::Rsqrtss
+            | SseOpcode::Sqrtps
+            | SseOpcode::Sqrtss
+            | SseOpcode::Subps
+            | SseOpcode::Subss
+            | SseOpcode::Ucomiss
+            | SseOpcode::Xorps => SSE,
+
+            SseOpcode::Addpd
+            | SseOpcode::Addsd
+            | SseOpcode::Andpd
+            | SseOpcode::Andnpd
+            | SseOpcode::Cmppd
+            | SseOpcode::Cmpsd
+            | SseOpcode::Comisd
+            | SseOpcode::Cvtdq2ps
+            | SseOpcode::Cvtsd2ss
+            | SseOpcode::Cvtsd2si
+            | SseOpcode::Cvtsi2sd
+            | SseOpcode::Cvtss2sd
+            | SseOpcode::Cvttps2dq
+            | SseOpcode::Cvttsd2si
+            | SseOpcode::Divpd
+            | SseOpcode::Divsd
+            | SseOpcode::Maxpd
+            | SseOpcode::Maxsd
+            | SseOpcode::Minpd
+            | SseOpcode::Minsd
+            | SseOpcode::Movapd
+            | SseOpcode::Movd
+            | SseOpcode::Movmskpd
+            | SseOpcode::Movq
+            | SseOpcode::Movsd
+            | SseOpcode::Movupd
+            | SseOpcode::Movdqa
+            | SseOpcode::Movdqu
+            | SseOpcode::Mulpd
+            | SseOpcode::Mulsd
+            | SseOpcode::Orpd
+            | SseOpcode::Packsswb
+            | SseOpcode::Paddb
+            | SseOpcode::Paddd
+            | SseOpcode::Paddq
+            | SseOpcode::Paddw
+            | SseOpcode::Paddsb
+            | SseOpcode::Paddsw
+            | SseOpcode::Paddusb
+            | SseOpcode::Paddusw
+            | SseOpcode::Pand
+            | SseOpcode::Pandn
+            | SseOpcode::Pavgb
+            | SseOpcode::Pavgw
+            | SseOpcode::Pcmpeqb
+            | SseOpcode::Pcmpeqw
+            | SseOpcode::Pcmpeqd
+            | SseOpcode::Pcmpgtb
+            | SseOpcode::Pcmpgtw
+            | SseOpcode::Pcmpgtd
+            | SseOpcode::Pextrw
+            | SseOpcode::Pinsrw
+            | SseOpcode::Pmaxsw
+            | SseOpcode::Pmaxub
+            | SseOpcode::Pminsw
+            | SseOpcode::Pminub
+            | SseOpcode::Pmovmskb
+            | SseOpcode::Pmullw
+            | SseOpcode::Pmuludq
+            | SseOpcode::Por
+            | SseOpcode::Pshufd
+            | SseOpcode::Psllw
+            | SseOpcode::Pslld
+            | SseOpcode::Psllq
+            | SseOpcode::Psraw
+            | SseOpcode::Psrad
+            | SseOpcode::Psrlw
+            | SseOpcode::Psrld
+            | SseOpcode::Psrlq
+            | SseOpcode::Psubb
+            | SseOpcode::Psubd
+            | SseOpcode::Psubq
+            | SseOpcode::Psubw
+            | SseOpcode::Psubsb
+            | SseOpcode::Psubsw
+            | SseOpcode::Psubusb
+            | SseOpcode::Psubusw
+            | SseOpcode::Pxor
+            | SseOpcode::Sqrtpd
+            | SseOpcode::Sqrtsd
+            | SseOpcode::Subpd
+            | SseOpcode::Subsd
+            | SseOpcode::Ucomisd
+            | SseOpcode::Xorpd => SSE2,
+
+            SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3,
+
+            SseOpcode::Insertps
+            | SseOpcode::Pcmpeqq
+            | SseOpcode::Pextrb
+            | SseOpcode::Pextrd
+            | SseOpcode::Pinsrb
+            | SseOpcode::Pinsrd
+            | SseOpcode::Pmaxsb
+            | SseOpcode::Pmaxsd
+            | SseOpcode::Pmaxuw
+            | SseOpcode::Pmaxud
+            | SseOpcode::Pminsb
+            | SseOpcode::Pminsd
+            | SseOpcode::Pminuw
+            | SseOpcode::Pminud
+            | SseOpcode::Pmulld
+            | SseOpcode::Ptest
+            | SseOpcode::Roundss
+            | SseOpcode::Roundsd => SSE41,
+
+            SseOpcode::Pcmpgtq => SSE42,
+        }
+    }
+
+    /// Returns the src operand size for an instruction.
+    pub(crate) fn src_size(&self) -> u8 {
+        match self {
+            SseOpcode::Movd => 4,
+            _ => 8,
+        }
+    }
+}
+
+impl fmt::Debug for SseOpcode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            SseOpcode::Addps => "addps",
+            SseOpcode::Addpd => "addpd",
+            SseOpcode::Addss => "addss",
+            SseOpcode::Addsd => "addsd",
+            SseOpcode::Andpd => "andpd",
+            SseOpcode::Andps => "andps",
+            SseOpcode::Andnps => "andnps",
+            SseOpcode::Andnpd => "andnpd",
+            SseOpcode::Cmpps => "cmpps",
+            SseOpcode::Cmppd => "cmppd",
+            SseOpcode::Cmpss => "cmpss",
+            SseOpcode::Cmpsd => "cmpsd",
+            SseOpcode::Comiss => "comiss",
+            SseOpcode::Comisd => "comisd",
+            SseOpcode::Cvtdq2ps => "cvtdq2ps",
+            SseOpcode::Cvtsd2ss => "cvtsd2ss",
+            SseOpcode::Cvtsd2si => "cvtsd2si",
+            SseOpcode::Cvtsi2ss => "cvtsi2ss",
+            SseOpcode::Cvtsi2sd => "cvtsi2sd",
+            SseOpcode::Cvtss2si => "cvtss2si",
+            SseOpcode::Cvtss2sd => "cvtss2sd",
+            SseOpcode::Cvttps2dq => "cvttps2dq",
+            SseOpcode::Cvttss2si => "cvttss2si",
+            SseOpcode::Cvttsd2si => "cvttsd2si",
+            SseOpcode::Divps => "divps",
+            SseOpcode::Divpd => "divpd",
+            SseOpcode::Divss => "divss",
+            SseOpcode::Divsd => "divsd",
+            SseOpcode::Insertps => "insertps",
+            SseOpcode::Maxps => "maxps",
+            SseOpcode::Maxpd => "maxpd",
+            SseOpcode::Maxss => "maxss",
+            SseOpcode::Maxsd => "maxsd",
+            SseOpcode::Minps => "minps",
+            SseOpcode::Minpd => "minpd",
+            SseOpcode::Minss => "minss",
+            SseOpcode::Minsd => "minsd",
+            SseOpcode::Movaps => "movaps",
+            SseOpcode::Movapd => "movapd",
+            SseOpcode::Movd => "movd",
+            SseOpcode::Movdqa => "movdqa",
+            SseOpcode::Movdqu => "movdqu",
+            SseOpcode::Movlhps => "movlhps",
+            SseOpcode::Movmskps => "movmskps",
+            SseOpcode::Movmskpd => "movmskpd",
+            SseOpcode::Movq => "movq",
+            SseOpcode::Movss => "movss",
+            SseOpcode::Movsd => "movsd",
+            SseOpcode::Movups => "movups",
+            SseOpcode::Movupd => "movupd",
+            SseOpcode::Mulps => "mulps",
+            SseOpcode::Mulpd => "mulpd",
+            SseOpcode::Mulss => "mulss",
+            SseOpcode::Mulsd => "mulsd",
+            SseOpcode::Orpd => "orpd",
+            SseOpcode::Orps => "orps",
+            SseOpcode::Pabsb => "pabsb",
+            SseOpcode::Pabsw => "pabsw",
+            SseOpcode::Pabsd => "pabsd",
+            SseOpcode::Packsswb => "packsswb",
+            SseOpcode::Paddb => "paddb",
+            SseOpcode::Paddd => "paddd",
+            SseOpcode::Paddq => "paddq",
+            SseOpcode::Paddw => "paddw",
+            SseOpcode::Paddsb => "paddsb",
+            SseOpcode::Paddsw => "paddsw",
+            SseOpcode::Paddusb => "paddusb",
+            SseOpcode::Paddusw => "paddusw",
+            SseOpcode::Pand => "pand",
+            SseOpcode::Pandn => "pandn",
+            SseOpcode::Pavgb => "pavgb",
+            SseOpcode::Pavgw => "pavgw",
+            SseOpcode::Pcmpeqb => "pcmpeqb",
+            SseOpcode::Pcmpeqw => "pcmpeqw",
+            SseOpcode::Pcmpeqd => "pcmpeqd",
+            SseOpcode::Pcmpeqq => "pcmpeqq",
+            SseOpcode::Pcmpgtb => "pcmpgtb",
+            SseOpcode::Pcmpgtw => "pcmpgtw",
+            SseOpcode::Pcmpgtd => "pcmpgtd",
+            SseOpcode::Pcmpgtq => "pcmpgtq",
+            SseOpcode::Pextrb => "pextrb",
+            SseOpcode::Pextrw => "pextrw",
+            SseOpcode::Pextrd => "pextrd",
+            SseOpcode::Pinsrb => "pinsrb",
+            SseOpcode::Pinsrw => "pinsrw",
+            SseOpcode::Pinsrd => "pinsrd",
+            SseOpcode::Pmaxsb => "pmaxsb",
+            SseOpcode::Pmaxsw => "pmaxsw",
+            SseOpcode::Pmaxsd => "pmaxsd",
+            SseOpcode::Pmaxub => "pmaxub",
+            SseOpcode::Pmaxuw => "pmaxuw",
+            SseOpcode::Pmaxud => "pmaxud",
+            SseOpcode::Pminsb => "pminsb",
+            SseOpcode::Pminsw => "pminsw",
+            SseOpcode::Pminsd => "pminsd",
+            SseOpcode::Pminub => "pminub",
+            SseOpcode::Pminuw => "pminuw",
+            SseOpcode::Pminud => "pminud",
+            SseOpcode::Pmovmskb => "pmovmskb",
+            SseOpcode::Pmulld => "pmulld",
+            SseOpcode::Pmullw => "pmullw",
+            SseOpcode::Pmuludq => "pmuludq",
+            SseOpcode::Por => "por",
+            SseOpcode::Pshufb => "pshufb",
+            SseOpcode::Pshufd => "pshufd",
+            SseOpcode::Psllw => "psllw",
+            SseOpcode::Pslld => "pslld",
+            SseOpcode::Psllq => "psllq",
+            SseOpcode::Psraw => "psraw",
+            SseOpcode::Psrad => "psrad",
+            SseOpcode::Psrlw => "psrlw",
+            SseOpcode::Psrld => "psrld",
+            SseOpcode::Psrlq => "psrlq",
+            SseOpcode::Psubb => "psubb",
+            SseOpcode::Psubd => "psubd",
+            SseOpcode::Psubq => "psubq",
+            SseOpcode::Psubw => "psubw",
+            SseOpcode::Psubsb => "psubsb",
+            SseOpcode::Psubsw => "psubsw",
+            SseOpcode::Psubusb => "psubusb",
+            SseOpcode::Psubusw => "psubusw",
+            SseOpcode::Ptest => "ptest",
+            SseOpcode::Pxor => "pxor",
+            SseOpcode::Rcpss => "rcpss",
+            SseOpcode::Roundss => "roundss",
+            SseOpcode::Roundsd => "roundsd",
+            SseOpcode::Rsqrtss => "rsqrtss",
+            SseOpcode::Sqrtps => "sqrtps",
+            SseOpcode::Sqrtpd => "sqrtpd",
+            SseOpcode::Sqrtss => "sqrtss",
+            SseOpcode::Sqrtsd => "sqrtsd",
+            SseOpcode::Subps => "subps",
+            SseOpcode::Subpd => "subpd",
+            SseOpcode::Subss => "subss",
+            SseOpcode::Subsd => "subsd",
+            SseOpcode::Ucomiss => "ucomiss",
+            SseOpcode::Ucomisd => "ucomisd",
+            SseOpcode::Xorps => "xorps",
+            SseOpcode::Xorpd => "xorpd",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for SseOpcode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+/// This defines the ways a value can be extended: either signed- or zero-extension, or none for
+/// types that are not extended. Contrast with [ExtMode], which defines the widths from and to which
+/// values can be extended.
+#[derive(Clone, PartialEq)]
+pub enum ExtKind {
+    None,
+    SignExtend,
+    ZeroExtend,
+}
+
+/// These indicate ways of extending (widening) a value, using the Intel
+/// naming: B(yte) = u8, W(ord) = u16, L(ong)word = u32, Q(uad)word = u64
+#[derive(Clone, PartialEq)]
+pub enum ExtMode {
+    /// Byte -> Longword.
+    BL,
+    /// Byte -> Quadword.
+    BQ,
+    /// Word -> Longword.
+    WL,
+    /// Word -> Quadword.
+    WQ,
+    /// Longword -> Quadword.
+    LQ,
+}
+
+impl ExtMode {
+    /// Calculate the `ExtMode` from passed bit lengths of the from/to types.
+    pub(crate) fn new(from_bits: u16, to_bits: u16) -> Option<ExtMode> {
+        match (from_bits, to_bits) {
+            (1, 8) | (1, 16) | (1, 32) | (8, 16) | (8, 32) => Some(ExtMode::BL),
+            (1, 64) | (8, 64) => Some(ExtMode::BQ),
+            (16, 32) => Some(ExtMode::WL),
+            (16, 64) => Some(ExtMode::WQ),
+            (32, 64) => Some(ExtMode::LQ),
+            _ => None,
+        }
+    }
+
+    /// Return the source register size in bytes.
+    pub(crate) fn src_size(&self) -> u8 {
+        match self {
+            ExtMode::BL | ExtMode::BQ => 1,
+            ExtMode::WL | ExtMode::WQ => 2,
+            ExtMode::LQ => 4,
+        }
+    }
+
+    /// Return the destination register size in bytes.
+    pub(crate) fn dst_size(&self) -> u8 {
+        match self {
+            ExtMode::BL | ExtMode::WL => 4,
+            ExtMode::BQ | ExtMode::WQ | ExtMode::LQ => 8,
+        }
+    }
+}
+
+impl fmt::Debug for ExtMode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            ExtMode::BL => "bl",
+            ExtMode::BQ => "bq",
+            ExtMode::WL => "wl",
+            ExtMode::WQ => "wq",
+            ExtMode::LQ => "lq",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for ExtMode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+/// These indicate the form of a scalar shift/rotate: left, signed right, unsigned right.
+#[derive(Clone)]
+pub enum ShiftKind {
+    ShiftLeft,
+    /// Inserts zeros in the most significant bits.
+    ShiftRightLogical,
+    /// Replicates the sign bit in the most significant bits.
+    ShiftRightArithmetic,
+    RotateLeft,
+    RotateRight,
+}
+
+impl fmt::Debug for ShiftKind {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            ShiftKind::ShiftLeft => "shl",
+            ShiftKind::ShiftRightLogical => "shr",
+            ShiftKind::ShiftRightArithmetic => "sar",
+            ShiftKind::RotateLeft => "rol",
+            ShiftKind::RotateRight => "ror",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for ShiftKind {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+/// What kind of division or remainer instruction this is?
+#[derive(Clone)]
+pub enum DivOrRemKind {
+    SignedDiv,
+    UnsignedDiv,
+    SignedRem,
+    UnsignedRem,
+}
+
+impl DivOrRemKind {
+    pub(crate) fn is_signed(&self) -> bool {
+        match self {
+            DivOrRemKind::SignedDiv | DivOrRemKind::SignedRem => true,
+            _ => false,
+        }
+    }
+
+    pub(crate) fn is_div(&self) -> bool {
+        match self {
+            DivOrRemKind::SignedDiv | DivOrRemKind::UnsignedDiv => true,
+            _ => false,
+        }
+    }
+}
+
+/// These indicate condition code tests.  Not all are represented since not all are useful in
+/// compiler-generated code.
+#[derive(Copy, Clone)]
+#[repr(u8)]
+pub enum CC {
+    ///  overflow
+    O = 0,
+    /// no overflow
+    NO = 1,
+
+    /// < unsigned
+    B = 2,
+    /// >= unsigned
+    NB = 3,
+
+    /// zero
+    Z = 4,
+    /// not-zero
+    NZ = 5,
+
+    /// <= unsigned
+    BE = 6,
+    /// > unsigned
+    NBE = 7,
+
+    /// negative
+    S = 8,
+    /// not-negative
+    NS = 9,
+
+    /// < signed
+    L = 12,
+    /// >= signed
+    NL = 13,
+
+    /// <= signed
+    LE = 14,
+    /// > signed
+    NLE = 15,
+
+    /// parity
+    P = 10,
+
+    /// not parity
+    NP = 11,
+}
+
+impl CC {
+    pub(crate) fn from_intcc(intcc: IntCC) -> Self {
+        match intcc {
+            IntCC::Equal => CC::Z,
+            IntCC::NotEqual => CC::NZ,
+            IntCC::SignedGreaterThanOrEqual => CC::NL,
+            IntCC::SignedGreaterThan => CC::NLE,
+            IntCC::SignedLessThanOrEqual => CC::LE,
+            IntCC::SignedLessThan => CC::L,
+            IntCC::UnsignedGreaterThanOrEqual => CC::NB,
+            IntCC::UnsignedGreaterThan => CC::NBE,
+            IntCC::UnsignedLessThanOrEqual => CC::BE,
+            IntCC::UnsignedLessThan => CC::B,
+            IntCC::Overflow => CC::O,
+            IntCC::NotOverflow => CC::NO,
+        }
+    }
+
+    pub(crate) fn invert(&self) -> Self {
+        match self {
+            CC::O => CC::NO,
+            CC::NO => CC::O,
+
+            CC::B => CC::NB,
+            CC::NB => CC::B,
+
+            CC::Z => CC::NZ,
+            CC::NZ => CC::Z,
+
+            CC::BE => CC::NBE,
+            CC::NBE => CC::BE,
+
+            CC::S => CC::NS,
+            CC::NS => CC::S,
+
+            CC::L => CC::NL,
+            CC::NL => CC::L,
+
+            CC::LE => CC::NLE,
+            CC::NLE => CC::LE,
+
+            CC::P => CC::NP,
+            CC::NP => CC::P,
+        }
+    }
+
+    pub(crate) fn from_floatcc(floatcc: FloatCC) -> Self {
+        match floatcc {
+            FloatCC::Ordered => CC::NP,
+            FloatCC::Unordered => CC::P,
+            // Alias for NE
+            FloatCC::OrderedNotEqual => CC::NZ,
+            // Alias for E
+            FloatCC::UnorderedOrEqual => CC::Z,
+            // Alias for A
+            FloatCC::GreaterThan => CC::NBE,
+            // Alias for AE
+            FloatCC::GreaterThanOrEqual => CC::NB,
+            FloatCC::UnorderedOrLessThan => CC::B,
+            FloatCC::UnorderedOrLessThanOrEqual => CC::BE,
+            FloatCC::Equal
+            | FloatCC::NotEqual
+            | FloatCC::LessThan
+            | FloatCC::LessThanOrEqual
+            | FloatCC::UnorderedOrGreaterThan
+            | FloatCC::UnorderedOrGreaterThanOrEqual => panic!(
+                "{:?} can't be lowered to a CC code; treat as special case.",
+                floatcc
+            ),
+        }
+    }
+
+    pub(crate) fn get_enc(self) -> u8 {
+        self as u8
+    }
+}
+
+impl fmt::Debug for CC {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            CC::O => "o",
+            CC::NO => "no",
+            CC::B => "b",
+            CC::NB => "nb",
+            CC::Z => "z",
+            CC::NZ => "nz",
+            CC::BE => "be",
+            CC::NBE => "nbe",
+            CC::S => "s",
+            CC::NS => "ns",
+            CC::L => "l",
+            CC::NL => "nl",
+            CC::LE => "le",
+            CC::NLE => "nle",
+            CC::P => "p",
+            CC::NP => "np",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for CC {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+/// Encode the ways that floats can be compared. This is used in float comparisons such as `cmpps`,
+/// e.g.; it is distinguished from other float comparisons (e.g. `ucomiss`) in that those use EFLAGS
+/// whereas [FcmpImm] is used as an immediate.
+pub(crate) enum FcmpImm {
+    Equal = 0x00,
+    LessThan = 0x01,
+    LessThanOrEqual = 0x02,
+    Unordered = 0x03,
+    NotEqual = 0x04,
+    UnorderedOrGreaterThanOrEqual = 0x05,
+    UnorderedOrGreaterThan = 0x06,
+    Ordered = 0x07,
+}
+
+impl FcmpImm {
+    pub(crate) fn encode(self) -> u8 {
+        self as u8
+    }
+}
+
+impl From<FloatCC> for FcmpImm {
+    fn from(cond: FloatCC) -> Self {
+        match cond {
+            FloatCC::Equal => FcmpImm::Equal,
+            FloatCC::LessThan => FcmpImm::LessThan,
+            FloatCC::LessThanOrEqual => FcmpImm::LessThanOrEqual,
+            FloatCC::Unordered => FcmpImm::Unordered,
+            FloatCC::NotEqual => FcmpImm::NotEqual,
+            FloatCC::UnorderedOrGreaterThanOrEqual => FcmpImm::UnorderedOrGreaterThanOrEqual,
+            FloatCC::UnorderedOrGreaterThan => FcmpImm::UnorderedOrGreaterThan,
+            FloatCC::Ordered => FcmpImm::Ordered,
+            _ => panic!("unable to create comparison predicate for {}", cond),
+        }
+    }
+}
+
+/// An operand's size in bits.
+#[derive(Clone, Copy, PartialEq)]
+pub enum OperandSize {
+    Size32,
+    Size64,
+}
+
+impl OperandSize {
+    pub(crate) fn from_bytes(num_bytes: u32) -> Self {
+        match num_bytes {
+            1 | 2 | 4 => OperandSize::Size32,
+            8 => OperandSize::Size64,
+            _ => unreachable!(),
+        }
+    }
+
+    pub(crate) fn to_bytes(&self) -> u8 {
+        match self {
+            Self::Size32 => 4,
+            Self::Size64 => 8,
+        }
+    }
+
+    pub(crate) fn to_bits(&self) -> u8 {
+        match self {
+            Self::Size32 => 32,
+            Self::Size64 => 64,
+        }
+    }
+}
+
+/// An x64 memory fence kind.
+#[derive(Clone)]
+#[allow(dead_code)]
+pub enum FenceKind {
+    /// `mfence` instruction ("Memory Fence")
+    MFence,
+    /// `lfence` instruction ("Load Fence")
+    LFence,
+    /// `sfence` instruction ("Store Fence")
+    SFence,
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs
new file mode 100644
index 0000000000..dd4125a2da
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs
@@ -0,0 +1,2819 @@
+use crate::binemit::{Addend, Reloc};
+use crate::ir::immediates::{Ieee32, Ieee64};
+use crate::ir::TrapCode;
+use crate::isa::x64::inst::args::*;
+use crate::isa::x64::inst::*;
+use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel};
+use core::convert::TryInto;
+use log::debug;
+use regalloc::{Reg, RegClass, Writable};
+
+fn low8_will_sign_extend_to_64(x: u32) -> bool {
+    let xs = (x as i32) as i64;
+    xs == ((xs << 56) >> 56)
+}
+
+fn low8_will_sign_extend_to_32(x: u32) -> bool {
+    let xs = x as i32;
+    xs == ((xs << 24) >> 24)
+}
+
+//=============================================================================
+// Instructions and subcomponents: emission
+
+// For all of the routines that take both a memory-or-reg operand (sometimes
+// called "E" in the Intel documentation) and a reg-only operand ("G" in
+// Intelese), the order is always G first, then E.
+//
+// "enc" in the following means "hardware register encoding number".
+
+#[inline(always)]
+fn encode_modrm(m0d: u8, enc_reg_g: u8, rm_e: u8) -> u8 {
+    debug_assert!(m0d < 4);
+    debug_assert!(enc_reg_g < 8);
+    debug_assert!(rm_e < 8);
+    ((m0d & 3) << 6) | ((enc_reg_g & 7) << 3) | (rm_e & 7)
+}
+
+#[inline(always)]
+fn encode_sib(shift: u8, enc_index: u8, enc_base: u8) -> u8 {
+    debug_assert!(shift < 4);
+    debug_assert!(enc_index < 8);
+    debug_assert!(enc_base < 8);
+    ((shift & 3) << 6) | ((enc_index & 7) << 3) | (enc_base & 7)
+}
+
+/// Get the encoding number of a GPR.
+#[inline(always)]
+fn int_reg_enc(reg: Reg) -> u8 {
+    debug_assert!(reg.is_real());
+    debug_assert_eq!(reg.get_class(), RegClass::I64);
+    reg.get_hw_encoding()
+}
+
+/// Get the encoding number of any register.
+#[inline(always)]
+fn reg_enc(reg: Reg) -> u8 {
+    debug_assert!(reg.is_real());
+    reg.get_hw_encoding()
+}
+
+/// A small bit field to record a REX prefix specification:
+/// - bit 0 set to 1 indicates REX.W must be 0 (cleared).
+/// - bit 1 set to 1 indicates the REX prefix must always be emitted.
+#[repr(transparent)]
+#[derive(Clone, Copy)]
+struct RexFlags(u8);
+
+impl RexFlags {
+    /// By default, set the W field, and don't always emit.
+    #[inline(always)]
+    fn set_w() -> Self {
+        Self(0)
+    }
+    /// Creates a new RexPrefix for which the REX.W bit will be cleared.
+    #[inline(always)]
+    fn clear_w() -> Self {
+        Self(1)
+    }
+
+    #[inline(always)]
+    fn always_emit(&mut self) -> &mut Self {
+        self.0 = self.0 | 2;
+        self
+    }
+
+    #[inline(always)]
+    fn must_clear_w(&self) -> bool {
+        (self.0 & 1) != 0
+    }
+    #[inline(always)]
+    fn must_always_emit(&self) -> bool {
+        (self.0 & 2) != 0
+    }
+
+    #[inline(always)]
+    fn emit_two_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_e: u8) {
+        let w = if self.must_clear_w() { 0 } else { 1 };
+        let r = (enc_g >> 3) & 1;
+        let x = 0;
+        let b = (enc_e >> 3) & 1;
+        let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+        if rex != 0x40 || self.must_always_emit() {
+            sink.put1(rex);
+        }
+    }
+
+    #[inline(always)]
+    fn emit_three_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_index: u8, enc_base: u8) {
+        let w = if self.must_clear_w() { 0 } else { 1 };
+        let r = (enc_g >> 3) & 1;
+        let x = (enc_index >> 3) & 1;
+        let b = (enc_base >> 3) & 1;
+        let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+        if rex != 0x40 || self.must_always_emit() {
+            sink.put1(rex);
+        }
+    }
+}
+
+/// We may need to include one or more legacy prefix bytes before the REX prefix.  This enum
+/// covers only the small set of possibilities that we actually need.
+enum LegacyPrefixes {
+    /// No prefix bytes
+    None,
+    /// Operand Size Override -- here, denoting "16-bit operation"
+    _66,
+    /// The Lock prefix
+    _F0,
+    /// Operand size override and Lock
+    _66F0,
+    /// REPNE, but no specific meaning here -- is just an opcode extension
+    _F2,
+    /// REP/REPE, but no specific meaning here -- is just an opcode extension
+    _F3,
+}
+
+impl LegacyPrefixes {
+    #[inline(always)]
+    fn emit(&self, sink: &mut MachBuffer<Inst>) {
+        match self {
+            LegacyPrefixes::_66 => sink.put1(0x66),
+            LegacyPrefixes::_F0 => sink.put1(0xF0),
+            LegacyPrefixes::_66F0 => {
+                // I don't think the order matters, but in any case, this is the same order that
+                // the GNU assembler uses.
+                sink.put1(0x66);
+                sink.put1(0xF0);
+            }
+            LegacyPrefixes::_F2 => sink.put1(0xF2),
+            LegacyPrefixes::_F3 => sink.put1(0xF3),
+            LegacyPrefixes::None => (),
+        }
+    }
+}
+
+/// This is the core 'emit' function for instructions that reference memory.
+///
+/// For an instruction that has as operands a reg encoding `enc_g` and a memory address `mem_e`,
+/// create and emit:
+/// - first the legacy prefixes, if any
+/// - then the REX prefix, if needed
+/// - then caller-supplied opcode byte(s) (`opcodes` and `num_opcodes`),
+/// - then the MOD/RM byte,
+/// - then optionally, a SIB byte,
+/// - and finally optionally an immediate that will be derived from the `mem_e` operand.
+///
+/// For most instructions up to and including SSE4.2, that will be the whole instruction: this is
+/// what we call "standard" instructions, and abbreviate "std" in the name here. VEX-prefixed
+/// instructions will require their own emitter functions.
+///
+/// This will also work for 32-bits x86 instructions, assuming no REX prefix is provided.
+///
+/// The opcodes are written bigendianly for the convenience of callers.  For example, if the opcode
+/// bytes to be emitted are, in this order, F3 0F 27, then the caller should pass `opcodes` ==
+/// 0xF3_0F_27 and `num_opcodes` == 3.
+///
+/// The register operand is represented here not as a `Reg` but as its hardware encoding, `enc_g`.
+/// `rex` can specify special handling for the REX prefix.  By default, the REX prefix will
+/// indicate a 64-bit operation and will be deleted if it is redundant (0x40).  Note that for a
+/// 64-bit operation, the REX prefix will normally never be redundant, since REX.W must be 1 to
+/// indicate a 64-bit operation.
+fn emit_std_enc_mem(
+    sink: &mut MachBuffer<Inst>,
+    state: &EmitState,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    mut num_opcodes: usize,
+    enc_g: u8,
+    mem_e: &Amode,
+    rex: RexFlags,
+) {
+    // General comment for this function: the registers in `mem_e` must be
+    // 64-bit integer registers, because they are part of an address
+    // expression.  But `enc_g` can be derived from a register of any class.
+
+    let srcloc = state.cur_srcloc();
+    if srcloc != SourceLoc::default() && mem_e.can_trap() {
+        sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+    }
+
+    prefixes.emit(sink);
+
+    match mem_e {
+        Amode::ImmReg { simm32, base, .. } => {
+            // First, the REX byte.
+            let enc_e = int_reg_enc(*base);
+            rex.emit_two_op(sink, enc_g, enc_e);
+
+            // Now the opcode(s).  These include any other prefixes the caller
+            // hands to us.
+            while num_opcodes > 0 {
+                num_opcodes -= 1;
+                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+            }
+
+            // Now the mod/rm and associated immediates.  This is
+            // significantly complicated due to the multiple special cases.
+            if *simm32 == 0
+                && enc_e != regs::ENC_RSP
+                && enc_e != regs::ENC_RBP
+                && enc_e != regs::ENC_R12
+                && enc_e != regs::ENC_R13
+            {
+                // FIXME JRS 2020Feb11: those four tests can surely be
+                // replaced by a single mask-and-compare check.  We should do
+                // that because this routine is likely to be hot.
+                sink.put1(encode_modrm(0, enc_g & 7, enc_e & 7));
+            } else if *simm32 == 0 && (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12) {
+                sink.put1(encode_modrm(0, enc_g & 7, 4));
+                sink.put1(0x24);
+            } else if low8_will_sign_extend_to_32(*simm32)
+                && enc_e != regs::ENC_RSP
+                && enc_e != regs::ENC_R12
+            {
+                sink.put1(encode_modrm(1, enc_g & 7, enc_e & 7));
+                sink.put1((simm32 & 0xFF) as u8);
+            } else if enc_e != regs::ENC_RSP && enc_e != regs::ENC_R12 {
+                sink.put1(encode_modrm(2, enc_g & 7, enc_e & 7));
+                sink.put4(*simm32);
+            } else if (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12)
+                && low8_will_sign_extend_to_32(*simm32)
+            {
+                // REX.B distinguishes RSP from R12
+                sink.put1(encode_modrm(1, enc_g & 7, 4));
+                sink.put1(0x24);
+                sink.put1((simm32 & 0xFF) as u8);
+            } else if enc_e == regs::ENC_R12 || enc_e == regs::ENC_RSP {
+                //.. wait for test case for RSP case
+                // REX.B distinguishes RSP from R12
+                sink.put1(encode_modrm(2, enc_g & 7, 4));
+                sink.put1(0x24);
+                sink.put4(*simm32);
+            } else {
+                unreachable!("ImmReg");
+            }
+        }
+
+        Amode::ImmRegRegShift {
+            simm32,
+            base: reg_base,
+            index: reg_index,
+            shift,
+            ..
+        } => {
+            let enc_base = int_reg_enc(*reg_base);
+            let enc_index = int_reg_enc(*reg_index);
+
+            // The rex byte.
+            rex.emit_three_op(sink, enc_g, enc_index, enc_base);
+
+            // All other prefixes and opcodes.
+            while num_opcodes > 0 {
+                num_opcodes -= 1;
+                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+            }
+
+            // modrm, SIB, immediates.
+            if low8_will_sign_extend_to_32(*simm32) && enc_index != regs::ENC_RSP {
+                sink.put1(encode_modrm(1, enc_g & 7, 4));
+                sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7));
+                sink.put1(*simm32 as u8);
+            } else if enc_index != regs::ENC_RSP {
+                sink.put1(encode_modrm(2, enc_g & 7, 4));
+                sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7));
+                sink.put4(*simm32);
+            } else {
+                panic!("ImmRegRegShift");
+            }
+        }
+
+        Amode::RipRelative { ref target } => {
+            // First, the REX byte, with REX.B = 0.
+            rex.emit_two_op(sink, enc_g, 0);
+
+            // Now the opcode(s).  These include any other prefixes the caller
+            // hands to us.
+            while num_opcodes > 0 {
+                num_opcodes -= 1;
+                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+            }
+
+            // RIP-relative is mod=00, rm=101.
+            sink.put1(encode_modrm(0, enc_g & 7, 0b101));
+
+            let offset = sink.cur_offset();
+            sink.use_label_at_offset(offset, *target, LabelUse::JmpRel32);
+            sink.put4(0);
+        }
+    }
+}
+
+/// This is the core 'emit' function for instructions that do not reference memory.
+///
+/// This is conceptually the same as emit_modrm_sib_enc_ge, except it is for the case where the E
+/// operand is a register rather than memory.  Hence it is much simpler.
+fn emit_std_enc_enc(
+    sink: &mut MachBuffer<Inst>,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    mut num_opcodes: usize,
+    enc_g: u8,
+    enc_e: u8,
+    rex: RexFlags,
+) {
+    // EncG and EncE can be derived from registers of any class, and they
+    // don't even have to be from the same class.  For example, for an
+    // integer-to-FP conversion insn, one might be RegClass::I64 and the other
+    // RegClass::V128.
+
+    // The legacy prefixes.
+    prefixes.emit(sink);
+
+    // The rex byte.
+    rex.emit_two_op(sink, enc_g, enc_e);
+
+    // All other prefixes and opcodes.
+    while num_opcodes > 0 {
+        num_opcodes -= 1;
+        sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+    }
+
+    // Now the mod/rm byte.  The instruction we're generating doesn't access
+    // memory, so there is no SIB byte or immediate -- we're done.
+    sink.put1(encode_modrm(3, enc_g & 7, enc_e & 7));
+}
+
+// These are merely wrappers for the above two functions that facilitate passing
+// actual `Reg`s rather than their encodings.
+
+fn emit_std_reg_mem(
+    sink: &mut MachBuffer<Inst>,
+    state: &EmitState,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    num_opcodes: usize,
+    reg_g: Reg,
+    mem_e: &Amode,
+    rex: RexFlags,
+) {
+    let enc_g = reg_enc(reg_g);
+    emit_std_enc_mem(
+        sink,
+        state,
+        prefixes,
+        opcodes,
+        num_opcodes,
+        enc_g,
+        mem_e,
+        rex,
+    );
+}
+
+fn emit_std_reg_reg(
+    sink: &mut MachBuffer<Inst>,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    num_opcodes: usize,
+    reg_g: Reg,
+    reg_e: Reg,
+    rex: RexFlags,
+) {
+    let enc_g = reg_enc(reg_g);
+    let enc_e = reg_enc(reg_e);
+    emit_std_enc_enc(sink, prefixes, opcodes, num_opcodes, enc_g, enc_e, rex);
+}
+
+/// Write a suitable number of bits from an imm64 to the sink.
+fn emit_simm(sink: &mut MachBuffer<Inst>, size: u8, simm32: u32) {
+    match size {
+        8 | 4 => sink.put4(simm32),
+        2 => sink.put2(simm32 as u16),
+        1 => sink.put1(simm32 as u8),
+        _ => unreachable!(),
+    }
+}
+
+/// A small helper to generate a signed conversion instruction.
+fn emit_signed_cvt(
+    sink: &mut MachBuffer<Inst>,
+    info: &EmitInfo,
+    state: &mut EmitState,
+    src: Reg,
+    dst: Writable<Reg>,
+    to_f64: bool,
+) {
+    // Handle an unsigned int, which is the "easy" case: a signed conversion will do the
+    // right thing.
+    let op = if to_f64 {
+        SseOpcode::Cvtsi2sd
+    } else {
+        SseOpcode::Cvtsi2ss
+    };
+    let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst);
+    inst.emit(sink, info, state);
+}
+
+/// Emits a one way conditional jump if CC is set (true).
+fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
+    let cond_start = sink.cur_offset();
+    let cond_disp_off = cond_start + 2;
+    sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
+    sink.put1(0x0F);
+    sink.put1(0x80 + cc.get_enc());
+    sink.put4(0x0);
+}
+
+/// Emits a relocation, attaching the current source location as well.
+fn emit_reloc(
+    sink: &mut MachBuffer<Inst>,
+    state: &EmitState,
+    kind: Reloc,
+    name: &ExternalName,
+    addend: Addend,
+) {
+    let srcloc = state.cur_srcloc();
+    sink.add_reloc(srcloc, kind, name, addend);
+}
+
+/// The top-level emit function.
+///
+/// Important!  Do not add improved (shortened) encoding cases to existing
+/// instructions without also adding tests for those improved encodings.  That
+/// is a dangerous game that leads to hard-to-track-down errors in the emitted
+/// code.
+///
+/// For all instructions, make sure to have test coverage for all of the
+/// following situations.  Do this by creating the cross product resulting from
+/// applying the following rules to each operand:
+///
+/// (1) for any insn that mentions a register: one test using a register from
+///     the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
+///     using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
+///     This helps detect incorrect REX prefix construction.
+///
+/// (2) for any insn that mentions a byte register: one test for each of the
+///     four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
+///     [r8b .. r11b] and [r12b .. r15b].  This checks that
+///     apparently-redundant REX prefixes are retained when required.
+///
+/// (3) for any insn that contains an immediate field, check the following
+///     cases: field is zero, field is in simm8 range (-128 .. 127), field is
+///     in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF).  This is because some
+///     instructions that require a 32-bit immediate have a short-form encoding
+///     when the imm is in simm8 range.
+///
+/// Rules (1), (2) and (3) don't apply for registers within address expressions
+/// (`Addr`s).  Those are already pretty well tested, and the registers in them
+/// don't have any effect on the containing instruction (apart from possibly
+/// require REX prefix bits).
+///
+/// When choosing registers for a test, avoid using registers with the same
+/// offset within a given group.  For example, don't use rax and r8, since they
+/// both have the lowest 3 bits as 000, and so the test won't detect errors
+/// where those 3-bit register sub-fields are confused by the emitter.  Instead
+/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001).  Similarly, don't use (eg) cl
+/// and bpl since they have the same offset in their group; use instead (eg) cl
+/// and sil.
+///
+/// For all instructions, also add a test that uses only low-half registers
+/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
+/// prefixes are correctly omitted.  This low-half restriction must apply to
+/// _all_ registers in the insn, even those in address expressions.
+///
+/// Following these rules creates large numbers of test cases, but it's the
+/// only way to make the emitter reliable.
+///
+/// Known possible improvements:
+///
+/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
+///   care?)
+pub(crate) fn emit(
+    inst: &Inst,
+    sink: &mut MachBuffer<Inst>,
+    info: &EmitInfo,
+    state: &mut EmitState,
+) {
+    if let Some(iset_requirement) = inst.isa_requirement() {
+        match iset_requirement {
+            // Cranelift assumes SSE2 at least.
+            InstructionSet::SSE | InstructionSet::SSE2 => {}
+            InstructionSet::SSSE3 => assert!(info.isa_flags.has_ssse3()),
+            InstructionSet::SSE41 => assert!(info.isa_flags.has_sse41()),
+            InstructionSet::SSE42 => assert!(info.isa_flags.has_sse42()),
+        }
+    }
+
+    match inst {
+        Inst::AluRmiR {
+            is_64,
+            op,
+            src,
+            dst: reg_g,
+        } => {
+            let rex = if *is_64 {
+                RexFlags::set_w()
+            } else {
+                RexFlags::clear_w()
+            };
+
+            if *op == AluRmiROpcode::Mul {
+                // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so
+                // we have to special-case it.
+                match src {
+                    RegMemImm::Reg { reg: reg_e } => {
+                        emit_std_reg_reg(
+                            sink,
+                            LegacyPrefixes::None,
+                            0x0FAF,
+                            2,
+                            reg_g.to_reg(),
+                            *reg_e,
+                            rex,
+                        );
+                    }
+
+                    RegMemImm::Mem { addr } => {
+                        let amode = addr.finalize(state);
+                        emit_std_reg_mem(
+                            sink,
+                            state,
+                            LegacyPrefixes::None,
+                            0x0FAF,
+                            2,
+                            reg_g.to_reg(),
+                            &amode,
+                            rex,
+                        );
+                    }
+
+                    RegMemImm::Imm { simm32 } => {
+                        let use_imm8 = low8_will_sign_extend_to_32(*simm32);
+                        let opcode = if use_imm8 { 0x6B } else { 0x69 };
+                        // Yes, really, reg_g twice.
+                        emit_std_reg_reg(
+                            sink,
+                            LegacyPrefixes::None,
+                            opcode,
+                            1,
+                            reg_g.to_reg(),
+                            reg_g.to_reg(),
+                            rex,
+                        );
+                        emit_simm(sink, if use_imm8 { 1 } else { 4 }, *simm32);
+                    }
+                }
+            } else {
+                let (opcode_r, opcode_m, subopcode_i) = match op {
+                    AluRmiROpcode::Add => (0x01, 0x03, 0),
+                    AluRmiROpcode::Sub => (0x29, 0x2B, 5),
+                    AluRmiROpcode::And => (0x21, 0x23, 4),
+                    AluRmiROpcode::Or => (0x09, 0x0B, 1),
+                    AluRmiROpcode::Xor => (0x31, 0x33, 6),
+                    AluRmiROpcode::Mul => panic!("unreachable"),
+                };
+
+                match src {
+                    RegMemImm::Reg { reg: reg_e } => {
+                        // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
+                        // duality). Do this too, so as to be able to compare generated machine
+                        // code easily.
+                        emit_std_reg_reg(
+                            sink,
+                            LegacyPrefixes::None,
+                            opcode_r,
+                            1,
+                            *reg_e,
+                            reg_g.to_reg(),
+                            rex,
+                        );
+                        // NB: if this is ever extended to handle byte size ops, be sure to retain
+                        // redundant REX prefixes.
+                    }
+
+                    RegMemImm::Mem { addr } => {
+                        // Here we revert to the "normal" G-E ordering.
+                        let amode = addr.finalize(state);
+                        emit_std_reg_mem(
+                            sink,
+                            state,
+                            LegacyPrefixes::None,
+                            opcode_m,
+                            1,
+                            reg_g.to_reg(),
+                            &amode,
+                            rex,
+                        );
+                    }
+
+                    RegMemImm::Imm { simm32 } => {
+                        let use_imm8 = low8_will_sign_extend_to_32(*simm32);
+                        let opcode = if use_imm8 { 0x83 } else { 0x81 };
+                        // And also here we use the "normal" G-E ordering.
+                        let enc_g = int_reg_enc(reg_g.to_reg());
+                        emit_std_enc_enc(
+                            sink,
+                            LegacyPrefixes::None,
+                            opcode,
+                            1,
+                            subopcode_i,
+                            enc_g,
+                            rex,
+                        );
+                        emit_simm(sink, if use_imm8 { 1 } else { 4 }, *simm32);
+                    }
+                }
+            }
+        }
+
+        Inst::UnaryRmR { size, op, src, dst } => {
+            let (prefix, rex_flags) = match size {
+                2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!(),
+            };
+
+            let (opcode, num_opcodes) = match op {
+                UnaryRmROpcode::Bsr => (0x0fbd, 2),
+                UnaryRmROpcode::Bsf => (0x0fbc, 2),
+            };
+
+            match src {
+                RegMem::Reg { reg: src } => emit_std_reg_reg(
+                    sink,
+                    prefix,
+                    opcode,
+                    num_opcodes,
+                    dst.to_reg(),
+                    *src,
+                    rex_flags,
+                ),
+                RegMem::Mem { addr: src } => {
+                    let amode = src.finalize(state);
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        prefix,
+                        opcode,
+                        num_opcodes,
+                        dst.to_reg(),
+                        &amode,
+                        rex_flags,
+                    );
+                }
+            }
+        }
+
+        Inst::Not { size, src } => {
+            let (opcode, prefix, rex_flags) = match size {
+                1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
+                2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!("{}", size),
+            };
+
+            let subopcode = 2;
+            let src = int_reg_enc(src.to_reg());
+            emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
+        }
+
+        Inst::Neg { size, src } => {
+            let (opcode, prefix, rex_flags) = match size {
+                1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
+                2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!("{}", size),
+            };
+
+            let subopcode = 3;
+            let src = int_reg_enc(src.to_reg());
+            emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
+        }
+
+        Inst::Div {
+            size,
+            signed,
+            divisor,
+        } => {
+            let (opcode, prefix, rex_flags) = match size {
+                1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
+                2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!("{}", size),
+            };
+
+            let loc = state.cur_srcloc();
+            sink.add_trap(loc, TrapCode::IntegerDivisionByZero);
+
+            let subopcode = if *signed { 7 } else { 6 };
+            match divisor {
+                RegMem::Reg { reg } => {
+                    let src = int_reg_enc(*reg);
+                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
+                }
+                RegMem::Mem { addr: src } => {
+                    let amode = src.finalize(state);
+                    emit_std_enc_mem(sink, state, prefix, opcode, 1, subopcode, &amode, rex_flags);
+                }
+            }
+        }
+
+        Inst::MulHi { size, signed, rhs } => {
+            let (prefix, rex_flags) = match size {
+                2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!(),
+            };
+
+            let subopcode = if *signed { 5 } else { 4 };
+            match rhs {
+                RegMem::Reg { reg } => {
+                    let src = int_reg_enc(*reg);
+                    emit_std_enc_enc(sink, prefix, 0xF7, 1, subopcode, src, rex_flags)
+                }
+                RegMem::Mem { addr: src } => {
+                    let amode = src.finalize(state);
+                    emit_std_enc_mem(sink, state, prefix, 0xF7, 1, subopcode, &amode, rex_flags);
+                }
+            }
+        }
+
+        Inst::SignExtendData { size } => match size {
+            1 => {
+                sink.put1(0x66);
+                sink.put1(0x98);
+            }
+            2 => {
+                sink.put1(0x66);
+                sink.put1(0x99);
+            }
+            4 => sink.put1(0x99),
+            8 => {
+                sink.put1(0x48);
+                sink.put1(0x99);
+            }
+            _ => unreachable!(),
+        },
+
+        Inst::CheckedDivOrRemSeq {
+            kind,
+            size,
+            divisor,
+            tmp,
+        } => {
+            // Generates the following code sequence:
+            //
+            // ;; check divide by zero:
+            // cmp 0 %divisor
+            // jnz $after_trap
+            // ud2
+            // $after_trap:
+            //
+            // ;; for signed modulo/div:
+            // cmp -1 %divisor
+            // jnz $do_op
+            // ;;   for signed modulo, result is 0
+            //    mov #0, %rdx
+            //    j $done
+            // ;;   for signed div, check for integer overflow against INT_MIN of the right size
+            // cmp INT_MIN, %rax
+            // jnz $do_op
+            // ud2
+            //
+            // $do_op:
+            // ;; if signed
+            //     cdq ;; sign-extend from rax into rdx
+            // ;; else
+            //     mov #0, %rdx
+            // idiv %divisor
+            //
+            // $done:
+            debug_assert!(info.flags().avoid_div_traps());
+
+            // Check if the divisor is zero, first.
+            let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0), divisor.to_reg());
+            inst.emit(sink, info, state);
+
+            let inst = Inst::trap_if(CC::Z, TrapCode::IntegerDivisionByZero);
+            inst.emit(sink, info, state);
+
+            let (do_op, done_label) = if kind.is_signed() {
+                // Now check if the divisor is -1.
+                let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0xffffffff), divisor.to_reg());
+                inst.emit(sink, info, state);
+
+                let do_op = sink.get_label();
+
+                // If not equal, jump to do-op.
+                one_way_jmp(sink, CC::NZ, do_op);
+
+                // Here, divisor == -1.
+                if !kind.is_div() {
+                    // x % -1 = 0; put the result into the destination, $rdx.
+                    let done_label = sink.get_label();
+
+                    let inst = Inst::imm(
+                        OperandSize::from_bytes(*size as u32),
+                        0,
+                        Writable::from_reg(regs::rdx()),
+                    );
+                    inst.emit(sink, info, state);
+
+                    let inst = Inst::jmp_known(done_label);
+                    inst.emit(sink, info, state);
+
+                    (Some(do_op), Some(done_label))
+                } else {
+                    // Check for integer overflow.
+                    if *size == 8 {
+                        let tmp = tmp.expect("temporary for i64 sdiv");
+
+                        let inst = Inst::imm(OperandSize::Size64, 0x8000000000000000, tmp);
+                        inst.emit(sink, info, state);
+
+                        let inst = Inst::cmp_rmi_r(8, RegMemImm::reg(tmp.to_reg()), regs::rax());
+                        inst.emit(sink, info, state);
+                    } else {
+                        let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0x80000000), regs::rax());
+                        inst.emit(sink, info, state);
+                    }
+
+                    // If not equal, jump over the trap.
+                    let inst = Inst::trap_if(CC::Z, TrapCode::IntegerOverflow);
+                    inst.emit(sink, info, state);
+
+                    (Some(do_op), None)
+                }
+            } else {
+                (None, None)
+            };
+
+            if let Some(do_op) = do_op {
+                sink.bind_label(do_op);
+            }
+
+            assert!(
+                *size > 1,
+                "CheckedDivOrRemSeq for i8 is not yet implemented"
+            );
+
+            // Fill in the high parts:
+            if kind.is_signed() {
+                // sign-extend the sign-bit of rax into rdx, for signed opcodes.
+                let inst = Inst::sign_extend_data(*size);
+                inst.emit(sink, info, state);
+            } else {
+                // zero for unsigned opcodes.
+                let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(regs::rdx()));
+                inst.emit(sink, info, state);
+            }
+
+            let inst = Inst::div(*size, kind.is_signed(), RegMem::reg(divisor.to_reg()));
+            inst.emit(sink, info, state);
+
+            // Lowering takes care of moving the result back into the right register, see comment
+            // there.
+
+            if let Some(done) = done_label {
+                sink.bind_label(done);
+            }
+        }
+
+        Inst::Imm {
+            dst_is_64,
+            simm64,
+            dst,
+        } => {
+            let enc_dst = int_reg_enc(dst.to_reg());
+            if *dst_is_64 {
+                if low32_will_sign_extend_to_64(*simm64) {
+                    // Sign-extended move imm32.
+                    emit_std_enc_enc(
+                        sink,
+                        LegacyPrefixes::None,
+                        0xC7,
+                        1,
+                        /* subopcode */ 0,
+                        enc_dst,
+                        RexFlags::set_w(),
+                    );
+                    sink.put4(*simm64 as u32);
+                } else {
+                    sink.put1(0x48 | ((enc_dst >> 3) & 1));
+                    sink.put1(0xB8 | (enc_dst & 7));
+                    sink.put8(*simm64);
+                }
+            } else {
+                if ((enc_dst >> 3) & 1) == 1 {
+                    sink.put1(0x41);
+                }
+                sink.put1(0xB8 | (enc_dst & 7));
+                sink.put4(*simm64 as u32);
+            }
+        }
+
+        Inst::MovRR { is_64, src, dst } => {
+            let rex = if *is_64 {
+                RexFlags::set_w()
+            } else {
+                RexFlags::clear_w()
+            };
+            emit_std_reg_reg(sink, LegacyPrefixes::None, 0x89, 1, *src, dst.to_reg(), rex);
+        }
+
+        Inst::MovzxRmR { ext_mode, src, dst } => {
+            let (opcodes, num_opcodes, mut rex_flags) = match ext_mode {
+                ExtMode::BL => {
+                    // MOVZBL is (REX.W==0) 0F B6 /r
+                    (0x0FB6, 2, RexFlags::clear_w())
+                }
+                ExtMode::BQ => {
+                    // MOVZBQ is (REX.W==1) 0F B6 /r
+                    // I'm not sure why the Intel manual offers different
+                    // encodings for MOVZBQ than for MOVZBL.  AIUI they should
+                    // achieve the same, since MOVZBL is just going to zero out
+                    // the upper half of the destination anyway.
+                    (0x0FB6, 2, RexFlags::set_w())
+                }
+                ExtMode::WL => {
+                    // MOVZWL is (REX.W==0) 0F B7 /r
+                    (0x0FB7, 2, RexFlags::clear_w())
+                }
+                ExtMode::WQ => {
+                    // MOVZWQ is (REX.W==1) 0F B7 /r
+                    (0x0FB7, 2, RexFlags::set_w())
+                }
+                ExtMode::LQ => {
+                    // This is just a standard 32 bit load, and we rely on the
+                    // default zero-extension rule to perform the extension.
+                    // Note that in reg/reg mode, gcc seems to use the swapped form R/RM, which we
+                    // don't do here, since it's the same encoding size.
+                    // MOV r/m32, r32 is (REX.W==0) 8B /r
+                    (0x8B, 1, RexFlags::clear_w())
+                }
+            };
+
+            match src {
+                RegMem::Reg { reg: src } => {
+                    match ext_mode {
+                        ExtMode::BL | ExtMode::BQ => {
+                            // A redundant REX prefix must be emitted for certain register inputs.
+                            let enc_src = int_reg_enc(*src);
+                            if enc_src >= 4 && enc_src <= 7 {
+                                rex_flags.always_emit();
+                            };
+                        }
+                        _ => {}
+                    }
+                    emit_std_reg_reg(
+                        sink,
+                        LegacyPrefixes::None,
+                        opcodes,
+                        num_opcodes,
+                        dst.to_reg(),
+                        *src,
+                        rex_flags,
+                    )
+                }
+
+                RegMem::Mem { addr: src } => {
+                    let src = &src.finalize(state);
+
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        opcodes,
+                        num_opcodes,
+                        dst.to_reg(),
+                        src,
+                        rex_flags,
+                    )
+                }
+            }
+        }
+
+        Inst::Mov64MR { src, dst } => {
+            let src = &src.finalize(state);
+
+            emit_std_reg_mem(
+                sink,
+                state,
+                LegacyPrefixes::None,
+                0x8B,
+                1,
+                dst.to_reg(),
+                src,
+                RexFlags::set_w(),
+            )
+        }
+
+        Inst::LoadEffectiveAddress { addr, dst } => {
+            let amode = addr.finalize(state);
+
+            emit_std_reg_mem(
+                sink,
+                state,
+                LegacyPrefixes::None,
+                0x8D,
+                1,
+                dst.to_reg(),
+                &amode,
+                RexFlags::set_w(),
+            );
+        }
+
+        Inst::MovsxRmR { ext_mode, src, dst } => {
+            let (opcodes, num_opcodes, mut rex_flags) = match ext_mode {
+                ExtMode::BL => {
+                    // MOVSBL is (REX.W==0) 0F BE /r
+                    (0x0FBE, 2, RexFlags::clear_w())
+                }
+                ExtMode::BQ => {
+                    // MOVSBQ is (REX.W==1) 0F BE /r
+                    (0x0FBE, 2, RexFlags::set_w())
+                }
+                ExtMode::WL => {
+                    // MOVSWL is (REX.W==0) 0F BF /r
+                    (0x0FBF, 2, RexFlags::clear_w())
+                }
+                ExtMode::WQ => {
+                    // MOVSWQ is (REX.W==1) 0F BF /r
+                    (0x0FBF, 2, RexFlags::set_w())
+                }
+                ExtMode::LQ => {
+                    // MOVSLQ is (REX.W==1) 63 /r
+                    (0x63, 1, RexFlags::set_w())
+                }
+            };
+
+            match src {
+                RegMem::Reg { reg: src } => {
+                    match ext_mode {
+                        ExtMode::BL | ExtMode::BQ => {
+                            // A redundant REX prefix must be emitted for certain register inputs.
+                            let enc_src = int_reg_enc(*src);
+                            if enc_src >= 4 && enc_src <= 7 {
+                                rex_flags.always_emit();
+                            };
+                        }
+                        _ => {}
+                    }
+                    emit_std_reg_reg(
+                        sink,
+                        LegacyPrefixes::None,
+                        opcodes,
+                        num_opcodes,
+                        dst.to_reg(),
+                        *src,
+                        rex_flags,
+                    )
+                }
+
+                RegMem::Mem { addr: src } => {
+                    let src = &src.finalize(state);
+
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        opcodes,
+                        num_opcodes,
+                        dst.to_reg(),
+                        src,
+                        rex_flags,
+                    )
+                }
+            }
+        }
+
+        Inst::MovRM { size, src, dst } => {
+            let dst = &dst.finalize(state);
+
+            match size {
+                1 => {
+                    // This is one of the few places where the presence of a
+                    // redundant REX prefix changes the meaning of the
+                    // instruction.
+                    let mut rex = RexFlags::clear_w();
+
+                    let enc_src = int_reg_enc(*src);
+                    if enc_src >= 4 && enc_src <= 7 {
+                        rex.always_emit();
+                    };
+
+                    // MOV r8, r/m8 is (REX.W==0) 88 /r
+                    emit_std_reg_mem(sink, state, LegacyPrefixes::None, 0x88, 1, *src, dst, rex)
+                }
+
+                2 => {
+                    // MOV r16, r/m16 is 66 (REX.W==0) 89 /r
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::_66,
+                        0x89,
+                        1,
+                        *src,
+                        dst,
+                        RexFlags::clear_w(),
+                    )
+                }
+
+                4 => {
+                    // MOV r32, r/m32 is (REX.W==0) 89 /r
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        0x89,
+                        1,
+                        *src,
+                        dst,
+                        RexFlags::clear_w(),
+                    )
+                }
+
+                8 => {
+                    // MOV r64, r/m64 is (REX.W==1) 89 /r
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        0x89,
+                        1,
+                        *src,
+                        dst,
+                        RexFlags::set_w(),
+                    )
+                }
+
+                _ => panic!("x64::Inst::Mov_R_M::emit: unreachable"),
+            }
+        }
+
+        Inst::ShiftR {
+            size,
+            kind,
+            num_bits,
+            dst,
+        } => {
+            let enc_dst = int_reg_enc(dst.to_reg());
+            let subopcode = match kind {
+                ShiftKind::RotateLeft => 0,
+                ShiftKind::RotateRight => 1,
+                ShiftKind::ShiftLeft => 4,
+                ShiftKind::ShiftRightLogical => 5,
+                ShiftKind::ShiftRightArithmetic => 7,
+            };
+
+            match num_bits {
+                None => {
+                    let (opcode, prefix, rex_flags) = match size {
+                        1 => (0xD2, LegacyPrefixes::None, RexFlags::clear_w()),
+                        2 => (0xD3, LegacyPrefixes::_66, RexFlags::clear_w()),
+                        4 => (0xD3, LegacyPrefixes::None, RexFlags::clear_w()),
+                        8 => (0xD3, LegacyPrefixes::None, RexFlags::set_w()),
+                        _ => unreachable!("{}", size),
+                    };
+
+                    // SHL/SHR/SAR %cl, reg8 is (REX.W==0) D2 /subopcode
+                    // SHL/SHR/SAR %cl, reg16 is 66 (REX.W==0) D3 /subopcode
+                    // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode
+                    // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode
+                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags);
+                }
+
+                Some(num_bits) => {
+                    let (opcode, prefix, rex_flags) = match size {
+                        1 => (0xC0, LegacyPrefixes::None, RexFlags::clear_w()),
+                        2 => (0xC1, LegacyPrefixes::_66, RexFlags::clear_w()),
+                        4 => (0xC1, LegacyPrefixes::None, RexFlags::clear_w()),
+                        8 => (0xC1, LegacyPrefixes::None, RexFlags::set_w()),
+                        _ => unreachable!("{}", size),
+                    };
+
+                    // SHL/SHR/SAR $ib, reg8 is (REX.W==0) C0 /subopcode
+                    // SHL/SHR/SAR $ib, reg16 is 66 (REX.W==0) C1 /subopcode
+                    // SHL/SHR/SAR $ib, reg32 is (REX.W==0) C1 /subopcode ib
+                    // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib
+                    // When the shift amount is 1, there's an even shorter encoding, but we don't
+                    // bother with that nicety here.
+                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags);
+                    sink.put1(*num_bits);
+                }
+            }
+        }
+
+        Inst::XmmRmiReg { opcode, src, dst } => {
+            let rex = RexFlags::clear_w();
+            let prefix = LegacyPrefixes::_66;
+            if let RegMemImm::Imm { simm32 } = src {
+                let (opcode_bytes, reg_digit) = match opcode {
+                    SseOpcode::Psllw => (0x0F71, 6),
+                    SseOpcode::Pslld => (0x0F72, 6),
+                    SseOpcode::Psllq => (0x0F73, 6),
+                    SseOpcode::Psraw => (0x0F71, 4),
+                    SseOpcode::Psrad => (0x0F72, 4),
+                    SseOpcode::Psrlw => (0x0F71, 2),
+                    SseOpcode::Psrld => (0x0F72, 2),
+                    SseOpcode::Psrlq => (0x0F73, 2),
+                    _ => panic!("invalid opcode: {}", opcode),
+                };
+                let dst_enc = reg_enc(dst.to_reg());
+                emit_std_enc_enc(sink, prefix, opcode_bytes, 2, reg_digit, dst_enc, rex);
+                let imm = (*simm32)
+                    .try_into()
+                    .expect("the immediate must be convertible to a u8");
+                sink.put1(imm);
+            } else {
+                let opcode_bytes = match opcode {
+                    SseOpcode::Psllw => 0x0FF1,
+                    SseOpcode::Pslld => 0x0FF2,
+                    SseOpcode::Psllq => 0x0FF3,
+                    SseOpcode::Psraw => 0x0FE1,
+                    SseOpcode::Psrad => 0x0FE2,
+                    SseOpcode::Psrlw => 0x0FD1,
+                    SseOpcode::Psrld => 0x0FD2,
+                    SseOpcode::Psrlq => 0x0FD3,
+                    _ => panic!("invalid opcode: {}", opcode),
+                };
+
+                match src {
+                    RegMemImm::Reg { reg } => {
+                        emit_std_reg_reg(sink, prefix, opcode_bytes, 2, dst.to_reg(), *reg, rex);
+                    }
+                    RegMemImm::Mem { addr } => {
+                        let addr = &addr.finalize(state);
+                        emit_std_reg_mem(
+                            sink,
+                            state,
+                            prefix,
+                            opcode_bytes,
+                            2,
+                            dst.to_reg(),
+                            addr,
+                            rex,
+                        );
+                    }
+                    RegMemImm::Imm { .. } => unreachable!(),
+                }
+            };
+        }
+
+        Inst::CmpRmiR {
+            size,
+            src: src_e,
+            dst: reg_g,
+        } => {
+            let mut prefix = LegacyPrefixes::None;
+            if *size == 2 {
+                prefix = LegacyPrefixes::_66;
+            }
+
+            let mut rex = match size {
+                8 => RexFlags::set_w(),
+                4 | 2 => RexFlags::clear_w(),
+                1 => {
+                    let mut rex = RexFlags::clear_w();
+                    // Here, a redundant REX prefix changes the meaning of the instruction.
+                    let enc_g = int_reg_enc(*reg_g);
+                    if enc_g >= 4 && enc_g <= 7 {
+                        rex.always_emit();
+                    }
+                    rex
+                }
+                _ => panic!("x64::Inst::Cmp_RMI_R::emit: unreachable"),
+            };
+
+            match src_e {
+                RegMemImm::Reg { reg: reg_e } => {
+                    if *size == 1 {
+                        // Check whether the E register forces the use of a redundant REX.
+                        let enc_e = int_reg_enc(*reg_e);
+                        if enc_e >= 4 && enc_e <= 7 {
+                            rex.always_emit();
+                        }
+                    }
+
+                    // Use the swapped operands encoding, to stay consistent with the output of
+                    // gcc/llvm.
+                    let opcode = if *size == 1 { 0x38 } else { 0x39 };
+                    emit_std_reg_reg(sink, prefix, opcode, 1, *reg_e, *reg_g, rex);
+                }
+
+                RegMemImm::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    // Whereas here we revert to the "normal" G-E ordering.
+                    let opcode = if *size == 1 { 0x3A } else { 0x3B };
+                    emit_std_reg_mem(sink, state, prefix, opcode, 1, *reg_g, addr, rex);
+                }
+
+                RegMemImm::Imm { simm32 } => {
+                    // FIXME JRS 2020Feb11: there are shorter encodings for
+                    // cmp $imm, rax/eax/ax/al.
+                    let use_imm8 = low8_will_sign_extend_to_32(*simm32);
+
+                    // And also here we use the "normal" G-E ordering.
+                    let opcode = if *size == 1 {
+                        0x80
+                    } else if use_imm8 {
+                        0x83
+                    } else {
+                        0x81
+                    };
+
+                    let enc_g = int_reg_enc(*reg_g);
+                    emit_std_enc_enc(sink, prefix, opcode, 1, 7 /*subopcode*/, enc_g, rex);
+                    emit_simm(sink, if use_imm8 { 1 } else { *size }, *simm32);
+                }
+            }
+        }
+
+        Inst::Setcc { cc, dst } => {
+            let opcode = 0x0f90 + cc.get_enc() as u32;
+            let mut rex_flags = RexFlags::clear_w();
+            rex_flags.always_emit();
+            emit_std_enc_enc(
+                sink,
+                LegacyPrefixes::None,
+                opcode,
+                2,
+                0,
+                reg_enc(dst.to_reg()),
+                rex_flags,
+            );
+        }
+
+        Inst::Cmove {
+            size,
+            cc,
+            src,
+            dst: reg_g,
+        } => {
+            let (prefix, rex_flags) = match size {
+                2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!("invalid size spec for cmove"),
+            };
+            let opcode = 0x0F40 + cc.get_enc() as u32;
+            match src {
+                RegMem::Reg { reg: reg_e } => {
+                    emit_std_reg_reg(sink, prefix, opcode, 2, reg_g.to_reg(), *reg_e, rex_flags);
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        prefix,
+                        opcode,
+                        2,
+                        reg_g.to_reg(),
+                        addr,
+                        rex_flags,
+                    );
+                }
+            }
+        }
+
+        Inst::XmmCmove {
+            is_64,
+            cc,
+            src,
+            dst,
+        } => {
+            // Lowering of the Select IR opcode when the input is an fcmp relies on the fact that
+            // this doesn't clobber flags. Make sure to not do so here.
+            let next = sink.get_label();
+
+            // Jump if cc is *not* set.
+            one_way_jmp(sink, cc.invert(), next);
+
+            let op = if *is_64 {
+                SseOpcode::Movsd
+            } else {
+                SseOpcode::Movss
+            };
+            let inst = Inst::xmm_unary_rm_r(op, src.clone(), *dst);
+            inst.emit(sink, info, state);
+
+            sink.bind_label(next);
+        }
+
+        Inst::Push64 { src } => {
+            match src {
+                RegMemImm::Reg { reg } => {
+                    let enc_reg = int_reg_enc(*reg);
+                    let rex = 0x40 | ((enc_reg >> 3) & 1);
+                    if rex != 0x40 {
+                        sink.put1(rex);
+                    }
+                    sink.put1(0x50 | (enc_reg & 7));
+                }
+
+                RegMemImm::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_enc_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        0xFF,
+                        1,
+                        6, /*subopcode*/
+                        addr,
+                        RexFlags::clear_w(),
+                    );
+                }
+
+                RegMemImm::Imm { simm32 } => {
+                    if low8_will_sign_extend_to_64(*simm32) {
+                        sink.put1(0x6A);
+                        sink.put1(*simm32 as u8);
+                    } else {
+                        sink.put1(0x68);
+                        sink.put4(*simm32);
+                    }
+                }
+            }
+        }
+
+        Inst::Pop64 { dst } => {
+            let enc_dst = int_reg_enc(dst.to_reg());
+            if enc_dst >= 8 {
+                // 0x41 == REX.{W=0, B=1}.  It seems that REX.W is irrelevant here.
+                sink.put1(0x41);
+            }
+            sink.put1(0x58 + (enc_dst & 7));
+        }
+
+        Inst::CallKnown { dest, opcode, .. } => {
+            if let Some(s) = state.take_stack_map() {
+                sink.add_stack_map(StackMapExtent::UpcomingBytes(5), s);
+            }
+            sink.put1(0xE8);
+            // The addend adjusts for the difference between the end of the instruction and the
+            // beginning of the immediate field.
+            emit_reloc(sink, state, Reloc::X86CallPCRel4, &dest, -4);
+            sink.put4(0);
+            if opcode.is_call() {
+                let loc = state.cur_srcloc();
+                sink.add_call_site(loc, *opcode);
+            }
+        }
+
+        Inst::CallUnknown { dest, opcode, .. } => {
+            let start_offset = sink.cur_offset();
+            match dest {
+                RegMem::Reg { reg } => {
+                    let reg_enc = int_reg_enc(*reg);
+                    emit_std_enc_enc(
+                        sink,
+                        LegacyPrefixes::None,
+                        0xFF,
+                        1,
+                        2, /*subopcode*/
+                        reg_enc,
+                        RexFlags::clear_w(),
+                    );
+                }
+
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_enc_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        0xFF,
+                        1,
+                        2, /*subopcode*/
+                        addr,
+                        RexFlags::clear_w(),
+                    );
+                }
+            }
+            if let Some(s) = state.take_stack_map() {
+                sink.add_stack_map(StackMapExtent::StartedAtOffset(start_offset), s);
+            }
+            if opcode.is_call() {
+                let loc = state.cur_srcloc();
+                sink.add_call_site(loc, *opcode);
+            }
+        }
+
+        Inst::Ret {} => sink.put1(0xC3),
+
+        Inst::JmpKnown { dst } => {
+            let br_start = sink.cur_offset();
+            let br_disp_off = br_start + 1;
+            let br_end = br_start + 5;
+
+            sink.use_label_at_offset(br_disp_off, *dst, LabelUse::JmpRel32);
+            sink.add_uncond_branch(br_start, br_end, *dst);
+
+            sink.put1(0xE9);
+            // Placeholder for the label value.
+            sink.put4(0x0);
+        }
+
+        Inst::JmpIf { cc, taken } => {
+            let cond_start = sink.cur_offset();
+            let cond_disp_off = cond_start + 2;
+
+            sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32);
+            // Since this is not a terminator, don't enroll in the branch inversion mechanism.
+
+            sink.put1(0x0F);
+            sink.put1(0x80 + cc.get_enc());
+            // Placeholder for the label value.
+            sink.put4(0x0);
+        }
+
+        Inst::JmpCond {
+            cc,
+            taken,
+            not_taken,
+        } => {
+            // If taken.
+            let cond_start = sink.cur_offset();
+            let cond_disp_off = cond_start + 2;
+            let cond_end = cond_start + 6;
+
+            sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32);
+            let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];
+            sink.add_cond_branch(cond_start, cond_end, *taken, &inverted[..]);
+
+            sink.put1(0x0F);
+            sink.put1(0x80 + cc.get_enc());
+            // Placeholder for the label value.
+            sink.put4(0x0);
+
+            // If not taken.
+            let uncond_start = sink.cur_offset();
+            let uncond_disp_off = uncond_start + 1;
+            let uncond_end = uncond_start + 5;
+
+            sink.use_label_at_offset(uncond_disp_off, *not_taken, LabelUse::JmpRel32);
+            sink.add_uncond_branch(uncond_start, uncond_end, *not_taken);
+
+            sink.put1(0xE9);
+            // Placeholder for the label value.
+            sink.put4(0x0);
+        }
+
+        Inst::JmpUnknown { target } => {
+            match target {
+                RegMem::Reg { reg } => {
+                    let reg_enc = int_reg_enc(*reg);
+                    emit_std_enc_enc(
+                        sink,
+                        LegacyPrefixes::None,
+                        0xFF,
+                        1,
+                        4, /*subopcode*/
+                        reg_enc,
+                        RexFlags::clear_w(),
+                    );
+                }
+
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_enc_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        0xFF,
+                        1,
+                        4, /*subopcode*/
+                        addr,
+                        RexFlags::clear_w(),
+                    );
+                }
+            }
+        }
+
+        Inst::JmpTableSeq {
+            idx,
+            tmp1,
+            tmp2,
+            ref targets,
+            default_target,
+            ..
+        } => {
+            // This sequence is *one* instruction in the vcode, and is expanded only here at
+            // emission time, because we cannot allow the regalloc to insert spills/reloads in
+            // the middle; we depend on hardcoded PC-rel addressing below.
+            //
+            // We don't have to worry about emitting islands, because the only label-use type has a
+            // maximum range of 2 GB. If we later consider using shorter-range label references,
+            // this will need to be revisited.
+
+            // Save index in a tmp (the live range of ridx only goes to start of this
+            // sequence; rtmp1 or rtmp2 may overwrite it).
+
+            // We generate the following sequence:
+            // ;; generated by lowering: cmp #jmp_table_size, %idx
+            // jnb $default_target
+            // movl %idx, %tmp2
+            // lea start_of_jump_table_offset(%rip), %tmp1
+            // movslq [%tmp1, %tmp2, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
+            // addq %tmp2, %tmp1
+            // j *%tmp1
+            // $start_of_jump_table:
+            // -- jump table entries
+            one_way_jmp(sink, CC::NB, *default_target); // idx unsigned >= jmp table size
+
+            // Copy the index (and make sure to clear the high 32-bits lane of tmp2).
+            let inst = Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(*idx), *tmp2);
+            inst.emit(sink, info, state);
+
+            // Load base address of jump table.
+            let start_of_jumptable = sink.get_label();
+            let inst = Inst::lea(Amode::rip_relative(start_of_jumptable), *tmp1);
+            inst.emit(sink, info, state);
+
+            // Load value out of the jump table. It's a relative offset to the target block, so it
+            // might be negative; use a sign-extension.
+            let inst = Inst::movsx_rm_r(
+                ExtMode::LQ,
+                RegMem::mem(Amode::imm_reg_reg_shift(0, tmp1.to_reg(), tmp2.to_reg(), 2)),
+                *tmp2,
+            );
+            inst.emit(sink, info, state);
+
+            // Add base of jump table to jump-table-sourced block offset.
+            let inst = Inst::alu_rmi_r(
+                true, /* is_64 */
+                AluRmiROpcode::Add,
+                RegMemImm::reg(tmp2.to_reg()),
+                *tmp1,
+            );
+            inst.emit(sink, info, state);
+
+            // Branch to computed address.
+            let inst = Inst::jmp_unknown(RegMem::reg(tmp1.to_reg()));
+            inst.emit(sink, info, state);
+
+            // Emit jump table (table of 32-bit offsets).
+            sink.bind_label(start_of_jumptable);
+            let jt_off = sink.cur_offset();
+            for &target in targets.iter() {
+                let word_off = sink.cur_offset();
+                // off_into_table is an addend here embedded in the label to be later patched at
+                // the end of codegen. The offset is initially relative to this jump table entry;
+                // with the extra addend, it'll be relative to the jump table's start, after
+                // patching.
+                let off_into_table = word_off - jt_off;
+                sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);
+                sink.put4(off_into_table);
+            }
+        }
+
+        Inst::TrapIf { cc, trap_code } => {
+            let else_label = sink.get_label();
+
+            // Jump over if the invert of CC is set (i.e. CC is not set).
+            one_way_jmp(sink, cc.invert(), else_label);
+
+            // Trap!
+            let inst = Inst::trap(*trap_code);
+            inst.emit(sink, info, state);
+
+            sink.bind_label(else_label);
+        }
+
+        Inst::XmmUnaryRmR {
+            op,
+            src: src_e,
+            dst: reg_g,
+        } => {
+            let rex = RexFlags::clear_w();
+
+            let (prefix, opcode, num_opcodes) = match op {
+                SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2),
+                SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2),
+                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2),
+                SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F28, 2),
+                SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F6F, 2),
+                SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F6F, 2),
+                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
+                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10, 2),
+                SseOpcode::Movups => (LegacyPrefixes::None, 0x0F10, 2),
+                SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F10, 2),
+                SseOpcode::Pabsb => (LegacyPrefixes::_66, 0x0F381C, 3),
+                SseOpcode::Pabsw => (LegacyPrefixes::_66, 0x0F381D, 3),
+                SseOpcode::Pabsd => (LegacyPrefixes::_66, 0x0F381E, 3),
+                SseOpcode::Sqrtps => (LegacyPrefixes::None, 0x0F51, 2),
+                SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2),
+                SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2),
+                SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2),
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+
+            match src_e {
+                RegMem::Reg { reg: reg_e } => {
+                    emit_std_reg_reg(
+                        sink,
+                        prefix,
+                        opcode,
+                        num_opcodes,
+                        reg_g.to_reg(),
+                        *reg_e,
+                        rex,
+                    );
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        prefix,
+                        opcode,
+                        num_opcodes,
+                        reg_g.to_reg(),
+                        addr,
+                        rex,
+                    );
+                }
+            };
+        }
+
+        Inst::XmmRmR {
+            op,
+            src: src_e,
+            dst: reg_g,
+        } => {
+            let rex = RexFlags::clear_w();
+            let (prefix, opcode, length) = match op {
+                SseOpcode::Addps => (LegacyPrefixes::None, 0x0F58, 2),
+                SseOpcode::Addpd => (LegacyPrefixes::_66, 0x0F58, 2),
+                SseOpcode::Addss => (LegacyPrefixes::_F3, 0x0F58, 2),
+                SseOpcode::Addsd => (LegacyPrefixes::_F2, 0x0F58, 2),
+                SseOpcode::Andps => (LegacyPrefixes::None, 0x0F54, 2),
+                SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
+                SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
+                SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
+                SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
+                SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
+                SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2),
+                SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2),
+                SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2),
+                SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2),
+                SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2),
+                SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2),
+                SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2),
+                SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2),
+                SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2),
+                SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2),
+                SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2),
+                SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2),
+                SseOpcode::Movlhps => (LegacyPrefixes::None, 0x0F16, 2),
+                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
+                SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2),
+                SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2),
+                SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2),
+                SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
+                SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
+                SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
+                SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
+                SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
+                SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
+                SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
+                SseOpcode::Paddw => (LegacyPrefixes::_66, 0x0FFD, 2),
+                SseOpcode::Paddsb => (LegacyPrefixes::_66, 0x0FEC, 2),
+                SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2),
+                SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2),
+                SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2),
+                SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2),
+                SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
+                SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
+                SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2),
+                SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2),
+                SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2),
+                SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2),
+                SseOpcode::Pcmpeqq => (LegacyPrefixes::_66, 0x0F3829, 3),
+                SseOpcode::Pcmpgtb => (LegacyPrefixes::_66, 0x0F64, 2),
+                SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2),
+                SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2),
+                SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3),
+                SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3),
+                SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2),
+                SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3),
+                SseOpcode::Pmaxub => (LegacyPrefixes::_66, 0x0FDE, 2),
+                SseOpcode::Pmaxuw => (LegacyPrefixes::_66, 0x0F383E, 3),
+                SseOpcode::Pmaxud => (LegacyPrefixes::_66, 0x0F383F, 3),
+                SseOpcode::Pminsb => (LegacyPrefixes::_66, 0x0F3838, 3),
+                SseOpcode::Pminsw => (LegacyPrefixes::_66, 0x0FEA, 2),
+                SseOpcode::Pminsd => (LegacyPrefixes::_66, 0x0F3839, 3),
+                SseOpcode::Pminub => (LegacyPrefixes::_66, 0x0FDA, 2),
+                SseOpcode::Pminuw => (LegacyPrefixes::_66, 0x0F383A, 3),
+                SseOpcode::Pminud => (LegacyPrefixes::_66, 0x0F383B, 3),
+                SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
+                SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
+                SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2),
+                SseOpcode::Por => (LegacyPrefixes::_66, 0x0FEB, 2),
+                SseOpcode::Pshufb => (LegacyPrefixes::_66, 0x0F3800, 3),
+                SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2),
+                SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2),
+                SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2),
+                SseOpcode::Psubw => (LegacyPrefixes::_66, 0x0FF9, 2),
+                SseOpcode::Psubsb => (LegacyPrefixes::_66, 0x0FE8, 2),
+                SseOpcode::Psubsw => (LegacyPrefixes::_66, 0x0FE9, 2),
+                SseOpcode::Psubusb => (LegacyPrefixes::_66, 0x0FD8, 2),
+                SseOpcode::Psubusw => (LegacyPrefixes::_66, 0x0FD9, 2),
+                SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
+                SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
+                SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
+                SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2),
+                SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2),
+                SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
+                SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+
+            match src_e {
+                RegMem::Reg { reg: reg_e } => {
+                    emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex);
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        prefix,
+                        opcode,
+                        length,
+                        reg_g.to_reg(),
+                        addr,
+                        rex,
+                    );
+                }
+            }
+        }
+
+        Inst::XmmMinMaxSeq {
+            size,
+            is_min,
+            lhs,
+            rhs_dst,
+        } => {
+            // Generates the following sequence:
+            // cmpss/cmpsd %lhs, %rhs_dst
+            // jnz do_min_max
+            // jp propagate_nan
+            //
+            // ;; ordered and equal: propagate the sign bit (for -0 vs 0):
+            // {and,or}{ss,sd} %lhs, %rhs_dst
+            // j done
+            //
+            // ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the
+            // ;; NaN value is returned), we add both inputs.
+            // propagate_nan:
+            // add{ss,sd} %lhs, %rhs_dst
+            // j done
+            //
+            // do_min_max:
+            // {min,max}{ss,sd} %lhs, %rhs_dst
+            //
+            // done:
+            let done = sink.get_label();
+            let propagate_nan = sink.get_label();
+            let do_min_max = sink.get_label();
+
+            let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {
+                OperandSize::Size32 => (
+                    SseOpcode::Addss,
+                    SseOpcode::Ucomiss,
+                    SseOpcode::Andps,
+                    SseOpcode::Orps,
+                    if *is_min {
+                        SseOpcode::Minss
+                    } else {
+                        SseOpcode::Maxss
+                    },
+                ),
+                OperandSize::Size64 => (
+                    SseOpcode::Addsd,
+                    SseOpcode::Ucomisd,
+                    SseOpcode::Andpd,
+                    SseOpcode::Orpd,
+                    if *is_min {
+                        SseOpcode::Minsd
+                    } else {
+                        SseOpcode::Maxsd
+                    },
+                ),
+            };
+
+            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(*lhs), rhs_dst.to_reg());
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::NZ, do_min_max);
+            one_way_jmp(sink, CC::P, propagate_nan);
+
+            // Ordered and equal. The operands are bit-identical unless they are zero
+            // and negative zero. These instructions merge the sign bits in that
+            // case, and are no-ops otherwise.
+            let op = if *is_min { or_op } else { and_op };
+            let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::jmp_known(done);
+            inst.emit(sink, info, state);
+
+            // x86's min/max are not symmetric; if either operand is a NaN, they return the
+            // read-only operand: perform an addition between the two operands, which has the
+            // desired NaN propagation effects.
+            sink.bind_label(propagate_nan);
+            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst);
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::P, done);
+
+            sink.bind_label(do_min_max);
+            let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst);
+            inst.emit(sink, info, state);
+
+            sink.bind_label(done);
+        }
+
+        Inst::XmmRmRImm {
+            op,
+            src,
+            dst,
+            imm,
+            is64,
+        } => {
+            let (prefix, opcode, len) = match op {
+                SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2),
+                SseOpcode::Cmppd => (LegacyPrefixes::_66, 0x0FC2, 2),
+                SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2),
+                SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2),
+                SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3),
+                SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3),
+                SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
+                SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
+                SseOpcode::Pextrb => (LegacyPrefixes::_66, 0x0F3A14, 3),
+                SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2),
+                SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3),
+                SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+            let rex = if *is64 {
+                RexFlags::set_w()
+            } else {
+                RexFlags::clear_w()
+            };
+            let regs_swapped = match *op {
+                // These opcodes (and not the SSE2 version of PEXTRW) flip the operand
+                // encoding: `dst` in ModRM's r/m, `src` in ModRM's reg field.
+                SseOpcode::Pextrb | SseOpcode::Pextrd => true,
+                // The rest of the opcodes have the customary encoding: `dst` in ModRM's reg,
+                // `src` in ModRM's r/m field.
+                _ => false,
+            };
+            match src {
+                RegMem::Reg { reg } => {
+                    if regs_swapped {
+                        emit_std_reg_reg(sink, prefix, opcode, len, *reg, dst.to_reg(), rex);
+                    } else {
+                        emit_std_reg_reg(sink, prefix, opcode, len, dst.to_reg(), *reg, rex);
+                    }
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    assert!(
+                        !regs_swapped,
+                        "No existing way to encode a mem argument in the ModRM r/m field."
+                    );
+                    emit_std_reg_mem(sink, state, prefix, opcode, len, dst.to_reg(), addr, rex);
+                }
+            }
+            sink.put1(*imm);
+        }
+
+        Inst::XmmLoadConst { src, dst, ty } => {
+            let load_offset = Amode::rip_relative(sink.get_label_for_constant(*src));
+            let load = Inst::load(*ty, load_offset, *dst, ExtKind::None);
+            load.emit(sink, info, state);
+        }
+
+        Inst::XmmUninitializedValue { .. } => {
+            // This instruction format only exists to declare a register as a `def`; no code is
+            // emitted.
+        }
+
+        Inst::XmmMovRM { op, src, dst } => {
+            let (prefix, opcode) = match op {
+                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29),
+                SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29),
+                SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F7F),
+                SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F),
+                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11),
+                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11),
+                SseOpcode::Movups => (LegacyPrefixes::None, 0x0F11),
+                SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F11),
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+            let dst = &dst.finalize(state);
+            emit_std_reg_mem(
+                sink,
+                state,
+                prefix,
+                opcode,
+                2,
+                *src,
+                dst,
+                RexFlags::clear_w(),
+            );
+        }
+
+        Inst::XmmToGpr {
+            op,
+            src,
+            dst,
+            dst_size,
+        } => {
+            let (prefix, opcode, dst_first) = match op {
+                SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
+                SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
+                // Movd and movq use the same opcode; the presence of the REX prefix (set below)
+                // actually determines which is used.
+                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false),
+                SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true),
+                SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true),
+                SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true),
+                _ => panic!("unexpected opcode {:?}", op),
+            };
+            let rex = match dst_size {
+                OperandSize::Size32 => RexFlags::clear_w(),
+                OperandSize::Size64 => RexFlags::set_w(),
+            };
+
+            let (src, dst) = if dst_first {
+                (dst.to_reg(), *src)
+            } else {
+                (*src, dst.to_reg())
+            };
+
+            emit_std_reg_reg(sink, prefix, opcode, 2, src, dst, rex);
+        }
+
+        Inst::GprToXmm {
+            op,
+            src: src_e,
+            dst: reg_g,
+            src_size,
+        } => {
+            let (prefix, opcode) = match op {
+                // Movd and movq use the same opcode; the presence of the REX prefix (set below)
+                // actually determines which is used.
+                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E),
+                SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
+                SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
+                _ => panic!("unexpected opcode {:?}", op),
+            };
+            let rex = match *src_size {
+                OperandSize::Size32 => RexFlags::clear_w(),
+                OperandSize::Size64 => RexFlags::set_w(),
+            };
+            match src_e {
+                RegMem::Reg { reg: reg_e } => {
+                    emit_std_reg_reg(sink, prefix, opcode, 2, reg_g.to_reg(), *reg_e, rex);
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_reg_mem(sink, state, prefix, opcode, 2, reg_g.to_reg(), addr, rex);
+                }
+            }
+        }
+
+        Inst::XmmCmpRmR { op, src, dst } => {
+            let rex = RexFlags::clear_w();
+            let (prefix, opcode, len) = match op {
+                SseOpcode::Ptest => (LegacyPrefixes::_66, 0x0F3817, 3),
+                SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E, 2),
+                SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E, 2),
+                _ => unimplemented!("Emit xmm cmp rm r"),
+            };
+
+            match src {
+                RegMem::Reg { reg } => {
+                    emit_std_reg_reg(sink, prefix, opcode, len, *dst, *reg, rex);
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_reg_mem(sink, state, prefix, opcode, len, *dst, addr, rex);
+                }
+            }
+        }
+
+        Inst::CvtUint64ToFloatSeq {
+            to_f64,
+            src,
+            dst,
+            tmp_gpr1,
+            tmp_gpr2,
+        } => {
+            // Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a
+            // different sequence.
+            //
+            // Emit the following sequence:
+            //
+            //  cmp 0, %src
+            //  jl handle_negative
+            //
+            //  ;; handle positive, which can't overflow
+            //  cvtsi2sd/cvtsi2ss %src, %dst
+            //  j done
+            //
+            //  ;; handle negative: see below for an explanation of what it's doing.
+            //  handle_negative:
+            //  mov %src, %tmp_gpr1
+            //  shr $1, %tmp_gpr1
+            //  mov %src, %tmp_gpr2
+            //  and $1, %tmp_gpr2
+            //  or %tmp_gpr1, %tmp_gpr2
+            //  cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst
+            //  addsd/addss %dst, %dst
+            //
+            //  done:
+
+            assert_ne!(src, tmp_gpr1);
+            assert_ne!(src, tmp_gpr2);
+            assert_ne!(tmp_gpr1, tmp_gpr2);
+
+            let handle_negative = sink.get_label();
+            let done = sink.get_label();
+
+            // If x seen as a signed int64 is not negative, a signed-conversion will do the right
+            // thing.
+            // TODO use tst src, src here.
+            let inst = Inst::cmp_rmi_r(8, RegMemImm::imm(0), src.to_reg());
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::L, handle_negative);
+
+            // Handle a positive int64, which is the "easy" case: a signed conversion will do the
+            // right thing.
+            emit_signed_cvt(sink, info, state, src.to_reg(), *dst, *to_f64);
+
+            let inst = Inst::jmp_known(done);
+            inst.emit(sink, info, state);
+
+            sink.bind_label(handle_negative);
+
+            // Divide x by two to get it in range for the signed conversion, keep the LSB, and
+            // scale it back up on the FP side.
+            let inst = Inst::gen_move(*tmp_gpr1, src.to_reg(), types::I64);
+            inst.emit(sink, info, state);
+
+            // tmp_gpr1 := src >> 1
+            let inst = Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(1), *tmp_gpr1);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::gen_move(*tmp_gpr2, src.to_reg(), types::I64);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::alu_rmi_r(
+                true, /* 64bits */
+                AluRmiROpcode::And,
+                RegMemImm::imm(1),
+                *tmp_gpr2,
+            );
+            inst.emit(sink, info, state);
+
+            let inst = Inst::alu_rmi_r(
+                true, /* 64bits */
+                AluRmiROpcode::Or,
+                RegMemImm::reg(tmp_gpr1.to_reg()),
+                *tmp_gpr2,
+            );
+            inst.emit(sink, info, state);
+
+            emit_signed_cvt(sink, info, state, tmp_gpr2.to_reg(), *dst, *to_f64);
+
+            let add_op = if *to_f64 {
+                SseOpcode::Addsd
+            } else {
+                SseOpcode::Addss
+            };
+            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst);
+            inst.emit(sink, info, state);
+
+            sink.bind_label(done);
+        }
+
+        Inst::CvtFloatToSintSeq {
+            src_size,
+            dst_size,
+            is_saturating,
+            src,
+            dst,
+            tmp_gpr,
+            tmp_xmm,
+        } => {
+            // Emits the following common sequence:
+            //
+            // cvttss2si/cvttsd2si %src, %dst
+            // cmp %dst, 1
+            // jno done
+            //
+            // Then, for saturating conversions:
+            //
+            // ;; check for NaN
+            // cmpss/cmpsd %src, %src
+            // jnp not_nan
+            // xor %dst, %dst
+            //
+            // ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is
+            // ;; already in %dst.
+            // xorpd %tmp_xmm, %tmp_xmm
+            // cmpss/cmpsd %src, %tmp_xmm
+            // jnb done
+            // mov/movaps $INT_MAX, %dst
+            //
+            // done:
+            //
+            // Then, for non-saturating conversions:
+            //
+            // ;; check for NaN
+            // cmpss/cmpsd %src, %src
+            // jnp not_nan
+            // ud2 trap BadConversionToInteger
+            //
+            // ;; check if INT_MIN was the correct result, against a magic constant:
+            // not_nan:
+            // movaps/mov $magic, %tmp_gpr
+            // movq/movd %tmp_gpr, %tmp_xmm
+            // cmpss/cmpsd %tmp_xmm, %src
+            // jnb/jnbe $check_positive
+            // ud2 trap IntegerOverflow
+            //
+            // ;; if positive, it was a real overflow
+            // check_positive:
+            // xorpd %tmp_xmm, %tmp_xmm
+            // cmpss/cmpsd %src, %tmp_xmm
+            // jnb done
+            // ud2 trap IntegerOverflow
+            //
+            // done:
+
+            let src = src.to_reg();
+
+            let (cast_op, cmp_op, trunc_op) = match src_size {
+                OperandSize::Size64 => (SseOpcode::Movq, SseOpcode::Ucomisd, SseOpcode::Cvttsd2si),
+                OperandSize::Size32 => (SseOpcode::Movd, SseOpcode::Ucomiss, SseOpcode::Cvttss2si),
+            };
+
+            let done = sink.get_label();
+            let not_nan = sink.get_label();
+
+            // The truncation.
+            let inst = Inst::xmm_to_gpr(trunc_op, src, *dst, *dst_size);
+            inst.emit(sink, info, state);
+
+            // Compare against 1, in case of overflow the dst operand was INT_MIN.
+            let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(1), dst.to_reg());
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::NO, done); // no overflow => done
+
+            // Check for NaN.
+
+            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), src);
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
+
+            if *is_saturating {
+                // For NaN, emit 0.
+                let inst = Inst::alu_rmi_r(
+                    *dst_size == OperandSize::Size64,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(dst.to_reg()),
+                    *dst,
+                );
+                inst.emit(sink, info, state);
+
+                let inst = Inst::jmp_known(done);
+                inst.emit(sink, info, state);
+
+                sink.bind_label(not_nan);
+
+                // If the input was positive, saturate to INT_MAX.
+
+                // Zero out tmp_xmm.
+                let inst =
+                    Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm);
+                inst.emit(sink, info, state);
+
+                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
+                inst.emit(sink, info, state);
+
+                // Jump if >= to done.
+                one_way_jmp(sink, CC::NB, done);
+
+                // Otherwise, put INT_MAX.
+                if *dst_size == OperandSize::Size64 {
+                    let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, *dst);
+                    inst.emit(sink, info, state);
+                } else {
+                    let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, *dst);
+                    inst.emit(sink, info, state);
+                }
+            } else {
+                let check_positive = sink.get_label();
+
+                let inst = Inst::trap(TrapCode::BadConversionToInteger);
+                inst.emit(sink, info, state);
+
+                // Check if INT_MIN was the correct result: determine the smallest floating point
+                // number that would convert to INT_MIN, put it in a temporary register, and compare
+                // against the src register.
+                // If the src register is less (or in some cases, less-or-equal) than the threshold,
+                // trap!
+
+                sink.bind_label(not_nan);
+
+                let mut no_overflow_cc = CC::NB; // >=
+                let output_bits = dst_size.to_bits();
+                match *src_size {
+                    OperandSize::Size32 => {
+                        let cst = Ieee32::pow2(output_bits - 1).neg().bits();
+                        let inst = Inst::imm(OperandSize::Size32, cst as u64, *tmp_gpr);
+                        inst.emit(sink, info, state);
+                    }
+                    OperandSize::Size64 => {
+                        // An f64 can represent `i32::min_value() - 1` exactly with precision to spare,
+                        // so there are values less than -2^(N-1) that convert correctly to INT_MIN.
+                        let cst = if output_bits < 64 {
+                            no_overflow_cc = CC::NBE; // >
+                            Ieee64::fcvt_to_sint_negative_overflow(output_bits)
+                        } else {
+                            Ieee64::pow2(output_bits - 1).neg()
+                        };
+                        let inst = Inst::imm(OperandSize::Size64, cst.bits(), *tmp_gpr);
+                        inst.emit(sink, info, state);
+                    }
+                }
+
+                let inst =
+                    Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm);
+                inst.emit(sink, info, state);
+
+                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm.to_reg()), src);
+                inst.emit(sink, info, state);
+
+                // jump over trap if src >= or > threshold
+                one_way_jmp(sink, no_overflow_cc, check_positive);
+
+                let inst = Inst::trap(TrapCode::IntegerOverflow);
+                inst.emit(sink, info, state);
+
+                // If positive, it was a real overflow.
+
+                sink.bind_label(check_positive);
+
+                // Zero out the tmp_xmm register.
+                let inst =
+                    Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm);
+                inst.emit(sink, info, state);
+
+                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
+                inst.emit(sink, info, state);
+
+                one_way_jmp(sink, CC::NB, done); // jump over trap if 0 >= src
+
+                let inst = Inst::trap(TrapCode::IntegerOverflow);
+                inst.emit(sink, info, state);
+            }
+
+            sink.bind_label(done);
+        }
+
+        Inst::CvtFloatToUintSeq {
+            src_size,
+            dst_size,
+            is_saturating,
+            src,
+            dst,
+            tmp_gpr,
+            tmp_xmm,
+        } => {
+            // The only difference in behavior between saturating and non-saturating is how we
+            // handle errors. Emits the following sequence:
+            //
+            // movaps/mov 2**(int_width - 1), %tmp_gpr
+            // movq/movd %tmp_gpr, %tmp_xmm
+            // cmpss/cmpsd %tmp_xmm, %src
+            // jnb is_large
+            //
+            // ;; check for NaN inputs
+            // jnp not_nan
+            // -- non-saturating: ud2 trap BadConversionToInteger
+            // -- saturating: xor %dst, %dst; j done
+            //
+            // not_nan:
+            // cvttss2si/cvttsd2si %src, %dst
+            // cmp 0, %dst
+            // jnl done
+            // -- non-saturating: ud2 trap IntegerOverflow
+            // -- saturating: xor %dst, %dst; j done
+            //
+            // is_large:
+            // subss/subsd %tmp_xmm, %src ; <-- we clobber %src here
+            // cvttss2si/cvttss2sd %tmp_x, %dst
+            // cmp 0, %dst
+            // jnl next_is_large
+            // -- non-saturating: ud2 trap IntegerOverflow
+            // -- saturating: movaps $UINT_MAX, %dst; j done
+            //
+            // next_is_large:
+            // add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers
+            //
+            // done:
+
+            assert_ne!(tmp_xmm, src, "tmp_xmm clobbers src!");
+
+            let (sub_op, cast_op, cmp_op, trunc_op) = if *src_size == OperandSize::Size64 {
+                (
+                    SseOpcode::Subsd,
+                    SseOpcode::Movq,
+                    SseOpcode::Ucomisd,
+                    SseOpcode::Cvttsd2si,
+                )
+            } else {
+                (
+                    SseOpcode::Subss,
+                    SseOpcode::Movd,
+                    SseOpcode::Ucomiss,
+                    SseOpcode::Cvttss2si,
+                )
+            };
+
+            let done = sink.get_label();
+
+            let cst = if *src_size == OperandSize::Size64 {
+                Ieee64::pow2(dst_size.to_bits() - 1).bits()
+            } else {
+                Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64
+            };
+
+            let inst = Inst::imm(*src_size, cst, *tmp_gpr);
+            inst.emit(sink, info, state);
+
+            let inst =
+                Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm.to_reg()), src.to_reg());
+            inst.emit(sink, info, state);
+
+            let handle_large = sink.get_label();
+            one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
+
+            let not_nan = sink.get_label();
+            one_way_jmp(sink, CC::NP, not_nan); // jump over trap if not NaN
+
+            if *is_saturating {
+                // Emit 0.
+                let inst = Inst::alu_rmi_r(
+                    *dst_size == OperandSize::Size64,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(dst.to_reg()),
+                    *dst,
+                );
+                inst.emit(sink, info, state);
+
+                let inst = Inst::jmp_known(done);
+                inst.emit(sink, info, state);
+            } else {
+                // Trap.
+                let inst = Inst::trap(TrapCode::BadConversionToInteger);
+                inst.emit(sink, info, state);
+            }
+
+            sink.bind_label(not_nan);
+
+            // Actual truncation for small inputs: if the result is not positive, then we had an
+            // overflow.
+
+            let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(0), dst.to_reg());
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
+
+            if *is_saturating {
+                // The input was "small" (< 2**(width -1)), so the only way to get an integer
+                // overflow is because the input was too small: saturate to the min value, i.e. 0.
+                let inst = Inst::alu_rmi_r(
+                    *dst_size == OperandSize::Size64,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(dst.to_reg()),
+                    *dst,
+                );
+                inst.emit(sink, info, state);
+
+                let inst = Inst::jmp_known(done);
+                inst.emit(sink, info, state);
+            } else {
+                // Trap.
+                let inst = Inst::trap(TrapCode::IntegerOverflow);
+                inst.emit(sink, info, state);
+            }
+
+            // Now handle large inputs.
+
+            sink.bind_label(handle_large);
+
+            let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(0), dst.to_reg());
+            inst.emit(sink, info, state);
+
+            let next_is_large = sink.get_label();
+            one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
+
+            if *is_saturating {
+                // The input was "large" (>= 2**(width -1)), so the only way to get an integer
+                // overflow is because the input was too large: saturate to the max value.
+                let inst = Inst::imm(
+                    OperandSize::Size64,
+                    if *dst_size == OperandSize::Size64 {
+                        u64::max_value()
+                    } else {
+                        u32::max_value() as u64
+                    },
+                    *dst,
+                );
+                inst.emit(sink, info, state);
+
+                let inst = Inst::jmp_known(done);
+                inst.emit(sink, info, state);
+            } else {
+                let inst = Inst::trap(TrapCode::IntegerOverflow);
+                inst.emit(sink, info, state);
+            }
+
+            sink.bind_label(next_is_large);
+
+            if *dst_size == OperandSize::Size64 {
+                let inst = Inst::imm(OperandSize::Size64, 1 << 63, *tmp_gpr);
+                inst.emit(sink, info, state);
+
+                let inst = Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Add,
+                    RegMemImm::reg(tmp_gpr.to_reg()),
+                    *dst,
+                );
+                inst.emit(sink, info, state);
+            } else {
+                let inst =
+                    Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::imm(1 << 31), *dst);
+                inst.emit(sink, info, state);
+            }
+
+            sink.bind_label(done);
+        }
+
+        Inst::LoadExtName { dst, name, offset } => {
+            // The full address can be encoded in the register, with a relocation.
+            // Generates: movabsq $name, %dst
+            let enc_dst = int_reg_enc(dst.to_reg());
+            sink.put1(0x48 | ((enc_dst >> 3) & 1));
+            sink.put1(0xB8 | (enc_dst & 7));
+            emit_reloc(sink, state, Reloc::Abs8, name, *offset);
+            if info.flags().emit_all_ones_funcaddrs() {
+                sink.put8(u64::max_value());
+            } else {
+                sink.put8(0);
+            }
+        }
+
+        Inst::LockCmpxchg { ty, src, dst } => {
+            // lock cmpxchg{b,w,l,q} %src, (dst)
+            // Note that 0xF0 is the Lock prefix.
+            let (prefix, rex, opcodes) = match *ty {
+                types::I8 => {
+                    let mut rex_flags = RexFlags::clear_w();
+                    let enc_src = int_reg_enc(*src);
+                    if enc_src >= 4 && enc_src <= 7 {
+                        rex_flags.always_emit();
+                    };
+                    (LegacyPrefixes::_F0, rex_flags, 0x0FB0)
+                }
+                types::I16 => (LegacyPrefixes::_66F0, RexFlags::clear_w(), 0x0FB1),
+                types::I32 => (LegacyPrefixes::_F0, RexFlags::clear_w(), 0x0FB1),
+                types::I64 => (LegacyPrefixes::_F0, RexFlags::set_w(), 0x0FB1),
+                _ => unreachable!(),
+            };
+            let amode = dst.finalize(state);
+            emit_std_reg_mem(sink, state, prefix, opcodes, 2, *src, &amode, rex);
+        }
+
+        Inst::AtomicRmwSeq { ty, op } => {
+            // Emit this:
+            //
+            //    mov{zbq,zwq,zlq,q}     (%r9), %rax  // rax = old value
+            //   again:
+            //    movq                   %rax, %r11   // rax = old value, r11 = old value
+            //    `op`q                  %r10, %r11   // rax = old value, r11 = new value
+            //    lock cmpxchg{b,w,l,q}  %r11, (%r9)  // try to store new value
+            //    jnz again // If this is taken, rax will have a "revised" old value
+            //
+            // Operand conventions:
+            //    IN:  %r9 (addr), %r10 (2nd arg for `op`)
+            //    OUT: %rax (old value), %r11 (trashed), %rflags (trashed)
+            //
+            // In the case where the operation is 'xchg', the "`op`q" instruction is instead
+            //   movq                    %r10, %r11
+            // so that we simply write in the destination, the "2nd arg for `op`".
+            let rax = regs::rax();
+            let r9 = regs::r9();
+            let r10 = regs::r10();
+            let r11 = regs::r11();
+            let rax_w = Writable::from_reg(rax);
+            let r11_w = Writable::from_reg(r11);
+            let amode = Amode::imm_reg(0, r9);
+            let again_label = sink.get_label();
+
+            // mov{zbq,zwq,zlq,q} (%r9), %rax
+            // No need to call `add_trap` here, since the `i1` emit will do that.
+            let i1 = Inst::load(*ty, amode.clone(), rax_w, ExtKind::ZeroExtend);
+            i1.emit(sink, info, state);
+
+            // again:
+            sink.bind_label(again_label);
+
+            // movq %rax, %r11
+            let i2 = Inst::mov_r_r(true, rax, r11_w);
+            i2.emit(sink, info, state);
+
+            // opq %r10, %r11
+            let r10_rmi = RegMemImm::reg(r10);
+            let i3 = if *op == inst_common::AtomicRmwOp::Xchg {
+                Inst::mov_r_r(true, r10, r11_w)
+            } else {
+                let alu_op = match op {
+                    inst_common::AtomicRmwOp::Add => AluRmiROpcode::Add,
+                    inst_common::AtomicRmwOp::Sub => AluRmiROpcode::Sub,
+                    inst_common::AtomicRmwOp::And => AluRmiROpcode::And,
+                    inst_common::AtomicRmwOp::Or => AluRmiROpcode::Or,
+                    inst_common::AtomicRmwOp::Xor => AluRmiROpcode::Xor,
+                    inst_common::AtomicRmwOp::Xchg => unreachable!(),
+                };
+                Inst::alu_rmi_r(true, alu_op, r10_rmi, r11_w)
+            };
+            i3.emit(sink, info, state);
+
+            // lock cmpxchg{b,w,l,q} %r11, (%r9)
+            // No need to call `add_trap` here, since the `i4` emit will do that.
+            let i4 = Inst::LockCmpxchg {
+                ty: *ty,
+                src: r11,
+                dst: amode.into(),
+            };
+            i4.emit(sink, info, state);
+
+            // jnz again
+            one_way_jmp(sink, CC::NZ, again_label);
+        }
+
+        Inst::Fence { kind } => {
+            sink.put1(0x0F);
+            sink.put1(0xAE);
+            match kind {
+                FenceKind::MFence => sink.put1(0xF0), // mfence = 0F AE F0
+                FenceKind::LFence => sink.put1(0xE8), // lfence = 0F AE E8
+                FenceKind::SFence => sink.put1(0xF8), // sfence = 0F AE F8
+            }
+        }
+
+        Inst::Hlt => {
+            sink.put1(0xcc);
+        }
+
+        Inst::Ud2 { trap_code } => {
+            let cur_srcloc = state.cur_srcloc();
+            sink.add_trap(cur_srcloc, *trap_code);
+            if let Some(s) = state.take_stack_map() {
+                sink.add_stack_map(StackMapExtent::UpcomingBytes(2), s);
+            }
+            sink.put1(0x0f);
+            sink.put1(0x0b);
+        }
+
+        Inst::VirtualSPOffsetAdj { offset } => {
+            debug!(
+                "virtual sp offset adjusted by {} -> {}",
+                offset,
+                state.virtual_sp_offset + offset
+            );
+            state.virtual_sp_offset += offset;
+        }
+
+        Inst::Nop { len } => {
+            // These encodings can all be found in Intel's architecture manual, at the NOP
+            // instruction description.
+            let mut len = *len;
+            while len != 0 {
+                let emitted = u8::min(len, 9);
+                match emitted {
+                    0 => {}
+                    1 => sink.put1(0x90), // NOP
+                    2 => {
+                        // 66 NOP
+                        sink.put1(0x66);
+                        sink.put1(0x90);
+                    }
+                    3 => {
+                        // NOP [EAX]
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x00);
+                    }
+                    4 => {
+                        // NOP 0(EAX), with 0 a 1-byte immediate.
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x40);
+                        sink.put1(0x00);
+                    }
+                    5 => {
+                        // NOP [EAX, EAX, 1]
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x44);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                    }
+                    6 => {
+                        // 66 NOP [EAX, EAX, 1]
+                        sink.put1(0x66);
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x44);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                    }
+                    7 => {
+                        // NOP 0[EAX], but 0 is a 4 bytes immediate.
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x80);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                    }
+                    8 => {
+                        // NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate.
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x84);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                    }
+                    9 => {
+                        // 66 NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate.
+                        sink.put1(0x66);
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x84);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                    }
+                    _ => unreachable!(),
+                }
+                len -= emitted;
+            }
+        }
+
+        Inst::EpiloguePlaceholder => {
+            // Generate no code.
+        }
+    }
+
+    state.clear_post_insn();
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs
new file mode 100644
index 0000000000..06092d498a
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs
@@ -0,0 +1,3593 @@
+//! Tests for the emitter
+//!
+//! See comments at the top of `fn x64_emit` for advice on how to create reliable test cases.
+//!
+//! to see stdout: cargo test -- --nocapture
+//!
+//! for this specific case, as of 24 Aug 2020:
+//!
+//! cd to the top of your wasmtime tree, then:
+//! RUST_BACKTRACE=1 cargo test --features test-programs/test_programs \
+//!   --features experimental_x64 --all --exclude peepmatic --exclude lightbeam \
+//!   --exclude wasmtime-lightbeam --exclude peepmatic-automata --exclude peepmatic-fuzzing \
+//!  --exclude peepmatic-macro -- isa::x64::inst::emit_tests::test_x64_emit
+
+use super::*;
+use crate::isa::test_utils;
+use crate::isa::x64;
+use alloc::vec::Vec;
+
+#[test]
+fn test_x64_emit() {
+    let rax = regs::rax();
+    let rbx = regs::rbx();
+    let rcx = regs::rcx();
+    let rdx = regs::rdx();
+    let rsi = regs::rsi();
+    let rdi = regs::rdi();
+    let rsp = regs::rsp();
+    let rbp = regs::rbp();
+    let r8 = regs::r8();
+    let r9 = regs::r9();
+    let r10 = regs::r10();
+    let r11 = regs::r11();
+    let r12 = regs::r12();
+    let r13 = regs::r13();
+    let r14 = regs::r14();
+    let r15 = regs::r15();
+
+    let xmm0 = regs::xmm0();
+    let xmm1 = regs::xmm1();
+    let xmm2 = regs::xmm2();
+    let xmm3 = regs::xmm3();
+    let xmm4 = regs::xmm4();
+    let xmm5 = regs::xmm5();
+    let xmm6 = regs::xmm6();
+    let xmm7 = regs::xmm7();
+    let xmm8 = regs::xmm8();
+    let xmm9 = regs::xmm9();
+    let xmm10 = regs::xmm10();
+    let xmm11 = regs::xmm11();
+    let xmm12 = regs::xmm12();
+    let xmm13 = regs::xmm13();
+    let xmm14 = regs::xmm14();
+    let xmm15 = regs::xmm15();
+
+    // And Writable<> versions of the same:
+    let w_rax = Writable::<Reg>::from_reg(rax);
+    let w_rbx = Writable::<Reg>::from_reg(rbx);
+    let w_rcx = Writable::<Reg>::from_reg(rcx);
+    let w_rdx = Writable::<Reg>::from_reg(rdx);
+    let w_rsi = Writable::<Reg>::from_reg(rsi);
+    let w_rdi = Writable::<Reg>::from_reg(rdi);
+    let _w_rsp = Writable::<Reg>::from_reg(rsp);
+    let _w_rbp = Writable::<Reg>::from_reg(rbp);
+    let w_r8 = Writable::<Reg>::from_reg(r8);
+    let w_r9 = Writable::<Reg>::from_reg(r9);
+    let _w_r10 = Writable::<Reg>::from_reg(r10);
+    let w_r11 = Writable::<Reg>::from_reg(r11);
+    let w_r12 = Writable::<Reg>::from_reg(r12);
+    let w_r13 = Writable::<Reg>::from_reg(r13);
+    let w_r14 = Writable::<Reg>::from_reg(r14);
+    let w_r15 = Writable::<Reg>::from_reg(r15);
+
+    let w_xmm0 = Writable::<Reg>::from_reg(xmm0);
+    let w_xmm1 = Writable::<Reg>::from_reg(xmm1);
+    let w_xmm2 = Writable::<Reg>::from_reg(xmm2);
+    let w_xmm3 = Writable::<Reg>::from_reg(xmm3);
+    let w_xmm4 = Writable::<Reg>::from_reg(xmm4);
+    let w_xmm5 = Writable::<Reg>::from_reg(xmm5);
+    let w_xmm6 = Writable::<Reg>::from_reg(xmm6);
+    let w_xmm7 = Writable::<Reg>::from_reg(xmm7);
+    let w_xmm8 = Writable::<Reg>::from_reg(xmm8);
+    let w_xmm9 = Writable::<Reg>::from_reg(xmm9);
+    let w_xmm10 = Writable::<Reg>::from_reg(xmm10);
+    let w_xmm11 = Writable::<Reg>::from_reg(xmm11);
+    let w_xmm12 = Writable::<Reg>::from_reg(xmm12);
+    let w_xmm13 = Writable::<Reg>::from_reg(xmm13);
+    let w_xmm14 = Writable::<Reg>::from_reg(xmm14);
+    let w_xmm15 = Writable::<Reg>::from_reg(xmm15);
+
+    let mut insns = Vec::<(Inst, &str, &str)>::new();
+
+    // ========================================================
+    // Cases aimed at checking Addr-esses: IR (Imm + Reg)
+    //
+    // These are just a bunch of loads with all supported (by the emitter)
+    // permutations of address formats.
+    //
+    // Addr_IR, offset zero
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rax), w_rdi),
+        "488B38",
+        "movq    0(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rbx), w_rdi),
+        "488B3B",
+        "movq    0(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rcx), w_rdi),
+        "488B39",
+        "movq    0(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rdx), w_rdi),
+        "488B3A",
+        "movq    0(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rbp), w_rdi),
+        "488B7D00",
+        "movq    0(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rsp), w_rdi),
+        "488B3C24",
+        "movq    0(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rsi), w_rdi),
+        "488B3E",
+        "movq    0(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rdi), w_rdi),
+        "488B3F",
+        "movq    0(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r8), w_rdi),
+        "498B38",
+        "movq    0(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r9), w_rdi),
+        "498B39",
+        "movq    0(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r10), w_rdi),
+        "498B3A",
+        "movq    0(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r11), w_rdi),
+        "498B3B",
+        "movq    0(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r12), w_rdi),
+        "498B3C24",
+        "movq    0(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r13), w_rdi),
+        "498B7D00",
+        "movq    0(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r14), w_rdi),
+        "498B3E",
+        "movq    0(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r15), w_rdi),
+        "498B3F",
+        "movq    0(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset max simm8
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rax), w_rdi),
+        "488B787F",
+        "movq    127(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rbx), w_rdi),
+        "488B7B7F",
+        "movq    127(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rcx), w_rdi),
+        "488B797F",
+        "movq    127(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rdx), w_rdi),
+        "488B7A7F",
+        "movq    127(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rbp), w_rdi),
+        "488B7D7F",
+        "movq    127(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rsp), w_rdi),
+        "488B7C247F",
+        "movq    127(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rsi), w_rdi),
+        "488B7E7F",
+        "movq    127(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rdi), w_rdi),
+        "488B7F7F",
+        "movq    127(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r8), w_rdi),
+        "498B787F",
+        "movq    127(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r9), w_rdi),
+        "498B797F",
+        "movq    127(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r10), w_rdi),
+        "498B7A7F",
+        "movq    127(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r11), w_rdi),
+        "498B7B7F",
+        "movq    127(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r12), w_rdi),
+        "498B7C247F",
+        "movq    127(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r13), w_rdi),
+        "498B7D7F",
+        "movq    127(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r14), w_rdi),
+        "498B7E7F",
+        "movq    127(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r15), w_rdi),
+        "498B7F7F",
+        "movq    127(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset min simm8
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rax), w_rdi),
+        "488B7880",
+        "movq    -128(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rbx), w_rdi),
+        "488B7B80",
+        "movq    -128(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rcx), w_rdi),
+        "488B7980",
+        "movq    -128(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rdx), w_rdi),
+        "488B7A80",
+        "movq    -128(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rbp), w_rdi),
+        "488B7D80",
+        "movq    -128(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rsp), w_rdi),
+        "488B7C2480",
+        "movq    -128(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rsi), w_rdi),
+        "488B7E80",
+        "movq    -128(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rdi), w_rdi),
+        "488B7F80",
+        "movq    -128(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r8), w_rdi),
+        "498B7880",
+        "movq    -128(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r9), w_rdi),
+        "498B7980",
+        "movq    -128(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r10), w_rdi),
+        "498B7A80",
+        "movq    -128(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r11), w_rdi),
+        "498B7B80",
+        "movq    -128(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r12), w_rdi),
+        "498B7C2480",
+        "movq    -128(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r13), w_rdi),
+        "498B7D80",
+        "movq    -128(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r14), w_rdi),
+        "498B7E80",
+        "movq    -128(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r15), w_rdi),
+        "498B7F80",
+        "movq    -128(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset smallest positive simm32
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rax), w_rdi),
+        "488BB880000000",
+        "movq    128(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rbx), w_rdi),
+        "488BBB80000000",
+        "movq    128(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rcx), w_rdi),
+        "488BB980000000",
+        "movq    128(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rdx), w_rdi),
+        "488BBA80000000",
+        "movq    128(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rbp), w_rdi),
+        "488BBD80000000",
+        "movq    128(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rsp), w_rdi),
+        "488BBC2480000000",
+        "movq    128(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rsi), w_rdi),
+        "488BBE80000000",
+        "movq    128(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rdi), w_rdi),
+        "488BBF80000000",
+        "movq    128(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r8), w_rdi),
+        "498BB880000000",
+        "movq    128(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r9), w_rdi),
+        "498BB980000000",
+        "movq    128(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r10), w_rdi),
+        "498BBA80000000",
+        "movq    128(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r11), w_rdi),
+        "498BBB80000000",
+        "movq    128(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r12), w_rdi),
+        "498BBC2480000000",
+        "movq    128(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r13), w_rdi),
+        "498BBD80000000",
+        "movq    128(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r14), w_rdi),
+        "498BBE80000000",
+        "movq    128(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r15), w_rdi),
+        "498BBF80000000",
+        "movq    128(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset smallest negative simm32
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rax), w_rdi),
+        "488BB87FFFFFFF",
+        "movq    -129(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rbx), w_rdi),
+        "488BBB7FFFFFFF",
+        "movq    -129(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rcx), w_rdi),
+        "488BB97FFFFFFF",
+        "movq    -129(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rdx), w_rdi),
+        "488BBA7FFFFFFF",
+        "movq    -129(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rbp), w_rdi),
+        "488BBD7FFFFFFF",
+        "movq    -129(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rsp), w_rdi),
+        "488BBC247FFFFFFF",
+        "movq    -129(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rsi), w_rdi),
+        "488BBE7FFFFFFF",
+        "movq    -129(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rdi), w_rdi),
+        "488BBF7FFFFFFF",
+        "movq    -129(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r8), w_rdi),
+        "498BB87FFFFFFF",
+        "movq    -129(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r9), w_rdi),
+        "498BB97FFFFFFF",
+        "movq    -129(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r10), w_rdi),
+        "498BBA7FFFFFFF",
+        "movq    -129(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r11), w_rdi),
+        "498BBB7FFFFFFF",
+        "movq    -129(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r12), w_rdi),
+        "498BBC247FFFFFFF",
+        "movq    -129(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r13), w_rdi),
+        "498BBD7FFFFFFF",
+        "movq    -129(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r14), w_rdi),
+        "498BBE7FFFFFFF",
+        "movq    -129(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r15), w_rdi),
+        "498BBF7FFFFFFF",
+        "movq    -129(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset large positive simm32
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rax), w_rdi),
+        "488BB877207317",
+        "movq    393420919(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rbx), w_rdi),
+        "488BBB77207317",
+        "movq    393420919(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rcx), w_rdi),
+        "488BB977207317",
+        "movq    393420919(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rdx), w_rdi),
+        "488BBA77207317",
+        "movq    393420919(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rbp), w_rdi),
+        "488BBD77207317",
+        "movq    393420919(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rsp), w_rdi),
+        "488BBC2477207317",
+        "movq    393420919(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rsi), w_rdi),
+        "488BBE77207317",
+        "movq    393420919(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rdi), w_rdi),
+        "488BBF77207317",
+        "movq    393420919(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r8), w_rdi),
+        "498BB877207317",
+        "movq    393420919(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r9), w_rdi),
+        "498BB977207317",
+        "movq    393420919(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r10), w_rdi),
+        "498BBA77207317",
+        "movq    393420919(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r11), w_rdi),
+        "498BBB77207317",
+        "movq    393420919(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r12), w_rdi),
+        "498BBC2477207317",
+        "movq    393420919(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r13), w_rdi),
+        "498BBD77207317",
+        "movq    393420919(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r14), w_rdi),
+        "498BBE77207317",
+        "movq    393420919(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r15), w_rdi),
+        "498BBF77207317",
+        "movq    393420919(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset large negative simm32
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rax), w_rdi),
+        "488BB8D9A6BECE",
+        "movq    -826366247(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rbx), w_rdi),
+        "488BBBD9A6BECE",
+        "movq    -826366247(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rcx), w_rdi),
+        "488BB9D9A6BECE",
+        "movq    -826366247(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rdx), w_rdi),
+        "488BBAD9A6BECE",
+        "movq    -826366247(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rbp), w_rdi),
+        "488BBDD9A6BECE",
+        "movq    -826366247(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rsp), w_rdi),
+        "488BBC24D9A6BECE",
+        "movq    -826366247(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rsi), w_rdi),
+        "488BBED9A6BECE",
+        "movq    -826366247(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rdi), w_rdi),
+        "488BBFD9A6BECE",
+        "movq    -826366247(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r8), w_rdi),
+        "498BB8D9A6BECE",
+        "movq    -826366247(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r9), w_rdi),
+        "498BB9D9A6BECE",
+        "movq    -826366247(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r10), w_rdi),
+        "498BBAD9A6BECE",
+        "movq    -826366247(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r11), w_rdi),
+        "498BBBD9A6BECE",
+        "movq    -826366247(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r12), w_rdi),
+        "498BBC24D9A6BECE",
+        "movq    -826366247(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r13), w_rdi),
+        "498BBDD9A6BECE",
+        "movq    -826366247(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r14), w_rdi),
+        "498BBED9A6BECE",
+        "movq    -826366247(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r15), w_rdi),
+        "498BBFD9A6BECE",
+        "movq    -826366247(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Cases aimed at checking Addr-esses: IRRS (Imm + Reg + (Reg << Shift))
+    // Note these don't check the case where the index reg is RSP, since we
+    // don't encode any of those.
+    //
+    // Addr_IRRS, offset max simm8
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rax, rax, 0), w_r11),
+        "4C8B5C007F",
+        "movq    127(%rax,%rax,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rdi, rax, 1), w_r11),
+        "4C8B5C477F",
+        "movq    127(%rdi,%rax,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r8, rax, 2), w_r11),
+        "4D8B5C807F",
+        "movq    127(%r8,%rax,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r15, rax, 3), w_r11),
+        "4D8B5CC77F",
+        "movq    127(%r15,%rax,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rax, rdi, 3), w_r11),
+        "4C8B5CF87F",
+        "movq    127(%rax,%rdi,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rdi, rdi, 2), w_r11),
+        "4C8B5CBF7F",
+        "movq    127(%rdi,%rdi,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r8, rdi, 1), w_r11),
+        "4D8B5C787F",
+        "movq    127(%r8,%rdi,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r15, rdi, 0), w_r11),
+        "4D8B5C3F7F",
+        "movq    127(%r15,%rdi,1), %r11",
+    ));
+
+    // ========================================================
+    // Addr_IRRS, offset min simm8
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rax, r8, 2), w_r11),
+        "4E8B5C8080",
+        "movq    -128(%rax,%r8,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rdi, r8, 3), w_r11),
+        "4E8B5CC780",
+        "movq    -128(%rdi,%r8,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r8, r8, 0), w_r11),
+        "4F8B5C0080",
+        "movq    -128(%r8,%r8,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r15, r8, 1), w_r11),
+        "4F8B5C4780",
+        "movq    -128(%r15,%r8,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rax, r15, 1), w_r11),
+        "4E8B5C7880",
+        "movq    -128(%rax,%r15,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rdi, r15, 0), w_r11),
+        "4E8B5C3F80",
+        "movq    -128(%rdi,%r15,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r8, r15, 3), w_r11),
+        "4F8B5CF880",
+        "movq    -128(%r8,%r15,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r15, r15, 2), w_r11),
+        "4F8B5CBF80",
+        "movq    -128(%r15,%r15,4), %r11",
+    ));
+
+    // ========================================================
+    // Addr_IRRS, offset large positive simm32
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rax, rax, 0), w_r11),
+        "4C8B9C00BE25664F",
+        "movq    1332094398(%rax,%rax,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rdi, rax, 1), w_r11),
+        "4C8B9C47BE25664F",
+        "movq    1332094398(%rdi,%rax,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r8, rax, 2), w_r11),
+        "4D8B9C80BE25664F",
+        "movq    1332094398(%r8,%rax,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r15, rax, 3), w_r11),
+        "4D8B9CC7BE25664F",
+        "movq    1332094398(%r15,%rax,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rax, rdi, 3), w_r11),
+        "4C8B9CF8BE25664F",
+        "movq    1332094398(%rax,%rdi,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rdi, rdi, 2), w_r11),
+        "4C8B9CBFBE25664F",
+        "movq    1332094398(%rdi,%rdi,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r8, rdi, 1), w_r11),
+        "4D8B9C78BE25664F",
+        "movq    1332094398(%r8,%rdi,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r15, rdi, 0), w_r11),
+        "4D8B9C3FBE25664F",
+        "movq    1332094398(%r15,%rdi,1), %r11",
+    ));
+
+    // ========================================================
+    // Addr_IRRS, offset large negative simm32
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rax, r8, 2),
+            w_r11,
+        ),
+        "4E8B9C8070E9B2D9",
+        "movq    -642586256(%rax,%r8,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rdi, r8, 3),
+            w_r11,
+        ),
+        "4E8B9CC770E9B2D9",
+        "movq    -642586256(%rdi,%r8,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r8, r8, 0),
+            w_r11,
+        ),
+        "4F8B9C0070E9B2D9",
+        "movq    -642586256(%r8,%r8,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r15, r8, 1),
+            w_r11,
+        ),
+        "4F8B9C4770E9B2D9",
+        "movq    -642586256(%r15,%r8,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rax, r15, 1),
+            w_r11,
+        ),
+        "4E8B9C7870E9B2D9",
+        "movq    -642586256(%rax,%r15,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rdi, r15, 0),
+            w_r11,
+        ),
+        "4E8B9C3F70E9B2D9",
+        "movq    -642586256(%rdi,%r15,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r8, r15, 3),
+            w_r11,
+        ),
+        "4F8B9CF870E9B2D9",
+        "movq    -642586256(%r8,%r15,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r15, r15, 2),
+            w_r11,
+        ),
+        "4F8B9CBF70E9B2D9",
+        "movq    -642586256(%r15,%r15,4), %r11",
+    ));
+
+    // End of test cases for Addr
+    // ========================================================
+
+    // ========================================================
+    // General tests for each insn.  Don't forget to follow the
+    // guidelines commented just prior to `fn x64_emit`.
+    //
+    // Alu_RMI_R
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Add, RegMemImm::reg(r15), w_rdx),
+        "4C01FA",
+        "addq    %r15, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::reg(rcx), w_r8),
+        "4101C8",
+        "addl    %ecx, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::reg(rcx), w_rsi),
+        "01CE",
+        "addl    %ecx, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Add,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_rdx,
+        ),
+        "48035763",
+        "addq    99(%rdi), %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_r8,
+        ),
+        "44034763",
+        "addl    99(%rdi), %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_rsi,
+        ),
+        "037763",
+        "addl    99(%rdi), %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-127i32 as u32),
+            w_rdx,
+        ),
+        "4883C281",
+        "addq    $-127, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-129i32 as u32),
+            w_rdx,
+        ),
+        "4881C27FFFFFFF",
+        "addq    $-129, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Add, RegMemImm::imm(76543210), w_rdx),
+        "4881C2EAF48F04",
+        "addq    $76543210, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-127i32 as u32),
+            w_r8,
+        ),
+        "4183C081",
+        "addl    $-127, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-129i32 as u32),
+            w_r8,
+        ),
+        "4181C07FFFFFFF",
+        "addl    $-129, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-76543210i32 as u32),
+            w_r8,
+        ),
+        "4181C0160B70FB",
+        "addl    $-76543210, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-127i32 as u32),
+            w_rsi,
+        ),
+        "83C681",
+        "addl    $-127, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-129i32 as u32),
+            w_rsi,
+        ),
+        "81C67FFFFFFF",
+        "addl    $-129, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::imm(76543210), w_rsi),
+        "81C6EAF48F04",
+        "addl    $76543210, %esi",
+    ));
+    // This is pretty feeble
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Sub, RegMemImm::reg(r15), w_rdx),
+        "4C29FA",
+        "subq    %r15, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::And, RegMemImm::reg(r15), w_rdx),
+        "4C21FA",
+        "andq    %r15, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Or, RegMemImm::reg(r15), w_rdx),
+        "4C09FA",
+        "orq     %r15, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Xor, RegMemImm::reg(r15), w_rdx),
+        "4C31FA",
+        "xorq    %r15, %rdx",
+    ));
+    // Test all mul cases, though
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Mul, RegMemImm::reg(r15), w_rdx),
+        "490FAFD7",
+        "imulq   %r15, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::reg(rcx), w_r8),
+        "440FAFC1",
+        "imull   %ecx, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::reg(rcx), w_rsi),
+        "0FAFF1",
+        "imull   %ecx, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Mul,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_rdx,
+        ),
+        "480FAF5763",
+        "imulq   99(%rdi), %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_r8,
+        ),
+        "440FAF4763",
+        "imull   99(%rdi), %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_rsi,
+        ),
+        "0FAF7763",
+        "imull   99(%rdi), %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-127i32 as u32),
+            w_rdx,
+        ),
+        "486BD281",
+        "imulq   $-127, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-129i32 as u32),
+            w_rdx,
+        ),
+        "4869D27FFFFFFF",
+        "imulq   $-129, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Mul, RegMemImm::imm(76543210), w_rdx),
+        "4869D2EAF48F04",
+        "imulq   $76543210, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-127i32 as u32),
+            w_r8,
+        ),
+        "456BC081",
+        "imull   $-127, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-129i32 as u32),
+            w_r8,
+        ),
+        "4569C07FFFFFFF",
+        "imull   $-129, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-76543210i32 as u32),
+            w_r8,
+        ),
+        "4569C0160B70FB",
+        "imull   $-76543210, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-127i32 as u32),
+            w_rsi,
+        ),
+        "6BF681",
+        "imull   $-127, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-129i32 as u32),
+            w_rsi,
+        ),
+        "69F67FFFFFFF",
+        "imull   $-129, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::imm(76543210), w_rsi),
+        "69F6EAF48F04",
+        "imull   $76543210, %esi",
+    ));
+
+    // ========================================================
+    // UnaryRmR
+
+    insns.push((
+        Inst::unary_rm_r(4, UnaryRmROpcode::Bsr, RegMem::reg(rsi), w_rdi),
+        "0FBDFE",
+        "bsrl    %esi, %edi",
+    ));
+    insns.push((
+        Inst::unary_rm_r(8, UnaryRmROpcode::Bsr, RegMem::reg(r15), w_rax),
+        "490FBDC7",
+        "bsrq    %r15, %rax",
+    ));
+
+    // ========================================================
+    // Not
+    insns.push((
+        Inst::not(4, Writable::from_reg(regs::rsi())),
+        "F7D6",
+        "notl    %esi",
+    ));
+    insns.push((
+        Inst::not(8, Writable::from_reg(regs::r15())),
+        "49F7D7",
+        "notq    %r15",
+    ));
+    insns.push((
+        Inst::not(4, Writable::from_reg(regs::r14())),
+        "41F7D6",
+        "notl    %r14d",
+    ));
+    insns.push((
+        Inst::not(2, Writable::from_reg(regs::rdi())),
+        "66F7D7",
+        "notw    %di",
+    ));
+
+    // ========================================================
+    // Neg
+    insns.push((
+        Inst::neg(4, Writable::from_reg(regs::rsi())),
+        "F7DE",
+        "negl    %esi",
+    ));
+    insns.push((
+        Inst::neg(8, Writable::from_reg(regs::r15())),
+        "49F7DF",
+        "negq    %r15",
+    ));
+    insns.push((
+        Inst::neg(4, Writable::from_reg(regs::r14())),
+        "41F7DE",
+        "negl    %r14d",
+    ));
+    insns.push((
+        Inst::neg(2, Writable::from_reg(regs::rdi())),
+        "66F7DF",
+        "negw    %di",
+    ));
+
+    // ========================================================
+    // Div
+    insns.push((
+        Inst::div(4, true /*signed*/, RegMem::reg(regs::rsi())),
+        "F7FE",
+        "idiv    %esi",
+    ));
+    insns.push((
+        Inst::div(8, true /*signed*/, RegMem::reg(regs::r15())),
+        "49F7FF",
+        "idiv    %r15",
+    ));
+    insns.push((
+        Inst::div(4, false /*signed*/, RegMem::reg(regs::r14())),
+        "41F7F6",
+        "div     %r14d",
+    ));
+    insns.push((
+        Inst::div(8, false /*signed*/, RegMem::reg(regs::rdi())),
+        "48F7F7",
+        "div     %rdi",
+    ));
+
+    // ========================================================
+    // MulHi
+    insns.push((
+        Inst::mul_hi(4, true /*signed*/, RegMem::reg(regs::rsi())),
+        "F7EE",
+        "imul    %esi",
+    ));
+    insns.push((
+        Inst::mul_hi(8, true /*signed*/, RegMem::reg(regs::r15())),
+        "49F7EF",
+        "imul    %r15",
+    ));
+    insns.push((
+        Inst::mul_hi(4, false /*signed*/, RegMem::reg(regs::r14())),
+        "41F7E6",
+        "mul     %r14d",
+    ));
+    insns.push((
+        Inst::mul_hi(8, false /*signed*/, RegMem::reg(regs::rdi())),
+        "48F7E7",
+        "mul     %rdi",
+    ));
+
+    // ========================================================
+    // cbw
+    insns.push((Inst::sign_extend_data(1), "6698", "cbw"));
+
+    // ========================================================
+    // cdq family: SignExtendRaxRdx
+    insns.push((Inst::sign_extend_data(2), "6699", "cwd"));
+    insns.push((Inst::sign_extend_data(4), "99", "cdq"));
+    insns.push((Inst::sign_extend_data(8), "4899", "cqo"));
+
+    // ========================================================
+    // Imm_R
+    //
+    insns.push((
+        Inst::imm(OperandSize::Size32, 1234567, w_r14),
+        "41BE87D61200",
+        "movl    $1234567, %r14d",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size32, -126i64 as u64, w_r14),
+        "41BE82FFFFFF",
+        "movl    $-126, %r14d",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size64, 1234567898765, w_r14),
+        "49BE8D26FB711F010000",
+        "movabsq $1234567898765, %r14",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size64, -126i64 as u64, w_r14),
+        "49C7C682FFFFFF",
+        "movabsq $-126, %r14",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size32, 1234567, w_rcx),
+        "B987D61200",
+        "movl    $1234567, %ecx",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size32, -126i64 as u64, w_rcx),
+        "B982FFFFFF",
+        "movl    $-126, %ecx",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size64, 1234567898765, w_rsi),
+        "48BE8D26FB711F010000",
+        "movabsq $1234567898765, %rsi",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size64, -126i64 as u64, w_rbx),
+        "48C7C382FFFFFF",
+        "movabsq $-126, %rbx",
+    ));
+
+    // ========================================================
+    // Mov_R_R
+    insns.push((
+        Inst::mov_r_r(false, rbx, w_rsi),
+        "89DE",
+        "movl    %ebx, %esi",
+    ));
+    insns.push((
+        Inst::mov_r_r(false, rbx, w_r9),
+        "4189D9",
+        "movl    %ebx, %r9d",
+    ));
+    insns.push((
+        Inst::mov_r_r(false, r11, w_rsi),
+        "4489DE",
+        "movl    %r11d, %esi",
+    ));
+    insns.push((
+        Inst::mov_r_r(false, r12, w_r9),
+        "4589E1",
+        "movl    %r12d, %r9d",
+    ));
+    insns.push((
+        Inst::mov_r_r(true, rbx, w_rsi),
+        "4889DE",
+        "movq    %rbx, %rsi",
+    ));
+    insns.push((
+        Inst::mov_r_r(true, rbx, w_r9),
+        "4989D9",
+        "movq    %rbx, %r9",
+    ));
+    insns.push((
+        Inst::mov_r_r(true, r11, w_rsi),
+        "4C89DE",
+        "movq    %r11, %rsi",
+    ));
+    insns.push((
+        Inst::mov_r_r(true, r12, w_r9),
+        "4D89E1",
+        "movq    %r12, %r9",
+    ));
+
+    // ========================================================
+    // MovZX_RM_R
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(rdi), w_rdi),
+        "400FB6FF",
+        "movzbl  %dil, %edi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(rax), w_rsi),
+        "0FB6F0",
+        "movzbl  %al, %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(r15), w_rsi),
+        "410FB6F7",
+        "movzbl  %r15b, %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "0FB671F9",
+        "movzbl  -7(%rcx), %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "410FB658F9",
+        "movzbl  -7(%r8), %ebx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "450FB64AF9",
+        "movzbl  -7(%r10), %r9d",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "410FB653F9",
+        "movzbl  -7(%r11), %edx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::BQ, RegMem::reg(rax), w_rsi),
+        "480FB6F0",
+        "movzbq  %al, %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::BQ, RegMem::reg(r10), w_rsi),
+        "490FB6F2",
+        "movzbq  %r10b, %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "480FB671F9",
+        "movzbq  -7(%rcx), %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "490FB658F9",
+        "movzbq  -7(%r8), %rbx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "4D0FB64AF9",
+        "movzbq  -7(%r10), %r9",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "490FB653F9",
+        "movzbq  -7(%r11), %rdx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::WL, RegMem::reg(rcx), w_rsi),
+        "0FB7F1",
+        "movzwl  %cx, %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::WL, RegMem::reg(r10), w_rsi),
+        "410FB7F2",
+        "movzwl  %r10w, %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "0FB771F9",
+        "movzwl  -7(%rcx), %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "410FB758F9",
+        "movzwl  -7(%r8), %ebx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "450FB74AF9",
+        "movzwl  -7(%r10), %r9d",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "410FB753F9",
+        "movzwl  -7(%r11), %edx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::WQ, RegMem::reg(rcx), w_rsi),
+        "480FB7F1",
+        "movzwq  %cx, %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::WQ, RegMem::reg(r11), w_rsi),
+        "490FB7F3",
+        "movzwq  %r11w, %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "480FB771F9",
+        "movzwq  -7(%rcx), %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "490FB758F9",
+        "movzwq  -7(%r8), %rbx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "4D0FB74AF9",
+        "movzwq  -7(%r10), %r9",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "490FB753F9",
+        "movzwq  -7(%r11), %rdx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(rcx), w_rsi),
+        "8BF1",
+        "movl    %ecx, %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "8B71F9",
+        "movl    -7(%rcx), %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "418B58F9",
+        "movl    -7(%r8), %ebx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "458B4AF9",
+        "movl    -7(%r10), %r9d",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "418B53F9",
+        "movl    -7(%r11), %edx",
+    ));
+
+    // ========================================================
+    // Mov64_M_R
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, rbx, 0), w_rcx),
+        "488B8C18B3000000",
+        "movq    179(%rax,%rbx,1), %rcx",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, rbx, 0), w_r8),
+        "4C8B8418B3000000",
+        "movq    179(%rax,%rbx,1), %r8",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, r9, 0), w_rcx),
+        "4A8B8C08B3000000",
+        "movq    179(%rax,%r9,1), %rcx",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, r9, 0), w_r8),
+        "4E8B8408B3000000",
+        "movq    179(%rax,%r9,1), %r8",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, rbx, 0), w_rcx),
+        "498B8C1AB3000000",
+        "movq    179(%r10,%rbx,1), %rcx",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, rbx, 0), w_r8),
+        "4D8B841AB3000000",
+        "movq    179(%r10,%rbx,1), %r8",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_rcx),
+        "4B8B8C0AB3000000",
+        "movq    179(%r10,%r9,1), %rcx",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_r8),
+        "4F8B840AB3000000",
+        "movq    179(%r10,%r9,1), %r8",
+    ));
+
+    // ========================================================
+    // LoadEffectiveAddress
+    insns.push((
+        Inst::lea(Amode::imm_reg(42, r10), w_r8),
+        "4D8D422A",
+        "lea     42(%r10), %r8",
+    ));
+    insns.push((
+        Inst::lea(Amode::imm_reg(42, r10), w_r15),
+        "4D8D7A2A",
+        "lea     42(%r10), %r15",
+    ));
+    insns.push((
+        Inst::lea(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_r8),
+        "4F8D840AB3000000",
+        "lea     179(%r10,%r9,1), %r8",
+    ));
+    insns.push((
+        Inst::lea(Amode::rip_relative(MachLabel::from_block(0)), w_rdi),
+        "488D3D00000000",
+        "lea     label0(%rip), %rdi",
+    ));
+
+    // ========================================================
+    // MovSX_RM_R
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rdi), w_rdi),
+        "400FBEFF",
+        "movsbl  %dil, %edi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rcx), w_rsi),
+        "0FBEF1",
+        "movsbl  %cl, %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(r14), w_rsi),
+        "410FBEF6",
+        "movsbl  %r14b, %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "0FBE71F9",
+        "movsbl  -7(%rcx), %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "410FBE58F9",
+        "movsbl  -7(%r8), %ebx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "450FBE4AF9",
+        "movsbl  -7(%r10), %r9d",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "410FBE53F9",
+        "movsbl  -7(%r11), %edx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::BQ, RegMem::reg(rcx), w_rsi),
+        "480FBEF1",
+        "movsbq  %cl, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::BQ, RegMem::reg(r15), w_rsi),
+        "490FBEF7",
+        "movsbq  %r15b, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "480FBE71F9",
+        "movsbq  -7(%rcx), %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "490FBE58F9",
+        "movsbq  -7(%r8), %rbx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "4D0FBE4AF9",
+        "movsbq  -7(%r10), %r9",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "490FBE53F9",
+        "movsbq  -7(%r11), %rdx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::WL, RegMem::reg(rcx), w_rsi),
+        "0FBFF1",
+        "movswl  %cx, %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::WL, RegMem::reg(r14), w_rsi),
+        "410FBFF6",
+        "movswl  %r14w, %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "0FBF71F9",
+        "movswl  -7(%rcx), %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "410FBF58F9",
+        "movswl  -7(%r8), %ebx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "450FBF4AF9",
+        "movswl  -7(%r10), %r9d",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "410FBF53F9",
+        "movswl  -7(%r11), %edx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::WQ, RegMem::reg(rcx), w_rsi),
+        "480FBFF1",
+        "movswq  %cx, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::WQ, RegMem::reg(r13), w_rsi),
+        "490FBFF5",
+        "movswq  %r13w, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "480FBF71F9",
+        "movswq  -7(%rcx), %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "490FBF58F9",
+        "movswq  -7(%r8), %rbx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "4D0FBF4AF9",
+        "movswq  -7(%r10), %r9",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "490FBF53F9",
+        "movswq  -7(%r11), %rdx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::LQ, RegMem::reg(rcx), w_rsi),
+        "4863F1",
+        "movslq  %ecx, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::LQ, RegMem::reg(r15), w_rsi),
+        "4963F7",
+        "movslq  %r15d, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "486371F9",
+        "movslq  -7(%rcx), %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "496358F9",
+        "movslq  -7(%r8), %rbx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "4D634AF9",
+        "movslq  -7(%r10), %r9",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "496353F9",
+        "movslq  -7(%r11), %rdx",
+    ));
+
+    // ========================================================
+    // Mov_R_M.  Byte stores are tricky.  Check everything carefully.
+    insns.push((
+        Inst::mov_r_m(8, rax, Amode::imm_reg(99, rdi)),
+        "48894763",
+        "movq    %rax, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rbx, Amode::imm_reg(99, r8)),
+        "49895863",
+        "movq    %rbx, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rcx, Amode::imm_reg(99, rsi)),
+        "48894E63",
+        "movq    %rcx, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rdx, Amode::imm_reg(99, r9)),
+        "49895163",
+        "movq    %rdx, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rsi, Amode::imm_reg(99, rax)),
+        "48897063",
+        "movq    %rsi, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rdi, Amode::imm_reg(99, r15)),
+        "49897F63",
+        "movq    %rdi, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rsp, Amode::imm_reg(99, rcx)),
+        "48896163",
+        "movq    %rsp, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rbp, Amode::imm_reg(99, r14)),
+        "49896E63",
+        "movq    %rbp, 99(%r14)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r8, Amode::imm_reg(99, rdi)),
+        "4C894763",
+        "movq    %r8, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r9, Amode::imm_reg(99, r8)),
+        "4D894863",
+        "movq    %r9, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r10, Amode::imm_reg(99, rsi)),
+        "4C895663",
+        "movq    %r10, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r11, Amode::imm_reg(99, r9)),
+        "4D895963",
+        "movq    %r11, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r12, Amode::imm_reg(99, rax)),
+        "4C896063",
+        "movq    %r12, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r13, Amode::imm_reg(99, r15)),
+        "4D896F63",
+        "movq    %r13, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r14, Amode::imm_reg(99, rcx)),
+        "4C897163",
+        "movq    %r14, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r15, Amode::imm_reg(99, r14)),
+        "4D897E63",
+        "movq    %r15, 99(%r14)",
+    ));
+    //
+    insns.push((
+        Inst::mov_r_m(4, rax, Amode::imm_reg(99, rdi)),
+        "894763",
+        "movl    %eax, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rbx, Amode::imm_reg(99, r8)),
+        "41895863",
+        "movl    %ebx, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rcx, Amode::imm_reg(99, rsi)),
+        "894E63",
+        "movl    %ecx, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rdx, Amode::imm_reg(99, r9)),
+        "41895163",
+        "movl    %edx, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rsi, Amode::imm_reg(99, rax)),
+        "897063",
+        "movl    %esi, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rdi, Amode::imm_reg(99, r15)),
+        "41897F63",
+        "movl    %edi, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rsp, Amode::imm_reg(99, rcx)),
+        "896163",
+        "movl    %esp, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rbp, Amode::imm_reg(99, r14)),
+        "41896E63",
+        "movl    %ebp, 99(%r14)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r8, Amode::imm_reg(99, rdi)),
+        "44894763",
+        "movl    %r8d, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r9, Amode::imm_reg(99, r8)),
+        "45894863",
+        "movl    %r9d, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r10, Amode::imm_reg(99, rsi)),
+        "44895663",
+        "movl    %r10d, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r11, Amode::imm_reg(99, r9)),
+        "45895963",
+        "movl    %r11d, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r12, Amode::imm_reg(99, rax)),
+        "44896063",
+        "movl    %r12d, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r13, Amode::imm_reg(99, r15)),
+        "45896F63",
+        "movl    %r13d, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r14, Amode::imm_reg(99, rcx)),
+        "44897163",
+        "movl    %r14d, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r15, Amode::imm_reg(99, r14)),
+        "45897E63",
+        "movl    %r15d, 99(%r14)",
+    ));
+    //
+    insns.push((
+        Inst::mov_r_m(2, rax, Amode::imm_reg(99, rdi)),
+        "66894763",
+        "movw    %ax, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rbx, Amode::imm_reg(99, r8)),
+        "6641895863",
+        "movw    %bx, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rcx, Amode::imm_reg(99, rsi)),
+        "66894E63",
+        "movw    %cx, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rdx, Amode::imm_reg(99, r9)),
+        "6641895163",
+        "movw    %dx, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rsi, Amode::imm_reg(99, rax)),
+        "66897063",
+        "movw    %si, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rdi, Amode::imm_reg(99, r15)),
+        "6641897F63",
+        "movw    %di, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rsp, Amode::imm_reg(99, rcx)),
+        "66896163",
+        "movw    %sp, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rbp, Amode::imm_reg(99, r14)),
+        "6641896E63",
+        "movw    %bp, 99(%r14)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r8, Amode::imm_reg(99, rdi)),
+        "6644894763",
+        "movw    %r8w, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r9, Amode::imm_reg(99, r8)),
+        "6645894863",
+        "movw    %r9w, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r10, Amode::imm_reg(99, rsi)),
+        "6644895663",
+        "movw    %r10w, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r11, Amode::imm_reg(99, r9)),
+        "6645895963",
+        "movw    %r11w, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r12, Amode::imm_reg(99, rax)),
+        "6644896063",
+        "movw    %r12w, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r13, Amode::imm_reg(99, r15)),
+        "6645896F63",
+        "movw    %r13w, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r14, Amode::imm_reg(99, rcx)),
+        "6644897163",
+        "movw    %r14w, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r15, Amode::imm_reg(99, r14)),
+        "6645897E63",
+        "movw    %r15w, 99(%r14)",
+    ));
+    //
+    insns.push((
+        Inst::mov_r_m(1, rax, Amode::imm_reg(99, rdi)),
+        "884763",
+        "movb    %al, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rbx, Amode::imm_reg(99, r8)),
+        "41885863",
+        "movb    %bl, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rcx, Amode::imm_reg(99, rsi)),
+        "884E63",
+        "movb    %cl, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rdx, Amode::imm_reg(99, r9)),
+        "41885163",
+        "movb    %dl, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rsi, Amode::imm_reg(99, rax)),
+        "40887063",
+        "movb    %sil, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rdi, Amode::imm_reg(99, r15)),
+        "41887F63",
+        "movb    %dil, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rsp, Amode::imm_reg(99, rcx)),
+        "40886163",
+        "movb    %spl, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rbp, Amode::imm_reg(99, r14)),
+        "41886E63",
+        "movb    %bpl, 99(%r14)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r8, Amode::imm_reg(99, rdi)),
+        "44884763",
+        "movb    %r8b, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r9, Amode::imm_reg(99, r8)),
+        "45884863",
+        "movb    %r9b, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r10, Amode::imm_reg(99, rsi)),
+        "44885663",
+        "movb    %r10b, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r11, Amode::imm_reg(99, r9)),
+        "45885963",
+        "movb    %r11b, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r12, Amode::imm_reg(99, rax)),
+        "44886063",
+        "movb    %r12b, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r13, Amode::imm_reg(99, r15)),
+        "45886F63",
+        "movb    %r13b, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r14, Amode::imm_reg(99, rcx)),
+        "44887163",
+        "movb    %r14b, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r15, Amode::imm_reg(99, r14)),
+        "45887E63",
+        "movb    %r15b, 99(%r14)",
+    ));
+
+    // ========================================================
+    // Shift_R
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftLeft, None, w_rdi),
+        "D3E7",
+        "shll    %cl, %edi",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftLeft, None, w_r12),
+        "41D3E4",
+        "shll    %cl, %r12d",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftLeft, Some(2), w_r8),
+        "41C1E002",
+        "shll    $2, %r8d",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftLeft, Some(31), w_r13),
+        "41C1E51F",
+        "shll    $31, %r13d",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftLeft, None, w_r13),
+        "49D3E5",
+        "shlq    %cl, %r13",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftLeft, None, w_rdi),
+        "48D3E7",
+        "shlq    %cl, %rdi",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftLeft, Some(2), w_r8),
+        "49C1E002",
+        "shlq    $2, %r8",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftLeft, Some(3), w_rbx),
+        "48C1E303",
+        "shlq    $3, %rbx",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftLeft, Some(63), w_r13),
+        "49C1E53F",
+        "shlq    $63, %r13",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightLogical, None, w_rdi),
+        "D3EF",
+        "shrl    %cl, %edi",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(2), w_r8),
+        "41C1E802",
+        "shrl    $2, %r8d",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(31), w_r13),
+        "41C1ED1F",
+        "shrl    $31, %r13d",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightLogical, None, w_rdi),
+        "48D3EF",
+        "shrq    %cl, %rdi",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(2), w_r8),
+        "49C1E802",
+        "shrq    $2, %r8",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(63), w_r13),
+        "49C1ED3F",
+        "shrq    $63, %r13",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, None, w_rdi),
+        "D3FF",
+        "sarl    %cl, %edi",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, Some(2), w_r8),
+        "41C1F802",
+        "sarl    $2, %r8d",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, Some(31), w_r13),
+        "41C1FD1F",
+        "sarl    $31, %r13d",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, None, w_rdi),
+        "48D3FF",
+        "sarq    %cl, %rdi",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, Some(2), w_r8),
+        "49C1F802",
+        "sarq    $2, %r8",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, Some(63), w_r13),
+        "49C1FD3F",
+        "sarq    $63, %r13",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::RotateLeft, None, w_r8),
+        "49D3C0",
+        "rolq    %cl, %r8",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::RotateLeft, Some(3), w_r9),
+        "41C1C103",
+        "roll    $3, %r9d",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::RotateRight, None, w_rsi),
+        "D3CE",
+        "rorl    %cl, %esi",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::RotateRight, Some(5), w_r15),
+        "49C1CF05",
+        "rorq    $5, %r15",
+    ));
+    insns.push((
+        Inst::shift_r(1, ShiftKind::RotateRight, None, w_rsi),
+        "D2CE",
+        "rorb    %cl, %sil",
+    ));
+    insns.push((
+        Inst::shift_r(1, ShiftKind::RotateRight, Some(5), w_r15),
+        "41C0CF05",
+        "rorb    $5, %r15b",
+    ));
+    insns.push((
+        Inst::shift_r(2, ShiftKind::RotateRight, None, w_rsi),
+        "66D3CE",
+        "rorw    %cl, %si",
+    ));
+    insns.push((
+        Inst::shift_r(2, ShiftKind::RotateRight, Some(5), w_r15),
+        "6641C1CF05",
+        "rorw    $5, %r15w",
+    ));
+
+    // ========================================================
+    // CmpRMIR
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::reg(r15), rdx),
+        "4C39FA",
+        "cmpq    %r15, %rdx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::reg(rcx), r8),
+        "4939C8",
+        "cmpq    %rcx, %r8",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::reg(rcx), rsi),
+        "4839CE",
+        "cmpq    %rcx, %rsi",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "483B5763",
+        "cmpq    99(%rdi), %rdx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8),
+        "4C3B4763",
+        "cmpq    99(%rdi), %r8",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi),
+        "483B7763",
+        "cmpq    99(%rdi), %rsi",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::imm(76543210), rdx),
+        "4881FAEAF48F04",
+        "cmpq    $76543210, %rdx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::imm(-76543210i32 as u32), r8),
+        "4981F8160B70FB",
+        "cmpq    $-76543210, %r8",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::imm(76543210), rsi),
+        "4881FEEAF48F04",
+        "cmpq    $76543210, %rsi",
+    ));
+    //
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::reg(r15), rdx),
+        "4439FA",
+        "cmpl    %r15d, %edx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::reg(rcx), r8),
+        "4139C8",
+        "cmpl    %ecx, %r8d",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::reg(rcx), rsi),
+        "39CE",
+        "cmpl    %ecx, %esi",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "3B5763",
+        "cmpl    99(%rdi), %edx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8),
+        "443B4763",
+        "cmpl    99(%rdi), %r8d",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi),
+        "3B7763",
+        "cmpl    99(%rdi), %esi",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::imm(76543210), rdx),
+        "81FAEAF48F04",
+        "cmpl    $76543210, %edx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::imm(-76543210i32 as u32), r8),
+        "4181F8160B70FB",
+        "cmpl    $-76543210, %r8d",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::imm(76543210), rsi),
+        "81FEEAF48F04",
+        "cmpl    $76543210, %esi",
+    ));
+    //
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::reg(r15), rdx),
+        "664439FA",
+        "cmpw    %r15w, %dx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::reg(rcx), r8),
+        "664139C8",
+        "cmpw    %cx, %r8w",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::reg(rcx), rsi),
+        "6639CE",
+        "cmpw    %cx, %si",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "663B5763",
+        "cmpw    99(%rdi), %dx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8),
+        "66443B4763",
+        "cmpw    99(%rdi), %r8w",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi),
+        "663B7763",
+        "cmpw    99(%rdi), %si",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::imm(23210), rdx),
+        "6681FAAA5A",
+        "cmpw    $23210, %dx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::imm(-7654i32 as u32), r8),
+        "664181F81AE2",
+        "cmpw    $-7654, %r8w",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::imm(7654), rsi),
+        "6681FEE61D",
+        "cmpw    $7654, %si",
+    ));
+    //
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r15), rdx),
+        "4438FA",
+        "cmpb    %r15b, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r8),
+        "4138C8",
+        "cmpb    %cl, %r8b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rsi),
+        "4038CE",
+        "cmpb    %cl, %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "3A5763",
+        "cmpb    99(%rdi), %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8),
+        "443A4763",
+        "cmpb    99(%rdi), %r8b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi),
+        "403A7763",
+        "cmpb    99(%rdi), %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::imm(70), rdx),
+        "80FA46",
+        "cmpb    $70, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::imm(-76i32 as u32), r8),
+        "4180F8B4",
+        "cmpb    $-76, %r8b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::imm(76), rsi),
+        "4080FE4C",
+        "cmpb    $76, %sil",
+    ));
+    // Extra byte-cases (paranoia!) for cmp_rmi_r for first operand = R
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rax), rbx),
+        "38C3",
+        "cmpb    %al, %bl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rbx), rax),
+        "38D8",
+        "cmpb    %bl, %al",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rdx),
+        "38CA",
+        "cmpb    %cl, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rsi),
+        "4038CE",
+        "cmpb    %cl, %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r10),
+        "4138CA",
+        "cmpb    %cl, %r10b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r14),
+        "4138CE",
+        "cmpb    %cl, %r14b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), rdx),
+        "4038EA",
+        "cmpb    %bpl, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), rsi),
+        "4038EE",
+        "cmpb    %bpl, %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), r10),
+        "4138EA",
+        "cmpb    %bpl, %r10b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), r14),
+        "4138EE",
+        "cmpb    %bpl, %r14b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r9), rdx),
+        "4438CA",
+        "cmpb    %r9b, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r9), rsi),
+        "4438CE",
+        "cmpb    %r9b, %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r9), r10),
+        "4538CA",
+        "cmpb    %r9b, %r10b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r9), r14),
+        "4538CE",
+        "cmpb    %r9b, %r14b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r13), rdx),
+        "4438EA",
+        "cmpb    %r13b, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r13), rsi),
+        "4438EE",
+        "cmpb    %r13b, %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r13), r10),
+        "4538EA",
+        "cmpb    %r13b, %r10b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r13), r14),
+        "4538EE",
+        "cmpb    %r13b, %r14b",
+    ));
+
+    // ========================================================
+    // SetCC
+    insns.push((Inst::setcc(CC::O, w_rsi), "400F90C6", "seto    %sil"));
+    insns.push((Inst::setcc(CC::NLE, w_rsi), "400F9FC6", "setnle  %sil"));
+    insns.push((Inst::setcc(CC::Z, w_r14), "410F94C6", "setz    %r14b"));
+    insns.push((Inst::setcc(CC::LE, w_r14), "410F9EC6", "setle   %r14b"));
+    insns.push((Inst::setcc(CC::P, w_r9), "410F9AC1", "setp    %r9b"));
+    insns.push((Inst::setcc(CC::NP, w_r8), "410F9BC0", "setnp   %r8b"));
+    // ========================================================
+    // Cmove
+    insns.push((
+        Inst::cmove(2, CC::O, RegMem::reg(rdi), w_rsi),
+        "660F40F7",
+        "cmovow  %di, %si",
+    ));
+    insns.push((
+        Inst::cmove(
+            2,
+            CC::NO,
+            RegMem::mem(Amode::imm_reg_reg_shift(37, rdi, rsi, 2)),
+            w_r15,
+        ),
+        "66440F417CB725",
+        "cmovnow 37(%rdi,%rsi,4), %r15w",
+    ));
+    insns.push((
+        Inst::cmove(4, CC::LE, RegMem::reg(rdi), w_rsi),
+        "0F4EF7",
+        "cmovlel %edi, %esi",
+    ));
+    insns.push((
+        Inst::cmove(4, CC::NLE, RegMem::mem(Amode::imm_reg(0, r15)), w_rsi),
+        "410F4F37",
+        "cmovnlel 0(%r15), %esi",
+    ));
+    insns.push((
+        Inst::cmove(8, CC::Z, RegMem::reg(rdi), w_r14),
+        "4C0F44F7",
+        "cmovzq  %rdi, %r14",
+    ));
+    insns.push((
+        Inst::cmove(8, CC::NZ, RegMem::mem(Amode::imm_reg(13, rdi)), w_r14),
+        "4C0F45770D",
+        "cmovnzq 13(%rdi), %r14",
+    ));
+
+    // ========================================================
+    // Push64
+    insns.push((Inst::push64(RegMemImm::reg(rdi)), "57", "pushq   %rdi"));
+    insns.push((Inst::push64(RegMemImm::reg(r8)), "4150", "pushq   %r8"));
+    insns.push((
+        Inst::push64(RegMemImm::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))),
+        "FFB4CE41010000",
+        "pushq   321(%rsi,%rcx,8)",
+    ));
+    insns.push((
+        Inst::push64(RegMemImm::mem(Amode::imm_reg_reg_shift(321, r9, rbx, 2))),
+        "41FFB49941010000",
+        "pushq   321(%r9,%rbx,4)",
+    ));
+    insns.push((Inst::push64(RegMemImm::imm(0)), "6A00", "pushq   $0"));
+    insns.push((Inst::push64(RegMemImm::imm(127)), "6A7F", "pushq   $127"));
+    insns.push((
+        Inst::push64(RegMemImm::imm(128)),
+        "6880000000",
+        "pushq   $128",
+    ));
+    insns.push((
+        Inst::push64(RegMemImm::imm(0x31415927)),
+        "6827594131",
+        "pushq   $826366247",
+    ));
+    insns.push((
+        Inst::push64(RegMemImm::imm(-128i32 as u32)),
+        "6A80",
+        "pushq   $-128",
+    ));
+    insns.push((
+        Inst::push64(RegMemImm::imm(-129i32 as u32)),
+        "687FFFFFFF",
+        "pushq   $-129",
+    ));
+    insns.push((
+        Inst::push64(RegMemImm::imm(-0x75c4e8a1i32 as u32)),
+        "685F173B8A",
+        "pushq   $-1975838881",
+    ));
+
+    // ========================================================
+    // Pop64
+    insns.push((Inst::pop64(w_rax), "58", "popq    %rax"));
+    insns.push((Inst::pop64(w_rdi), "5F", "popq    %rdi"));
+    insns.push((Inst::pop64(w_r8), "4158", "popq    %r8"));
+    insns.push((Inst::pop64(w_r15), "415F", "popq    %r15"));
+
+    // ========================================================
+    // CallKnown
+    insns.push((
+        Inst::call_known(
+            ExternalName::User {
+                namespace: 0,
+                index: 0,
+            },
+            Vec::new(),
+            Vec::new(),
+            Opcode::Call,
+        ),
+        "E800000000",
+        "call    User { namespace: 0, index: 0 }",
+    ));
+
+    // ========================================================
+    // CallUnknown
+    fn call_unknown(rm: RegMem) -> Inst {
+        Inst::call_unknown(rm, Vec::new(), Vec::new(), Opcode::CallIndirect)
+    }
+
+    insns.push((call_unknown(RegMem::reg(rbp)), "FFD5", "call    *%rbp"));
+    insns.push((call_unknown(RegMem::reg(r11)), "41FFD3", "call    *%r11"));
+    insns.push((
+        call_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))),
+        "FF94CE41010000",
+        "call    *321(%rsi,%rcx,8)",
+    ));
+    insns.push((
+        call_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rdx, 2))),
+        "41FF949241010000",
+        "call    *321(%r10,%rdx,4)",
+    ));
+
+    // ========================================================
+    // Ret
+    insns.push((Inst::ret(), "C3", "ret"));
+
+    // ========================================================
+    // JmpKnown skipped for now
+
+    // ========================================================
+    // JmpCondSymm isn't a real instruction
+
+    // ========================================================
+    // JmpCond skipped for now
+
+    // ========================================================
+    // JmpCondCompound isn't a real instruction
+
+    // ========================================================
+    // JmpUnknown
+    insns.push((Inst::jmp_unknown(RegMem::reg(rbp)), "FFE5", "jmp     *%rbp"));
+    insns.push((
+        Inst::jmp_unknown(RegMem::reg(r11)),
+        "41FFE3",
+        "jmp     *%r11",
+    ));
+    insns.push((
+        Inst::jmp_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))),
+        "FFA4CE41010000",
+        "jmp     *321(%rsi,%rcx,8)",
+    ));
+    insns.push((
+        Inst::jmp_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rdx, 2))),
+        "41FFA49241010000",
+        "jmp     *321(%r10,%rdx,4)",
+    ));
+
+    // ========================================================
+    // XMM_CMP_RM_R
+
+    insns.push((
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, RegMem::reg(xmm1), xmm2),
+        "0F2ED1",
+        "ucomiss %xmm1, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, RegMem::reg(xmm0), xmm9),
+        "440F2EC8",
+        "ucomiss %xmm0, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, RegMem::reg(xmm13), xmm4),
+        "66410F2EE5",
+        "ucomisd %xmm13, %xmm4",
+    ));
+
+    insns.push((
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, RegMem::reg(xmm11), xmm12),
+        "66450F2EE3",
+        "ucomisd %xmm11, %xmm12",
+    ));
+
+    // ========================================================
+    // XMM_RM_R: float binary ops
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0),
+        "F30F58C1",
+        "addss   %xmm1, %xmm0",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13),
+        "F3450F58EB",
+        "addss   %xmm11, %xmm13",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(
+            SseOpcode::Addss,
+            RegMem::mem(Amode::imm_reg_reg_shift(123, r10, rdx, 2)),
+            w_xmm0,
+        ),
+        "F3410F5844927B",
+        "addss   123(%r10,%rdx,4), %xmm0",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4),
+        "F2410F58E7",
+        "addsd   %xmm15, %xmm4",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1),
+        "F30F5CC8",
+        "subss   %xmm0, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1),
+        "F3410F5CCC",
+        "subss   %xmm12, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(
+            SseOpcode::Subss,
+            RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rax, 3)),
+            w_xmm10,
+        ),
+        "F3450F5C94C241010000",
+        "subss   321(%r10,%rax,8), %xmm10",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14),
+        "F2440F5CF5",
+        "subsd   %xmm5, %xmm14",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4),
+        "F30F59E5",
+        "mulss   %xmm5, %xmm4",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4),
+        "F20F59E5",
+        "mulsd   %xmm5, %xmm4",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7),
+        "F3410F5EF8",
+        "divss   %xmm8, %xmm7",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4),
+        "F20F5EE5",
+        "divsd   %xmm5, %xmm4",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12),
+        "440F54E3",
+        "andps   %xmm3, %xmm12",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11),
+        "440F55DC",
+        "andnps  %xmm4, %xmm11",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15),
+        "440F56F9",
+        "orps    %xmm1, %xmm15",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4),
+        "0F56E5",
+        "orps    %xmm5, %xmm4",
+    ));
+
+    // ========================================================
+    // XMM_RM_R: Integer Packed
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5),
+        "66410FFCE9",
+        "paddb   %xmm9, %xmm5",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6),
+        "660FFDF7",
+        "paddw   %xmm7, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13),
+        "66450FFEEC",
+        "paddd   %xmm12, %xmm13",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8),
+        "66440FD4C1",
+        "paddq   %xmm1, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5),
+        "66410FECE9",
+        "paddsb  %xmm9, %xmm5",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6),
+        "660FEDF7",
+        "paddsw  %xmm7, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13),
+        "66450FDCEC",
+        "paddusb %xmm12, %xmm13",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8),
+        "66440FDDC1",
+        "paddusw %xmm1, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5),
+        "66410FE8E9",
+        "psubsb  %xmm9, %xmm5",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6),
+        "660FE9F7",
+        "psubsw  %xmm7, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13),
+        "66450FD8EC",
+        "psubusb %xmm12, %xmm13",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8),
+        "66440FD9C1",
+        "psubusw %xmm1, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13),
+        "66450FE0EC",
+        "pavgb   %xmm12, %xmm13",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8),
+        "66440FE3C1",
+        "pavgw   %xmm1, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9),
+        "66440FF8CD",
+        "psubb   %xmm5, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7),
+        "660FF9FE",
+        "psubw   %xmm6, %xmm7",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12),
+        "66450FFAE5",
+        "psubd   %xmm13, %xmm12",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1),
+        "66410FFBC8",
+        "psubq   %xmm8, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6),
+        "66410F3840F7",
+        "pmulld  %xmm15, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1),
+        "66410FD5CE",
+        "pmullw  %xmm14, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
+        "66450FF4C8",
+        "pmuludq %xmm8, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6),
+        "66410F383CF7",
+        "pmaxsb  %xmm15, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6),
+        "66410FEEF7",
+        "pmaxsw  %xmm15, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6),
+        "66410F383DF7",
+        "pmaxsd  %xmm15, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1),
+        "66410FDECE",
+        "pmaxub  %xmm14, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1),
+        "66410F383ECE",
+        "pmaxuw  %xmm14, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1),
+        "66410F383FCE",
+        "pmaxud  %xmm14, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9),
+        "66450F3838C8",
+        "pminsb  %xmm8, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9),
+        "66450FEAC8",
+        "pminsw  %xmm8, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9),
+        "66450F3839C8",
+        "pminsd  %xmm8, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2),
+        "660FDAD3",
+        "pminub  %xmm3, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2),
+        "660F383AD3",
+        "pminuw  %xmm3, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2),
+        "660F383BD3",
+        "pminud  %xmm3, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2),
+        "66410FEFD3",
+        "pxor    %xmm11, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2),
+        "66410F3800D3",
+        "pshufb  %xmm11, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2),
+        "66410F63D3",
+        "packsswb %xmm11, %xmm2",
+    ));
+
+    // ========================================================
+    // XMM_RM_R: Integer Conversion
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::reg(xmm1), w_xmm8),
+        "440F5BC1",
+        "cvtdq2ps %xmm1, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Cvttps2dq, RegMem::reg(xmm9), w_xmm8),
+        "F3450F5BC1",
+        "cvttps2dq %xmm9, %xmm8",
+    ));
+
+    // XMM_Mov_R_M: float stores
+    insns.push((
+        Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12)),
+        "F3450F11BC2480000000",
+        "movss   %xmm15, 128(%r12)",
+    ));
+    insns.push((
+        Inst::xmm_mov_r_m(SseOpcode::Movsd, xmm1, Amode::imm_reg(0, rsi)),
+        "F20F110E",
+        "movsd   %xmm1, 0(%rsi)",
+    ));
+
+    // XmmUnary: moves and unary float ops
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Movss, RegMem::reg(xmm13), w_xmm2),
+        "F3410F10D5",
+        "movss   %xmm13, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Movsd, RegMem::reg(xmm0), w_xmm1),
+        "F20F10C8",
+        "movsd   %xmm0, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(
+            SseOpcode::Movsd,
+            RegMem::mem(Amode::imm_reg(0, rsi)),
+            w_xmm2,
+        ),
+        "F20F1016",
+        "movsd   0(%rsi), %xmm2",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Movsd, RegMem::reg(xmm14), w_xmm3),
+        "F2410F10DE",
+        "movsd   %xmm14, %xmm3",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Movaps, RegMem::reg(xmm5), w_xmm14),
+        "440F28F5",
+        "movaps  %xmm5, %xmm14",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Sqrtss, RegMem::reg(xmm7), w_xmm8),
+        "F3440F51C7",
+        "sqrtss  %xmm7, %xmm8",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Sqrtsd, RegMem::reg(xmm1), w_xmm2),
+        "F20F51D1",
+        "sqrtsd  %xmm1, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, RegMem::reg(xmm0), w_xmm1),
+        "F30F5AC8",
+        "cvtss2sd %xmm0, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, RegMem::reg(xmm1), w_xmm0),
+        "F20F5AC1",
+        "cvtsd2ss %xmm1, %xmm0",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Pabsb, RegMem::reg(xmm2), w_xmm1),
+        "660F381CCA",
+        "pabsb   %xmm2, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Pabsw, RegMem::reg(xmm0), w_xmm0),
+        "660F381DC0",
+        "pabsw   %xmm0, %xmm0",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Pabsd, RegMem::reg(xmm10), w_xmm11),
+        "66450F381EDA",
+        "pabsd   %xmm10, %xmm11",
+    ));
+
+    // Xmm to int conversions, and conversely.
+
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Movd, xmm0, w_rsi, OperandSize::Size32),
+        "660F7EC6",
+        "movd    %xmm0, %esi",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Movq, xmm2, w_rdi, OperandSize::Size64),
+        "66480F7ED7",
+        "movq    %xmm2, %rdi",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Cvttss2si, xmm0, w_rsi, OperandSize::Size32),
+        "F30F2CF0",
+        "cvttss2si %xmm0, %esi",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Cvttss2si, xmm0, w_rdi, OperandSize::Size64),
+        "F3480F2CF8",
+        "cvttss2si %xmm0, %rdi",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Cvttsd2si, xmm0, w_rax, OperandSize::Size32),
+        "F20F2CC0",
+        "cvttsd2si %xmm0, %eax",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Cvttsd2si, xmm0, w_r15, OperandSize::Size64),
+        "F24C0F2CF8",
+        "cvttsd2si %xmm0, %r15",
+    ));
+
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Pmovmskb, xmm10, w_rax, OperandSize::Size32),
+        "66410FD7C2",
+        "pmovmskb %xmm10, %eax",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Movmskps, xmm2, w_rax, OperandSize::Size32),
+        "0F50C2",
+        "movmskps %xmm2, %eax",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Movmskpd, xmm0, w_rcx, OperandSize::Size32),
+        "660F50C8",
+        "movmskpd %xmm0, %ecx",
+    ));
+
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Movd,
+            RegMem::reg(rax),
+            OperandSize::Size32,
+            w_xmm15,
+        ),
+        "66440F6EF8",
+        "movd    %eax, %xmm15",
+    ));
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Movd,
+            RegMem::mem(Amode::imm_reg(2, r10)),
+            OperandSize::Size32,
+            w_xmm9,
+        ),
+        "66450F6E4A02",
+        "movd    2(%r10), %xmm9",
+    ));
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Movd,
+            RegMem::reg(rsi),
+            OperandSize::Size32,
+            w_xmm1,
+        ),
+        "660F6ECE",
+        "movd    %esi, %xmm1",
+    ));
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Movq,
+            RegMem::reg(rdi),
+            OperandSize::Size64,
+            w_xmm15,
+        ),
+        "664C0F6EFF",
+        "movq    %rdi, %xmm15",
+    ));
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Cvtsi2ss,
+            RegMem::reg(rdi),
+            OperandSize::Size32,
+            w_xmm15,
+        ),
+        "F3440F2AFF",
+        "cvtsi2ss %edi, %xmm15",
+    ));
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Cvtsi2sd,
+            RegMem::reg(rsi),
+            OperandSize::Size64,
+            w_xmm1,
+        ),
+        "F2480F2ACE",
+        "cvtsi2sd %rsi, %xmm1",
+    ));
+
+    // ========================================================
+    // XmmRmi
+    insns.push((
+        Inst::xmm_rmi_reg(SseOpcode::Psraw, RegMemImm::reg(xmm10), w_xmm1),
+        "66410FE1CA",
+        "psraw   %xmm10, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(31), w_xmm1),
+        "660F72F11F",
+        "pslld   $31, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rmi_reg(SseOpcode::Psrlq, RegMemImm::imm(1), w_xmm3),
+        "660F73D301",
+        "psrlq   $1, %xmm3",
+    ));
+
+    // ========================================================
+    // XmmRmRImm
+    insns.push((
+        Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false),
+        "660FC2CD02",
+        "cmppd   $2, %xmm5, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false),
+        "410FC2FF00",
+        "cmpps   $0, %xmm15, %xmm7",
+    ));
+
+    // ========================================================
+    // Pertaining to atomics.
+    let am1: SyntheticAmode = Amode::imm_reg_reg_shift(321, r10, rdx, 2).into();
+    // `am2` doesn't contribute any 1 bits to the rex prefix, so we must use it when testing
+    // for retention of the apparently-redundant rex prefix in the 8-bit case.
+    let am2: SyntheticAmode = Amode::imm_reg_reg_shift(-12345i32 as u32, rcx, rsi, 3).into();
+
+    // A general 8-bit case.
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: rbx,
+            dst: am1,
+        },
+        "F0410FB09C9241010000",
+        "lock cmpxchgb %bl, 321(%r10,%rdx,4)",
+    ));
+    // Check redundant rex retention in 8-bit cases.
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: rdx,
+            dst: am2.clone(),
+        },
+        "F00FB094F1C7CFFFFF",
+        "lock cmpxchgb %dl, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: rsi,
+            dst: am2.clone(),
+        },
+        "F0400FB0B4F1C7CFFFFF",
+        "lock cmpxchgb %sil, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: r10,
+            dst: am2.clone(),
+        },
+        "F0440FB094F1C7CFFFFF",
+        "lock cmpxchgb %r10b, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: r15,
+            dst: am2.clone(),
+        },
+        "F0440FB0BCF1C7CFFFFF",
+        "lock cmpxchgb %r15b, -12345(%rcx,%rsi,8)",
+    ));
+    // 16 bit cases
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I16,
+            src: rsi,
+            dst: am2.clone(),
+        },
+        "66F00FB1B4F1C7CFFFFF",
+        "lock cmpxchgw %si, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I16,
+            src: r10,
+            dst: am2.clone(),
+        },
+        "66F0440FB194F1C7CFFFFF",
+        "lock cmpxchgw %r10w, -12345(%rcx,%rsi,8)",
+    ));
+    // 32 bit cases
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I32,
+            src: rsi,
+            dst: am2.clone(),
+        },
+        "F00FB1B4F1C7CFFFFF",
+        "lock cmpxchgl %esi, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I32,
+            src: r10,
+            dst: am2.clone(),
+        },
+        "F0440FB194F1C7CFFFFF",
+        "lock cmpxchgl %r10d, -12345(%rcx,%rsi,8)",
+    ));
+    // 64 bit cases
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I64,
+            src: rsi,
+            dst: am2.clone(),
+        },
+        "F0480FB1B4F1C7CFFFFF",
+        "lock cmpxchgq %rsi, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I64,
+            src: r10,
+            dst: am2.clone(),
+        },
+        "F04C0FB194F1C7CFFFFF",
+        "lock cmpxchgq %r10, -12345(%rcx,%rsi,8)",
+    ));
+
+    // AtomicRmwSeq
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I8, op: inst_common::AtomicRmwOp::Or, },
+        "490FB6014989C34D09D3F0450FB0190F85EFFFFFFF",
+        "atomically { 8_bits_at_[%r9]) Or= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I16, op: inst_common::AtomicRmwOp::And, },
+        "490FB7014989C34D21D366F0450FB1190F85EEFFFFFF",
+        "atomically { 16_bits_at_[%r9]) And= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I32, op: inst_common::AtomicRmwOp::Xchg, },
+        "418B014989C34D89D3F0450FB1190F85EFFFFFFF",
+        "atomically { 32_bits_at_[%r9]) Xchg= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I64, op: inst_common::AtomicRmwOp::Add, },
+        "498B014989C34D01D3F04D0FB1190F85EFFFFFFF",
+        "atomically { 64_bits_at_[%r9]) Add= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+
+    // Fence
+    insns.push((
+        Inst::Fence {
+            kind: FenceKind::MFence,
+        },
+        "0FAEF0",
+        "mfence",
+    ));
+    insns.push((
+        Inst::Fence {
+            kind: FenceKind::LFence,
+        },
+        "0FAEE8",
+        "lfence",
+    ));
+    insns.push((
+        Inst::Fence {
+            kind: FenceKind::SFence,
+        },
+        "0FAEF8",
+        "sfence",
+    ));
+
+    // ========================================================
+    // Misc instructions.
+
+    insns.push((Inst::Hlt, "CC", "hlt"));
+
+    let trap_code = TrapCode::UnreachableCodeReached;
+    insns.push((Inst::Ud2 { trap_code }, "0F0B", "ud2 unreachable"));
+
+    // ========================================================
+    // Actually run the tests!
+    let flags = settings::Flags::new(settings::builder());
+
+    use crate::settings::Configurable;
+    let mut isa_flag_builder = x64::settings::builder();
+    isa_flag_builder.enable("has_ssse3").unwrap();
+    isa_flag_builder.enable("has_sse41").unwrap();
+    let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
+
+    let rru = regs::create_reg_universe_systemv(&flags);
+    let emit_info = EmitInfo::new(flags, isa_flags);
+    for (insn, expected_encoding, expected_printing) in insns {
+        // Check the printed text is as expected.
+        let actual_printing = insn.show_rru(Some(&rru));
+        assert_eq!(expected_printing, actual_printing);
+        let mut sink = test_utils::TestCodeSink::new();
+        let mut buffer = MachBuffer::new();
+
+        insn.emit(&mut buffer, &emit_info, &mut Default::default());
+
+        // Allow one label just after the instruction (so the offset is 0).
+        let label = buffer.get_label();
+        buffer.bind_label(label);
+
+        let buffer = buffer.finish();
+        buffer.emit(&mut sink);
+        let actual_encoding = &sink.stringify();
+        assert_eq!(expected_encoding, actual_encoding, "{}", expected_printing);
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs
new file mode 100644
index 0000000000..1172b22eff
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs
@@ -0,0 +1,2733 @@
+//! This module defines x86_64-specific machine instruction types.
+
+use crate::binemit::{CodeOffset, StackMap};
+use crate::ir::{types, ExternalName, Opcode, SourceLoc, TrapCode, Type};
+use crate::isa::x64::settings as x64_settings;
+use crate::machinst::*;
+use crate::{settings, settings::Flags, CodegenError, CodegenResult};
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use regalloc::{
+    PrettyPrint, PrettyPrintSized, RealRegUniverse, Reg, RegClass, RegUsageCollector,
+    RegUsageMapper, SpillSlot, VirtualReg, Writable,
+};
+use smallvec::SmallVec;
+use std::fmt;
+use std::string::{String, ToString};
+
+pub mod args;
+mod emit;
+#[cfg(test)]
+mod emit_tests;
+pub mod regs;
+pub mod unwind;
+
+use args::*;
+use regs::{create_reg_universe_systemv, show_ireg_sized};
+
+//=============================================================================
+// Instructions (top level): definition
+
+// Don't build these directly.  Instead use the Inst:: functions to create them.
+
+/// Instructions.  Destinations are on the RIGHT (a la AT&T syntax).
+#[derive(Clone)]
+pub enum Inst {
+    /// Nops of various sizes, including zero.
+    Nop { len: u8 },
+
+    // =====================================
+    // Integer instructions.
+    /// Integer arithmetic/bit-twiddling: (add sub and or xor mul adc? sbb?) (32 64) (reg addr imm) reg
+    AluRmiR {
+        is_64: bool,
+        op: AluRmiROpcode,
+        src: RegMemImm,
+        dst: Writable<Reg>,
+    },
+
+    /// Instructions on GPR that only read src and defines dst (dst is not modified): bsr, etc.
+    UnaryRmR {
+        size: u8, // 2, 4 or 8
+        op: UnaryRmROpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// Bitwise not
+    Not {
+        size: u8, // 1, 2, 4 or 8
+        src: Writable<Reg>,
+    },
+
+    /// Integer negation
+    Neg {
+        size: u8, // 1, 2, 4 or 8
+        src: Writable<Reg>,
+    },
+
+    /// Integer quotient and remainder: (div idiv) $rax $rdx (reg addr)
+    Div {
+        size: u8, // 1, 2, 4 or 8
+        signed: bool,
+        divisor: RegMem,
+    },
+
+    /// The high bits (RDX) of a (un)signed multiply: RDX:RAX := RAX * rhs.
+    MulHi { size: u8, signed: bool, rhs: RegMem },
+
+    /// A synthetic sequence to implement the right inline checks for remainder and division,
+    /// assuming the dividend is in %rax.
+    /// Puts the result back into %rax if is_div, %rdx if !is_div, to mimic what the div
+    /// instruction does.
+    /// The generated code sequence is described in the emit's function match arm for this
+    /// instruction.
+    ///
+    /// Note: %rdx is marked as modified by this instruction, to avoid an early clobber problem
+    /// with the temporary and divisor registers. Make sure to zero %rdx right before this
+    /// instruction, or you might run into regalloc failures where %rdx is live before its first
+    /// def!
+    CheckedDivOrRemSeq {
+        kind: DivOrRemKind,
+        size: u8,
+        /// The divisor operand. Note it's marked as modified so that it gets assigned a register
+        /// different from the temporary.
+        divisor: Writable<Reg>,
+        tmp: Option<Writable<Reg>>,
+    },
+
+    /// Do a sign-extend based on the sign of the value in rax into rdx: (cwd cdq cqo)
+    /// or al into ah: (cbw)
+    SignExtendData {
+        size: u8, // 1, 2, 4 or 8
+    },
+
+    /// Constant materialization: (imm32 imm64) reg.
+    /// Either: movl $imm32, %reg32 or movabsq $imm64, %reg32.
+    Imm {
+        dst_is_64: bool,
+        simm64: u64,
+        dst: Writable<Reg>,
+    },
+
+    /// GPR to GPR move: mov (64 32) reg reg.
+    MovRR {
+        is_64: bool,
+        src: Reg,
+        dst: Writable<Reg>,
+    },
+
+    /// Zero-extended loads, except for 64 bits: movz (bl bq wl wq lq) addr reg.
+    /// Note that the lq variant doesn't really exist since the default zero-extend rule makes it
+    /// unnecessary. For that case we emit the equivalent "movl AM, reg32".
+    MovzxRmR {
+        ext_mode: ExtMode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// A plain 64-bit integer load, since MovZX_RM_R can't represent that.
+    Mov64MR {
+        src: SyntheticAmode,
+        dst: Writable<Reg>,
+    },
+
+    /// Loads the memory address of addr into dst.
+    LoadEffectiveAddress {
+        addr: SyntheticAmode,
+        dst: Writable<Reg>,
+    },
+
+    /// Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg.
+    MovsxRmR {
+        ext_mode: ExtMode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// Integer stores: mov (b w l q) reg addr.
+    MovRM {
+        size: u8, // 1, 2, 4 or 8.
+        src: Reg,
+        dst: SyntheticAmode,
+    },
+
+    /// Arithmetic shifts: (shl shr sar) (b w l q) imm reg.
+    ShiftR {
+        size: u8, // 1, 2, 4 or 8
+        kind: ShiftKind,
+        /// shift count: Some(0 .. #bits-in-type - 1), or None to mean "%cl".
+        num_bits: Option<u8>,
+        dst: Writable<Reg>,
+    },
+
+    /// Arithmetic SIMD shifts.
+    XmmRmiReg {
+        opcode: SseOpcode,
+        src: RegMemImm,
+        dst: Writable<Reg>,
+    },
+
+    /// Integer comparisons/tests: cmp (b w l q) (reg addr imm) reg.
+    CmpRmiR {
+        size: u8, // 1, 2, 4 or 8
+        src: RegMemImm,
+        dst: Reg,
+    },
+
+    /// Materializes the requested condition code in the destination reg.
+    Setcc { cc: CC, dst: Writable<Reg> },
+
+    /// Integer conditional move.
+    /// Overwrites the destination register.
+    Cmove {
+        /// Possible values are 2, 4 or 8. Checked in the related factory.
+        size: u8,
+        cc: CC,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    // =====================================
+    // Stack manipulation.
+    /// pushq (reg addr imm)
+    Push64 { src: RegMemImm },
+
+    /// popq reg
+    Pop64 { dst: Writable<Reg> },
+
+    // =====================================
+    // Floating-point operations.
+    /// XMM (scalar or vector) binary op: (add sub and or xor mul adc? sbb?) (32 64) (reg addr) reg
+    XmmRmR {
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt,
+    /// etc.
+    ///
+    /// This differs from XMM_RM_R in that the dst register of XmmUnaryRmR is not used in the
+    /// computation of the instruction dst value and so does not have to be a previously valid
+    /// value. This is characteristic of mov instructions.
+    XmmUnaryRmR {
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd, movq
+    XmmMovRM {
+        op: SseOpcode,
+        src: Reg,
+        dst: SyntheticAmode,
+    },
+
+    /// XMM (vector) unary op (to move a constant value into an xmm register): movups
+    XmmLoadConst {
+        src: VCodeConstant,
+        dst: Writable<Reg>,
+        ty: Type,
+    },
+
+    /// XMM (scalar) unary op (from xmm to integer reg): movd, movq, cvtts{s,d}2si
+    XmmToGpr {
+        op: SseOpcode,
+        src: Reg,
+        dst: Writable<Reg>,
+        dst_size: OperandSize,
+    },
+
+    /// XMM (scalar) unary op (from integer to float reg): movd, movq, cvtsi2s{s,d}
+    GprToXmm {
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+        src_size: OperandSize,
+    },
+
+    /// Converts an unsigned int64 to a float32/float64.
+    CvtUint64ToFloatSeq {
+        /// Is the target a 64-bits or 32-bits register?
+        to_f64: bool,
+        /// A copy of the source register, fed by lowering. It is marked as modified during
+        /// register allocation to make sure that the temporary registers differ from the src
+        /// register, since both registers are live at the same time in the generated code
+        /// sequence.
+        src: Writable<Reg>,
+        dst: Writable<Reg>,
+        tmp_gpr1: Writable<Reg>,
+        tmp_gpr2: Writable<Reg>,
+    },
+
+    /// Converts a scalar xmm to a signed int32/int64.
+    CvtFloatToSintSeq {
+        dst_size: OperandSize,
+        src_size: OperandSize,
+        is_saturating: bool,
+        /// A copy of the source register, fed by lowering. It is marked as modified during
+        /// register allocation to make sure that the temporary xmm register differs from the src
+        /// register, since both registers are live at the same time in the generated code
+        /// sequence.
+        src: Writable<Reg>,
+        dst: Writable<Reg>,
+        tmp_gpr: Writable<Reg>,
+        tmp_xmm: Writable<Reg>,
+    },
+
+    /// Converts a scalar xmm to an unsigned int32/int64.
+    CvtFloatToUintSeq {
+        src_size: OperandSize,
+        dst_size: OperandSize,
+        is_saturating: bool,
+        /// A copy of the source register, fed by lowering, reused as a temporary. It is marked as
+        /// modified during register allocation to make sure that the temporary xmm register
+        /// differs from the src register, since both registers are live at the same time in the
+        /// generated code sequence.
+        src: Writable<Reg>,
+        dst: Writable<Reg>,
+        tmp_gpr: Writable<Reg>,
+        tmp_xmm: Writable<Reg>,
+    },
+
+    /// A sequence to compute min/max with the proper NaN semantics for xmm registers.
+    XmmMinMaxSeq {
+        size: OperandSize,
+        is_min: bool,
+        lhs: Reg,
+        rhs_dst: Writable<Reg>,
+    },
+
+    /// XMM (scalar) conditional move.
+    /// Overwrites the destination register if cc is set.
+    XmmCmove {
+        /// Whether the cmove is moving either 32 or 64 bits.
+        is_64: bool,
+        cc: CC,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// Float comparisons/tests: cmp (b w l q) (reg addr imm) reg.
+    XmmCmpRmR {
+        op: SseOpcode,
+        src: RegMem,
+        dst: Reg,
+    },
+
+    /// A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg
+    XmmRmRImm {
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+        imm: u8,
+        is64: bool,
+    },
+
+    // =====================================
+    // Control flow instructions.
+    /// Direct call: call simm32.
+    CallKnown {
+        dest: ExternalName,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: Opcode,
+    },
+
+    /// Indirect call: callq (reg mem).
+    CallUnknown {
+        dest: RegMem,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: Opcode,
+    },
+
+    /// Return.
+    Ret,
+
+    /// A placeholder instruction, generating no code, meaning that a function epilogue must be
+    /// inserted there.
+    EpiloguePlaceholder,
+
+    /// Jump to a known target: jmp simm32.
+    JmpKnown { dst: MachLabel },
+
+    /// One-way conditional branch: jcond cond target.
+    ///
+    /// This instruction is useful when we have conditional jumps depending on more than two
+    /// conditions, see for instance the lowering of Brz/brnz with Fcmp inputs.
+    ///
+    /// A note of caution: in contexts where the branch target is another block, this has to be the
+    /// same successor as the one specified in the terminator branch of the current block.
+    /// Otherwise, this might confuse register allocation by creating new invisible edges.
+    JmpIf { cc: CC, taken: MachLabel },
+
+    /// Two-way conditional branch: jcond cond target target.
+    /// Emitted as a compound sequence; the MachBuffer will shrink it as appropriate.
+    JmpCond {
+        cc: CC,
+        taken: MachLabel,
+        not_taken: MachLabel,
+    },
+
+    /// Jump-table sequence, as one compound instruction (see note in lower.rs for rationale).
+    /// The generated code sequence is described in the emit's function match arm for this
+    /// instruction.
+    /// See comment in lowering about the temporaries signedness.
+    JmpTableSeq {
+        idx: Reg,
+        tmp1: Writable<Reg>,
+        tmp2: Writable<Reg>,
+        default_target: MachLabel,
+        targets: Vec<MachLabel>,
+        targets_for_term: Vec<MachLabel>,
+    },
+
+    /// Indirect jump: jmpq (reg mem).
+    JmpUnknown { target: RegMem },
+
+    /// Traps if the condition code is set.
+    TrapIf { cc: CC, trap_code: TrapCode },
+
+    /// A debug trap.
+    Hlt,
+
+    /// An instruction that will always trigger the illegal instruction exception.
+    Ud2 { trap_code: TrapCode },
+
+    /// Loads an external symbol in a register, with a relocation: movabsq $name, dst
+    LoadExtName {
+        dst: Writable<Reg>,
+        name: Box<ExternalName>,
+        offset: i64,
+    },
+
+    // =====================================
+    // Instructions pertaining to atomic memory accesses.
+    /// A standard (native) `lock cmpxchg src, (amode)`, with register conventions:
+    ///
+    /// `dst`  (read) address
+    /// `src`  (read) replacement value
+    /// %rax   (modified) in: expected value, out: value that was actually at `dst`
+    /// %rflags is written.  Do not assume anything about it after the instruction.
+    ///
+    /// The instruction "succeeded" iff the lowest `ty` bits of %rax afterwards are the same as
+    /// they were before.
+    LockCmpxchg {
+        ty: Type, // I8, I16, I32 or I64
+        src: Reg,
+        dst: SyntheticAmode,
+    },
+
+    /// A synthetic instruction, based on a loop around a native `lock cmpxchg` instruction.
+    /// This atomically modifies a value in memory and returns the old value.  The sequence
+    /// consists of an initial "normal" load from `dst`, followed by a loop which computes the
+    /// new value and tries to compare-and-swap ("CAS") it into `dst`, using the native
+    /// instruction `lock cmpxchg{b,w,l,q}` .  The loop iterates until the CAS is successful.
+    /// If there is no contention, there will be only one pass through the loop body.  The
+    /// sequence does *not* perform any explicit memory fence instructions
+    /// (mfence/sfence/lfence).
+    ///
+    /// Note that the transaction is atomic in the sense that, as observed by some other thread,
+    /// `dst` either has the initial or final value, but no other.  It isn't atomic in the sense
+    /// of guaranteeing that no other thread writes to `dst` in between the initial load and the
+    /// CAS -- but that would cause the CAS to fail unless the other thread's last write before
+    /// the CAS wrote the same value that was already there.  In other words, this
+    /// implementation suffers (unavoidably) from the A-B-A problem.
+    ///
+    /// This instruction sequence has fixed register uses as follows:
+    ///
+    /// %r9   (read) address
+    /// %r10  (read) second operand for `op`
+    /// %r11  (written) scratch reg; value afterwards has no meaning
+    /// %rax  (written) the old value at %r9
+    /// %rflags is written.  Do not assume anything about it after the instruction.
+    AtomicRmwSeq {
+        ty: Type, // I8, I16, I32 or I64
+        op: inst_common::AtomicRmwOp,
+    },
+
+    /// A memory fence (mfence, lfence or sfence).
+    Fence { kind: FenceKind },
+
+    // =====================================
+    // Meta-instructions generating no code.
+    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
+    /// controls how MemArg::NominalSPOffset args are lowered.
+    VirtualSPOffsetAdj { offset: i64 },
+
+    /// Provides a way to tell the register allocator that the upcoming sequence of instructions
+    /// will overwrite `dst` so it should be considered as a `def`; use this with care.
+    ///
+    /// This is useful when we have a sequence of instructions whose register usages are nominally
+    /// `mod`s, but such that the combination of operations creates a result that is independent of
+    /// the initial register value. It's thus semantically a `def`, not a `mod`, when all the
+    /// instructions are taken together, so we want to ensure the register is defined (its
+    /// live-range starts) prior to the sequence to keep analyses happy.
+    ///
+    /// One alternative would be a compound instruction that somehow encapsulates the others and
+    /// reports its own `def`s/`use`s/`mod`s; this adds complexity (the instruction list is no
+    /// longer flat) and requires knowledge about semantics and initial-value independence anyway.
+    XmmUninitializedValue { dst: Writable<Reg> },
+}
+
+pub(crate) fn low32_will_sign_extend_to_64(x: u64) -> bool {
+    let xs = x as i64;
+    xs == ((xs << 32) >> 32)
+}
+
+impl Inst {
+    fn isa_requirement(&self) -> Option<InstructionSet> {
+        match self {
+            // These instructions are part of SSE2, which is a basic requirement in Cranelift, and
+            // don't have to be checked.
+            Inst::AluRmiR { .. }
+            | Inst::AtomicRmwSeq { .. }
+            | Inst::CallKnown { .. }
+            | Inst::CallUnknown { .. }
+            | Inst::CheckedDivOrRemSeq { .. }
+            | Inst::Cmove { .. }
+            | Inst::CmpRmiR { .. }
+            | Inst::CvtFloatToSintSeq { .. }
+            | Inst::CvtFloatToUintSeq { .. }
+            | Inst::CvtUint64ToFloatSeq { .. }
+            | Inst::Div { .. }
+            | Inst::EpiloguePlaceholder
+            | Inst::Fence { .. }
+            | Inst::Hlt
+            | Inst::Imm { .. }
+            | Inst::JmpCond { .. }
+            | Inst::JmpIf { .. }
+            | Inst::JmpKnown { .. }
+            | Inst::JmpTableSeq { .. }
+            | Inst::JmpUnknown { .. }
+            | Inst::LoadEffectiveAddress { .. }
+            | Inst::LoadExtName { .. }
+            | Inst::LockCmpxchg { .. }
+            | Inst::Mov64MR { .. }
+            | Inst::MovRM { .. }
+            | Inst::MovRR { .. }
+            | Inst::MovsxRmR { .. }
+            | Inst::MovzxRmR { .. }
+            | Inst::MulHi { .. }
+            | Inst::Neg { .. }
+            | Inst::Not { .. }
+            | Inst::Nop { .. }
+            | Inst::Pop64 { .. }
+            | Inst::Push64 { .. }
+            | Inst::Ret
+            | Inst::Setcc { .. }
+            | Inst::ShiftR { .. }
+            | Inst::SignExtendData { .. }
+            | Inst::TrapIf { .. }
+            | Inst::Ud2 { .. }
+            | Inst::UnaryRmR { .. }
+            | Inst::VirtualSPOffsetAdj { .. }
+            | Inst::XmmCmove { .. }
+            | Inst::XmmCmpRmR { .. }
+            | Inst::XmmLoadConst { .. }
+            | Inst::XmmMinMaxSeq { .. }
+            | Inst::XmmUninitializedValue { .. } => None,
+
+            // These use dynamic SSE opcodes.
+            Inst::GprToXmm { op, .. }
+            | Inst::XmmMovRM { op, .. }
+            | Inst::XmmRmiReg { opcode: op, .. }
+            | Inst::XmmRmR { op, .. }
+            | Inst::XmmRmRImm { op, .. }
+            | Inst::XmmToGpr { op, .. }
+            | Inst::XmmUnaryRmR { op, .. } => Some(op.available_from()),
+        }
+    }
+}
+
+// Handy constructors for Insts.
+
+impl Inst {
+    pub(crate) fn nop(len: u8) -> Self {
+        debug_assert!(len <= 16);
+        Self::Nop { len }
+    }
+
+    pub(crate) fn alu_rmi_r(
+        is_64: bool,
+        op: AluRmiROpcode,
+        src: RegMemImm,
+        dst: Writable<Reg>,
+    ) -> Self {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Self::AluRmiR {
+            is_64,
+            op,
+            src,
+            dst,
+        }
+    }
+
+    pub(crate) fn unary_rm_r(
+        size: u8,
+        op: UnaryRmROpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    ) -> Self {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2);
+        Self::UnaryRmR { size, op, src, dst }
+    }
+
+    pub(crate) fn not(size: u8, src: Writable<Reg>) -> Inst {
+        debug_assert_eq!(src.to_reg().get_class(), RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        Inst::Not { size, src }
+    }
+
+    pub(crate) fn neg(size: u8, src: Writable<Reg>) -> Inst {
+        debug_assert_eq!(src.to_reg().get_class(), RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        Inst::Neg { size, src }
+    }
+
+    pub(crate) fn div(size: u8, signed: bool, divisor: RegMem) -> Inst {
+        divisor.assert_regclass_is(RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        Inst::Div {
+            size,
+            signed,
+            divisor,
+        }
+    }
+
+    pub(crate) fn mul_hi(size: u8, signed: bool, rhs: RegMem) -> Inst {
+        rhs.assert_regclass_is(RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        Inst::MulHi { size, signed, rhs }
+    }
+
+    pub(crate) fn checked_div_or_rem_seq(
+        kind: DivOrRemKind,
+        size: u8,
+        divisor: Writable<Reg>,
+        tmp: Option<Writable<Reg>>,
+    ) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert!(divisor.to_reg().get_class() == RegClass::I64);
+        debug_assert!(tmp
+            .map(|tmp| tmp.to_reg().get_class() == RegClass::I64)
+            .unwrap_or(true));
+        Inst::CheckedDivOrRemSeq {
+            kind,
+            size,
+            divisor,
+            tmp,
+        }
+    }
+
+    pub(crate) fn sign_extend_data(size: u8) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        Inst::SignExtendData { size }
+    }
+
+    pub(crate) fn imm(size: OperandSize, simm64: u64, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        // Try to generate a 32-bit immediate when the upper high bits are zeroed (which matches
+        // the semantics of movl).
+        let dst_is_64 = size == OperandSize::Size64 && simm64 > u32::max_value() as u64;
+        Inst::Imm {
+            dst_is_64,
+            simm64,
+            dst,
+        }
+    }
+
+    pub(crate) fn mov_r_r(is_64: bool, src: Reg, dst: Writable<Reg>) -> Inst {
+        debug_assert!(src.get_class() == RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::MovRR { is_64, src, dst }
+    }
+
+    // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
+    pub(crate) fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmUnaryRmR { op, src, dst }
+    }
+
+    pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable<Reg>, ty: Type) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        debug_assert!(ty.is_vector() && ty.bits() == 128);
+        Inst::XmmLoadConst { src, dst, ty }
+    }
+
+    /// Convenient helper for unary float operations.
+    pub(crate) fn xmm_unary_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmUnaryRmR { op, src, dst }
+    }
+
+    pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmRmR { op, src, dst }
+    }
+
+    pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmUninitializedValue { dst }
+    }
+
+    pub(crate) fn xmm_mov_r_m(op: SseOpcode, src: Reg, dst: impl Into<SyntheticAmode>) -> Inst {
+        debug_assert!(src.get_class() == RegClass::V128);
+        Inst::XmmMovRM {
+            op,
+            src,
+            dst: dst.into(),
+        }
+    }
+
+    pub(crate) fn xmm_to_gpr(
+        op: SseOpcode,
+        src: Reg,
+        dst: Writable<Reg>,
+        dst_size: OperandSize,
+    ) -> Inst {
+        debug_assert!(src.get_class() == RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::XmmToGpr {
+            op,
+            src,
+            dst,
+            dst_size,
+        }
+    }
+
+    pub(crate) fn gpr_to_xmm(
+        op: SseOpcode,
+        src: RegMem,
+        src_size: OperandSize,
+        dst: Writable<Reg>,
+    ) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::GprToXmm {
+            op,
+            src,
+            dst,
+            src_size,
+        }
+    }
+
+    pub(crate) fn xmm_cmp_rm_r(op: SseOpcode, src: RegMem, dst: Reg) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.get_class() == RegClass::V128);
+        Inst::XmmCmpRmR { op, src, dst }
+    }
+
+    pub(crate) fn cvt_u64_to_float_seq(
+        to_f64: bool,
+        src: Writable<Reg>,
+        tmp_gpr1: Writable<Reg>,
+        tmp_gpr2: Writable<Reg>,
+        dst: Writable<Reg>,
+    ) -> Inst {
+        debug_assert!(src.to_reg().get_class() == RegClass::I64);
+        debug_assert!(tmp_gpr1.to_reg().get_class() == RegClass::I64);
+        debug_assert!(tmp_gpr2.to_reg().get_class() == RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::CvtUint64ToFloatSeq {
+            src,
+            dst,
+            tmp_gpr1,
+            tmp_gpr2,
+            to_f64,
+        }
+    }
+
+    pub(crate) fn cvt_float_to_sint_seq(
+        src_size: OperandSize,
+        dst_size: OperandSize,
+        is_saturating: bool,
+        src: Writable<Reg>,
+        dst: Writable<Reg>,
+        tmp_gpr: Writable<Reg>,
+        tmp_xmm: Writable<Reg>,
+    ) -> Inst {
+        debug_assert!(src.to_reg().get_class() == RegClass::V128);
+        debug_assert!(tmp_xmm.to_reg().get_class() == RegClass::V128);
+        debug_assert!(tmp_gpr.to_reg().get_class() == RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::CvtFloatToSintSeq {
+            src_size,
+            dst_size,
+            is_saturating,
+            src,
+            dst,
+            tmp_gpr,
+            tmp_xmm,
+        }
+    }
+
+    pub(crate) fn cvt_float_to_uint_seq(
+        src_size: OperandSize,
+        dst_size: OperandSize,
+        is_saturating: bool,
+        src: Writable<Reg>,
+        dst: Writable<Reg>,
+        tmp_gpr: Writable<Reg>,
+        tmp_xmm: Writable<Reg>,
+    ) -> Inst {
+        debug_assert!(src.to_reg().get_class() == RegClass::V128);
+        debug_assert!(tmp_xmm.to_reg().get_class() == RegClass::V128);
+        debug_assert!(tmp_gpr.to_reg().get_class() == RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::CvtFloatToUintSeq {
+            src_size,
+            dst_size,
+            is_saturating,
+            src,
+            dst,
+            tmp_gpr,
+            tmp_xmm,
+        }
+    }
+
+    pub(crate) fn xmm_min_max_seq(
+        size: OperandSize,
+        is_min: bool,
+        lhs: Reg,
+        rhs_dst: Writable<Reg>,
+    ) -> Inst {
+        debug_assert_eq!(lhs.get_class(), RegClass::V128);
+        debug_assert_eq!(rhs_dst.to_reg().get_class(), RegClass::V128);
+        Inst::XmmMinMaxSeq {
+            size,
+            is_min,
+            lhs,
+            rhs_dst,
+        }
+    }
+
+    pub(crate) fn xmm_rm_r_imm(
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+        imm: u8,
+        is64: bool,
+    ) -> Inst {
+        Inst::XmmRmRImm {
+            op,
+            src,
+            dst,
+            imm,
+            is64,
+        }
+    }
+
+    pub(crate) fn movzx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::MovzxRmR { ext_mode, src, dst }
+    }
+
+    pub(crate) fn xmm_rmi_reg(opcode: SseOpcode, src: RegMemImm, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmRmiReg { opcode, src, dst }
+    }
+
+    pub(crate) fn movsx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::MovsxRmR { ext_mode, src, dst }
+    }
+
+    pub(crate) fn mov64_m_r(src: impl Into<SyntheticAmode>, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Mov64MR {
+            src: src.into(),
+            dst,
+        }
+    }
+
+    /// A convenience function to be able to use a RegMem as the source of a move.
+    pub(crate) fn mov64_rm_r(src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        match src {
+            RegMem::Reg { reg } => Self::mov_r_r(true, reg, dst),
+            RegMem::Mem { addr } => Self::mov64_m_r(addr, dst),
+        }
+    }
+
+    pub(crate) fn mov_r_m(
+        size: u8, // 1, 2, 4 or 8
+        src: Reg,
+        dst: impl Into<SyntheticAmode>,
+    ) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert!(src.get_class() == RegClass::I64);
+        Inst::MovRM {
+            size,
+            src,
+            dst: dst.into(),
+        }
+    }
+
+    pub(crate) fn lea(addr: impl Into<SyntheticAmode>, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::LoadEffectiveAddress {
+            addr: addr.into(),
+            dst,
+        }
+    }
+
+    pub(crate) fn shift_r(
+        size: u8,
+        kind: ShiftKind,
+        num_bits: Option<u8>,
+        dst: Writable<Reg>,
+    ) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert!(if let Some(num_bits) = num_bits {
+            num_bits < size * 8
+        } else {
+            true
+        });
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::ShiftR {
+            size,
+            kind,
+            num_bits,
+            dst,
+        }
+    }
+
+    /// Does a comparison of dst - src for operands of size `size`, as stated by the machine
+    /// instruction semantics. Be careful with the order of parameters!
+    pub(crate) fn cmp_rmi_r(
+        size: u8, // 1, 2, 4 or 8
+        src: RegMemImm,
+        dst: Reg,
+    ) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert!(dst.get_class() == RegClass::I64);
+        Inst::CmpRmiR { size, src, dst }
+    }
+
+    pub(crate) fn trap(trap_code: TrapCode) -> Inst {
+        Inst::Ud2 {
+            trap_code: trap_code,
+        }
+    }
+
+    pub(crate) fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Setcc { cc, dst }
+    }
+
+    pub(crate) fn cmove(size: u8, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Cmove { size, cc, src, dst }
+    }
+
+    pub(crate) fn xmm_cmove(is_64: bool, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmCmove {
+            is_64,
+            cc,
+            src,
+            dst,
+        }
+    }
+
+    pub(crate) fn push64(src: RegMemImm) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        Inst::Push64 { src }
+    }
+
+    pub(crate) fn pop64(dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Pop64 { dst }
+    }
+
+    pub(crate) fn call_known(
+        dest: ExternalName,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: Opcode,
+    ) -> Inst {
+        Inst::CallKnown {
+            dest,
+            uses,
+            defs,
+            opcode,
+        }
+    }
+
+    pub(crate) fn call_unknown(
+        dest: RegMem,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: Opcode,
+    ) -> Inst {
+        dest.assert_regclass_is(RegClass::I64);
+        Inst::CallUnknown {
+            dest,
+            uses,
+            defs,
+            opcode,
+        }
+    }
+
+    pub(crate) fn ret() -> Inst {
+        Inst::Ret
+    }
+
+    pub(crate) fn epilogue_placeholder() -> Inst {
+        Inst::EpiloguePlaceholder
+    }
+
+    pub(crate) fn jmp_known(dst: MachLabel) -> Inst {
+        Inst::JmpKnown { dst }
+    }
+
+    pub(crate) fn jmp_if(cc: CC, taken: MachLabel) -> Inst {
+        Inst::JmpIf { cc, taken }
+    }
+
+    pub(crate) fn jmp_cond(cc: CC, taken: MachLabel, not_taken: MachLabel) -> Inst {
+        Inst::JmpCond {
+            cc,
+            taken,
+            not_taken,
+        }
+    }
+
+    pub(crate) fn jmp_unknown(target: RegMem) -> Inst {
+        target.assert_regclass_is(RegClass::I64);
+        Inst::JmpUnknown { target }
+    }
+
+    pub(crate) fn trap_if(cc: CC, trap_code: TrapCode) -> Inst {
+        Inst::TrapIf { cc, trap_code }
+    }
+
+    /// Choose which instruction to use for loading a register value from memory. For loads smaller
+    /// than 64 bits, this method expects a way to extend the value (i.e. [ExtKind::SignExtend],
+    /// [ExtKind::ZeroExtend]); loads with no extension necessary will ignore this.
+    pub(crate) fn load(
+        ty: Type,
+        from_addr: impl Into<SyntheticAmode>,
+        to_reg: Writable<Reg>,
+        ext_kind: ExtKind,
+    ) -> Inst {
+        let rc = to_reg.to_reg().get_class();
+        match rc {
+            RegClass::I64 => {
+                let ext_mode = match ty.bytes() {
+                    1 => Some(ExtMode::BQ),
+                    2 => Some(ExtMode::WQ),
+                    4 => Some(ExtMode::LQ),
+                    8 => None,
+                    _ => unreachable!("the type should never use a scalar load: {}", ty),
+                };
+                if let Some(ext_mode) = ext_mode {
+                    // Values smaller than 64 bits must be extended in some way.
+                    match ext_kind {
+                        ExtKind::SignExtend => {
+                            Inst::movsx_rm_r(ext_mode, RegMem::mem(from_addr), to_reg)
+                        }
+                        ExtKind::ZeroExtend => {
+                            Inst::movzx_rm_r(ext_mode, RegMem::mem(from_addr), to_reg)
+                        }
+                        ExtKind::None => panic!(
+                            "expected an extension kind for extension mode: {:?}",
+                            ext_mode
+                        ),
+                    }
+                } else {
+                    // 64-bit values can be moved directly.
+                    Inst::mov64_m_r(from_addr, to_reg)
+                }
+            }
+            RegClass::V128 => {
+                let opcode = match ty {
+                    types::F32 => SseOpcode::Movss,
+                    types::F64 => SseOpcode::Movsd,
+                    types::F32X4 => SseOpcode::Movups,
+                    types::F64X2 => SseOpcode::Movupd,
+                    _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqu,
+                    _ => unimplemented!("unable to load type: {}", ty),
+                };
+                Inst::xmm_unary_rm_r(opcode, RegMem::mem(from_addr), to_reg)
+            }
+            _ => panic!("unable to generate load for register class: {:?}", rc),
+        }
+    }
+
+    /// Choose which instruction to use for storing a register value to memory.
+    pub(crate) fn store(ty: Type, from_reg: Reg, to_addr: impl Into<SyntheticAmode>) -> Inst {
+        let rc = from_reg.get_class();
+        match rc {
+            RegClass::I64 => {
+                // Always store the full register, to ensure that the high bits are properly set
+                // when doing a full reload.
+                Inst::mov_r_m(8 /* bytes */, from_reg, to_addr)
+            }
+            RegClass::V128 => {
+                let opcode = match ty {
+                    types::F32 => SseOpcode::Movss,
+                    types::F64 => SseOpcode::Movsd,
+                    types::F32X4 => SseOpcode::Movups,
+                    types::F64X2 => SseOpcode::Movupd,
+                    _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqu,
+                    _ => unimplemented!("unable to store type: {}", ty),
+                };
+                Inst::xmm_mov_r_m(opcode, from_reg, to_addr)
+            }
+            _ => panic!("unable to generate store for register class: {:?}", rc),
+        }
+    }
+}
+
+// Inst helpers.
+
+impl Inst {
+    /// In certain cases, instructions of this format can act as a definition of an XMM register,
+    /// producing a value that is independent of its initial value.
+    ///
+    /// For example, a vector equality comparison (`cmppd` or `cmpps`) that compares a register to
+    /// itself will generate all ones as a result, regardless of its value. From the register
+    /// allocator's point of view, we should (i) record the first register, which is normally a
+    /// mod, as a def instead; and (ii) not record the second register as a use, because it is the
+    /// same as the first register (already handled).
+    fn produces_const(&self) -> bool {
+        match self {
+            Self::AluRmiR { op, src, dst, .. } => {
+                src.to_reg() == Some(dst.to_reg())
+                    && (*op == AluRmiROpcode::Xor || *op == AluRmiROpcode::Sub)
+            }
+
+            Self::XmmRmR { op, src, dst, .. } => {
+                src.to_reg() == Some(dst.to_reg())
+                    && (*op == SseOpcode::Xorps
+                        || *op == SseOpcode::Xorpd
+                        || *op == SseOpcode::Pxor
+                        || *op == SseOpcode::Pcmpeqb
+                        || *op == SseOpcode::Pcmpeqw
+                        || *op == SseOpcode::Pcmpeqd
+                        || *op == SseOpcode::Pcmpeqq)
+            }
+
+            Self::XmmRmRImm {
+                op, src, dst, imm, ..
+            } => {
+                src.to_reg() == Some(dst.to_reg())
+                    && (*op == SseOpcode::Cmppd || *op == SseOpcode::Cmpps)
+                    && *imm == FcmpImm::Equal.encode()
+            }
+
+            _ => false,
+        }
+    }
+
+    /// Choose which instruction to use for comparing two values for equality.
+    pub(crate) fn equals(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
+        match ty {
+            types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to),
+            types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to),
+            types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to),
+            types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to),
+            types::F32X4 => {
+                Inst::xmm_rm_r_imm(SseOpcode::Cmpps, from, to, FcmpImm::Equal.encode(), false)
+            }
+            types::F64X2 => {
+                Inst::xmm_rm_r_imm(SseOpcode::Cmppd, from, to, FcmpImm::Equal.encode(), false)
+            }
+            _ => unimplemented!("unimplemented type for Inst::equals: {}", ty),
+        }
+    }
+
+    /// Choose which instruction to use for computing a bitwise AND on two values.
+    pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
+        match ty {
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to),
+            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pand, from, to),
+            _ => unimplemented!("unimplemented type for Inst::and: {}", ty),
+        }
+    }
+
+    /// Choose which instruction to use for computing a bitwise AND NOT on two values.
+    pub(crate) fn and_not(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
+        match ty {
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to),
+            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pandn, from, to),
+            _ => unimplemented!("unimplemented type for Inst::and_not: {}", ty),
+        }
+    }
+
+    /// Choose which instruction to use for computing a bitwise OR on two values.
+    pub(crate) fn or(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
+        match ty {
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to),
+            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Por, from, to),
+            _ => unimplemented!("unimplemented type for Inst::or: {}", ty),
+        }
+    }
+
+    /// Choose which instruction to use for computing a bitwise XOR on two values.
+    pub(crate) fn xor(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
+        match ty {
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to),
+            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pxor, from, to),
+            _ => unimplemented!("unimplemented type for Inst::xor: {}", ty),
+        }
+    }
+}
+
+//=============================================================================
+// Instructions: printing
+
+impl PrettyPrint for Inst {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        fn ljustify(s: String) -> String {
+            let w = 7;
+            if s.len() >= w {
+                s
+            } else {
+                let need = usize::min(w, w - s.len());
+                s + &format!("{nil: <width$}", nil = "", width = need)
+            }
+        }
+
+        fn ljustify2(s1: String, s2: String) -> String {
+            ljustify(s1 + &s2)
+        }
+
+        fn suffix_lq(is_64: bool) -> String {
+            (if is_64 { "q" } else { "l" }).to_string()
+        }
+
+        fn size_lq(is_64: bool) -> u8 {
+            if is_64 {
+                8
+            } else {
+                4
+            }
+        }
+
+        fn suffix_bwlq(size: u8) -> String {
+            match size {
+                1 => "b".to_string(),
+                2 => "w".to_string(),
+                4 => "l".to_string(),
+                8 => "q".to_string(),
+                _ => panic!("Inst(x64).show.suffixBWLQ: size={}", size),
+            }
+        }
+
+        match self {
+            Inst::Nop { len } => format!("{} len={}", ljustify("nop".to_string()), len),
+
+            Inst::AluRmiR {
+                is_64,
+                op,
+                src,
+                dst,
+            } => format!(
+                "{} {}, {}",
+                ljustify2(op.to_string(), suffix_lq(*is_64)),
+                src.show_rru_sized(mb_rru, size_lq(*is_64)),
+                show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64)),
+            ),
+
+            Inst::UnaryRmR { src, dst, op, size } => format!(
+                "{} {}, {}",
+                ljustify2(op.to_string(), suffix_bwlq(*size)),
+                src.show_rru_sized(mb_rru, *size),
+                show_ireg_sized(dst.to_reg(), mb_rru, *size),
+            ),
+
+            Inst::Not { size, src } => format!(
+                "{} {}",
+                ljustify2("not".to_string(), suffix_bwlq(*size)),
+                show_ireg_sized(src.to_reg(), mb_rru, *size)
+            ),
+
+            Inst::Neg { size, src } => format!(
+                "{} {}",
+                ljustify2("neg".to_string(), suffix_bwlq(*size)),
+                show_ireg_sized(src.to_reg(), mb_rru, *size)
+            ),
+
+            Inst::Div {
+                size,
+                signed,
+                divisor,
+                ..
+            } => format!(
+                "{} {}",
+                ljustify(if *signed {
+                    "idiv".to_string()
+                } else {
+                    "div".into()
+                }),
+                divisor.show_rru_sized(mb_rru, *size)
+            ),
+
+            Inst::MulHi {
+                size, signed, rhs, ..
+            } => format!(
+                "{} {}",
+                ljustify(if *signed {
+                    "imul".to_string()
+                } else {
+                    "mul".to_string()
+                }),
+                rhs.show_rru_sized(mb_rru, *size)
+            ),
+
+            Inst::CheckedDivOrRemSeq {
+                kind,
+                size,
+                divisor,
+                ..
+            } => format!(
+                "{} $rax:$rdx, {}",
+                match kind {
+                    DivOrRemKind::SignedDiv => "sdiv",
+                    DivOrRemKind::UnsignedDiv => "udiv",
+                    DivOrRemKind::SignedRem => "srem",
+                    DivOrRemKind::UnsignedRem => "urem",
+                },
+                show_ireg_sized(divisor.to_reg(), mb_rru, *size),
+            ),
+
+            Inst::SignExtendData { size } => match size {
+                1 => "cbw",
+                2 => "cwd",
+                4 => "cdq",
+                8 => "cqo",
+                _ => unreachable!(),
+            }
+            .into(),
+
+            Inst::XmmUnaryRmR { op, src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                src.show_rru_sized(mb_rru, op.src_size()),
+                show_ireg_sized(dst.to_reg(), mb_rru, 8),
+            ),
+
+            Inst::XmmMovRM { op, src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                show_ireg_sized(*src, mb_rru, 8),
+                dst.show_rru(mb_rru),
+            ),
+
+            Inst::XmmRmR { op, src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                src.show_rru_sized(mb_rru, 8),
+                show_ireg_sized(dst.to_reg(), mb_rru, 8),
+            ),
+
+            Inst::XmmMinMaxSeq {
+                lhs,
+                rhs_dst,
+                is_min,
+                size,
+            } => format!(
+                "{} {}, {}",
+                ljustify2(
+                    if *is_min {
+                        "xmm min seq ".to_string()
+                    } else {
+                        "xmm max seq ".to_string()
+                    },
+                    match size {
+                        OperandSize::Size32 => "f32",
+                        OperandSize::Size64 => "f64",
+                    }
+                    .into()
+                ),
+                show_ireg_sized(*lhs, mb_rru, 8),
+                show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8),
+            ),
+
+            Inst::XmmRmRImm { op, src, dst, imm, is64, .. } => format!(
+                "{} ${}, {}, {}",
+                ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })),
+                imm,
+                src.show_rru(mb_rru),
+                dst.show_rru(mb_rru),
+            ),
+
+            Inst::XmmUninitializedValue { dst } => format!(
+                "{} {}",
+                ljustify("uninit".into()),
+                dst.show_rru(mb_rru),
+            ),
+
+            Inst::XmmLoadConst { src, dst, .. } => {
+                format!("load_const {:?}, {}", src, dst.show_rru(mb_rru),)
+            }
+
+            Inst::XmmToGpr {
+                op,
+                src,
+                dst,
+                dst_size,
+            } => {
+                let dst_size = match dst_size {
+                    OperandSize::Size32 => 4,
+                    OperandSize::Size64 => 8,
+                };
+                format!(
+                    "{} {}, {}",
+                    ljustify(op.to_string()),
+                    src.show_rru(mb_rru),
+                    show_ireg_sized(dst.to_reg(), mb_rru, dst_size),
+                )
+            }
+
+            Inst::GprToXmm {
+                op,
+                src,
+                src_size,
+                dst,
+            } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                src.show_rru_sized(mb_rru, src_size.to_bytes()),
+                dst.show_rru(mb_rru)
+            ),
+
+            Inst::XmmCmpRmR { op, src, dst } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                src.show_rru_sized(mb_rru, 8),
+                show_ireg_sized(*dst, mb_rru, 8),
+            ),
+
+            Inst::CvtUint64ToFloatSeq {
+                src, dst, to_f64, ..
+            } => format!(
+                "{} {}, {}",
+                ljustify(format!(
+                    "u64_to_{}_seq",
+                    if *to_f64 { "f64" } else { "f32" }
+                )),
+                show_ireg_sized(src.to_reg(), mb_rru, 8),
+                dst.show_rru(mb_rru),
+            ),
+
+            Inst::CvtFloatToSintSeq {
+                src,
+                dst,
+                src_size,
+                dst_size,
+                ..
+            } => format!(
+                "{} {}, {}",
+                ljustify(format!(
+                    "cvt_float{}_to_sint{}_seq",
+                    if *src_size == OperandSize::Size64 {
+                        "64"
+                    } else {
+                        "32"
+                    },
+                    if *dst_size == OperandSize::Size64 {
+                        "64"
+                    } else {
+                        "32"
+                    }
+                )),
+                show_ireg_sized(src.to_reg(), mb_rru, 8),
+                show_ireg_sized(dst.to_reg(), mb_rru, dst_size.to_bytes()),
+            ),
+
+            Inst::CvtFloatToUintSeq {
+                src,
+                dst,
+                src_size,
+                dst_size,
+                ..
+            } => format!(
+                "{} {}, {}",
+                ljustify(format!(
+                    "cvt_float{}_to_uint{}_seq",
+                    if *src_size == OperandSize::Size64 {
+                        "64"
+                    } else {
+                        "32"
+                    },
+                    if *dst_size == OperandSize::Size64 {
+                        "64"
+                    } else {
+                        "32"
+                    }
+                )),
+                show_ireg_sized(src.to_reg(), mb_rru, 8),
+                show_ireg_sized(dst.to_reg(), mb_rru, dst_size.to_bytes()),
+            ),
+
+            Inst::Imm {
+                dst_is_64,
+                simm64,
+                dst,
+            } => {
+                if *dst_is_64 {
+                    format!(
+                        "{} ${}, {}",
+                        ljustify("movabsq".to_string()),
+                        *simm64 as i64,
+                        show_ireg_sized(dst.to_reg(), mb_rru, 8)
+                    )
+                } else {
+                    format!(
+                        "{} ${}, {}",
+                        ljustify("movl".to_string()),
+                        (*simm64 as u32) as i32,
+                        show_ireg_sized(dst.to_reg(), mb_rru, 4)
+                    )
+                }
+            }
+
+            Inst::MovRR { is_64, src, dst } => format!(
+                "{} {}, {}",
+                ljustify2("mov".to_string(), suffix_lq(*is_64)),
+                show_ireg_sized(*src, mb_rru, size_lq(*is_64)),
+                show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64))
+            ),
+
+            Inst::MovzxRmR {
+                ext_mode, src, dst, ..
+            } => {
+                if *ext_mode == ExtMode::LQ {
+                    format!(
+                        "{} {}, {}",
+                        ljustify("movl".to_string()),
+                        src.show_rru_sized(mb_rru, ext_mode.src_size()),
+                        show_ireg_sized(dst.to_reg(), mb_rru, 4)
+                    )
+                } else {
+                    format!(
+                        "{} {}, {}",
+                        ljustify2("movz".to_string(), ext_mode.to_string()),
+                        src.show_rru_sized(mb_rru, ext_mode.src_size()),
+                        show_ireg_sized(dst.to_reg(), mb_rru, ext_mode.dst_size())
+                    )
+                }
+            }
+
+            Inst::Mov64MR { src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify("movq".to_string()),
+                src.show_rru(mb_rru),
+                dst.show_rru(mb_rru)
+            ),
+
+            Inst::LoadEffectiveAddress { addr, dst } => format!(
+                "{} {}, {}",
+                ljustify("lea".to_string()),
+                addr.show_rru(mb_rru),
+                dst.show_rru(mb_rru)
+            ),
+
+            Inst::MovsxRmR {
+                ext_mode, src, dst, ..
+            } => format!(
+                "{} {}, {}",
+                ljustify2("movs".to_string(), ext_mode.to_string()),
+                src.show_rru_sized(mb_rru, ext_mode.src_size()),
+                show_ireg_sized(dst.to_reg(), mb_rru, ext_mode.dst_size())
+            ),
+
+            Inst::MovRM { size, src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify2("mov".to_string(), suffix_bwlq(*size)),
+                show_ireg_sized(*src, mb_rru, *size),
+                dst.show_rru(mb_rru)
+            ),
+
+            Inst::ShiftR {
+                size,
+                kind,
+                num_bits,
+                dst,
+            } => match num_bits {
+                None => format!(
+                    "{} %cl, {}",
+                    ljustify2(kind.to_string(), suffix_bwlq(*size)),
+                    show_ireg_sized(dst.to_reg(), mb_rru, *size)
+                ),
+
+                Some(num_bits) => format!(
+                    "{} ${}, {}",
+                    ljustify2(kind.to_string(), suffix_bwlq(*size)),
+                    num_bits,
+                    show_ireg_sized(dst.to_reg(), mb_rru, *size)
+                ),
+            },
+
+            Inst::XmmRmiReg { opcode, src, dst } => format!(
+                "{} {}, {}",
+                ljustify(opcode.to_string()),
+                src.show_rru(mb_rru),
+                dst.to_reg().show_rru(mb_rru)
+            ),
+
+            Inst::CmpRmiR { size, src, dst } => format!(
+                "{} {}, {}",
+                ljustify2("cmp".to_string(), suffix_bwlq(*size)),
+                src.show_rru_sized(mb_rru, *size),
+                show_ireg_sized(*dst, mb_rru, *size)
+            ),
+
+            Inst::Setcc { cc, dst } => format!(
+                "{} {}",
+                ljustify2("set".to_string(), cc.to_string()),
+                show_ireg_sized(dst.to_reg(), mb_rru, 1)
+            ),
+
+            Inst::Cmove { size, cc, src, dst } => format!(
+                "{} {}, {}",
+                ljustify(format!("cmov{}{}", cc.to_string(), suffix_bwlq(*size))),
+                src.show_rru_sized(mb_rru, *size),
+                show_ireg_sized(dst.to_reg(), mb_rru, *size)
+            ),
+
+            Inst::XmmCmove {
+                is_64,
+                cc,
+                src,
+                dst,
+            } => {
+                let size = if *is_64 { 8 } else { 4 };
+                format!(
+                    "j{} $next; mov{} {}, {}; $next: ",
+                    cc.invert().to_string(),
+                    if *is_64 { "sd" } else { "ss" },
+                    src.show_rru_sized(mb_rru, size),
+                    show_ireg_sized(dst.to_reg(), mb_rru, size)
+                )
+            }
+
+            Inst::Push64 { src } => {
+                format!("{} {}", ljustify("pushq".to_string()), src.show_rru(mb_rru))
+            }
+
+            Inst::Pop64 { dst } => {
+                format!("{} {}", ljustify("popq".to_string()), dst.show_rru(mb_rru))
+            }
+
+            Inst::CallKnown { dest, .. } => format!("{} {:?}", ljustify("call".to_string()), dest),
+
+            Inst::CallUnknown { dest, .. } => format!(
+                "{} *{}",
+                ljustify("call".to_string()),
+                dest.show_rru(mb_rru)
+            ),
+
+            Inst::Ret => "ret".to_string(),
+
+            Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(),
+
+            Inst::JmpKnown { dst } => {
+                format!("{} {}", ljustify("jmp".to_string()), dst.to_string())
+            }
+
+            Inst::JmpIf { cc, taken } => format!(
+                "{} {}",
+                ljustify2("j".to_string(), cc.to_string()),
+                taken.to_string(),
+            ),
+
+            Inst::JmpCond {
+                cc,
+                taken,
+                not_taken,
+            } => format!(
+                "{} {}; j {}",
+                ljustify2("j".to_string(), cc.to_string()),
+                taken.to_string(),
+                not_taken.to_string()
+            ),
+
+            Inst::JmpTableSeq { idx, .. } => {
+                format!("{} {}", ljustify("br_table".into()), idx.show_rru(mb_rru))
+            }
+
+            Inst::JmpUnknown { target } => format!(
+                "{} *{}",
+                ljustify("jmp".to_string()),
+                target.show_rru(mb_rru)
+            ),
+
+            Inst::TrapIf { cc, trap_code, .. } => {
+                format!("j{} ; ud2 {} ;", cc.invert().to_string(), trap_code)
+            }
+
+            Inst::LoadExtName {
+                dst, name, offset, ..
+            } => format!(
+                "{} {}+{}, {}",
+                ljustify("movaps".into()),
+                name,
+                offset,
+                show_ireg_sized(dst.to_reg(), mb_rru, 8),
+            ),
+
+            Inst::LockCmpxchg { ty, src, dst, .. } => {
+                let size = ty.bytes() as u8;
+                format!("lock cmpxchg{} {}, {}",
+                        suffix_bwlq(size), show_ireg_sized(*src, mb_rru, size), dst.show_rru(mb_rru))
+            }
+
+            Inst::AtomicRmwSeq { ty, op, .. } => {
+                format!(
+                    "atomically {{ {}_bits_at_[%r9]) {:?}= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }}",
+                    ty.bits(), op)
+            },
+
+            Inst::Fence { kind } => {
+                match kind {
+                    FenceKind::MFence => "mfence".to_string(),
+                    FenceKind::LFence => "lfence".to_string(),
+                    FenceKind::SFence => "sfence".to_string(),
+                }
+            }
+
+            Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
+
+            Inst::Hlt => "hlt".into(),
+
+            Inst::Ud2 { trap_code } => format!("ud2 {}", trap_code),
+        }
+    }
+}
+
+// Temp hook for legacy printing machinery
+impl fmt::Debug for Inst {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        // Print the insn without a Universe :-(
+        write!(fmt, "{}", self.show_rru(None))
+    }
+}
+
+fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
+    // This is a bit subtle. If some register is in the modified set, then it may not be in either
+    // the use or def sets. However, enforcing that directly is somewhat difficult. Instead,
+    // regalloc.rs will "fix" this for us by removing the the modified set from the use and def
+    // sets.
+    match inst {
+        Inst::AluRmiR { src, dst, .. } => {
+            if inst.produces_const() {
+                // No need to account for src, since src == dst.
+                collector.add_def(*dst);
+            } else {
+                src.get_regs_as_uses(collector);
+                collector.add_mod(*dst);
+            }
+        }
+        Inst::Not { src, .. } => {
+            collector.add_mod(*src);
+        }
+        Inst::Neg { src, .. } => {
+            collector.add_mod(*src);
+        }
+        Inst::Div { size, divisor, .. } => {
+            collector.add_mod(Writable::from_reg(regs::rax()));
+            if *size == 1 {
+                collector.add_def(Writable::from_reg(regs::rdx()));
+            } else {
+                collector.add_mod(Writable::from_reg(regs::rdx()));
+            }
+            divisor.get_regs_as_uses(collector);
+        }
+        Inst::MulHi { rhs, .. } => {
+            collector.add_mod(Writable::from_reg(regs::rax()));
+            collector.add_def(Writable::from_reg(regs::rdx()));
+            rhs.get_regs_as_uses(collector);
+        }
+        Inst::CheckedDivOrRemSeq { divisor, tmp, .. } => {
+            // Mark both fixed registers as mods, to avoid an early clobber problem in codegen
+            // (i.e. the temporary is allocated one of the fixed registers). This requires writing
+            // the rdx register *before* the instruction, which is not too bad.
+            collector.add_mod(Writable::from_reg(regs::rax()));
+            collector.add_mod(Writable::from_reg(regs::rdx()));
+            collector.add_mod(*divisor);
+            if let Some(tmp) = tmp {
+                collector.add_def(*tmp);
+            }
+        }
+        Inst::SignExtendData { size } => match size {
+            1 => collector.add_mod(Writable::from_reg(regs::rax())),
+            2 | 4 | 8 => {
+                collector.add_use(regs::rax());
+                collector.add_def(Writable::from_reg(regs::rdx()));
+            }
+            _ => unreachable!(),
+        },
+        Inst::UnaryRmR { src, dst, .. } | Inst::XmmUnaryRmR { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::XmmRmR { src, dst, .. } => {
+            if inst.produces_const() {
+                // No need to account for src, since src == dst.
+                collector.add_def(*dst);
+            } else {
+                src.get_regs_as_uses(collector);
+                collector.add_mod(*dst);
+            }
+        }
+        Inst::XmmRmRImm { op, src, dst, .. } => {
+            if inst.produces_const() {
+                // No need to account for src, since src == dst.
+                collector.add_def(*dst);
+            } else if *op == SseOpcode::Pextrb
+                || *op == SseOpcode::Pextrw
+                || *op == SseOpcode::Pextrd
+                || *op == SseOpcode::Pshufd
+            {
+                src.get_regs_as_uses(collector);
+                collector.add_def(*dst);
+            } else {
+                src.get_regs_as_uses(collector);
+                collector.add_mod(*dst);
+            }
+        }
+        Inst::XmmUninitializedValue { dst } => collector.add_def(*dst),
+        Inst::XmmLoadConst { dst, .. } => collector.add_def(*dst),
+        Inst::XmmMinMaxSeq { lhs, rhs_dst, .. } => {
+            collector.add_use(*lhs);
+            collector.add_mod(*rhs_dst);
+        }
+        Inst::XmmRmiReg { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_mod(*dst);
+        }
+        Inst::XmmMovRM { src, dst, .. } => {
+            collector.add_use(*src);
+            dst.get_regs_as_uses(collector);
+        }
+        Inst::XmmCmpRmR { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_use(*dst);
+        }
+        Inst::Imm { dst, .. } => {
+            collector.add_def(*dst);
+        }
+        Inst::MovRR { src, dst, .. } | Inst::XmmToGpr { src, dst, .. } => {
+            collector.add_use(*src);
+            collector.add_def(*dst);
+        }
+        Inst::GprToXmm { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::CvtUint64ToFloatSeq {
+            src,
+            dst,
+            tmp_gpr1,
+            tmp_gpr2,
+            ..
+        } => {
+            collector.add_mod(*src);
+            collector.add_def(*dst);
+            collector.add_def(*tmp_gpr1);
+            collector.add_def(*tmp_gpr2);
+        }
+        Inst::CvtFloatToSintSeq {
+            src,
+            dst,
+            tmp_xmm,
+            tmp_gpr,
+            ..
+        }
+        | Inst::CvtFloatToUintSeq {
+            src,
+            dst,
+            tmp_gpr,
+            tmp_xmm,
+            ..
+        } => {
+            collector.add_mod(*src);
+            collector.add_def(*dst);
+            collector.add_def(*tmp_gpr);
+            collector.add_def(*tmp_xmm);
+        }
+        Inst::MovzxRmR { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::Mov64MR { src, dst, .. } | Inst::LoadEffectiveAddress { addr: src, dst } => {
+            src.get_regs_as_uses(collector);
+            collector.add_def(*dst)
+        }
+        Inst::MovsxRmR { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::MovRM { src, dst, .. } => {
+            collector.add_use(*src);
+            dst.get_regs_as_uses(collector);
+        }
+        Inst::ShiftR { num_bits, dst, .. } => {
+            if num_bits.is_none() {
+                collector.add_use(regs::rcx());
+            }
+            collector.add_mod(*dst);
+        }
+        Inst::CmpRmiR { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_use(*dst); // yes, really `add_use`
+        }
+        Inst::Setcc { dst, .. } => {
+            collector.add_def(*dst);
+        }
+        Inst::Cmove { src, dst, .. } | Inst::XmmCmove { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_mod(*dst);
+        }
+        Inst::Push64 { src } => {
+            src.get_regs_as_uses(collector);
+            collector.add_mod(Writable::from_reg(regs::rsp()));
+        }
+        Inst::Pop64 { dst } => {
+            collector.add_def(*dst);
+        }
+
+        Inst::CallKnown {
+            ref uses, ref defs, ..
+        } => {
+            collector.add_uses(uses);
+            collector.add_defs(defs);
+        }
+
+        Inst::CallUnknown {
+            ref uses,
+            ref defs,
+            dest,
+            ..
+        } => {
+            collector.add_uses(uses);
+            collector.add_defs(defs);
+            dest.get_regs_as_uses(collector);
+        }
+
+        Inst::JmpTableSeq {
+            ref idx,
+            ref tmp1,
+            ref tmp2,
+            ..
+        } => {
+            collector.add_use(*idx);
+            collector.add_def(*tmp1);
+            collector.add_def(*tmp2);
+        }
+
+        Inst::JmpUnknown { target } => {
+            target.get_regs_as_uses(collector);
+        }
+
+        Inst::LoadExtName { dst, .. } => {
+            collector.add_def(*dst);
+        }
+
+        Inst::LockCmpxchg { src, dst, .. } => {
+            dst.get_regs_as_uses(collector);
+            collector.add_use(*src);
+            collector.add_mod(Writable::from_reg(regs::rax()));
+        }
+
+        Inst::AtomicRmwSeq { .. } => {
+            collector.add_use(regs::r9());
+            collector.add_use(regs::r10());
+            collector.add_def(Writable::from_reg(regs::r11()));
+            collector.add_def(Writable::from_reg(regs::rax()));
+        }
+
+        Inst::Ret
+        | Inst::EpiloguePlaceholder
+        | Inst::JmpKnown { .. }
+        | Inst::JmpIf { .. }
+        | Inst::JmpCond { .. }
+        | Inst::Nop { .. }
+        | Inst::TrapIf { .. }
+        | Inst::VirtualSPOffsetAdj { .. }
+        | Inst::Hlt
+        | Inst::Ud2 { .. }
+        | Inst::Fence { .. } => {
+            // No registers are used.
+        }
+    }
+}
+
+//=============================================================================
+// Instructions and subcomponents: map_regs
+
+fn map_use<RUM: RegUsageMapper>(m: &RUM, r: &mut Reg) {
+    if let Some(reg) = r.as_virtual_reg() {
+        let new = m.get_use(reg).unwrap().to_reg();
+        *r = new;
+    }
+}
+
+fn map_def<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
+    if let Some(reg) = r.to_reg().as_virtual_reg() {
+        let new = m.get_def(reg).unwrap().to_reg();
+        *r = Writable::from_reg(new);
+    }
+}
+
+fn map_mod<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
+    if let Some(reg) = r.to_reg().as_virtual_reg() {
+        let new = m.get_mod(reg).unwrap().to_reg();
+        *r = Writable::from_reg(new);
+    }
+}
+
+impl Amode {
+    fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            Amode::ImmReg { ref mut base, .. } => map_use(map, base),
+            Amode::ImmRegRegShift {
+                ref mut base,
+                ref mut index,
+                ..
+            } => {
+                map_use(map, base);
+                map_use(map, index);
+            }
+            Amode::RipRelative { .. } => {
+                // RIP isn't involved in regalloc.
+            }
+        }
+    }
+}
+
+impl RegMemImm {
+    fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            RegMemImm::Reg { ref mut reg } => map_use(map, reg),
+            RegMemImm::Mem { ref mut addr } => addr.map_uses(map),
+            RegMemImm::Imm { .. } => {}
+        }
+    }
+
+    fn map_as_def<RUM: RegUsageMapper>(&mut self, mapper: &RUM) {
+        match self {
+            Self::Reg { reg } => {
+                let mut writable_src = Writable::from_reg(*reg);
+                map_def(mapper, &mut writable_src);
+                *self = Self::reg(writable_src.to_reg());
+            }
+            _ => panic!("unexpected RegMemImm kind in map_src_reg_as_def"),
+        }
+    }
+}
+
+impl RegMem {
+    fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            RegMem::Reg { ref mut reg } => map_use(map, reg),
+            RegMem::Mem { ref mut addr, .. } => addr.map_uses(map),
+        }
+    }
+
+    fn map_as_def<RUM: RegUsageMapper>(&mut self, mapper: &RUM) {
+        match self {
+            Self::Reg { reg } => {
+                let mut writable_src = Writable::from_reg(*reg);
+                map_def(mapper, &mut writable_src);
+                *self = Self::reg(writable_src.to_reg());
+            }
+            _ => panic!("unexpected RegMem kind in map_src_reg_as_def"),
+        }
+    }
+}
+
+fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
+    // Note this must be carefully synchronized with x64_get_regs.
+    let produces_const = inst.produces_const();
+
+    match inst {
+        // ** Nop
+        Inst::AluRmiR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            if produces_const {
+                src.map_as_def(mapper);
+                map_def(mapper, dst);
+            } else {
+                src.map_uses(mapper);
+                map_mod(mapper, dst);
+            }
+        }
+        Inst::Not { src, .. } | Inst::Neg { src, .. } => map_mod(mapper, src),
+        Inst::Div { divisor, .. } => divisor.map_uses(mapper),
+        Inst::MulHi { rhs, .. } => rhs.map_uses(mapper),
+        Inst::CheckedDivOrRemSeq { divisor, tmp, .. } => {
+            map_mod(mapper, divisor);
+            if let Some(tmp) = tmp {
+                map_def(mapper, tmp)
+            }
+        }
+        Inst::SignExtendData { .. } => {}
+        Inst::XmmUnaryRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        }
+        | Inst::UnaryRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::XmmRmRImm {
+            ref op,
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            if produces_const {
+                src.map_as_def(mapper);
+                map_def(mapper, dst);
+            } else if *op == SseOpcode::Pextrb
+                || *op == SseOpcode::Pextrw
+                || *op == SseOpcode::Pextrd
+                || *op == SseOpcode::Pshufd
+            {
+                src.map_uses(mapper);
+                map_def(mapper, dst);
+            } else {
+                src.map_uses(mapper);
+                map_mod(mapper, dst);
+            }
+        }
+        Inst::XmmRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            if produces_const {
+                src.map_as_def(mapper);
+                map_def(mapper, dst);
+            } else {
+                src.map_uses(mapper);
+                map_mod(mapper, dst);
+            }
+        }
+        Inst::XmmRmiReg {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_mod(mapper, dst);
+        }
+        Inst::XmmUninitializedValue { ref mut dst, .. } => {
+            map_def(mapper, dst);
+        }
+        Inst::XmmLoadConst { ref mut dst, .. } => {
+            map_def(mapper, dst);
+        }
+        Inst::XmmMinMaxSeq {
+            ref mut lhs,
+            ref mut rhs_dst,
+            ..
+        } => {
+            map_use(mapper, lhs);
+            map_mod(mapper, rhs_dst);
+        }
+        Inst::XmmMovRM {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            map_use(mapper, src);
+            dst.map_uses(mapper);
+        }
+        Inst::XmmCmpRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_use(mapper, dst);
+        }
+        Inst::Imm { ref mut dst, .. } => map_def(mapper, dst),
+        Inst::MovRR {
+            ref mut src,
+            ref mut dst,
+            ..
+        }
+        | Inst::XmmToGpr {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            map_use(mapper, src);
+            map_def(mapper, dst);
+        }
+        Inst::GprToXmm {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::CvtUint64ToFloatSeq {
+            ref mut src,
+            ref mut dst,
+            ref mut tmp_gpr1,
+            ref mut tmp_gpr2,
+            ..
+        } => {
+            map_mod(mapper, src);
+            map_def(mapper, dst);
+            map_def(mapper, tmp_gpr1);
+            map_def(mapper, tmp_gpr2);
+        }
+        Inst::CvtFloatToSintSeq {
+            ref mut src,
+            ref mut dst,
+            ref mut tmp_xmm,
+            ref mut tmp_gpr,
+            ..
+        }
+        | Inst::CvtFloatToUintSeq {
+            ref mut src,
+            ref mut dst,
+            ref mut tmp_gpr,
+            ref mut tmp_xmm,
+            ..
+        } => {
+            map_mod(mapper, src);
+            map_def(mapper, dst);
+            map_def(mapper, tmp_gpr);
+            map_def(mapper, tmp_xmm);
+        }
+        Inst::MovzxRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::Mov64MR { src, dst, .. } | Inst::LoadEffectiveAddress { addr: src, dst } => {
+            src.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::MovsxRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::MovRM {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            map_use(mapper, src);
+            dst.map_uses(mapper);
+        }
+        Inst::ShiftR { ref mut dst, .. } => {
+            map_mod(mapper, dst);
+        }
+        Inst::CmpRmiR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_use(mapper, dst);
+        }
+        Inst::Setcc { ref mut dst, .. } => map_def(mapper, dst),
+        Inst::Cmove {
+            ref mut src,
+            ref mut dst,
+            ..
+        }
+        | Inst::XmmCmove {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_mod(mapper, dst)
+        }
+        Inst::Push64 { ref mut src } => src.map_uses(mapper),
+        Inst::Pop64 { ref mut dst } => {
+            map_def(mapper, dst);
+        }
+
+        Inst::CallKnown {
+            ref mut uses,
+            ref mut defs,
+            ..
+        } => {
+            for r in uses.iter_mut() {
+                map_use(mapper, r);
+            }
+            for r in defs.iter_mut() {
+                map_def(mapper, r);
+            }
+        }
+
+        Inst::CallUnknown {
+            ref mut uses,
+            ref mut defs,
+            ref mut dest,
+            ..
+        } => {
+            for r in uses.iter_mut() {
+                map_use(mapper, r);
+            }
+            for r in defs.iter_mut() {
+                map_def(mapper, r);
+            }
+            dest.map_uses(mapper);
+        }
+
+        Inst::JmpTableSeq {
+            ref mut idx,
+            ref mut tmp1,
+            ref mut tmp2,
+            ..
+        } => {
+            map_use(mapper, idx);
+            map_def(mapper, tmp1);
+            map_def(mapper, tmp2);
+        }
+
+        Inst::JmpUnknown { ref mut target } => target.map_uses(mapper),
+
+        Inst::LoadExtName { ref mut dst, .. } => map_def(mapper, dst),
+
+        Inst::LockCmpxchg {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            map_use(mapper, src);
+            dst.map_uses(mapper);
+        }
+
+        Inst::Ret
+        | Inst::EpiloguePlaceholder
+        | Inst::JmpKnown { .. }
+        | Inst::JmpCond { .. }
+        | Inst::JmpIf { .. }
+        | Inst::Nop { .. }
+        | Inst::TrapIf { .. }
+        | Inst::VirtualSPOffsetAdj { .. }
+        | Inst::Ud2 { .. }
+        | Inst::Hlt
+        | Inst::AtomicRmwSeq { .. }
+        | Inst::Fence { .. } => {
+            // Instruction doesn't explicitly mention any regs, so it can't have any virtual
+            // regs that we'd need to remap.  Hence no action required.
+        }
+    }
+}
+
+//=============================================================================
+// Instructions: misc functions and external interface
+
+impl MachInst for Inst {
+    fn get_regs(&self, collector: &mut RegUsageCollector) {
+        x64_get_regs(&self, collector)
+    }
+
+    fn map_regs<RUM: RegUsageMapper>(&mut self, mapper: &RUM) {
+        x64_map_regs(self, mapper);
+    }
+
+    fn is_move(&self) -> Option<(Writable<Reg>, Reg)> {
+        match self {
+            // Note (carefully!) that a 32-bit mov *isn't* a no-op since it zeroes
+            // out the upper 32 bits of the destination.  For example, we could
+            // conceivably use `movl %reg, %reg` to zero out the top 32 bits of
+            // %reg.
+            Self::MovRR {
+                is_64, src, dst, ..
+            } if *is_64 => Some((*dst, *src)),
+            // Note as well that MOVS[S|D] when used in the `XmmUnaryRmR` context are pure moves of
+            // scalar floating-point values (and annotate `dst` as `def`s to the register allocator)
+            // whereas the same operation in a packed context, e.g. `XMM_RM_R`, is used to merge a
+            // value into the lowest lane of a vector (not a move).
+            Self::XmmUnaryRmR { op, src, dst, .. }
+                if *op == SseOpcode::Movss
+                    || *op == SseOpcode::Movsd
+                    || *op == SseOpcode::Movaps
+                    || *op == SseOpcode::Movapd
+                    || *op == SseOpcode::Movups
+                    || *op == SseOpcode::Movupd
+                    || *op == SseOpcode::Movdqa
+                    || *op == SseOpcode::Movdqu =>
+            {
+                if let RegMem::Reg { reg } = src {
+                    Some((*dst, *reg))
+                } else {
+                    None
+                }
+            }
+            _ => None,
+        }
+    }
+
+    fn is_epilogue_placeholder(&self) -> bool {
+        if let Self::EpiloguePlaceholder = self {
+            true
+        } else {
+            false
+        }
+    }
+
+    fn is_term<'a>(&'a self) -> MachTerminator<'a> {
+        match self {
+            // Interesting cases.
+            &Self::Ret | &Self::EpiloguePlaceholder => MachTerminator::Ret,
+            &Self::JmpKnown { dst } => MachTerminator::Uncond(dst),
+            &Self::JmpCond {
+                taken, not_taken, ..
+            } => MachTerminator::Cond(taken, not_taken),
+            &Self::JmpTableSeq {
+                ref targets_for_term,
+                ..
+            } => MachTerminator::Indirect(&targets_for_term[..]),
+            // All other cases are boring.
+            _ => MachTerminator::None,
+        }
+    }
+
+    fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, ty: Type) -> Inst {
+        let rc_dst = dst_reg.to_reg().get_class();
+        let rc_src = src_reg.get_class();
+        // If this isn't true, we have gone way off the rails.
+        debug_assert!(rc_dst == rc_src);
+        match rc_dst {
+            RegClass::I64 => Inst::mov_r_r(true, src_reg, dst_reg),
+            RegClass::V128 => {
+                // The Intel optimization manual, in "3.5.1.13 Zero-Latency MOV Instructions",
+                // doesn't include MOVSS/MOVSD as instructions with zero-latency. Use movaps for
+                // those, which may write more lanes that we need, but are specified to have
+                // zero-latency.
+                let opcode = match ty {
+                    types::F32 | types::F64 | types::F32X4 => SseOpcode::Movaps,
+                    types::F64X2 => SseOpcode::Movapd,
+                    _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqa,
+                    _ => unimplemented!("unable to move type: {}", ty),
+                };
+                Inst::xmm_unary_rm_r(opcode, RegMem::reg(src_reg), dst_reg)
+            }
+            _ => panic!("gen_move(x64): unhandled regclass {:?}", rc_dst),
+        }
+    }
+
+    fn gen_zero_len_nop() -> Inst {
+        Inst::Nop { len: 0 }
+    }
+
+    fn gen_nop(preferred_size: usize) -> Inst {
+        Inst::nop((preferred_size % 16) as u8)
+    }
+
+    fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> {
+        None
+    }
+
+    fn rc_for_type(ty: Type) -> CodegenResult<RegClass> {
+        match ty {
+            types::I8
+            | types::I16
+            | types::I32
+            | types::I64
+            | types::B1
+            | types::B8
+            | types::B16
+            | types::B32
+            | types::B64
+            | types::R32
+            | types::R64 => Ok(RegClass::I64),
+            types::F32 | types::F64 => Ok(RegClass::V128),
+            _ if ty.bits() == 128 => Ok(RegClass::V128),
+            types::IFLAGS | types::FFLAGS => Ok(RegClass::I64),
+            _ => Err(CodegenError::Unsupported(format!(
+                "Unexpected SSA-value type: {}",
+                ty
+            ))),
+        }
+    }
+
+    fn gen_jump(label: MachLabel) -> Inst {
+        Inst::jmp_known(label)
+    }
+
+    fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        to_reg: Writable<Reg>,
+        value: u64,
+        ty: Type,
+        mut alloc_tmp: F,
+    ) -> SmallVec<[Self; 4]> {
+        let mut ret = SmallVec::new();
+        if ty == types::F32 {
+            if value == 0 {
+                ret.push(Inst::xmm_rm_r(
+                    SseOpcode::Xorps,
+                    RegMem::reg(to_reg.to_reg()),
+                    to_reg,
+                ));
+            } else {
+                let tmp = alloc_tmp(RegClass::I64, types::I32);
+                ret.push(Inst::imm(OperandSize::Size32, value, tmp));
+
+                ret.push(Inst::gpr_to_xmm(
+                    SseOpcode::Movd,
+                    RegMem::reg(tmp.to_reg()),
+                    OperandSize::Size32,
+                    to_reg,
+                ));
+            }
+        } else if ty == types::F64 {
+            if value == 0 {
+                ret.push(Inst::xmm_rm_r(
+                    SseOpcode::Xorpd,
+                    RegMem::reg(to_reg.to_reg()),
+                    to_reg,
+                ));
+            } else {
+                let tmp = alloc_tmp(RegClass::I64, types::I64);
+                ret.push(Inst::imm(OperandSize::Size64, value, tmp));
+
+                ret.push(Inst::gpr_to_xmm(
+                    SseOpcode::Movq,
+                    RegMem::reg(tmp.to_reg()),
+                    OperandSize::Size64,
+                    to_reg,
+                ));
+            }
+        } else {
+            // Must be an integer type.
+            debug_assert!(
+                ty == types::B1
+                    || ty == types::I8
+                    || ty == types::B8
+                    || ty == types::I16
+                    || ty == types::B16
+                    || ty == types::I32
+                    || ty == types::B32
+                    || ty == types::I64
+                    || ty == types::B64
+                    || ty == types::R32
+                    || ty == types::R64
+            );
+            if value == 0 {
+                ret.push(Inst::alu_rmi_r(
+                    ty == types::I64,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(to_reg.to_reg()),
+                    to_reg,
+                ));
+            } else {
+                ret.push(Inst::imm(
+                    OperandSize::from_bytes(ty.bytes()),
+                    value.into(),
+                    to_reg,
+                ));
+            }
+        }
+        ret
+    }
+
+    fn reg_universe(flags: &Flags) -> RealRegUniverse {
+        create_reg_universe_systemv(flags)
+    }
+
+    fn worst_case_size() -> CodeOffset {
+        15
+    }
+
+    fn ref_type_regclass(_: &settings::Flags) -> RegClass {
+        RegClass::I64
+    }
+
+    type LabelUse = LabelUse;
+}
+
+/// State carried between emissions of a sequence of instructions.
+#[derive(Default, Clone, Debug)]
+pub struct EmitState {
+    /// Addend to convert nominal-SP offsets to real-SP offsets at the current
+    /// program point.
+    pub(crate) virtual_sp_offset: i64,
+    /// Offset of FP from nominal-SP.
+    pub(crate) nominal_sp_to_fp: i64,
+    /// Safepoint stack map for upcoming instruction, as provided to `pre_safepoint()`.
+    stack_map: Option<StackMap>,
+    /// Current source location.
+    cur_srcloc: SourceLoc,
+}
+
+/// Constant state used during emissions of a sequence of instructions.
+pub struct EmitInfo {
+    flags: settings::Flags,
+    isa_flags: x64_settings::Flags,
+}
+
+impl EmitInfo {
+    pub(crate) fn new(flags: settings::Flags, isa_flags: x64_settings::Flags) -> Self {
+        Self { flags, isa_flags }
+    }
+}
+
+impl MachInstEmitInfo for EmitInfo {
+    fn flags(&self) -> &Flags {
+        &self.flags
+    }
+}
+
+impl MachInstEmit for Inst {
+    type State = EmitState;
+    type Info = EmitInfo;
+    type UnwindInfo = unwind::X64UnwindInfo;
+
+    fn emit(&self, sink: &mut MachBuffer<Inst>, info: &Self::Info, state: &mut Self::State) {
+        emit::emit(self, sink, info, state);
+    }
+
+    fn pretty_print(&self, mb_rru: Option<&RealRegUniverse>, _: &mut Self::State) -> String {
+        self.show_rru(mb_rru)
+    }
+}
+
+impl MachInstEmitState<Inst> for EmitState {
+    fn new(abi: &dyn ABICallee<I = Inst>) -> Self {
+        EmitState {
+            virtual_sp_offset: 0,
+            nominal_sp_to_fp: abi.frame_size() as i64,
+            stack_map: None,
+            cur_srcloc: SourceLoc::default(),
+        }
+    }
+
+    fn pre_safepoint(&mut self, stack_map: StackMap) {
+        self.stack_map = Some(stack_map);
+    }
+
+    fn pre_sourceloc(&mut self, srcloc: SourceLoc) {
+        self.cur_srcloc = srcloc;
+    }
+}
+
+impl EmitState {
+    fn take_stack_map(&mut self) -> Option<StackMap> {
+        self.stack_map.take()
+    }
+
+    fn clear_post_insn(&mut self) {
+        self.stack_map = None;
+    }
+
+    fn cur_srcloc(&self) -> SourceLoc {
+        self.cur_srcloc
+    }
+}
+
+/// A label-use (internal relocation) in generated code.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LabelUse {
+    /// A 32-bit offset from location of relocation itself, added to the existing value at that
+    /// location. Used for control flow instructions which consider an offset from the start of the
+    /// next instruction (so the size of the payload -- 4 bytes -- is subtracted from the payload).
+    JmpRel32,
+
+    /// A 32-bit offset from location of relocation itself, added to the existing value at that
+    /// location.
+    PCRel32,
+}
+
+impl MachInstLabelUse for LabelUse {
+    const ALIGN: CodeOffset = 1;
+
+    fn max_pos_range(self) -> CodeOffset {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => 0x7fff_ffff,
+        }
+    }
+
+    fn max_neg_range(self) -> CodeOffset {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => 0x8000_0000,
+        }
+    }
+
+    fn patch_size(self) -> CodeOffset {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => 4,
+        }
+    }
+
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
+        let pc_rel = (label_offset as i64) - (use_offset as i64);
+        debug_assert!(pc_rel <= self.max_pos_range() as i64);
+        debug_assert!(pc_rel >= -(self.max_neg_range() as i64));
+        let pc_rel = pc_rel as u32;
+        match self {
+            LabelUse::JmpRel32 => {
+                let addend = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+                let value = pc_rel.wrapping_add(addend).wrapping_sub(4);
+                buffer.copy_from_slice(&value.to_le_bytes()[..]);
+            }
+            LabelUse::PCRel32 => {
+                let addend = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+                let value = pc_rel.wrapping_add(addend);
+                buffer.copy_from_slice(&value.to_le_bytes()[..]);
+            }
+        }
+    }
+
+    fn supports_veneer(self) -> bool {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => false,
+        }
+    }
+
+    fn veneer_size(self) -> CodeOffset {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => 0,
+        }
+    }
+
+    fn generate_veneer(self, _: &mut [u8], _: CodeOffset) -> (CodeOffset, LabelUse) {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => {
+                panic!("Veneer not supported for JumpRel32 label-use.");
+            }
+        }
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs
new file mode 100644
index 0000000000..04bc1f09bf
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs
@@ -0,0 +1,289 @@
+//! Registers, the Universe thereof, and printing.
+//!
+//! These are ordered by sequence number, as required in the Universe.  The strange ordering is
+//! intended to make callee-save registers available before caller-saved ones.  This is a net win
+//! provided that each function makes at least one onward call.  It'll be a net loss for leaf
+//! functions, and we should change the ordering in that case, so as to make caller-save regs
+//! available first.
+//!
+//! TODO Maybe have two different universes, one for leaf functions and one for non-leaf functions?
+//! Also, they will have to be ABI dependent.  Need to find a way to avoid constructing a universe
+//! for each function we compile.
+
+use crate::settings;
+use alloc::vec::Vec;
+use regalloc::{
+    PrettyPrint, RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, NUM_REG_CLASSES,
+};
+use std::string::String;
+
+// Hardware encodings for a few registers.
+
+pub const ENC_RBX: u8 = 3;
+pub const ENC_RSP: u8 = 4;
+pub const ENC_RBP: u8 = 5;
+pub const ENC_R12: u8 = 12;
+pub const ENC_R13: u8 = 13;
+pub const ENC_R14: u8 = 14;
+pub const ENC_R15: u8 = 15;
+
+fn gpr(enc: u8, index: u8) -> Reg {
+    Reg::new_real(RegClass::I64, enc, index)
+}
+
+pub(crate) fn r12() -> Reg {
+    gpr(ENC_R12, 16)
+}
+pub(crate) fn r13() -> Reg {
+    gpr(ENC_R13, 17)
+}
+pub(crate) fn r14() -> Reg {
+    gpr(ENC_R14, 18)
+}
+pub(crate) fn rbx() -> Reg {
+    gpr(ENC_RBX, 19)
+}
+pub(crate) fn rsi() -> Reg {
+    gpr(6, 20)
+}
+pub(crate) fn rdi() -> Reg {
+    gpr(7, 21)
+}
+pub(crate) fn rax() -> Reg {
+    gpr(0, 22)
+}
+pub(crate) fn rcx() -> Reg {
+    gpr(1, 23)
+}
+pub(crate) fn rdx() -> Reg {
+    gpr(2, 24)
+}
+pub(crate) fn r8() -> Reg {
+    gpr(8, 25)
+}
+pub(crate) fn r9() -> Reg {
+    gpr(9, 26)
+}
+pub(crate) fn r10() -> Reg {
+    gpr(10, 27)
+}
+pub(crate) fn r11() -> Reg {
+    gpr(11, 28)
+}
+
+pub(crate) fn r15() -> Reg {
+    // r15 is put aside since this is the pinned register.
+    gpr(ENC_R15, 29)
+}
+
+/// The pinned register on this architecture.
+/// It must be the same as Spidermonkey's HeapReg, as found in this file.
+/// https://searchfox.org/mozilla-central/source/js/src/jit/x64/Assembler-x64.h#99
+pub(crate) fn pinned_reg() -> Reg {
+    r15()
+}
+
+fn fpr(enc: u8, index: u8) -> Reg {
+    Reg::new_real(RegClass::V128, enc, index)
+}
+
+pub(crate) fn xmm0() -> Reg {
+    fpr(0, 0)
+}
+pub(crate) fn xmm1() -> Reg {
+    fpr(1, 1)
+}
+pub(crate) fn xmm2() -> Reg {
+    fpr(2, 2)
+}
+pub(crate) fn xmm3() -> Reg {
+    fpr(3, 3)
+}
+pub(crate) fn xmm4() -> Reg {
+    fpr(4, 4)
+}
+pub(crate) fn xmm5() -> Reg {
+    fpr(5, 5)
+}
+pub(crate) fn xmm6() -> Reg {
+    fpr(6, 6)
+}
+pub(crate) fn xmm7() -> Reg {
+    fpr(7, 7)
+}
+pub(crate) fn xmm8() -> Reg {
+    fpr(8, 8)
+}
+pub(crate) fn xmm9() -> Reg {
+    fpr(9, 9)
+}
+pub(crate) fn xmm10() -> Reg {
+    fpr(10, 10)
+}
+pub(crate) fn xmm11() -> Reg {
+    fpr(11, 11)
+}
+pub(crate) fn xmm12() -> Reg {
+    fpr(12, 12)
+}
+pub(crate) fn xmm13() -> Reg {
+    fpr(13, 13)
+}
+pub(crate) fn xmm14() -> Reg {
+    fpr(14, 14)
+}
+pub(crate) fn xmm15() -> Reg {
+    fpr(15, 15)
+}
+
+pub(crate) fn rsp() -> Reg {
+    gpr(ENC_RSP, 30)
+}
+pub(crate) fn rbp() -> Reg {
+    gpr(ENC_RBP, 31)
+}
+
+/// Create the register universe for X64.
+///
+/// The ordering of registers matters, as commented in the file doc comment: assumes the
+/// calling-convention is SystemV, at the moment.
+pub(crate) fn create_reg_universe_systemv(flags: &settings::Flags) -> RealRegUniverse {
+    let mut regs = Vec::<(RealReg, String)>::new();
+    let mut allocable_by_class = [None; NUM_REG_CLASSES];
+
+    let use_pinned_reg = flags.enable_pinned_reg();
+
+    // XMM registers
+    let first_fpr = regs.len();
+    regs.push((xmm0().to_real_reg(), "%xmm0".into()));
+    regs.push((xmm1().to_real_reg(), "%xmm1".into()));
+    regs.push((xmm2().to_real_reg(), "%xmm2".into()));
+    regs.push((xmm3().to_real_reg(), "%xmm3".into()));
+    regs.push((xmm4().to_real_reg(), "%xmm4".into()));
+    regs.push((xmm5().to_real_reg(), "%xmm5".into()));
+    regs.push((xmm6().to_real_reg(), "%xmm6".into()));
+    regs.push((xmm7().to_real_reg(), "%xmm7".into()));
+    regs.push((xmm8().to_real_reg(), "%xmm8".into()));
+    regs.push((xmm9().to_real_reg(), "%xmm9".into()));
+    regs.push((xmm10().to_real_reg(), "%xmm10".into()));
+    regs.push((xmm11().to_real_reg(), "%xmm11".into()));
+    regs.push((xmm12().to_real_reg(), "%xmm12".into()));
+    regs.push((xmm13().to_real_reg(), "%xmm13".into()));
+    regs.push((xmm14().to_real_reg(), "%xmm14".into()));
+    regs.push((xmm15().to_real_reg(), "%xmm15".into()));
+    let last_fpr = regs.len() - 1;
+
+    // Integer regs.
+    let first_gpr = regs.len();
+
+    // Callee-saved, in the SystemV x86_64 ABI.
+    regs.push((r12().to_real_reg(), "%r12".into()));
+    regs.push((r13().to_real_reg(), "%r13".into()));
+    regs.push((r14().to_real_reg(), "%r14".into()));
+
+    regs.push((rbx().to_real_reg(), "%rbx".into()));
+
+    // Caller-saved, in the SystemV x86_64 ABI.
+    regs.push((rsi().to_real_reg(), "%rsi".into()));
+    regs.push((rdi().to_real_reg(), "%rdi".into()));
+    regs.push((rax().to_real_reg(), "%rax".into()));
+    regs.push((rcx().to_real_reg(), "%rcx".into()));
+    regs.push((rdx().to_real_reg(), "%rdx".into()));
+    regs.push((r8().to_real_reg(), "%r8".into()));
+    regs.push((r9().to_real_reg(), "%r9".into()));
+    regs.push((r10().to_real_reg(), "%r10".into()));
+    regs.push((r11().to_real_reg(), "%r11".into()));
+
+    // Other regs, not available to the allocator.
+    debug_assert_eq!(r15(), pinned_reg());
+    let allocable = if use_pinned_reg {
+        // The pinned register is not allocatable in this case, so record the length before adding
+        // it.
+        let len = regs.len();
+        regs.push((r15().to_real_reg(), "%r15/pinned".into()));
+        len
+    } else {
+        regs.push((r15().to_real_reg(), "%r15".into()));
+        regs.len()
+    };
+    let last_gpr = allocable - 1;
+
+    regs.push((rsp().to_real_reg(), "%rsp".into()));
+    regs.push((rbp().to_real_reg(), "%rbp".into()));
+
+    allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo {
+        first: first_gpr,
+        last: last_gpr,
+        suggested_scratch: Some(r12().get_index()),
+    });
+    allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo {
+        first: first_fpr,
+        last: last_fpr,
+        suggested_scratch: Some(xmm15().get_index()),
+    });
+
+    // Sanity-check: the index passed to the Reg ctor must match the order in the register list.
+    for (i, reg) in regs.iter().enumerate() {
+        assert_eq!(i, reg.0.get_index());
+    }
+
+    RealRegUniverse {
+        regs,
+        allocable,
+        allocable_by_class,
+    }
+}
+
+/// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show its name at some
+/// smaller size (4, 2 or 1 bytes).
+pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+    let mut s = reg.show_rru(mb_rru);
+
+    if reg.get_class() != RegClass::I64 || size == 8 {
+        // We can't do any better.
+        return s;
+    }
+
+    if reg.is_real() {
+        // Change (eg) "rax" into "eax", "ax" or "al" as appropriate.  This is something one could
+        // describe diplomatically as "a kludge", but it's only debug code.
+        let remapper = match s.as_str() {
+            "%rax" => Some(["%eax", "%ax", "%al"]),
+            "%rbx" => Some(["%ebx", "%bx", "%bl"]),
+            "%rcx" => Some(["%ecx", "%cx", "%cl"]),
+            "%rdx" => Some(["%edx", "%dx", "%dl"]),
+            "%rsi" => Some(["%esi", "%si", "%sil"]),
+            "%rdi" => Some(["%edi", "%di", "%dil"]),
+            "%rbp" => Some(["%ebp", "%bp", "%bpl"]),
+            "%rsp" => Some(["%esp", "%sp", "%spl"]),
+            "%r8" => Some(["%r8d", "%r8w", "%r8b"]),
+            "%r9" => Some(["%r9d", "%r9w", "%r9b"]),
+            "%r10" => Some(["%r10d", "%r10w", "%r10b"]),
+            "%r11" => Some(["%r11d", "%r11w", "%r11b"]),
+            "%r12" => Some(["%r12d", "%r12w", "%r12b"]),
+            "%r13" => Some(["%r13d", "%r13w", "%r13b"]),
+            "%r14" => Some(["%r14d", "%r14w", "%r14b"]),
+            "%r15" => Some(["%r15d", "%r15w", "%r15b"]),
+            _ => None,
+        };
+        if let Some(smaller_names) = remapper {
+            match size {
+                4 => s = smaller_names[0].into(),
+                2 => s = smaller_names[1].into(),
+                1 => s = smaller_names[2].into(),
+                _ => panic!("show_ireg_sized: real"),
+            }
+        }
+    } else {
+        // Add a "l", "w" or "b" suffix to RegClass::I64 vregs used at narrower widths.
+        let suffix = match size {
+            4 => "l",
+            2 => "w",
+            1 => "b",
+            _ => panic!("show_ireg_sized: virtual"),
+        };
+        s = s + suffix;
+    }
+
+    s
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs
new file mode 100644
index 0000000000..ffe43930f0
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs
@@ -0,0 +1,125 @@
+use crate::isa::unwind::input::UnwindInfo;
+use crate::isa::x64::inst::{
+    args::{AluRmiROpcode, Amode, RegMemImm, SyntheticAmode},
+    regs, Inst,
+};
+use crate::machinst::{UnwindInfoContext, UnwindInfoGenerator};
+use crate::result::CodegenResult;
+use alloc::vec::Vec;
+use regalloc::Reg;
+
+#[cfg(feature = "unwind")]
+pub(crate) mod systemv;
+
+pub struct X64UnwindInfo;
+
+impl UnwindInfoGenerator<Inst> for X64UnwindInfo {
+    fn create_unwind_info(
+        context: UnwindInfoContext<Inst>,
+    ) -> CodegenResult<Option<UnwindInfo<Reg>>> {
+        use crate::isa::unwind::input::{self, UnwindCode};
+        let mut codes = Vec::new();
+        const WORD_SIZE: u8 = 8;
+
+        for i in context.prologue.clone() {
+            let i = i as usize;
+            let inst = &context.insts[i];
+            let offset = context.insts_layout[i];
+
+            match inst {
+                Inst::Push64 {
+                    src: RegMemImm::Reg { reg },
+                } => {
+                    codes.push((
+                        offset,
+                        UnwindCode::StackAlloc {
+                            size: WORD_SIZE.into(),
+                        },
+                    ));
+                    codes.push((
+                        offset,
+                        UnwindCode::SaveRegister {
+                            reg: *reg,
+                            stack_offset: 0,
+                        },
+                    ));
+                }
+                Inst::MovRR { src, dst, .. } => {
+                    if *src == regs::rsp() {
+                        codes.push((offset, UnwindCode::SetFramePointer { reg: dst.to_reg() }));
+                    }
+                }
+                Inst::AluRmiR {
+                    is_64: true,
+                    op: AluRmiROpcode::Sub,
+                    src: RegMemImm::Imm { simm32 },
+                    dst,
+                    ..
+                } if dst.to_reg() == regs::rsp() => {
+                    let imm = *simm32;
+                    codes.push((offset, UnwindCode::StackAlloc { size: imm }));
+                }
+                Inst::MovRM {
+                    src,
+                    dst: SyntheticAmode::Real(Amode::ImmReg { simm32, base, .. }),
+                    ..
+                } if *base == regs::rsp() => {
+                    // `mov reg, imm(rsp)`
+                    let imm = *simm32;
+                    codes.push((
+                        offset,
+                        UnwindCode::SaveRegister {
+                            reg: *src,
+                            stack_offset: imm,
+                        },
+                    ));
+                }
+                Inst::AluRmiR {
+                    is_64: true,
+                    op: AluRmiROpcode::Add,
+                    src: RegMemImm::Imm { simm32 },
+                    dst,
+                    ..
+                } if dst.to_reg() == regs::rsp() => {
+                    let imm = *simm32;
+                    codes.push((offset, UnwindCode::StackDealloc { size: imm }));
+                }
+                _ => {}
+            }
+        }
+
+        let last_epilogue_end = context.len;
+        let epilogues_unwind_codes = context
+            .epilogues
+            .iter()
+            .map(|epilogue| {
+                // TODO add logic to process epilogue instruction instead of
+                // returning empty array.
+                let end = epilogue.end as usize - 1;
+                let end_offset = context.insts_layout[end];
+                if end_offset == last_epilogue_end {
+                    // Do not remember/restore for very last epilogue.
+                    return vec![];
+                }
+
+                let start = epilogue.start as usize;
+                let offset = context.insts_layout[start];
+                vec![
+                    (offset, UnwindCode::RememberState),
+                    // TODO epilogue instructions
+                    (end_offset, UnwindCode::RestoreState),
+                ]
+            })
+            .collect();
+
+        let prologue_size = context.insts_layout[context.prologue.end as usize];
+        Ok(Some(input::UnwindInfo {
+            prologue_size,
+            prologue_unwind_codes: codes,
+            epilogues_unwind_codes,
+            function_size: context.len,
+            word_size: WORD_SIZE,
+            initial_sp_offset: WORD_SIZE,
+        }))
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs
new file mode 100644
index 0000000000..68473a8afb
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs
@@ -0,0 +1,204 @@
+//! Unwind information for System V ABI (x86-64).
+
+use crate::isa::unwind::input;
+use crate::isa::unwind::systemv::{RegisterMappingError, UnwindInfo};
+use crate::result::CodegenResult;
+use gimli::{write::CommonInformationEntry, Encoding, Format, Register, X86_64};
+use regalloc::{Reg, RegClass};
+
+/// Creates a new x86-64 common information entry (CIE).
+pub fn create_cie() -> CommonInformationEntry {
+    use gimli::write::CallFrameInstruction;
+
+    let mut entry = CommonInformationEntry::new(
+        Encoding {
+            address_size: 8,
+            format: Format::Dwarf32,
+            version: 1,
+        },
+        1,  // Code alignment factor
+        -8, // Data alignment factor
+        X86_64::RA,
+    );
+
+    // Every frame will start with the call frame address (CFA) at RSP+8
+    // It is +8 to account for the push of the return address by the call instruction
+    entry.add_instruction(CallFrameInstruction::Cfa(X86_64::RSP, 8));
+
+    // Every frame will start with the return address at RSP (CFA-8 = RSP+8-8 = RSP)
+    entry.add_instruction(CallFrameInstruction::Offset(X86_64::RA, -8));
+
+    entry
+}
+
+/// Map Cranelift registers to their corresponding Gimli registers.
+pub fn map_reg(reg: Reg) -> Result<Register, RegisterMappingError> {
+    // Mapping from https://github.com/bytecodealliance/cranelift/pull/902 by @iximeow
+    const X86_GP_REG_MAP: [gimli::Register; 16] = [
+        X86_64::RAX,
+        X86_64::RCX,
+        X86_64::RDX,
+        X86_64::RBX,
+        X86_64::RSP,
+        X86_64::RBP,
+        X86_64::RSI,
+        X86_64::RDI,
+        X86_64::R8,
+        X86_64::R9,
+        X86_64::R10,
+        X86_64::R11,
+        X86_64::R12,
+        X86_64::R13,
+        X86_64::R14,
+        X86_64::R15,
+    ];
+    const X86_XMM_REG_MAP: [gimli::Register; 16] = [
+        X86_64::XMM0,
+        X86_64::XMM1,
+        X86_64::XMM2,
+        X86_64::XMM3,
+        X86_64::XMM4,
+        X86_64::XMM5,
+        X86_64::XMM6,
+        X86_64::XMM7,
+        X86_64::XMM8,
+        X86_64::XMM9,
+        X86_64::XMM10,
+        X86_64::XMM11,
+        X86_64::XMM12,
+        X86_64::XMM13,
+        X86_64::XMM14,
+        X86_64::XMM15,
+    ];
+
+    match reg.get_class() {
+        RegClass::I64 => {
+            // x86 GP registers have a weird mapping to DWARF registers, so we use a
+            // lookup table.
+            Ok(X86_GP_REG_MAP[reg.get_hw_encoding() as usize])
+        }
+        RegClass::V128 => Ok(X86_XMM_REG_MAP[reg.get_hw_encoding() as usize]),
+        _ => Err(RegisterMappingError::UnsupportedRegisterBank("class?")),
+    }
+}
+
+pub(crate) fn create_unwind_info(
+    unwind: input::UnwindInfo<Reg>,
+) -> CodegenResult<Option<UnwindInfo>> {
+    struct RegisterMapper;
+    impl crate::isa::unwind::systemv::RegisterMapper<Reg> for RegisterMapper {
+        fn map(&self, reg: Reg) -> Result<u16, RegisterMappingError> {
+            Ok(map_reg(reg)?.0)
+        }
+        fn sp(&self) -> u16 {
+            X86_64::RSP.0
+        }
+    }
+    let map = RegisterMapper;
+
+    Ok(Some(UnwindInfo::build(unwind, &map)?))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::{
+        types, AbiParam, ExternalName, Function, InstBuilder, Signature, StackSlotData,
+        StackSlotKind,
+    };
+    use crate::isa::{lookup, CallConv};
+    use crate::settings::{builder, Flags};
+    use crate::Context;
+    use gimli::write::Address;
+    use std::str::FromStr;
+    use target_lexicon::triple;
+
+    #[test]
+    fn test_simple_func() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::SystemV,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let fde = match context
+            .create_unwind_info(isa.as_ref())
+            .expect("can create unwind info")
+        {
+            Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => {
+                info.to_fde(Address::Constant(1234))
+            }
+            _ => panic!("expected unwind information"),
+        };
+
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(1234), length: 13, lsda: None, instructions: [(1, CfaOffset(16)), (1, Offset(Register(6), -16)), (4, CfaRegister(Register(6)))] }");
+    }
+
+    fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function {
+        let mut func =
+            Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv));
+
+        let block0 = func.dfg.make_block();
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().return_(&[]);
+
+        if let Some(stack_slot) = stack_slot {
+            func.stack_slots.push(stack_slot);
+        }
+
+        func
+    }
+
+    #[test]
+    fn test_multi_return_func() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let fde = match context
+            .create_unwind_info(isa.as_ref())
+            .expect("can create unwind info")
+        {
+            Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => {
+                info.to_fde(Address::Constant(4321))
+            }
+            _ => panic!("expected unwind information"),
+        };
+
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 23, lsda: None, instructions: [(1, CfaOffset(16)), (1, Offset(Register(6), -16)), (4, CfaRegister(Register(6))), (16, RememberState), (18, RestoreState)] }");
+    }
+
+    fn create_multi_return_function(call_conv: CallConv) -> Function {
+        let mut sig = Signature::new(call_conv);
+        sig.params.push(AbiParam::new(types::I32));
+        let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig);
+
+        let block0 = func.dfg.make_block();
+        let v0 = func.dfg.append_block_param(block0, types::I32);
+        let block1 = func.dfg.make_block();
+        let block2 = func.dfg.make_block();
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().brnz(v0, block2, &[]);
+        pos.ins().jump(block1, &[]);
+
+        pos.insert_block(block1);
+        pos.ins().return_(&[]);
+
+        pos.insert_block(block2);
+        pos.ins().return_(&[]);
+
+        func
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs
new file mode 100644
index 0000000000..0862154360
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs
@@ -0,0 +1,3771 @@
+//! Lowering rules for X64.
+
+use crate::data_value::DataValue;
+use crate::ir::{
+    condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
+    Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type,
+};
+use crate::isa::x64::abi::*;
+use crate::isa::x64::inst::args::*;
+use crate::isa::x64::inst::*;
+use crate::isa::{x64::X64Backend, CallConv};
+use crate::machinst::lower::*;
+use crate::machinst::*;
+use crate::result::CodegenResult;
+use crate::settings::Flags;
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use cranelift_codegen_shared::condcodes::CondCode;
+use log::trace;
+use regalloc::{Reg, RegClass, Writable};
+use smallvec::SmallVec;
+use std::convert::TryFrom;
+use target_lexicon::Triple;
+
+/// Context passed to all lowering functions.
+type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>;
+
+//=============================================================================
+// Helpers for instruction lowering.
+
+fn is_int_or_ref_ty(ty: Type) -> bool {
+    match ty {
+        types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
+        types::R32 => panic!("shouldn't have 32-bits refs on x64"),
+        _ => false,
+    }
+}
+
+fn is_bool_ty(ty: Type) -> bool {
+    match ty {
+        types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
+        types::R32 => panic!("shouldn't have 32-bits refs on x64"),
+        _ => false,
+    }
+}
+
+/// This is target-word-size dependent.  And it excludes booleans and reftypes.
+fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
+    match ty {
+        types::I8 | types::I16 | types::I32 | types::I64 => true,
+        _ => false,
+    }
+}
+
+/// Returns whether the given specified `input` is a result produced by an instruction with Opcode
+/// `op`.
+// TODO investigate failures with checking against the result index.
+fn matches_input<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    op: Opcode,
+) -> Option<IRInst> {
+    let inputs = ctx.get_input(input.insn, input.input);
+    inputs.inst.and_then(|(src_inst, _)| {
+        let data = ctx.data(src_inst);
+        if data.opcode() == op {
+            return Some(src_inst);
+        }
+        None
+    })
+}
+
+/// Returns whether the given specified `input` is a result produced by an instruction with any of
+/// the opcodes specified in `ops`.
+fn matches_input_any<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    ops: &[Opcode],
+) -> Option<IRInst> {
+    let inputs = ctx.get_input(input.insn, input.input);
+    inputs.inst.and_then(|(src_inst, _)| {
+        let data = ctx.data(src_inst);
+        for &op in ops {
+            if data.opcode() == op {
+                return Some(src_inst);
+            }
+        }
+        None
+    })
+}
+
+fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg {
+    ctx.use_input_reg(input);
+    input.reg
+}
+
+/// Put the given input into a register, and mark it as used (side-effect).
+fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg {
+    let input = ctx.get_input(spec.insn, spec.input);
+
+    if let Some(c) = input.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        let ty = ctx.input_ty(spec.insn, spec.input);
+        let from_bits = ty_bits(ty);
+        let masked = if from_bits < 64 {
+            c & ((1u64 << from_bits) - 1)
+        } else {
+            c
+        };
+
+        let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| {
+            ctx.alloc_tmp(reg_class, ty)
+        })
+        .into_iter()
+        {
+            ctx.emit(inst);
+        }
+        cst_copy.to_reg()
+    } else {
+        lowerinput_to_reg(ctx, input)
+    }
+}
+
+/// An extension specification for `extend_input_to_reg`.
+#[derive(Clone, Copy)]
+enum ExtSpec {
+    ZeroExtendTo32,
+    ZeroExtendTo64,
+    SignExtendTo32,
+    SignExtendTo64,
+}
+
+/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
+/// required. (This obviously causes side-effects.)
+fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
+    let requested_size = match ext_spec {
+        ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
+        ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
+    };
+    let input_size = ctx.input_ty(spec.insn, spec.input).bits();
+
+    let requested_ty = if requested_size == 32 {
+        types::I32
+    } else {
+        types::I64
+    };
+
+    let ext_mode = match (input_size, requested_size) {
+        (a, b) if a == b => return put_input_in_reg(ctx, spec),
+        (1, 8) => return put_input_in_reg(ctx, spec),
+        (a, b) => ExtMode::new(a, b).expect(&format!("invalid extension: {} -> {}", a, b)),
+    };
+
+    let src = input_to_reg_mem(ctx, spec);
+    let dst = ctx.alloc_tmp(RegClass::I64, requested_ty);
+    match ext_spec {
+        ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => {
+            ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst))
+        }
+        ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => {
+            ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst))
+        }
+    }
+    dst.to_reg()
+}
+
+fn lowerinput_to_reg_mem(ctx: Ctx, input: LowerInput) -> RegMem {
+    // TODO handle memory.
+    RegMem::reg(lowerinput_to_reg(ctx, input))
+}
+
+/// Put the given input into a register or a memory operand.
+/// Effectful: may mark the given input as used, when returning the register form.
+fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem {
+    let input = ctx.get_input(spec.insn, spec.input);
+    lowerinput_to_reg_mem(ctx, input)
+}
+
+/// Returns whether the given input is an immediate that can be properly sign-extended, without any
+/// possible side-effect.
+fn lowerinput_to_sext_imm(input: LowerInput, input_ty: Type) -> Option<u32> {
+    input.constant.and_then(|x| {
+        // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend
+        // to 64 bits. For other sizes, it doesn't matter and we can just use the plain
+        // constant.
+        if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) {
+            Some(x as u32)
+        } else {
+            None
+        }
+    })
+}
+
+fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> {
+    let input = ctx.get_input(spec.insn, spec.input);
+    let input_ty = ctx.input_ty(spec.insn, spec.input);
+    lowerinput_to_sext_imm(input, input_ty)
+}
+
+fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> {
+    ctx.get_input(spec.insn, spec.input).constant
+}
+
+/// Put the given input into an immediate, a register or a memory operand.
+/// Effectful: may mark the given input as used, when returning the register form.
+fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm {
+    let input = ctx.get_input(spec.insn, spec.input);
+    let input_ty = ctx.input_ty(spec.insn, spec.input);
+    match lowerinput_to_sext_imm(input, input_ty) {
+        Some(x) => RegMemImm::imm(x),
+        None => match lowerinput_to_reg_mem(ctx, input) {
+            RegMem::Reg { reg } => RegMemImm::reg(reg),
+            RegMem::Mem { addr } => RegMemImm::mem(addr),
+        },
+    }
+}
+
+/// Emit an instruction to insert a value `src` into a lane of `dst`.
+fn emit_insert_lane<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    src: RegMem,
+    dst: Writable<Reg>,
+    lane: u8,
+    ty: Type,
+) {
+    if !ty.is_float() {
+        let (sse_op, is64) = match ty.lane_bits() {
+            8 => (SseOpcode::Pinsrb, false),
+            16 => (SseOpcode::Pinsrw, false),
+            32 => (SseOpcode::Pinsrd, false),
+            64 => (SseOpcode::Pinsrd, true),
+            _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
+        };
+        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64));
+    } else if ty == types::F32 {
+        let sse_op = SseOpcode::Insertps;
+        // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
+        // shifted into bits 5:6).
+        let lane = 0b00_00_00_00 | lane << 4;
+        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false));
+    } else if ty == types::F64 {
+        let sse_op = match lane {
+            // Move the lowest quadword in replacement to vector without changing
+            // the upper bits.
+            0 => SseOpcode::Movsd,
+            // Move the low 64 bits of replacement vector to the high 64 bits of the
+            // vector.
+            1 => SseOpcode::Movlhps,
+            _ => unreachable!(),
+        };
+        // Here we use the `xmm_rm_r` encoding because it correctly tells the register
+        // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
+        // encoding formats like `xmm_unary_rm_r` treat it as a `def`.
+        ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
+    } else {
+        panic!("unable to emit insertlane for type: {}", ty)
+    }
+}
+
+/// Emits an int comparison instruction.
+///
+/// Note: make sure that there are no instructions modifying the flags between a call to this
+/// function and the use of the flags!
+fn emit_cmp(ctx: Ctx, insn: IRInst) {
+    let ty = ctx.input_ty(insn, 0);
+
+    let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
+
+    // TODO Try to commute the operands (and invert the condition) if one is an immediate.
+    let lhs = put_input_in_reg(ctx, inputs[0]);
+    let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
+
+    // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
+    // us dst - src at the machine instruction level, so invert operands.
+    ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs));
+}
+
+/// A specification for a fcmp emission.
+enum FcmpSpec {
+    /// Normal flow.
+    Normal,
+
+    /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that
+    /// happens with `InvertedEqualOrConditions`.
+    ///
+    /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or
+    /// sequence of instructions) that check for an "AND" combination of condition codes; see for
+    /// instance lowering of Select.
+    InvertEqual,
+}
+
+/// This explains how to interpret the results of an fcmp instruction.
+enum FcmpCondResult {
+    /// The given condition code must be set.
+    Condition(CC),
+
+    /// Both condition codes must be set.
+    AndConditions(CC, CC),
+
+    /// Either of the conditions codes must be set.
+    OrConditions(CC, CC),
+
+    /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either
+    /// of the condition codes must be set, and the user must invert meaning of analyzing the
+    /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be
+    /// reached.
+    InvertedEqualOrConditions(CC, CC),
+}
+
+/// Emits a float comparison instruction.
+///
+/// Note: make sure that there are no instructions modifying the flags between a call to this
+/// function and the use of the flags!
+fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult {
+    let (flip_operands, inverted_equal) = match cond_code {
+        FloatCC::LessThan
+        | FloatCC::LessThanOrEqual
+        | FloatCC::UnorderedOrGreaterThan
+        | FloatCC::UnorderedOrGreaterThanOrEqual => {
+            cond_code = cond_code.reverse();
+            (true, false)
+        }
+        FloatCC::Equal => {
+            let inverted_equal = match spec {
+                FcmpSpec::Normal => false,
+                FcmpSpec::InvertEqual => {
+                    cond_code = FloatCC::NotEqual; // same as .inverse()
+                    true
+                }
+            };
+            (false, inverted_equal)
+        }
+        _ => (false, false),
+    };
+
+    // The only valid CC constructed with `from_floatcc` can be put in the flag
+    // register with a direct float comparison; do this here.
+    let op = match ctx.input_ty(insn, 0) {
+        types::F32 => SseOpcode::Ucomiss,
+        types::F64 => SseOpcode::Ucomisd,
+        _ => panic!("Bad input type to Fcmp"),
+    };
+
+    let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
+    let (lhs_input, rhs_input) = if flip_operands {
+        (inputs[1], inputs[0])
+    } else {
+        (inputs[0], inputs[1])
+    };
+    let lhs = put_input_in_reg(ctx, lhs_input);
+    let rhs = input_to_reg_mem(ctx, rhs_input);
+    ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
+
+    let cond_result = match cond_code {
+        FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
+        FloatCC::NotEqual if inverted_equal => {
+            FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ)
+        }
+        FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ),
+        _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)),
+    };
+
+    cond_result
+}
+
+fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature {
+    let mut sig = Signature::new(call_conv);
+    for i in 0..ctx.num_inputs(insn) {
+        sig.params.push(AbiParam::new(ctx.input_ty(insn, i)));
+    }
+    for i in 0..ctx.num_outputs(insn) {
+        sig.returns.push(AbiParam::new(ctx.output_ty(insn, i)));
+    }
+    if call_conv.extends_baldrdash() {
+        // Adds the special VMContext parameter to the signature.
+        sig.params
+            .push(AbiParam::special(ptr_ty, ArgumentPurpose::VMContext));
+    }
+    sig
+}
+
+fn emit_vm_call<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    flags: &Flags,
+    triple: &Triple,
+    libcall: LibCall,
+    insn: IRInst,
+    inputs: SmallVec<[InsnInput; 4]>,
+    outputs: SmallVec<[InsnOutput; 2]>,
+) -> CodegenResult<()> {
+    let extname = ExternalName::LibCall(libcall);
+
+    let dist = if flags.use_colocated_libcalls() {
+        RelocDistance::Near
+    } else {
+        RelocDistance::Far
+    };
+
+    // TODO avoid recreating signatures for every single Libcall function.
+    let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple));
+    let sig = make_libcall_sig(ctx, insn, call_conv, types::I64);
+    let caller_conv = ctx.abi().call_conv();
+
+    let mut abi = X64ABICaller::from_func(&sig, &extname, dist, caller_conv)?;
+
+    abi.emit_stack_pre_adjust(ctx);
+
+    let vm_context = if call_conv.extends_baldrdash() { 1 } else { 0 };
+    assert_eq!(inputs.len() + vm_context, abi.num_args());
+
+    for (i, input) in inputs.iter().enumerate() {
+        let arg_reg = put_input_in_reg(ctx, *input);
+        abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
+    }
+    if call_conv.extends_baldrdash() {
+        let vm_context_vreg = ctx
+            .get_vm_context()
+            .expect("should have a VMContext to pass to libcall funcs");
+        abi.emit_copy_reg_to_arg(ctx, inputs.len(), vm_context_vreg);
+    }
+
+    abi.emit_call(ctx);
+    for (i, output) in outputs.iter().enumerate() {
+        let retval_reg = get_output_reg(ctx, *output);
+        abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
+    }
+    abi.emit_stack_post_adjust(ctx);
+
+    Ok(())
+}
+
+/// Returns whether the given input is a shift by a constant value less or equal than 3.
+/// The goal is to embed it within an address mode.
+fn matches_small_constant_shift<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    spec: InsnInput,
+) -> Option<(InsnInput, u8)> {
+    matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| {
+        match input_to_imm(
+            ctx,
+            InsnInput {
+                insn: shift,
+                input: 1,
+            },
+        ) {
+            Some(shift_amt) if shift_amt <= 3 => Some((
+                InsnInput {
+                    insn: shift,
+                    input: 0,
+                },
+                shift_amt as u8,
+            )),
+            _ => None,
+        }
+    })
+}
+
+/// Lowers an instruction to one of the x86 addressing modes.
+///
+/// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior.
+fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i32) -> Amode {
+    let flags = ctx
+        .memflags(spec.insn)
+        .expect("Instruction with amode should have memflags");
+
+    // We now either have an add that we must materialize, or some other input; as well as the
+    // final offset.
+    if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
+        debug_assert_eq!(ctx.output_ty(add, 0), types::I64);
+        let add_inputs = &[
+            InsnInput {
+                insn: add,
+                input: 0,
+            },
+            InsnInput {
+                insn: add,
+                input: 1,
+            },
+        ];
+
+        // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations
+        // aren't happening in the wasm case. We could do better, given some range analysis.
+        let (base, index, shift) = if let Some((shift_input, shift_amt)) =
+            matches_small_constant_shift(ctx, add_inputs[0])
+        {
+            (
+                put_input_in_reg(ctx, add_inputs[1]),
+                put_input_in_reg(ctx, shift_input),
+                shift_amt,
+            )
+        } else if let Some((shift_input, shift_amt)) =
+            matches_small_constant_shift(ctx, add_inputs[1])
+        {
+            (
+                put_input_in_reg(ctx, add_inputs[0]),
+                put_input_in_reg(ctx, shift_input),
+                shift_amt,
+            )
+        } else {
+            for i in 0..=1 {
+                let input = ctx.get_input(add, i);
+
+                // Try to pierce through uextend.
+                if let Some(uextend) = matches_input(
+                    ctx,
+                    InsnInput {
+                        insn: add,
+                        input: i,
+                    },
+                    Opcode::Uextend,
+                ) {
+                    if let Some(cst) = ctx.get_input(uextend, 0).constant {
+                        // Zero the upper bits.
+                        let input_size = ctx.input_ty(uextend, 0).bits() as u64;
+                        let shift: u64 = 64 - input_size;
+                        let uext_cst: u64 = (cst << shift) >> shift;
+
+                        let final_offset = (offset as i64).wrapping_add(uext_cst as i64);
+                        if low32_will_sign_extend_to_64(final_offset as u64) {
+                            let base = put_input_in_reg(ctx, add_inputs[1 - i]);
+                            return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
+                        }
+                    }
+                }
+
+                // If it's a constant, add it directly!
+                if let Some(cst) = input.constant {
+                    let final_offset = (offset as i64).wrapping_add(cst as i64);
+                    if low32_will_sign_extend_to_64(final_offset as u64) {
+                        let base = put_input_in_reg(ctx, add_inputs[1 - i]);
+                        return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
+                    }
+                }
+            }
+
+            (
+                put_input_in_reg(ctx, add_inputs[0]),
+                put_input_in_reg(ctx, add_inputs[1]),
+                0,
+            )
+        };
+
+        return Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags);
+    }
+
+    let input = put_input_in_reg(ctx, spec);
+    Amode::imm_reg(offset as u32, input).with_flags(flags)
+}
+
+//=============================================================================
+// Top-level instruction lowering entry point, for one instruction.
+
+/// Actually codegen an instruction's results into registers.
+fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+    flags: &Flags,
+    triple: &Triple,
+) -> CodegenResult<()> {
+    let op = ctx.data(insn).opcode();
+
+    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
+        .map(|i| InsnInput { insn, input: i })
+        .collect();
+    let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
+        .map(|i| InsnOutput { insn, output: i })
+        .collect();
+
+    let ty = if outputs.len() > 0 {
+        Some(ctx.output_ty(insn, 0))
+    } else {
+        None
+    };
+
+    match op {
+        Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
+            let value = ctx
+                .get_constant(insn)
+                .expect("constant value for iconst et al");
+            let dst = get_output_reg(ctx, outputs[0]);
+            for inst in Inst::gen_constant(dst, value, ty.unwrap(), |reg_class, ty| {
+                ctx.alloc_tmp(reg_class, ty)
+            }) {
+                ctx.emit(inst);
+            }
+        }
+
+        Opcode::Iadd
+        | Opcode::IaddIfcout
+        | Opcode::SaddSat
+        | Opcode::UaddSat
+        | Opcode::Isub
+        | Opcode::SsubSat
+        | Opcode::UsubSat
+        | Opcode::Imul
+        | Opcode::AvgRound
+        | Opcode::Band
+        | Opcode::Bor
+        | Opcode::Bxor => {
+            let ty = ty.unwrap();
+            if ty.lane_count() > 1 {
+                let sse_op = match op {
+                    Opcode::Iadd => match ty {
+                        types::I8X16 => SseOpcode::Paddb,
+                        types::I16X8 => SseOpcode::Paddw,
+                        types::I32X4 => SseOpcode::Paddd,
+                        types::I64X2 => SseOpcode::Paddq,
+                        _ => panic!("Unsupported type for packed iadd instruction: {}", ty),
+                    },
+                    Opcode::SaddSat => match ty {
+                        types::I8X16 => SseOpcode::Paddsb,
+                        types::I16X8 => SseOpcode::Paddsw,
+                        _ => panic!("Unsupported type for packed sadd_sat instruction: {}", ty),
+                    },
+                    Opcode::UaddSat => match ty {
+                        types::I8X16 => SseOpcode::Paddusb,
+                        types::I16X8 => SseOpcode::Paddusw,
+                        _ => panic!("Unsupported type for packed uadd_sat instruction: {}", ty),
+                    },
+                    Opcode::Isub => match ty {
+                        types::I8X16 => SseOpcode::Psubb,
+                        types::I16X8 => SseOpcode::Psubw,
+                        types::I32X4 => SseOpcode::Psubd,
+                        types::I64X2 => SseOpcode::Psubq,
+                        _ => panic!("Unsupported type for packed isub instruction: {}", ty),
+                    },
+                    Opcode::SsubSat => match ty {
+                        types::I8X16 => SseOpcode::Psubsb,
+                        types::I16X8 => SseOpcode::Psubsw,
+                        _ => panic!("Unsupported type for packed ssub_sat instruction: {}", ty),
+                    },
+                    Opcode::UsubSat => match ty {
+                        types::I8X16 => SseOpcode::Psubusb,
+                        types::I16X8 => SseOpcode::Psubusw,
+                        _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty),
+                    },
+                    Opcode::Imul => match ty {
+                        types::I16X8 => SseOpcode::Pmullw,
+                        types::I32X4 => SseOpcode::Pmulld,
+                        types::I64X2 => {
+                            // Note for I64X2 we describe a lane A as being composed of a
+                            // 32-bit upper half "Ah" and a 32-bit lower half "Al".
+                            // The 32-bit long hand multiplication can then be written as:
+                            //    Ah Al
+                            // *  Bh Bl
+                            //    -----
+                            //    Al * Bl
+                            // + (Ah * Bl) << 32
+                            // + (Al * Bh) << 32
+                            //
+                            // So for each lane we will compute:
+                            // A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
+                            //
+                            // Note, the algorithm will use pmuldq which operates directly on
+                            // the lower 32-bit (Al or Bl) of a lane and writes the result
+                            // to the full 64-bits of the lane of the destination. For this
+                            // reason we don't need shifts to isolate the lower 32-bits, however
+                            // we will need to use shifts to isolate the high 32-bits when doing
+                            // calculations, i.e. Ah == A >> 32
+                            //
+                            // The full sequence then is as follows:
+                            // A' = A
+                            // A' = A' >> 32
+                            // A' = Ah' * Bl
+                            // B' = B
+                            // B' = B' >> 32
+                            // B' = Bh' * Al
+                            // B' = B' + A'
+                            // B' = B' << 32
+                            // A' = A
+                            // A' = Al' * Bl
+                            // A' = A' + B'
+                            // dst = A'
+
+                            // Get inputs rhs=A and lhs=B and the dst register
+                            let lhs = put_input_in_reg(ctx, inputs[0]);
+                            let rhs = put_input_in_reg(ctx, inputs[1]);
+                            let dst = get_output_reg(ctx, outputs[0]);
+
+                            // A' = A
+                            let rhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2);
+                            ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
+
+                            // A' = A' >> 32
+                            // A' = Ah' * Bl
+                            ctx.emit(Inst::xmm_rmi_reg(
+                                SseOpcode::Psrlq,
+                                RegMemImm::imm(32),
+                                rhs_1,
+                            ));
+                            ctx.emit(Inst::xmm_rm_r(
+                                SseOpcode::Pmuludq,
+                                RegMem::reg(lhs.clone()),
+                                rhs_1,
+                            ));
+
+                            // B' = B
+                            let lhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2);
+                            ctx.emit(Inst::gen_move(lhs_1, lhs, ty));
+
+                            // B' = B' >> 32
+                            // B' = Bh' * Al
+                            ctx.emit(Inst::xmm_rmi_reg(
+                                SseOpcode::Psrlq,
+                                RegMemImm::imm(32),
+                                lhs_1,
+                            ));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
+
+                            // B' = B' + A'
+                            // B' = B' << 32
+                            ctx.emit(Inst::xmm_rm_r(
+                                SseOpcode::Paddq,
+                                RegMem::reg(rhs_1.to_reg()),
+                                lhs_1,
+                            ));
+                            ctx.emit(Inst::xmm_rmi_reg(
+                                SseOpcode::Psllq,
+                                RegMemImm::imm(32),
+                                lhs_1,
+                            ));
+
+                            // A' = A
+                            // A' = Al' * Bl
+                            // A' = A' + B'
+                            // dst = A'
+                            ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
+                            ctx.emit(Inst::xmm_rm_r(
+                                SseOpcode::Pmuludq,
+                                RegMem::reg(lhs.clone()),
+                                rhs_1,
+                            ));
+                            ctx.emit(Inst::xmm_rm_r(
+                                SseOpcode::Paddq,
+                                RegMem::reg(lhs_1.to_reg()),
+                                rhs_1,
+                            ));
+                            ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
+                            return Ok(());
+                        }
+                        _ => panic!("Unsupported type for packed imul instruction: {}", ty),
+                    },
+                    Opcode::AvgRound => match ty {
+                        types::I8X16 => SseOpcode::Pavgb,
+                        types::I16X8 => SseOpcode::Pavgw,
+                        _ => panic!("Unsupported type for packed avg_round instruction: {}", ty),
+                    },
+                    Opcode::Band => match ty {
+                        types::F32X4 => SseOpcode::Andps,
+                        types::F64X2 => SseOpcode::Andpd,
+                        _ => SseOpcode::Pand,
+                    },
+                    Opcode::Bor => match ty {
+                        types::F32X4 => SseOpcode::Orps,
+                        types::F64X2 => SseOpcode::Orpd,
+                        _ => SseOpcode::Por,
+                    },
+                    Opcode::Bxor => match ty {
+                        types::F32X4 => SseOpcode::Xorps,
+                        types::F64X2 => SseOpcode::Xorpd,
+                        _ => SseOpcode::Pxor,
+                    },
+                    _ => panic!("Unsupported packed instruction: {}", op),
+                };
+                let lhs = put_input_in_reg(ctx, inputs[0]);
+                let rhs = input_to_reg_mem(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                // Move the `lhs` to the same register as `dst`.
+                ctx.emit(Inst::gen_move(dst, lhs, ty));
+                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+            } else {
+                let is_64 = ty == types::I64;
+                let alu_op = match op {
+                    Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add,
+                    Opcode::Isub => AluRmiROpcode::Sub,
+                    Opcode::Imul => AluRmiROpcode::Mul,
+                    Opcode::Band => AluRmiROpcode::And,
+                    Opcode::Bor => AluRmiROpcode::Or,
+                    Opcode::Bxor => AluRmiROpcode::Xor,
+                    _ => unreachable!(),
+                };
+
+                let (lhs, rhs) = match op {
+                    Opcode::Iadd
+                    | Opcode::IaddIfcout
+                    | Opcode::Imul
+                    | Opcode::Band
+                    | Opcode::Bor
+                    | Opcode::Bxor => {
+                        // For commutative operations, try to commute operands if one is an
+                        // immediate.
+                        if let Some(imm) = input_to_sext_imm(ctx, inputs[0]) {
+                            (put_input_in_reg(ctx, inputs[1]), RegMemImm::imm(imm))
+                        } else {
+                            (
+                                put_input_in_reg(ctx, inputs[0]),
+                                input_to_reg_mem_imm(ctx, inputs[1]),
+                            )
+                        }
+                    }
+                    Opcode::Isub => (
+                        put_input_in_reg(ctx, inputs[0]),
+                        input_to_reg_mem_imm(ctx, inputs[1]),
+                    ),
+                    _ => unreachable!(),
+                };
+
+                let dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::mov_r_r(true, lhs, dst));
+                ctx.emit(Inst::alu_rmi_r(is_64, alu_op, rhs, dst));
+            }
+        }
+
+        Opcode::BandNot => {
+            let ty = ty.unwrap();
+            debug_assert!(ty.is_vector() && ty.bytes() == 16);
+            let lhs = input_to_reg_mem(ctx, inputs[0]);
+            let rhs = put_input_in_reg(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let sse_op = match ty {
+                types::F32X4 => SseOpcode::Andnps,
+                types::F64X2 => SseOpcode::Andnpd,
+                _ => SseOpcode::Pandn,
+            };
+            // Note the flipping of operands: the `rhs` operand is used as the destination instead
+            // of the `lhs` as in the other bit operations above (e.g. `band`).
+            ctx.emit(Inst::gen_move(dst, rhs, ty));
+            ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst));
+        }
+
+        Opcode::Iabs => {
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            if ty.is_vector() {
+                let opcode = match ty {
+                    types::I8X16 => SseOpcode::Pabsb,
+                    types::I16X8 => SseOpcode::Pabsw,
+                    types::I32X4 => SseOpcode::Pabsd,
+                    _ => panic!("Unsupported type for packed iabs instruction: {}", ty),
+                };
+                ctx.emit(Inst::xmm_unary_rm_r(opcode, src, dst));
+            } else {
+                unimplemented!("iabs is unimplemented for non-vector type: {}", ty);
+            }
+        }
+
+        Opcode::Imax | Opcode::Umax | Opcode::Imin | Opcode::Umin => {
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = input_to_reg_mem(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            if ty.is_vector() {
+                let sse_op = match op {
+                    Opcode::Imax => match ty {
+                        types::I8X16 => SseOpcode::Pmaxsb,
+                        types::I16X8 => SseOpcode::Pmaxsw,
+                        types::I32X4 => SseOpcode::Pmaxsd,
+                        _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
+                    },
+                    Opcode::Umax => match ty {
+                        types::I8X16 => SseOpcode::Pmaxub,
+                        types::I16X8 => SseOpcode::Pmaxuw,
+                        types::I32X4 => SseOpcode::Pmaxud,
+                        _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
+                    },
+                    Opcode::Imin => match ty {
+                        types::I8X16 => SseOpcode::Pminsb,
+                        types::I16X8 => SseOpcode::Pminsw,
+                        types::I32X4 => SseOpcode::Pminsd,
+                        _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
+                    },
+                    Opcode::Umin => match ty {
+                        types::I8X16 => SseOpcode::Pminub,
+                        types::I16X8 => SseOpcode::Pminuw,
+                        types::I32X4 => SseOpcode::Pminud,
+                        _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
+                    },
+                    _ => unreachable!("This is a bug: the external and internal `match op` should be over the same opcodes."),
+                };
+
+                // Move the `lhs` to the same register as `dst`.
+                ctx.emit(Inst::gen_move(dst, lhs, ty));
+                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+            } else {
+                panic!("Unsupported type for {} instruction: {}", op, ty);
+            }
+        }
+
+        Opcode::Bnot => {
+            let ty = ty.unwrap();
+            let size = ty.bytes() as u8;
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::gen_move(dst, src, ty));
+
+            if ty.is_vector() {
+                let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp));
+                ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst));
+            } else if ty.is_bool() {
+                unimplemented!("bool bnot")
+            } else {
+                ctx.emit(Inst::not(size, dst));
+            }
+        }
+
+        Opcode::Bitselect => {
+            let ty = ty.unwrap();
+            let condition = put_input_in_reg(ctx, inputs[0]);
+            let if_true = put_input_in_reg(ctx, inputs[1]);
+            let if_false = input_to_reg_mem(ctx, inputs[2]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            if ty.is_vector() {
+                let tmp1 = ctx.alloc_tmp(RegClass::V128, ty);
+                ctx.emit(Inst::gen_move(tmp1, if_true, ty));
+                ctx.emit(Inst::and(ty, RegMem::reg(condition.clone()), tmp1));
+
+                let tmp2 = ctx.alloc_tmp(RegClass::V128, ty);
+                ctx.emit(Inst::gen_move(tmp2, condition, ty));
+                ctx.emit(Inst::and_not(ty, if_false, tmp2));
+
+                ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
+                ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
+            } else {
+                unimplemented!("scalar bitselect")
+            }
+        }
+
+        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
+            let dst_ty = ctx.output_ty(insn, 0);
+            debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
+
+            let (size, lhs) = match dst_ty {
+                types::I8 | types::I16 => match op {
+                    Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])),
+                    Opcode::Ushr => (
+                        4,
+                        extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
+                    ),
+                    Opcode::Sshr => (
+                        4,
+                        extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
+                    ),
+                    Opcode::Rotl | Opcode::Rotr => {
+                        (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0]))
+                    }
+                    _ => unreachable!(),
+                },
+                types::I32 | types::I64 => (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])),
+                _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
+            };
+
+            let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant {
+                // Mask count, according to Cranelift's semantics.
+                let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
+                (Some(cst), None)
+            } else {
+                (None, Some(put_input_in_reg(ctx, inputs[1])))
+            };
+
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let shift_kind = match op {
+                Opcode::Ishl => ShiftKind::ShiftLeft,
+                Opcode::Ushr => ShiftKind::ShiftRightLogical,
+                Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
+                Opcode::Rotl => ShiftKind::RotateLeft,
+                Opcode::Rotr => ShiftKind::RotateRight,
+                _ => unreachable!(),
+            };
+
+            let w_rcx = Writable::from_reg(regs::rcx());
+            ctx.emit(Inst::mov_r_r(true, lhs, dst));
+            if count.is_none() {
+                ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx));
+            }
+            ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
+        }
+
+        Opcode::Ineg => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+
+            if ty.is_vector() {
+                // Zero's out a register and then does a packed subtraction
+                // of the input from the register.
+
+                let src = input_to_reg_mem(ctx, inputs[0]);
+                let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+
+                let subtract_opcode = match ty {
+                    types::I8X16 => SseOpcode::Psubb,
+                    types::I16X8 => SseOpcode::Psubw,
+                    types::I32X4 => SseOpcode::Psubd,
+                    types::I64X2 => SseOpcode::Psubq,
+                    _ => panic!("Unsupported type for Ineg instruction, found {}", ty),
+                };
+
+                // Note we must zero out a tmp instead of using the destination register since
+                // the desitnation could be an alias for the source input register
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Pxor,
+                    RegMem::reg(tmp.to_reg()),
+                    tmp,
+                ));
+                ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp));
+                ctx.emit(Inst::xmm_unary_rm_r(
+                    SseOpcode::Movapd,
+                    RegMem::reg(tmp.to_reg()),
+                    dst,
+                ));
+            } else {
+                let size = ty.bytes() as u8;
+                let src = put_input_in_reg(ctx, inputs[0]);
+                ctx.emit(Inst::gen_move(dst, src, ty));
+                ctx.emit(Inst::neg(size, dst));
+            }
+        }
+
+        Opcode::Clz => {
+            // TODO when the x86 flags have use_lzcnt, we can use LZCNT.
+
+            // General formula using bit-scan reverse (BSR):
+            // mov -1, %dst
+            // bsr %src, %tmp
+            // cmovz %dst, %tmp
+            // mov $(size_bits - 1), %dst
+            // sub %tmp, %dst
+
+            let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
+                types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
+                a if a == types::I32 || a == types::I64 => (None, a),
+                _ => unreachable!(),
+            };
+
+            let src = if let Some(ext_spec) = ext_spec {
+                RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
+            } else {
+                input_to_reg_mem(ctx, inputs[0])
+            };
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
+            ctx.emit(Inst::imm(
+                OperandSize::from_bytes(ty.bytes()),
+                u64::max_value(),
+                dst,
+            ));
+
+            ctx.emit(Inst::unary_rm_r(
+                ty.bytes() as u8,
+                UnaryRmROpcode::Bsr,
+                src,
+                tmp,
+            ));
+
+            ctx.emit(Inst::cmove(
+                ty.bytes() as u8,
+                CC::Z,
+                RegMem::reg(dst.to_reg()),
+                tmp,
+            ));
+
+            ctx.emit(Inst::imm(
+                OperandSize::from_bytes(ty.bytes()),
+                ty.bits() as u64 - 1,
+                dst,
+            ));
+
+            ctx.emit(Inst::alu_rmi_r(
+                ty == types::I64,
+                AluRmiROpcode::Sub,
+                RegMemImm::reg(tmp.to_reg()),
+                dst,
+            ));
+        }
+
+        Opcode::Ctz => {
+            // TODO when the x86 flags have use_bmi1, we can use TZCNT.
+
+            // General formula using bit-scan forward (BSF):
+            // bsf %src, %dst
+            // mov $(size_bits), %tmp
+            // cmovz %tmp, %dst
+            let ty = ctx.input_ty(insn, 0);
+            let ty = if ty.bits() < 32 { types::I32 } else { ty };
+            debug_assert!(ty == types::I32 || ty == types::I64);
+
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
+            ctx.emit(Inst::imm(OperandSize::Size32, ty.bits() as u64, tmp));
+
+            ctx.emit(Inst::unary_rm_r(
+                ty.bytes() as u8,
+                UnaryRmROpcode::Bsf,
+                src,
+                dst,
+            ));
+
+            ctx.emit(Inst::cmove(
+                ty.bytes() as u8,
+                CC::Z,
+                RegMem::reg(tmp.to_reg()),
+                dst,
+            ));
+        }
+
+        Opcode::Popcnt => {
+            // TODO when the x86 flags have use_popcnt, we can use the popcnt instruction.
+
+            let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
+                types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
+                a if a == types::I32 || a == types::I64 => (None, a),
+                _ => unreachable!(),
+            };
+
+            let src = if let Some(ext_spec) = ext_spec {
+                RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
+            } else {
+                input_to_reg_mem(ctx, inputs[0])
+            };
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            if ty == types::I64 {
+                let is_64 = true;
+
+                let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let cst = ctx.alloc_tmp(RegClass::I64, types::I64);
+
+                // mov src, tmp1
+                ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    8,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // mov 0x7777_7777_7777_7777, cst
+                ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst));
+
+                // andq cst, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cst.to_reg()),
+                    tmp1,
+                ));
+
+                // mov src, tmp2
+                ctx.emit(Inst::mov64_rm_r(src, tmp2));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    8,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // and cst, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cst.to_reg()),
+                    tmp1,
+                ));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    8,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // and cst, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cst.to_reg()),
+                    tmp1,
+                ));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // mov tmp2, dst
+                ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
+
+                // shr $4, dst
+                ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst));
+
+                // add tmp2, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Add,
+                    RegMemImm::reg(tmp2.to_reg()),
+                    dst,
+                ));
+
+                // mov $0x0F0F_0F0F_0F0F_0F0F, cst
+                ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst));
+
+                // and cst, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cst.to_reg()),
+                    dst,
+                ));
+
+                // mov $0x0101_0101_0101_0101, cst
+                ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst));
+
+                // mul cst, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Mul,
+                    RegMemImm::reg(cst.to_reg()),
+                    dst,
+                ));
+
+                // shr $56, dst
+                ctx.emit(Inst::shift_r(
+                    8,
+                    ShiftKind::ShiftRightLogical,
+                    Some(56),
+                    dst,
+                ));
+            } else {
+                assert_eq!(ty, types::I32);
+                let is_64 = false;
+
+                let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+
+                // mov src, tmp1
+                ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    4,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // andq $0x7777_7777, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(0x77777777),
+                    tmp1,
+                ));
+
+                // mov src, tmp2
+                ctx.emit(Inst::mov64_rm_r(src, tmp2));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    4,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // and 0x7777_7777, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(0x77777777),
+                    tmp1,
+                ));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    4,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // and $0x7777_7777, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(0x77777777),
+                    tmp1,
+                ));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // mov tmp2, dst
+                ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
+
+                // shr $4, dst
+                ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst));
+
+                // add tmp2, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Add,
+                    RegMemImm::reg(tmp2.to_reg()),
+                    dst,
+                ));
+
+                // and $0x0F0F_0F0F, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(0x0F0F0F0F),
+                    dst,
+                ));
+
+                // mul $0x0101_0101, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Mul,
+                    RegMemImm::imm(0x01010101),
+                    dst,
+                ));
+
+                // shr $24, dst
+                ctx.emit(Inst::shift_r(
+                    4,
+                    ShiftKind::ShiftRightLogical,
+                    Some(24),
+                    dst,
+                ));
+            }
+        }
+
+        Opcode::IsNull | Opcode::IsInvalid => {
+            // Null references are represented by the constant value 0; invalid references are
+            // represented by the constant value -1. See `define_reftypes()` in
+            // `meta/src/isa/x86/encodings.rs` to confirm.
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+            let imm = match op {
+                Opcode::IsNull => {
+                    // TODO could use tst src, src for IsNull
+                    0
+                }
+                Opcode::IsInvalid => {
+                    // We can do a 32-bit comparison even in 64-bits mode, as the constant is then
+                    // sign-extended.
+                    0xffffffff
+                }
+                _ => unreachable!(),
+            };
+            ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::imm(imm), src));
+            ctx.emit(Inst::setcc(CC::Z, dst));
+        }
+
+        Opcode::Uextend
+        | Opcode::Sextend
+        | Opcode::Bint
+        | Opcode::Breduce
+        | Opcode::Bextend
+        | Opcode::Ireduce => {
+            let src_ty = ctx.input_ty(insn, 0);
+            let dst_ty = ctx.output_ty(insn, 0);
+
+            // Sextend requires a sign-extended move, but all the other opcodes are simply a move
+            // from a zero-extended source. Here is why this works, in each case:
+            //
+            // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to
+            // zero-extend here.
+            //
+            // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so
+            // again, this is a zero-extend / no-op.
+            //
+            // - Ireduce: changing width of an integer. Smaller ints are stored with undefined
+            // high-order bits, so we can simply do a copy.
+
+            if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend {
+                // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on
+                // 32-bits will zero-extend the upper 32-bits, so we can even not generate a
+                // zero-extended move in this case.
+                // TODO add loads and shifts here.
+                if let Some(_) = matches_input_any(
+                    ctx,
+                    inputs[0],
+                    &[
+                        Opcode::Iadd,
+                        Opcode::IaddIfcout,
+                        Opcode::Isub,
+                        Opcode::Imul,
+                        Opcode::Band,
+                        Opcode::Bor,
+                        Opcode::Bxor,
+                    ],
+                ) {
+                    let src = put_input_in_reg(ctx, inputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]);
+                    ctx.emit(Inst::gen_move(dst, src, types::I64));
+                    return Ok(());
+                }
+            }
+
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits());
+            assert_eq!(
+                src_ty.bits() < dst_ty.bits(),
+                ext_mode.is_some(),
+                "unexpected extension: {} -> {}",
+                src_ty,
+                dst_ty
+            );
+
+            if let Some(ext_mode) = ext_mode {
+                if op == Opcode::Sextend {
+                    ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst));
+                } else {
+                    ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst));
+                }
+            } else {
+                ctx.emit(Inst::mov64_rm_r(src, dst));
+            }
+        }
+
+        Opcode::Icmp => {
+            let condcode = ctx.data(insn).cond_code().unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+            if !ty.is_vector() {
+                emit_cmp(ctx, insn);
+                let cc = CC::from_intcc(condcode);
+                ctx.emit(Inst::setcc(cc, dst));
+            } else {
+                assert_eq!(ty.bits(), 128);
+                let eq = |ty| match ty {
+                    types::I8X16 => SseOpcode::Pcmpeqb,
+                    types::I16X8 => SseOpcode::Pcmpeqw,
+                    types::I32X4 => SseOpcode::Pcmpeqd,
+                    types::I64X2 => SseOpcode::Pcmpeqq,
+                    _ => panic!(
+                        "Unable to find an instruction for {} for type: {}",
+                        condcode, ty
+                    ),
+                };
+                let gt = |ty| match ty {
+                    types::I8X16 => SseOpcode::Pcmpgtb,
+                    types::I16X8 => SseOpcode::Pcmpgtw,
+                    types::I32X4 => SseOpcode::Pcmpgtd,
+                    types::I64X2 => SseOpcode::Pcmpgtq,
+                    _ => panic!(
+                        "Unable to find an instruction for {} for type: {}",
+                        condcode, ty
+                    ),
+                };
+                let maxu = |ty| match ty {
+                    types::I8X16 => SseOpcode::Pmaxub,
+                    types::I16X8 => SseOpcode::Pmaxuw,
+                    types::I32X4 => SseOpcode::Pmaxud,
+                    _ => panic!(
+                        "Unable to find an instruction for {} for type: {}",
+                        condcode, ty
+                    ),
+                };
+                let mins = |ty| match ty {
+                    types::I8X16 => SseOpcode::Pminsb,
+                    types::I16X8 => SseOpcode::Pminsw,
+                    types::I32X4 => SseOpcode::Pminsd,
+                    _ => panic!(
+                        "Unable to find an instruction for {} for type: {}",
+                        condcode, ty
+                    ),
+                };
+                let minu = |ty| match ty {
+                    types::I8X16 => SseOpcode::Pminub,
+                    types::I16X8 => SseOpcode::Pminuw,
+                    types::I32X4 => SseOpcode::Pminud,
+                    _ => panic!(
+                        "Unable to find an instruction for {} for type: {}",
+                        condcode, ty
+                    ),
+                };
+
+                // Here we decide which operand to use as the read/write `dst` (ModRM reg field)
+                // and which to use as the read `input` (ModRM r/m field). In the normal case we
+                // use Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for
+                // the less-than cases so that we can reuse the greater-than implementation.
+                let input = match condcode {
+                    IntCC::SignedLessThan
+                    | IntCC::SignedLessThanOrEqual
+                    | IntCC::UnsignedLessThan
+                    | IntCC::UnsignedLessThanOrEqual => {
+                        let lhs = input_to_reg_mem(ctx, inputs[0]);
+                        let rhs = put_input_in_reg(ctx, inputs[1]);
+                        ctx.emit(Inst::gen_move(dst, rhs, ty));
+                        lhs
+                    }
+                    _ => {
+                        let lhs = put_input_in_reg(ctx, inputs[0]);
+                        let rhs = input_to_reg_mem(ctx, inputs[1]);
+                        ctx.emit(Inst::gen_move(dst, lhs, ty));
+                        rhs
+                    }
+                };
+
+                match condcode {
+                    IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)),
+                    IntCC::NotEqual => {
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
+                        // Emit all 1s into the `tmp` register.
+                        let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
+                        // Invert the result of the `PCMPEQ*`.
+                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
+                    }
+                    IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
+                        ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
+                    }
+                    IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => {
+                        ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
+                    }
+                    IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
+                        ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
+                        // Emit all 1s into the `tmp` register.
+                        let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
+                        // Invert the result of the `PCMPEQ*`.
+                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
+                    }
+                    IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
+                        ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
+                    }
+                    _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
+                }
+            }
+        }
+
+        Opcode::Fcmp => {
+            let cond_code = ctx.data(insn).fp_cond_code().unwrap();
+            let input_ty = ctx.input_ty(insn, 0);
+            if !input_ty.is_vector() {
+                // Unordered is returned by setting ZF, PF, CF <- 111
+                // Greater than by ZF, PF, CF <- 000
+                // Less than by ZF, PF, CF <- 001
+                // Equal by ZF, PF, CF <- 100
+                //
+                // Checking the result of comiss is somewhat annoying because you don't have setcc
+                // instructions that explicitly check simultaneously for the condition (i.e. eq, le,
+                // gt, etc) *and* orderedness.
+                //
+                // So that might mean we need more than one setcc check and then a logical "and" or
+                // "or" to determine both, in some cases.  However knowing that if the parity bit is
+                // set, then the result was considered unordered and knowing that if the parity bit is
+                // set, then both the ZF and CF flag bits must also be set we can get away with using
+                // one setcc for most condition codes.
+
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) {
+                    FcmpCondResult::Condition(cc) => {
+                        ctx.emit(Inst::setcc(cc, dst));
+                    }
+                    FcmpCondResult::AndConditions(cc1, cc2) => {
+                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        ctx.emit(Inst::setcc(cc1, tmp));
+                        ctx.emit(Inst::setcc(cc2, dst));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false,
+                            AluRmiROpcode::And,
+                            RegMemImm::reg(tmp.to_reg()),
+                            dst,
+                        ));
+                    }
+                    FcmpCondResult::OrConditions(cc1, cc2) => {
+                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        ctx.emit(Inst::setcc(cc1, tmp));
+                        ctx.emit(Inst::setcc(cc2, dst));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp.to_reg()),
+                            dst,
+                        ));
+                    }
+                    FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
+                }
+            } else {
+                let op = match input_ty {
+                    types::F32X4 => SseOpcode::Cmpps,
+                    types::F64X2 => SseOpcode::Cmppd,
+                    _ => panic!("Bad input type to fcmp: {}", input_ty),
+                };
+
+                // Since some packed comparisons are not available, some of the condition codes
+                // must be inverted, with a corresponding `flip` of the operands.
+                let (imm, flip) = match cond_code {
+                    FloatCC::GreaterThan => (FcmpImm::LessThan, true),
+                    FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true),
+                    FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true),
+                    FloatCC::UnorderedOrLessThanOrEqual => {
+                        (FcmpImm::UnorderedOrGreaterThanOrEqual, true)
+                    }
+                    FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => {
+                        panic!("unsupported float condition code: {}", cond_code)
+                    }
+                    _ => (FcmpImm::from(cond_code), false),
+                };
+
+                // Determine the operands of the comparison, possibly by flipping them.
+                let (lhs, rhs) = if flip {
+                    (
+                        put_input_in_reg(ctx, inputs[1]),
+                        input_to_reg_mem(ctx, inputs[0]),
+                    )
+                } else {
+                    (
+                        put_input_in_reg(ctx, inputs[0]),
+                        input_to_reg_mem(ctx, inputs[1]),
+                    )
+                };
+
+                // Move the `lhs` to the same register as `dst`; this may not emit an actual move
+                // but ensures that the registers are the same to match x86's read-write operand
+                // encoding.
+                let dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(dst, lhs, input_ty));
+
+                // Emit the comparison.
+                ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false));
+            }
+        }
+
+        Opcode::FallthroughReturn | Opcode::Return => {
+            for i in 0..ctx.num_inputs(insn) {
+                let src_reg = put_input_in_reg(ctx, inputs[i]);
+                let retval_reg = ctx.retval(i);
+                let ty = ctx.input_ty(insn, i);
+                ctx.emit(Inst::gen_move(retval_reg, src_reg, ty));
+            }
+            // N.B.: the Ret itself is generated by the ABI.
+        }
+
+        Opcode::Call | Opcode::CallIndirect => {
+            let caller_conv = ctx.abi().call_conv();
+            let (mut abi, inputs) = match op {
+                Opcode::Call => {
+                    let (extname, dist) = ctx.call_target(insn).unwrap();
+                    let sig = ctx.call_sig(insn).unwrap();
+                    assert_eq!(inputs.len(), sig.params.len());
+                    assert_eq!(outputs.len(), sig.returns.len());
+                    (
+                        X64ABICaller::from_func(sig, &extname, dist, caller_conv)?,
+                        &inputs[..],
+                    )
+                }
+
+                Opcode::CallIndirect => {
+                    let ptr = put_input_in_reg(ctx, inputs[0]);
+                    let sig = ctx.call_sig(insn).unwrap();
+                    assert_eq!(inputs.len() - 1, sig.params.len());
+                    assert_eq!(outputs.len(), sig.returns.len());
+                    (
+                        X64ABICaller::from_ptr(sig, ptr, op, caller_conv)?,
+                        &inputs[1..],
+                    )
+                }
+
+                _ => unreachable!(),
+            };
+
+            abi.emit_stack_pre_adjust(ctx);
+            assert_eq!(inputs.len(), abi.num_args());
+            for (i, input) in inputs.iter().enumerate() {
+                let arg_reg = put_input_in_reg(ctx, *input);
+                abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
+            }
+            abi.emit_call(ctx);
+            for (i, output) in outputs.iter().enumerate() {
+                let retval_reg = get_output_reg(ctx, *output);
+                abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
+            }
+            abi.emit_stack_post_adjust(ctx);
+        }
+
+        Opcode::Debugtrap => {
+            ctx.emit(Inst::Hlt);
+        }
+
+        Opcode::Trap | Opcode::ResumableTrap => {
+            let trap_code = ctx.data(insn).trap_code().unwrap();
+            ctx.emit_safepoint(Inst::Ud2 { trap_code });
+        }
+
+        Opcode::Trapif | Opcode::Trapff => {
+            let trap_code = ctx.data(insn).trap_code().unwrap();
+
+            if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
+                let cond_code = ctx.data(insn).cond_code().unwrap();
+                // The flags must not have been clobbered by any other instruction between the
+                // iadd_ifcout and this instruction, as verified by the CLIF validator; so we can
+                // simply use the flags here.
+                let cc = CC::from_intcc(cond_code);
+
+                ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
+            } else if op == Opcode::Trapif {
+                let cond_code = ctx.data(insn).cond_code().unwrap();
+                let cc = CC::from_intcc(cond_code);
+
+                // Verification ensures that the input is always a single-def ifcmp.
+                let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap();
+                emit_cmp(ctx, ifcmp);
+
+                ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
+            } else {
+                let cond_code = ctx.data(insn).fp_cond_code().unwrap();
+
+                // Verification ensures that the input is always a single-def ffcmp.
+                let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap();
+
+                match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
+                    FcmpCondResult::Condition(cc) => {
+                        ctx.emit_safepoint(Inst::TrapIf { trap_code, cc })
+                    }
+                    FcmpCondResult::AndConditions(cc1, cc2) => {
+                        // A bit unfortunate, but materialize the flags in their own register, and
+                        // check against this.
+                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        ctx.emit(Inst::setcc(cc1, tmp));
+                        ctx.emit(Inst::setcc(cc2, tmp2));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false, /* is_64 */
+                            AluRmiROpcode::And,
+                            RegMemImm::reg(tmp.to_reg()),
+                            tmp2,
+                        ));
+                        ctx.emit_safepoint(Inst::TrapIf {
+                            trap_code,
+                            cc: CC::NZ,
+                        });
+                    }
+                    FcmpCondResult::OrConditions(cc1, cc2) => {
+                        ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc1 });
+                        ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc2 });
+                    }
+                    FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
+                };
+            };
+        }
+
+        Opcode::F64const => {
+            // TODO use cmpeqpd for all 1s.
+            let value = ctx.get_constant(insn).unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            for inst in Inst::gen_constant(dst, value, types::F64, |reg_class, ty| {
+                ctx.alloc_tmp(reg_class, ty)
+            }) {
+                ctx.emit(inst);
+            }
+        }
+
+        Opcode::F32const => {
+            // TODO use cmpeqps for all 1s.
+            let value = ctx.get_constant(insn).unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            for inst in Inst::gen_constant(dst, value, types::F32, |reg_class, ty| {
+                ctx.alloc_tmp(reg_class, ty)
+            }) {
+                ctx.emit(inst);
+            }
+        }
+
+        Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = input_to_reg_mem(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+
+            // Move the `lhs` to the same register as `dst`; this may not emit an actual move
+            // but ensures that the registers are the same to match x86's read-write operand
+            // encoding.
+            ctx.emit(Inst::gen_move(dst, lhs, ty));
+
+            // Note: min and max can't be handled here, because of the way Cranelift defines them:
+            // if any operand is a NaN, they must return the NaN operand, while the x86 machine
+            // instruction will return the second operand if either operand is a NaN.
+            let sse_op = match ty {
+                types::F32 => match op {
+                    Opcode::Fadd => SseOpcode::Addss,
+                    Opcode::Fsub => SseOpcode::Subss,
+                    Opcode::Fmul => SseOpcode::Mulss,
+                    Opcode::Fdiv => SseOpcode::Divss,
+                    _ => unreachable!(),
+                },
+                types::F64 => match op {
+                    Opcode::Fadd => SseOpcode::Addsd,
+                    Opcode::Fsub => SseOpcode::Subsd,
+                    Opcode::Fmul => SseOpcode::Mulsd,
+                    Opcode::Fdiv => SseOpcode::Divsd,
+                    _ => unreachable!(),
+                },
+                types::F32X4 => match op {
+                    Opcode::Fadd => SseOpcode::Addps,
+                    Opcode::Fsub => SseOpcode::Subps,
+                    Opcode::Fmul => SseOpcode::Mulps,
+                    Opcode::Fdiv => SseOpcode::Divps,
+                    _ => unreachable!(),
+                },
+                types::F64X2 => match op {
+                    Opcode::Fadd => SseOpcode::Addpd,
+                    Opcode::Fsub => SseOpcode::Subpd,
+                    Opcode::Fmul => SseOpcode::Mulpd,
+                    Opcode::Fdiv => SseOpcode::Divpd,
+                    _ => unreachable!(),
+                },
+                _ => panic!(
+                    "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
+                    ty
+                ),
+            };
+            ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+        }
+
+        Opcode::Fmin | Opcode::Fmax => {
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = put_input_in_reg(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let is_min = op == Opcode::Fmin;
+            let output_ty = ty.unwrap();
+            ctx.emit(Inst::gen_move(dst, rhs, output_ty));
+            if !output_ty.is_vector() {
+                let op_size = match output_ty {
+                    types::F32 => OperandSize::Size32,
+                    types::F64 => OperandSize::Size64,
+                    _ => panic!("unexpected type {:?} for fmin/fmax", output_ty),
+                };
+                ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst));
+            } else {
+                // X64's implementation of floating point min and floating point max does not
+                // propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the
+                // scalar approach we use jumps to handle cases where NaN and +0 propagation is
+                // not consistent with what is needed. However for packed floating point min and
+                // floating point max we implement a different approach to avoid the sequence
+                // of jumps that would be required on a per lane basis. Because we do not need to
+                // lower labels and jumps but do need ctx for creating temporaries we implement
+                // the lowering here in lower.rs instead of emit.rs as is done in the case for scalars.
+                // The outline of approach is as follows:
+                //
+                // First we preform the Min/Max in both directions. This is because in the
+                // case of an operand's lane containing a NaN or in the case of the lanes of the
+                // two operands containing 0 but with mismatched signs, x64 will return the second
+                // operand regardless of its contents. So in order to make sure we capture NaNs and
+                // normalize NaNs and 0 values we capture the operation in both directions and merge the
+                // results. Then we normalize the results through operations that create a mask for the
+                // lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize
+                // 0s.
+                //
+                // The following sequence is generated for min:
+                //
+                // movap{s,d} %lhs, %tmp
+                // minp{s,d} %dst, %tmp
+                // minp,{s,d} %lhs, %dst
+                // orp{s,d} %dst, %tmp
+                // cmpp{s,d} %tmp, %dst, $3
+                // orps{s,d} %dst, %tmp
+                // psrl{s,d} {$10, $13}, %dst
+                // andnp{s,d} %tmp, %dst
+                //
+                // and for max the sequence is:
+                //
+                // movap{s,d} %lhs, %tmp
+                // minp{s,d} %dst, %tmp
+                // minp,{s,d} %lhs, %dst
+                // xorp{s,d} %tmp, %dst
+                // orp{s,d} %dst, %tmp
+                // subp{s,d} %dst, %tmp
+                // cmpp{s,d} %tmp, %dst, $3
+                // psrl{s,d} {$10, $13}, %dst
+                // andnp{s,d} %tmp, %dst
+
+                if is_min {
+                    let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) =
+                        match output_ty {
+                            types::F32X4 => (
+                                SseOpcode::Movaps,
+                                SseOpcode::Minps,
+                                SseOpcode::Orps,
+                                SseOpcode::Cmpps,
+                                SseOpcode::Psrld,
+                                10,
+                                SseOpcode::Andnps,
+                            ),
+                            types::F64X2 => (
+                                SseOpcode::Movapd,
+                                SseOpcode::Minpd,
+                                SseOpcode::Orpd,
+                                SseOpcode::Cmppd,
+                                SseOpcode::Psrlq,
+                                13,
+                                SseOpcode::Andnpd,
+                            ),
+                            _ => unimplemented!("unsupported op type {:?}", output_ty),
+                        };
+
+                    // Copy lhs into tmp
+                    let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, output_ty);
+                    ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
+
+                    // Perform min in reverse direction
+                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
+
+                    // Perform min in original direction
+                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
+
+                    // X64 handles propagation of -0's and Nans differently between left and right
+                    // operands. After doing the min in both directions, this OR will
+                    // guarrentee capture of -0's and Nan in our tmp register
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
+
+                    // Compare unordered to create mask for lanes containing NaNs and then use
+                    // that mask to saturate the NaN containing lanes in the tmp register with 1s.
+                    // TODO: Would a check for NaN and then a jump be better here in the
+                    // common case than continuing on to normalize NaNs that might not exist?
+                    let cond = FcmpImm::from(FloatCC::Unordered);
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        cmp_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        cond.encode(),
+                        false,
+                    ));
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // The dst register holds a mask for lanes containing NaNs.
+                    // We take that mask and shift in preparation for creating a different mask
+                    // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
+                    // number of least signficant bits. We shift right each lane by 10 bits
+                    // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
+                    // 11 exp. + 1 MSB sig.) for F64X2.
+                    ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
+
+                    // Finally we do a nand with the tmp register to produce the final results
+                    // in the dst.
+                    ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                } else {
+                    let (
+                        mov_op,
+                        max_op,
+                        xor_op,
+                        or_op,
+                        sub_op,
+                        cmp_op,
+                        shift_op,
+                        shift_by,
+                        andn_op,
+                    ) = match output_ty {
+                        types::F32X4 => (
+                            SseOpcode::Movaps,
+                            SseOpcode::Maxps,
+                            SseOpcode::Xorps,
+                            SseOpcode::Orps,
+                            SseOpcode::Subps,
+                            SseOpcode::Cmpps,
+                            SseOpcode::Psrld,
+                            10,
+                            SseOpcode::Andnps,
+                        ),
+                        types::F64X2 => (
+                            SseOpcode::Movapd,
+                            SseOpcode::Maxpd,
+                            SseOpcode::Xorpd,
+                            SseOpcode::Orpd,
+                            SseOpcode::Subpd,
+                            SseOpcode::Cmppd,
+                            SseOpcode::Psrlq,
+                            13,
+                            SseOpcode::Andnpd,
+                        ),
+                        _ => unimplemented!("unsupported op type {:?}", output_ty),
+                    };
+
+                    // Copy lhs into tmp.
+                    let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32);
+                    ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
+
+                    // Perform max in reverse direction.
+                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // Perform max in original direction.
+                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
+
+                    // Get the difference between the two results and store in tmp.
+                    // Max uses a different approach than min to account for potential
+                    // discrepancies with plus/minus 0.
+                    ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+
+                    // X64 handles propagation of -0's and Nans differently between left and right
+                    // operands. After doing the max in both directions, this OR will
+                    // guarentee capture of 0's and Nan in our tmp register.
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // Capture NaNs and sign discrepancies.
+                    ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // Compare unordered to create mask for lanes containing NaNs and then use
+                    // that mask to saturate the NaN containing lanes in the tmp register with 1s.
+                    let cond = FcmpImm::from(FloatCC::Unordered);
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        cmp_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        cond.encode(),
+                        false,
+                    ));
+
+                    // The dst register holds a mask for lanes containing NaNs.
+                    // We take that mask and shift in preparation for creating a different mask
+                    // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
+                    // number of least signficant bits. We shift right each lane by 10 bits
+                    // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
+                    // 11 exp. + 1 MSB sig.) for F64X2.
+                    ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
+
+                    // Finally we do a nand with the tmp register to produce the final results
+                    // in the dst.
+                    ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                }
+            }
+        }
+
+        Opcode::FminPseudo | Opcode::FmaxPseudo => {
+            let lhs = input_to_reg_mem(ctx, inputs[0]);
+            let rhs = put_input_in_reg(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            ctx.emit(Inst::gen_move(dst, rhs, ty));
+            let sse_opcode = match (ty, op) {
+                (types::F32X4, Opcode::FminPseudo) => SseOpcode::Minps,
+                (types::F32X4, Opcode::FmaxPseudo) => SseOpcode::Maxps,
+                (types::F64X2, Opcode::FminPseudo) => SseOpcode::Minpd,
+                (types::F64X2, Opcode::FmaxPseudo) => SseOpcode::Maxpd,
+                _ => unimplemented!("unsupported type {} for {}", ty, op),
+            };
+            ctx.emit(Inst::xmm_rm_r(sse_opcode, lhs, dst));
+        }
+
+        Opcode::Sqrt => {
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+
+            let sse_op = match ty {
+                types::F32 => SseOpcode::Sqrtss,
+                types::F64 => SseOpcode::Sqrtsd,
+                types::F32X4 => SseOpcode::Sqrtps,
+                types::F64X2 => SseOpcode::Sqrtpd,
+                _ => panic!(
+                    "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
+                    ty
+                ),
+            };
+
+            ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst));
+        }
+
+        Opcode::Fpromote => {
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
+        }
+
+        Opcode::Fdemote => {
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
+        }
+
+        Opcode::FcvtFromSint => {
+            let output_ty = ty.unwrap();
+            if !output_ty.is_vector() {
+                let (ext_spec, src_size) = match ctx.input_ty(insn, 0) {
+                    types::I8 | types::I16 => (Some(ExtSpec::SignExtendTo32), OperandSize::Size32),
+                    types::I32 => (None, OperandSize::Size32),
+                    types::I64 => (None, OperandSize::Size64),
+                    _ => unreachable!(),
+                };
+
+                let src = match ext_spec {
+                    Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)),
+                    None => input_to_reg_mem(ctx, inputs[0]),
+                };
+
+                let opcode = if output_ty == types::F32 {
+                    SseOpcode::Cvtsi2ss
+                } else {
+                    assert_eq!(output_ty, types::F64);
+                    SseOpcode::Cvtsi2sd
+                };
+                let dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gpr_to_xmm(opcode, src, src_size, dst));
+            } else {
+                let ty = ty.unwrap();
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                let opcode = match ctx.input_ty(insn, 0) {
+                    types::I32X4 => SseOpcode::Cvtdq2ps,
+                    _ => {
+                        unimplemented!("unable to use type {} for op {}", ctx.input_ty(insn, 0), op)
+                    }
+                };
+                ctx.emit(Inst::gen_move(dst, src, ty));
+                ctx.emit(Inst::xmm_rm_r(opcode, RegMem::from(dst), dst));
+            }
+        }
+
+        Opcode::FcvtFromUint => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+
+            let input_ty = ctx.input_ty(insn, 0);
+            if !ty.is_vector() {
+                match input_ty {
+                    types::I8 | types::I16 | types::I32 => {
+                        // Conversion from an unsigned int smaller than 64-bit is easy: zero-extend +
+                        // do a signed conversion (which won't overflow).
+                        let opcode = if ty == types::F32 {
+                            SseOpcode::Cvtsi2ss
+                        } else {
+                            assert_eq!(ty, types::F64);
+                            SseOpcode::Cvtsi2sd
+                        };
+
+                        let src = RegMem::reg(extend_input_to_reg(
+                            ctx,
+                            inputs[0],
+                            ExtSpec::ZeroExtendTo64,
+                        ));
+                        ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst));
+                    }
+
+                    types::I64 => {
+                        let src = put_input_in_reg(ctx, inputs[0]);
+
+                        let src_copy = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        ctx.emit(Inst::gen_move(src_copy, src, types::I64));
+
+                        let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        let tmp_gpr2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        ctx.emit(Inst::cvt_u64_to_float_seq(
+                            ty == types::F64,
+                            src_copy,
+                            tmp_gpr1,
+                            tmp_gpr2,
+                            dst,
+                        ));
+                    }
+                    _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
+                };
+            } else {
+                // Converting packed unsigned integers to packed floats requires a few steps.
+                // There is no single instruction lowering for converting unsigned floats but there
+                // is for converting packed signed integers to float (cvtdq2ps). In the steps below
+                // we isolate the upper half (16 bits) and lower half (16 bits) of each lane and
+                // then we convert each half separately using cvtdq2ps meant for signed integers.
+                // In order for this to work for the upper half bits we must shift right by 1
+                // (divide by 2) these bits in order to ensure the most significant bit is 0 not
+                // signed, and then after the conversion we double the value. Finally we add the
+                // converted values where addition will correctly round.
+                //
+                // Sequence:
+                // -> A = 0xffffffff
+                // -> Ah = 0xffff0000
+                // -> Al = 0x0000ffff
+                // -> Convert(Al) // Convert int to float
+                // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
+                // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
+                // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
+                // -> dst = Ah + Al // Add the two floats together
+
+                assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                // Create a temporary register
+                let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+                ctx.emit(Inst::xmm_unary_rm_r(
+                    SseOpcode::Movapd,
+                    RegMem::reg(src),
+                    tmp,
+                ));
+                ctx.emit(Inst::gen_move(dst, src, ty));
+
+                // Get the low 16 bits
+                ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
+                ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));
+
+                // Get the high 16 bits
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));
+
+                // Convert the low 16 bits
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp));
+
+                // Shift the high bits by 1, convert, and double to get the correct value.
+                ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Addps,
+                    RegMem::reg(dst.to_reg()),
+                    dst,
+                ));
+
+                // Add together the two converted values.
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Addps,
+                    RegMem::reg(tmp.to_reg()),
+                    dst,
+                ));
+            }
+        }
+
+        Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let input_ty = ctx.input_ty(insn, 0);
+            if !input_ty.is_vector() {
+                let src_size = if input_ty == types::F32 {
+                    OperandSize::Size32
+                } else {
+                    assert_eq!(input_ty, types::F64);
+                    OperandSize::Size64
+                };
+
+                let output_ty = ty.unwrap();
+                let dst_size = if output_ty == types::I32 {
+                    OperandSize::Size32
+                } else {
+                    assert_eq!(output_ty, types::I64);
+                    OperandSize::Size64
+                };
+
+                let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat;
+                let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat;
+
+                let src_copy = ctx.alloc_tmp(RegClass::V128, input_ty);
+                ctx.emit(Inst::gen_move(src_copy, src, input_ty));
+
+                let tmp_xmm = ctx.alloc_tmp(RegClass::V128, input_ty);
+                let tmp_gpr = ctx.alloc_tmp(RegClass::I64, output_ty);
+
+                if to_signed {
+                    ctx.emit(Inst::cvt_float_to_sint_seq(
+                        src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
+                    ));
+                } else {
+                    ctx.emit(Inst::cvt_float_to_uint_seq(
+                        src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
+                    ));
+                }
+            } else {
+                if op == Opcode::FcvtToSintSat {
+                    // Sets destination to zero if float is NaN
+                    let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+                    ctx.emit(Inst::xmm_unary_rm_r(
+                        SseOpcode::Movapd,
+                        RegMem::reg(src),
+                        tmp,
+                    ));
+                    ctx.emit(Inst::gen_move(dst, src, input_ty));
+                    let cond = FcmpImm::from(FloatCC::Equal);
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        SseOpcode::Cmpps,
+                        RegMem::reg(tmp.to_reg()),
+                        tmp,
+                        cond.encode(),
+                        false,
+                    ));
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Andps,
+                        RegMem::reg(tmp.to_reg()),
+                        dst,
+                    ));
+
+                    // Sets top bit of tmp if float is positive
+                    // Setting up to set top bit on negative float values
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pxor,
+                        RegMem::reg(dst.to_reg()),
+                        tmp,
+                    ));
+
+                    // Convert the packed float to packed doubleword.
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Cvttps2dq,
+                        RegMem::reg(dst.to_reg()),
+                        dst,
+                    ));
+
+                    // Set top bit only if < 0
+                    // Saturate lane with sign (top) bit.
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pand,
+                        RegMem::reg(dst.to_reg()),
+                        tmp,
+                    ));
+                    ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp));
+
+                    // On overflow 0x80000000 is returned to a lane.
+                    // Below sets positive overflow lanes to 0x7FFFFFFF
+                    // Keeps negative overflow lanes as is.
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pxor,
+                        RegMem::reg(tmp.to_reg()),
+                        dst,
+                    ));
+                } else if op == Opcode::FcvtToUintSat {
+                    unimplemented!("f32x4.convert_i32x4_u");
+                } else {
+                    // Since this branch is also guarded by a check for vector types
+                    // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here
+                    // due to vector varients not existing. The first two branches will
+                    // cover all reachable cases.
+                    unreachable!();
+                }
+            }
+        }
+
+        Opcode::Bitcast => {
+            let input_ty = ctx.input_ty(insn, 0);
+            let output_ty = ctx.output_ty(insn, 0);
+            match (input_ty, output_ty) {
+                (types::F32, types::I32) => {
+                    let src = put_input_in_reg(ctx, inputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]);
+                    ctx.emit(Inst::xmm_to_gpr(
+                        SseOpcode::Movd,
+                        src,
+                        dst,
+                        OperandSize::Size32,
+                    ));
+                }
+                (types::I32, types::F32) => {
+                    let src = input_to_reg_mem(ctx, inputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]);
+                    ctx.emit(Inst::gpr_to_xmm(
+                        SseOpcode::Movd,
+                        src,
+                        OperandSize::Size32,
+                        dst,
+                    ));
+                }
+                (types::F64, types::I64) => {
+                    let src = put_input_in_reg(ctx, inputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]);
+                    ctx.emit(Inst::xmm_to_gpr(
+                        SseOpcode::Movq,
+                        src,
+                        dst,
+                        OperandSize::Size64,
+                    ));
+                }
+                (types::I64, types::F64) => {
+                    let src = input_to_reg_mem(ctx, inputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]);
+                    ctx.emit(Inst::gpr_to_xmm(
+                        SseOpcode::Movq,
+                        src,
+                        OperandSize::Size64,
+                        dst,
+                    ));
+                }
+                _ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty),
+            }
+        }
+
+        Opcode::Fabs | Opcode::Fneg => {
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            // In both cases, generate a constant and apply a single binary instruction:
+            // - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the
+            // src with it.
+            // - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the
+            // src with it.
+            let output_ty = ty.unwrap();
+            if !output_ty.is_vector() {
+                let (val, opcode) = match output_ty {
+                    types::F32 => match op {
+                        Opcode::Fabs => (0x7fffffff, SseOpcode::Andps),
+                        Opcode::Fneg => (0x80000000, SseOpcode::Xorps),
+                        _ => unreachable!(),
+                    },
+                    types::F64 => match op {
+                        Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd),
+                        Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd),
+                        _ => unreachable!(),
+                    },
+                    _ => panic!("unexpected type {:?} for Fabs", output_ty),
+                };
+
+                for inst in Inst::gen_constant(dst, val, output_ty, |reg_class, ty| {
+                    ctx.alloc_tmp(reg_class, ty)
+                }) {
+                    ctx.emit(inst);
+                }
+
+                ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
+            } else {
+                // Eventually vector constants should be available in `gen_constant` and this block
+                // can be merged with the one above (TODO).
+                if output_ty.bits() == 128 {
+                    // Move the `lhs` to the same register as `dst`; this may not emit an actual move
+                    // but ensures that the registers are the same to match x86's read-write operand
+                    // encoding.
+                    let src = put_input_in_reg(ctx, inputs[0]);
+                    ctx.emit(Inst::gen_move(dst, src, output_ty));
+
+                    // Generate an all 1s constant in an XMM register. This uses CMPPS but could
+                    // have used CMPPD with the same effect.
+                    let tmp = ctx.alloc_tmp(RegClass::V128, output_ty);
+                    let cond = FcmpImm::from(FloatCC::Equal);
+                    let cmpps = Inst::xmm_rm_r_imm(
+                        SseOpcode::Cmpps,
+                        RegMem::reg(tmp.to_reg()),
+                        tmp,
+                        cond.encode(),
+                        false,
+                    );
+                    ctx.emit(cmpps);
+
+                    // Shift the all 1s constant to generate the mask.
+                    let lane_bits = output_ty.lane_bits();
+                    let (shift_opcode, opcode, shift_by) = match (op, lane_bits) {
+                        (Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1),
+                        (Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1),
+                        (Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31),
+                        (Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63),
+                        _ => unreachable!(
+                            "unexpected opcode and lane size: {:?}, {} bits",
+                            op, lane_bits
+                        ),
+                    };
+                    let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp);
+                    ctx.emit(shift);
+
+                    // Apply shifted mask (XOR or AND).
+                    let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
+                    ctx.emit(mask);
+                } else {
+                    panic!("unexpected type {:?} for Fabs", output_ty);
+                }
+            }
+        }
+
+        Opcode::Fcopysign => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = put_input_in_reg(ctx, inputs[1]);
+
+            let ty = ty.unwrap();
+
+            // We're going to generate the following sequence:
+            //
+            // movabs     $INT_MIN, tmp_gpr1
+            // mov{d,q}   tmp_gpr1, tmp_xmm1
+            // movap{s,d} tmp_xmm1, dst
+            // andnp{s,d} src_1, dst
+            // movap{s,d} src_2, tmp_xmm2
+            // andp{s,d}  tmp_xmm1, tmp_xmm2
+            // orp{s,d}   tmp_xmm2, dst
+
+            let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32);
+            let tmp_xmm2 = ctx.alloc_tmp(RegClass::V128, types::F32);
+
+            let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty {
+                types::F32 => (
+                    0x8000_0000,
+                    SseOpcode::Movaps,
+                    SseOpcode::Andnps,
+                    SseOpcode::Andps,
+                    SseOpcode::Orps,
+                ),
+                types::F64 => (
+                    0x8000_0000_0000_0000,
+                    SseOpcode::Movapd,
+                    SseOpcode::Andnpd,
+                    SseOpcode::Andpd,
+                    SseOpcode::Orpd,
+                ),
+                _ => {
+                    panic!("unexpected type {:?} for copysign", ty);
+                }
+            };
+
+            for inst in Inst::gen_constant(tmp_xmm1, sign_bit_cst, ty, |reg_class, ty| {
+                ctx.alloc_tmp(reg_class, ty)
+            }) {
+                ctx.emit(inst);
+            }
+            ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+            ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst));
+            ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2));
+            ctx.emit(Inst::xmm_rm_r(
+                and_op,
+                RegMem::reg(tmp_xmm1.to_reg()),
+                tmp_xmm2,
+            ));
+            ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst));
+        }
+
+        Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
+            // TODO use ROUNDSS/ROUNDSD after sse4.1.
+
+            // Lower to VM calls when there's no access to SSE4.1.
+            let ty = ty.unwrap();
+            let libcall = match (ty, op) {
+                (types::F32, Opcode::Ceil) => LibCall::CeilF32,
+                (types::F64, Opcode::Ceil) => LibCall::CeilF64,
+                (types::F32, Opcode::Floor) => LibCall::FloorF32,
+                (types::F64, Opcode::Floor) => LibCall::FloorF64,
+                (types::F32, Opcode::Nearest) => LibCall::NearestF32,
+                (types::F64, Opcode::Nearest) => LibCall::NearestF64,
+                (types::F32, Opcode::Trunc) => LibCall::TruncF32,
+                (types::F64, Opcode::Trunc) => LibCall::TruncF64,
+                _ => panic!(
+                    "unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc",
+                    ty, op
+                ),
+            };
+
+            emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?;
+        }
+
+        Opcode::Load
+        | Opcode::Uload8
+        | Opcode::Sload8
+        | Opcode::Uload16
+        | Opcode::Sload16
+        | Opcode::Uload32
+        | Opcode::Sload32
+        | Opcode::LoadComplex
+        | Opcode::Uload8Complex
+        | Opcode::Sload8Complex
+        | Opcode::Uload16Complex
+        | Opcode::Sload16Complex
+        | Opcode::Uload32Complex
+        | Opcode::Sload32Complex => {
+            let offset = ctx.data(insn).load_store_offset().unwrap();
+
+            let elem_ty = match op {
+                Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
+                    types::I8
+                }
+                Opcode::Sload16
+                | Opcode::Uload16
+                | Opcode::Sload16Complex
+                | Opcode::Uload16Complex => types::I16,
+                Opcode::Sload32
+                | Opcode::Uload32
+                | Opcode::Sload32Complex
+                | Opcode::Uload32Complex => types::I32,
+                Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
+                _ => unimplemented!(),
+            };
+
+            let ext_mode = ExtMode::new(elem_ty.bits(), 64);
+
+            let sign_extend = match op {
+                Opcode::Sload8
+                | Opcode::Sload8Complex
+                | Opcode::Sload16
+                | Opcode::Sload16Complex
+                | Opcode::Sload32
+                | Opcode::Sload32Complex => true,
+                _ => false,
+            };
+
+            let amode = match op {
+                Opcode::Load
+                | Opcode::Uload8
+                | Opcode::Sload8
+                | Opcode::Uload16
+                | Opcode::Sload16
+                | Opcode::Uload32
+                | Opcode::Sload32 => {
+                    assert_eq!(inputs.len(), 1, "only one input for load operands");
+                    lower_to_amode(ctx, inputs[0], offset)
+                }
+
+                Opcode::LoadComplex
+                | Opcode::Uload8Complex
+                | Opcode::Sload8Complex
+                | Opcode::Uload16Complex
+                | Opcode::Sload16Complex
+                | Opcode::Uload32Complex
+                | Opcode::Sload32Complex => {
+                    assert_eq!(
+                        inputs.len(),
+                        2,
+                        "can't handle more than two inputs in complex load"
+                    );
+                    let base = put_input_in_reg(ctx, inputs[0]);
+                    let index = put_input_in_reg(ctx, inputs[1]);
+                    let shift = 0;
+                    let flags = ctx.memflags(insn).expect("load should have memflags");
+                    Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags)
+                }
+
+                _ => unreachable!(),
+            };
+
+            let dst = get_output_reg(ctx, outputs[0]);
+            let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
+            match (sign_extend, is_xmm) {
+                (true, false) => {
+                    // The load is sign-extended only when the output size is lower than 64 bits,
+                    // so ext-mode is defined in this case.
+                    ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst));
+                }
+                (false, false) => {
+                    if elem_ty.bytes() == 8 {
+                        // Use a plain load.
+                        ctx.emit(Inst::mov64_m_r(amode, dst))
+                    } else {
+                        // Use a zero-extended load.
+                        ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst))
+                    }
+                }
+                (_, true) => {
+                    ctx.emit(match elem_ty {
+                        types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst),
+                        types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst),
+                        _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
+                            Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst)
+                        } // TODO Specialize for different types: MOVUPD, MOVDQU
+                        _ => unreachable!("unexpected type for load: {:?}", elem_ty),
+                    });
+                }
+            }
+        }
+
+        Opcode::Store
+        | Opcode::Istore8
+        | Opcode::Istore16
+        | Opcode::Istore32
+        | Opcode::StoreComplex
+        | Opcode::Istore8Complex
+        | Opcode::Istore16Complex
+        | Opcode::Istore32Complex => {
+            let offset = ctx.data(insn).load_store_offset().unwrap();
+
+            let elem_ty = match op {
+                Opcode::Istore8 | Opcode::Istore8Complex => types::I8,
+                Opcode::Istore16 | Opcode::Istore16Complex => types::I16,
+                Opcode::Istore32 | Opcode::Istore32Complex => types::I32,
+                Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
+                _ => unreachable!(),
+            };
+
+            let addr = match op {
+                Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
+                    assert_eq!(inputs.len(), 2, "only one input for store memory operands");
+                    lower_to_amode(ctx, inputs[1], offset)
+                }
+
+                Opcode::StoreComplex
+                | Opcode::Istore8Complex
+                | Opcode::Istore16Complex
+                | Opcode::Istore32Complex => {
+                    assert_eq!(
+                        inputs.len(),
+                        3,
+                        "can't handle more than two inputs in complex store"
+                    );
+                    let base = put_input_in_reg(ctx, inputs[1]);
+                    let index = put_input_in_reg(ctx, inputs[2]);
+                    let shift = 0;
+                    let flags = ctx.memflags(insn).expect("store should have memflags");
+                    Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags)
+                }
+
+                _ => unreachable!(),
+            };
+
+            let src = put_input_in_reg(ctx, inputs[0]);
+
+            ctx.emit(match elem_ty {
+                types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr),
+                types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr),
+                _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
+                    // TODO Specialize for different types: MOVUPD, MOVDQU, etc.
+                    Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr)
+                }
+                _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr),
+            });
+        }
+
+        Opcode::AtomicRmw => {
+            // This is a simple, general-case atomic update, based on a loop involving
+            // `cmpxchg`.  Note that we could do much better than this in the case where the old
+            // value at the location (that is to say, the SSA `Value` computed by this CLIF
+            // instruction) is not required.  In that case, we could instead implement this
+            // using a single `lock`-prefixed x64 read-modify-write instruction.  Also, even in
+            // the case where the old value is required, for the `add` and `sub` cases, we can
+            // use the single instruction `lock xadd`.  However, those improvements have been
+            // left for another day.
+            // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153
+            let dst = get_output_reg(ctx, outputs[0]);
+            let mut addr = put_input_in_reg(ctx, inputs[0]);
+            let mut arg2 = put_input_in_reg(ctx, inputs[1]);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+
+            // Make sure that both args are in virtual regs, since in effect we have to do a
+            // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not
+            // guaranteed safe if either is in a real reg.
+            addr = ctx.ensure_in_vreg(addr, types::I64);
+            arg2 = ctx.ensure_in_vreg(arg2, types::I64);
+
+            // Move the args to the preordained AtomicRMW input regs.  Note that `AtomicRmwSeq`
+            // operates at whatever width is specified by `ty`, so there's no need to
+            // zero-extend `arg2` in the case of `ty` being I8/I16/I32.
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::r9()),
+                addr,
+                types::I64,
+            ));
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::r10()),
+                arg2,
+                types::I64,
+            ));
+
+            // Now the AtomicRmwSeq (pseudo-) instruction itself
+            let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
+            ctx.emit(Inst::AtomicRmwSeq { ty: ty_access, op });
+
+            // And finally, copy the preordained AtomicRmwSeq output reg to its destination.
+            ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
+        }
+
+        Opcode::AtomicCas => {
+            // This is very similar to, but not identical to, the `AtomicRmw` case.  As with
+            // `AtomicRmw`, there's no need to zero-extend narrow values here.
+            let dst = get_output_reg(ctx, outputs[0]);
+            let addr = lower_to_amode(ctx, inputs[0], 0);
+            let expected = put_input_in_reg(ctx, inputs[1]);
+            let replacement = put_input_in_reg(ctx, inputs[2]);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+
+            // Move the expected value into %rax.  Because there's only one fixed register on
+            // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the
+            // `AtomicRmw` case.
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::rax()),
+                expected,
+                types::I64,
+            ));
+            ctx.emit(Inst::LockCmpxchg {
+                ty: ty_access,
+                src: replacement,
+                dst: addr.into(),
+            });
+            // And finally, copy the old value at the location to its destination reg.
+            ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
+        }
+
+        Opcode::AtomicLoad => {
+            // This is a normal load.  The x86-TSO memory model provides sufficient sequencing
+            // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
+            // need for any fence instructions.
+            let data = get_output_reg(ctx, outputs[0]);
+            let addr = lower_to_amode(ctx, inputs[0], 0);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+
+            let rm = RegMem::mem(addr);
+            if ty_access == types::I64 {
+                ctx.emit(Inst::mov64_rm_r(rm, data));
+            } else {
+                let ext_mode = ExtMode::new(ty_access.bits(), 64).expect(&format!(
+                    "invalid extension during AtomicLoad: {} -> {}",
+                    ty_access.bits(),
+                    64
+                ));
+                ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data));
+            }
+        }
+
+        Opcode::AtomicStore => {
+            // This is a normal store, followed by an `mfence` instruction.
+            let data = put_input_in_reg(ctx, inputs[0]);
+            let addr = lower_to_amode(ctx, inputs[1], 0);
+            let ty_access = ctx.input_ty(insn, 0);
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+
+            ctx.emit(Inst::mov_r_m(ty_access.bytes() as u8, data, addr));
+            ctx.emit(Inst::Fence {
+                kind: FenceKind::MFence,
+            });
+        }
+
+        Opcode::Fence => {
+            ctx.emit(Inst::Fence {
+                kind: FenceKind::MFence,
+            });
+        }
+
+        Opcode::FuncAddr => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let (extname, _) = ctx.call_target(insn).unwrap();
+            let extname = extname.clone();
+            ctx.emit(Inst::LoadExtName {
+                dst,
+                name: Box::new(extname),
+                offset: 0,
+            });
+        }
+
+        Opcode::SymbolValue => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
+            let extname = extname.clone();
+            ctx.emit(Inst::LoadExtName {
+                dst,
+                name: Box::new(extname),
+                offset,
+            });
+        }
+
+        Opcode::StackAddr => {
+            let (stack_slot, offset) = match *ctx.data(insn) {
+                InstructionData::StackLoad {
+                    opcode: Opcode::StackAddr,
+                    stack_slot,
+                    offset,
+                } => (stack_slot, offset),
+                _ => unreachable!(),
+            };
+            let dst = get_output_reg(ctx, outputs[0]);
+            let offset: i32 = offset.into();
+            let inst = ctx
+                .abi()
+                .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
+            ctx.emit(inst);
+        }
+
+        Opcode::Select => {
+            let flag_input = inputs[0];
+            if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
+                let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
+
+                // For equal, we flip the operands, because we can't test a conjunction of
+                // CPU flags with a single cmove; see InvertedEqualOrConditions doc comment.
+                let (lhs_input, rhs_input) = match cond_code {
+                    FloatCC::Equal => (inputs[2], inputs[1]),
+                    _ => (inputs[1], inputs[2]),
+                };
+
+                let ty = ctx.output_ty(insn, 0);
+                let rhs = put_input_in_reg(ctx, rhs_input);
+                let dst = get_output_reg(ctx, outputs[0]);
+                let lhs = if is_int_or_ref_ty(ty) && ty.bytes() < 4 {
+                    // Special case: since the higher bits are undefined per CLIF semantics, we
+                    // can just apply a 32-bit cmove here. Force inputs into registers, to
+                    // avoid partial spilling out-of-bounds with memory accesses, though.
+                    // Sign-extend operands to 32, then do a cmove of size 4.
+                    RegMem::reg(put_input_in_reg(ctx, lhs_input))
+                } else {
+                    input_to_reg_mem(ctx, lhs_input)
+                };
+
+                // We request inversion of Equal to NotEqual here: taking LHS if equal would mean
+                // take it if both CC::NP and CC::Z are set, the conjunction of which can't be
+                // modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the
+                // select operation, and invert the equal to a not-equal here.
+                let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual);
+
+                if let FcmpCondResult::InvertedEqualOrConditions(_, _) = &fcmp_results {
+                    // Keep this sync'd with the lowering of the select inputs above.
+                    assert_eq!(cond_code, FloatCC::Equal);
+                }
+
+                ctx.emit(Inst::gen_move(dst, rhs, ty));
+
+                match fcmp_results {
+                    FcmpCondResult::Condition(cc) => {
+                        if is_int_or_ref_ty(ty) {
+                            let size = u8::max(ty.bytes() as u8, 4);
+                            ctx.emit(Inst::cmove(size, cc, lhs, dst));
+                        } else {
+                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+                        }
+                    }
+                    FcmpCondResult::AndConditions(_, _) => {
+                        unreachable!(
+                            "can't AND with select; see above comment about inverting equal"
+                        );
+                    }
+                    FcmpCondResult::InvertedEqualOrConditions(cc1, cc2)
+                    | FcmpCondResult::OrConditions(cc1, cc2) => {
+                        if is_int_or_ref_ty(ty) {
+                            let size = u8::max(ty.bytes() as u8, 4);
+                            ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst));
+                            ctx.emit(Inst::cmove(size, cc2, lhs, dst));
+                        } else {
+                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst));
+                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst));
+                        }
+                    }
+                }
+            } else {
+                let ty = ty.unwrap();
+
+                let mut size = ty.bytes() as u8;
+                let lhs = if is_int_or_ref_ty(ty) {
+                    if size < 4 {
+                        // Special case: since the higher bits are undefined per CLIF semantics, we
+                        // can just apply a 32-bit cmove here. Force inputs into registers, to
+                        // avoid partial spilling out-of-bounds with memory accesses, though.
+                        size = 4;
+                        RegMem::reg(put_input_in_reg(ctx, inputs[1]))
+                    } else {
+                        input_to_reg_mem(ctx, inputs[1])
+                    }
+                } else {
+                    input_to_reg_mem(ctx, inputs[1])
+                };
+
+                let rhs = put_input_in_reg(ctx, inputs[2]);
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
+                    emit_cmp(ctx, icmp);
+                    let cond_code = ctx.data(icmp).cond_code().unwrap();
+                    CC::from_intcc(cond_code)
+                } else {
+                    // The input is a boolean value, compare it against zero.
+                    let size = ctx.input_ty(insn, 0).bytes() as u8;
+                    let test = put_input_in_reg(ctx, flag_input);
+                    ctx.emit(Inst::cmp_rmi_r(size, RegMemImm::imm(0), test));
+                    CC::NZ
+                };
+
+                // This doesn't affect the flags.
+                ctx.emit(Inst::gen_move(dst, rhs, ty));
+
+                if is_int_or_ref_ty(ty) {
+                    ctx.emit(Inst::cmove(size, cc, lhs, dst));
+                } else {
+                    debug_assert!(ty == types::F32 || ty == types::F64);
+                    ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+                }
+            }
+        }
+
+        Opcode::Selectif | Opcode::SelectifSpectreGuard => {
+            let lhs = input_to_reg_mem(ctx, inputs[1]);
+            let rhs = put_input_in_reg(ctx, inputs[2]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ctx.output_ty(insn, 0);
+
+            // Verification ensures that the input is always a single-def ifcmp.
+            let cmp_insn = ctx
+                .get_input(inputs[0].insn, inputs[0].input)
+                .inst
+                .unwrap()
+                .0;
+            debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
+            emit_cmp(ctx, cmp_insn);
+
+            let cc = CC::from_intcc(ctx.data(insn).cond_code().unwrap());
+
+            if is_int_or_ref_ty(ty) {
+                let size = ty.bytes() as u8;
+                if size == 1 {
+                    // Sign-extend operands to 32, then do a cmove of size 4.
+                    let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32);
+                    ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se));
+                    ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst));
+                    ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst));
+                } else {
+                    ctx.emit(Inst::gen_move(dst, rhs, ty));
+                    ctx.emit(Inst::cmove(size, cc, lhs, dst));
+                }
+            } else {
+                debug_assert!(ty == types::F32 || ty == types::F64);
+                ctx.emit(Inst::gen_move(dst, rhs, ty));
+                ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+            }
+        }
+
+        Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => {
+            let kind = match op {
+                Opcode::Udiv => DivOrRemKind::UnsignedDiv,
+                Opcode::Sdiv => DivOrRemKind::SignedDiv,
+                Opcode::Urem => DivOrRemKind::UnsignedRem,
+                Opcode::Srem => DivOrRemKind::SignedRem,
+                _ => unreachable!(),
+            };
+            let is_div = kind.is_div();
+
+            let input_ty = ctx.input_ty(insn, 0);
+            let size = input_ty.bytes() as u8;
+
+            let dividend = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::rax()),
+                dividend,
+                input_ty,
+            ));
+
+            if flags.avoid_div_traps() {
+                // A vcode meta-instruction is used to lower the inline checks, since they embed
+                // pc-relative offsets that must not change, thus requiring regalloc to not
+                // interfere by introducing spills and reloads.
+                //
+                // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
+                // regalloc is aware of the coalescing opportunity between rax/rdx and the
+                // destination register.
+                let divisor = put_input_in_reg(ctx, inputs[1]);
+
+                let divisor_copy = ctx.alloc_tmp(RegClass::I64, types::I64);
+                ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64));
+
+                let tmp = if op == Opcode::Sdiv && size == 8 {
+                    Some(ctx.alloc_tmp(RegClass::I64, types::I64))
+                } else {
+                    None
+                };
+                // TODO use xor
+                ctx.emit(Inst::imm(
+                    OperandSize::Size32,
+                    0,
+                    Writable::from_reg(regs::rdx()),
+                ));
+                ctx.emit(Inst::checked_div_or_rem_seq(kind, size, divisor_copy, tmp));
+            } else {
+                let divisor = input_to_reg_mem(ctx, inputs[1]);
+
+                // Fill in the high parts:
+                if kind.is_signed() {
+                    // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
+                    // signed opcodes.
+                    ctx.emit(Inst::sign_extend_data(size));
+                } else if input_ty == types::I8 {
+                    ctx.emit(Inst::movzx_rm_r(
+                        ExtMode::BL,
+                        RegMem::reg(regs::rax()),
+                        Writable::from_reg(regs::rax()),
+                    ));
+                } else {
+                    // zero for unsigned opcodes.
+                    ctx.emit(Inst::imm(
+                        OperandSize::Size64,
+                        0,
+                        Writable::from_reg(regs::rdx()),
+                    ));
+                }
+
+                // Emit the actual idiv.
+                ctx.emit(Inst::div(size, kind.is_signed(), divisor));
+            }
+
+            // Move the result back into the destination reg.
+            if is_div {
+                // The quotient is in rax.
+                ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
+            } else {
+                // The remainder is in rdx.
+                ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
+            }
+        }
+
+        Opcode::Umulhi | Opcode::Smulhi => {
+            let input_ty = ctx.input_ty(insn, 0);
+            let size = input_ty.bytes() as u8;
+
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = input_to_reg_mem(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            // Move lhs in %rax.
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::rax()),
+                lhs,
+                input_ty,
+            ));
+
+            // Emit the actual mul or imul.
+            let signed = op == Opcode::Smulhi;
+            ctx.emit(Inst::mul_hi(size, signed, rhs));
+
+            // Read the result from the high part (stored in %rdx).
+            ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
+        }
+
+        Opcode::GetPinnedReg => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64));
+        }
+
+        Opcode::SetPinnedReg => {
+            let src = put_input_in_reg(ctx, inputs[0]);
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::pinned_reg()),
+                src,
+                types::I64,
+            ));
+        }
+
+        Opcode::Vconst => {
+            let used_constant = if let &InstructionData::UnaryConst {
+                constant_handle, ..
+            } = ctx.data(insn)
+            {
+                ctx.use_constant(VCodeConstantData::Pool(
+                    constant_handle,
+                    ctx.get_constant_data(constant_handle).clone(),
+                ))
+            } else {
+                unreachable!("vconst should always have unary_const format")
+            };
+            // TODO use Inst::gen_constant() instead.
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            ctx.emit(Inst::xmm_load_const(used_constant, dst, ty));
+        }
+
+        Opcode::RawBitcast => {
+            // A raw_bitcast is just a mechanism for correcting the type of V128 values (see
+            // https://github.com/bytecodealliance/wasmtime/issues/1147). As such, this IR
+            // instruction should emit no machine code but a move is necessary to give the register
+            // allocator a definition for the output virtual register.
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            ctx.emit(Inst::gen_move(dst, src, ty));
+        }
+
+        Opcode::Shuffle => {
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let lhs_ty = ctx.input_ty(insn, 0);
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = put_input_in_reg(ctx, inputs[1]);
+            let mask = match ctx.get_immediate(insn) {
+                Some(DataValue::V128(bytes)) => bytes.to_vec(),
+                _ => unreachable!("shuffle should always have a 16-byte immediate"),
+            };
+
+            // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
+            // 1 in the most significant position zeroes the lane.
+            let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
+
+            ctx.emit(Inst::gen_move(dst, rhs, ty));
+            if rhs == lhs {
+                // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
+                // register. We statically build `constructed_mask` to zero out any unknown lane
+                // indices (may not be completely necessary: verification could fail incorrect mask
+                // values) and fix the indexes to all point to the `dst` vector.
+                let constructed_mask = mask
+                    .iter()
+                    // If the mask is greater than 15 it still may be referring to a lane in b.
+                    .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
+                    .map(zero_unknown_lane_index)
+                    .collect();
+                let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
+                let tmp = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+                ctx.emit(Inst::xmm_load_const(constant, tmp, ty));
+                // After loading the constructed mask in a temporary register, we use this to
+                // shuffle the `dst` register (remember that, in this case, it is the same as
+                // `src` so we disregard this register).
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
+            } else {
+                // If `lhs` and `rhs` are different, we must shuffle each separately and then OR
+                // them together. This is necessary due to PSHUFB semantics. As in the case above,
+                // we build the `constructed_mask` for each case statically.
+
+                // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
+                let tmp0 = ctx.alloc_tmp(RegClass::V128, lhs_ty);
+                ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
+                let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
+                let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
+                let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+                ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
+
+                // PSHUFB the second argument, placing zeroes for unused lanes.
+                let constructed_mask = mask
+                    .iter()
+                    .map(|b| b.wrapping_sub(16))
+                    .map(zero_unknown_lane_index)
+                    .collect();
+                let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
+                let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+                ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
+
+                // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
+                // is not important).
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
+
+                // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
+            }
+        }
+
+        Opcode::Swizzle => {
+            // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec
+            // requiring mask indexes greater than 15 to have the same semantics as a 0 index. For
+            // the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF
+            // semantics match the Wasm SIMD semantics for this instruction.
+            // The instruction format maps to variables like: %dst = swizzle %src, %mask
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let swizzle_mask = put_input_in_reg(ctx, inputs[1]);
+
+            // Inform the register allocator that `src` and `dst` should be in the same register.
+            ctx.emit(Inst::gen_move(dst, src, ty));
+
+            // Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
+            let zero_mask = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+            static ZERO_MASK_VALUE: [u8; 16] = [
+                0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                0x70, 0x70,
+            ];
+            let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE));
+            ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty));
+
+            // Use the `zero_mask` on a writable `swizzle_mask`.
+            let swizzle_mask = Writable::from_reg(swizzle_mask);
+            ctx.emit(Inst::xmm_rm_r(
+                SseOpcode::Paddusb,
+                RegMem::from(zero_mask),
+                swizzle_mask,
+            ));
+
+            // Shuffle `dst` using the fixed-up `swizzle_mask`.
+            ctx.emit(Inst::xmm_rm_r(
+                SseOpcode::Pshufb,
+                RegMem::from(swizzle_mask),
+                dst,
+            ));
+        }
+
+        Opcode::Insertlane => {
+            // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let in_vec = put_input_in_reg(ctx, inputs[0]);
+            let src_ty = ctx.input_ty(insn, 1);
+            debug_assert!(!src_ty.is_vector());
+            let src = input_to_reg_mem(ctx, inputs[1]);
+            let lane = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
+                *imm
+            } else {
+                unreachable!();
+            };
+            debug_assert!(lane < ty.lane_count() as u8);
+
+            ctx.emit(Inst::gen_move(dst, in_vec, ty));
+            emit_insert_lane(ctx, src, dst, lane, ty.lane_type());
+        }
+
+        Opcode::Extractlane => {
+            // The instruction format maps to variables like: %dst = extractlane %src, %lane
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let src_ty = ctx.input_ty(insn, 0);
+            assert_eq!(src_ty.bits(), 128);
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
+                *imm
+            } else {
+                unreachable!();
+            };
+            debug_assert!(lane < src_ty.lane_count() as u8);
+
+            if !ty.is_float() {
+                let (sse_op, w_bit) = match ty.lane_bits() {
+                    8 => (SseOpcode::Pextrb, false),
+                    16 => (SseOpcode::Pextrw, false),
+                    32 => (SseOpcode::Pextrd, false),
+                    64 => (SseOpcode::Pextrd, true),
+                    _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
+                };
+                let src = RegMem::reg(src);
+                ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit));
+            } else {
+                if lane == 0 {
+                    // Remove the extractlane instruction, leaving the float where it is. The upper
+                    // bits will remain unchanged; for correctness, this relies on Cranelift type
+                    // checking to avoid using those bits.
+                    ctx.emit(Inst::gen_move(dst, src, ty));
+                } else {
+                    // Otherwise, shuffle the bits in `lane` to the lowest lane.
+                    let sse_op = SseOpcode::Pshufd;
+                    let mask = match src_ty {
+                        // Move the value at `lane` to lane 0, copying existing value at lane 0 to
+                        // other lanes. Again, this relies on Cranelift type checking to avoid
+                        // using those bits.
+                        types::F32X4 => 0b00_00_00_00 | lane,
+                        // Move the value at `lane` 1 (we know it must be 1 because of the `if`
+                        // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
+                        // checking assumption also applies here.
+                        types::F64X2 => 0b11_10_11_10,
+                        _ => unreachable!(),
+                    };
+                    let src = RegMem::reg(src);
+                    ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false));
+                }
+            }
+        }
+
+        Opcode::Splat | Opcode::LoadSplat => {
+            let ty = ty.unwrap();
+            assert_eq!(ty.bits(), 128);
+            let src_ty = ctx.input_ty(insn, 0);
+            assert!(src_ty.bits() < 128);
+
+            let src = match op {
+                Opcode::Splat => input_to_reg_mem(ctx, inputs[0]),
+                Opcode::LoadSplat => {
+                    let offset = ctx.data(insn).load_store_offset().unwrap();
+                    let amode = lower_to_amode(ctx, inputs[0], offset);
+                    RegMem::mem(amode)
+                }
+                _ => unreachable!(),
+            };
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            // We know that splat will overwrite all of the lanes of `dst` but it takes several
+            // instructions to do so. Because of the multiple instructions, there is no good way to
+            // declare `dst` a `def` except with the following pseudo-instruction.
+            ctx.emit(Inst::xmm_uninit_value(dst));
+
+            // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
+            // and VPBROADCAST*.
+            match ty.lane_bits() {
+                8 => {
+                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
+                    // Initialize a register with all 0s.
+                    let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
+                    // Shuffle the lowest byte lane to all other lanes.
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
+                }
+                16 => {
+                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
+                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
+                    // Shuffle the lowest two lanes to all other lanes.
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        SseOpcode::Pshufd,
+                        RegMem::from(dst),
+                        dst,
+                        0,
+                        false,
+                    ))
+                }
+                32 => {
+                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
+                    // Shuffle the lowest lane to all other lanes.
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        SseOpcode::Pshufd,
+                        RegMem::from(dst),
+                        dst,
+                        0,
+                        false,
+                    ))
+                }
+                64 => {
+                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
+                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
+                }
+                _ => panic!("Invalid type to splat: {}", ty),
+            }
+        }
+
+        Opcode::VanyTrue => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let src_ty = ctx.input_ty(insn, 0);
+            assert_eq!(src_ty.bits(), 128);
+            let src = put_input_in_reg(ctx, inputs[0]);
+            // Set the ZF if the result is all zeroes.
+            ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src));
+            // If the ZF is not set, place a 1 in `dst`.
+            ctx.emit(Inst::setcc(CC::NZ, dst));
+        }
+
+        Opcode::VallTrue => {
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let src_ty = ctx.input_ty(insn, 0);
+            assert_eq!(src_ty.bits(), 128);
+            let src = input_to_reg_mem(ctx, inputs[0]);
+
+            let eq = |ty: Type| match ty.lane_bits() {
+                8 => SseOpcode::Pcmpeqb,
+                16 => SseOpcode::Pcmpeqw,
+                32 => SseOpcode::Pcmpeqd,
+                64 => SseOpcode::Pcmpeqq,
+                _ => panic!("Unable to find an instruction for {} for type: {}", op, ty),
+            };
+
+            // Initialize a register with all 0s.
+            let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
+            // Compare to see what lanes are filled with all 1s.
+            ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
+            // Set the ZF if the result is all zeroes.
+            ctx.emit(Inst::xmm_cmp_rm_r(
+                SseOpcode::Ptest,
+                RegMem::from(tmp),
+                tmp.to_reg(),
+            ));
+            // If the ZF is set, place a 1 in `dst`.
+            ctx.emit(Inst::setcc(CC::Z, dst));
+        }
+
+        Opcode::VhighBits => {
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let src_ty = ctx.input_ty(insn, 0);
+            debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
+            let dst = get_output_reg(ctx, outputs[0]);
+            debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+
+            // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
+            // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
+            // the instruction can access additional registers when used with a REX.R prefix. The
+            // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
+            // Manual, vol. 2). This being the case, we will always clear REX.W since its use is
+            // unnecessary (`OperandSize` is used for setting/clearing REX.W).
+            let size = OperandSize::Size32;
+
+            match src_ty {
+                types::I8X16 | types::B8X16 => {
+                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
+                }
+                types::I32X4 | types::B32X4 | types::F32X4 => {
+                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
+                }
+                types::I64X2 | types::B64X2 | types::F64X2 => {
+                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
+                }
+                types::I16X8 | types::B16X8 => {
+                    // There is no x86 instruction for extracting the high bit of 16-bit lanes so
+                    // here we:
+                    // - duplicate the 16-bit lanes of `src` into 8-bit lanes:
+                    //     PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
+                    // - use PMOVMSKB to gather the high bits; now we have duplicates, though
+                    // - shift away the bottom 8 high bits to remove the duplicates.
+                    let tmp = ctx.alloc_tmp(RegClass::V128, src_ty);
+                    ctx.emit(Inst::gen_move(tmp, src, src_ty));
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp));
+                    ctx.emit(Inst::xmm_to_gpr(
+                        SseOpcode::Pmovmskb,
+                        tmp.to_reg(),
+                        dst,
+                        size,
+                    ));
+                    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(8), dst));
+                }
+                _ => unimplemented!("unknown input type {} for {}", src_ty, op),
+            }
+        }
+
+        Opcode::IaddImm
+        | Opcode::ImulImm
+        | Opcode::UdivImm
+        | Opcode::SdivImm
+        | Opcode::UremImm
+        | Opcode::SremImm
+        | Opcode::IrsubImm
+        | Opcode::IaddCin
+        | Opcode::IaddIfcin
+        | Opcode::IaddCout
+        | Opcode::IaddCarry
+        | Opcode::IaddIfcarry
+        | Opcode::IsubBin
+        | Opcode::IsubIfbin
+        | Opcode::IsubBout
+        | Opcode::IsubIfbout
+        | Opcode::IsubBorrow
+        | Opcode::IsubIfborrow
+        | Opcode::BandImm
+        | Opcode::BorImm
+        | Opcode::BxorImm
+        | Opcode::RotlImm
+        | Opcode::RotrImm
+        | Opcode::IshlImm
+        | Opcode::UshrImm
+        | Opcode::SshrImm => {
+            panic!("ALU+imm and ALU+carry ops should not appear here!");
+        }
+        _ => unimplemented!("unimplemented lowering for opcode {:?}", op),
+    }
+
+    Ok(())
+}
+
+//=============================================================================
+// Lowering-backend trait implementation.
+
+impl LowerBackend for X64Backend {
+    type MInst = Inst;
+
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
+        lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.triple)
+    }
+
+    fn lower_branch_group<C: LowerCtx<I = Inst>>(
+        &self,
+        ctx: &mut C,
+        branches: &[IRInst],
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
+        // A block should end with at most two branches. The first may be a
+        // conditional branch; a conditional branch can be followed only by an
+        // unconditional branch or fallthrough. Otherwise, if only one branch,
+        // it may be an unconditional branch, a fallthrough, a return, or a
+        // trap. These conditions are verified by `is_ebb_basic()` during the
+        // verifier pass.
+        assert!(branches.len() <= 2);
+
+        if branches.len() == 2 {
+            // Must be a conditional branch followed by an unconditional branch.
+            let op0 = ctx.data(branches[0]).opcode();
+            let op1 = ctx.data(branches[1]).opcode();
+
+            trace!(
+                "lowering two-branch group: opcodes are {:?} and {:?}",
+                op0,
+                op1
+            );
+            assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
+
+            let taken = targets[0];
+            let not_taken = match op1 {
+                Opcode::Jump => targets[1],
+                Opcode::Fallthrough => fallthrough.unwrap(),
+                _ => unreachable!(), // assert above.
+            };
+
+            match op0 {
+                Opcode::Brz | Opcode::Brnz => {
+                    let flag_input = InsnInput {
+                        insn: branches[0],
+                        input: 0,
+                    };
+
+                    let src_ty = ctx.input_ty(branches[0], 0);
+
+                    if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
+                        emit_cmp(ctx, icmp);
+
+                        let cond_code = ctx.data(icmp).cond_code().unwrap();
+                        let cond_code = if op0 == Opcode::Brz {
+                            cond_code.inverse()
+                        } else {
+                            cond_code
+                        };
+
+                        let cc = CC::from_intcc(cond_code);
+                        ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
+                    } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
+                        let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
+                        let cond_code = if op0 == Opcode::Brz {
+                            cond_code.inverse()
+                        } else {
+                            cond_code
+                        };
+                        match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) {
+                            FcmpCondResult::Condition(cc) => {
+                                ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
+                            }
+                            FcmpCondResult::AndConditions(cc1, cc2) => {
+                                ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
+                                ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
+                            }
+                            FcmpCondResult::OrConditions(cc1, cc2) => {
+                                ctx.emit(Inst::jmp_if(cc1, taken));
+                                ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
+                            }
+                            FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
+                        }
+                    } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
+                        let src = put_input_in_reg(
+                            ctx,
+                            InsnInput {
+                                insn: branches[0],
+                                input: 0,
+                            },
+                        );
+                        let cc = match op0 {
+                            Opcode::Brz => CC::Z,
+                            Opcode::Brnz => CC::NZ,
+                            _ => unreachable!(),
+                        };
+                        let size_bytes = src_ty.bytes() as u8;
+                        ctx.emit(Inst::cmp_rmi_r(size_bytes, RegMemImm::imm(0), src));
+                        ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
+                    } else {
+                        unimplemented!("brz/brnz with non-int type {:?}", src_ty);
+                    }
+                }
+
+                Opcode::BrIcmp => {
+                    let src_ty = ctx.input_ty(branches[0], 0);
+                    if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
+                        let lhs = put_input_in_reg(
+                            ctx,
+                            InsnInput {
+                                insn: branches[0],
+                                input: 0,
+                            },
+                        );
+                        let rhs = input_to_reg_mem_imm(
+                            ctx,
+                            InsnInput {
+                                insn: branches[0],
+                                input: 1,
+                            },
+                        );
+                        let cc = CC::from_intcc(ctx.data(branches[0]).cond_code().unwrap());
+                        let byte_size = src_ty.bytes() as u8;
+                        // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
+                        // us dst - src at the machine instruction level, so invert operands.
+                        ctx.emit(Inst::cmp_rmi_r(byte_size, rhs, lhs));
+                        ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
+                    } else {
+                        unimplemented!("bricmp with non-int type {:?}", src_ty);
+                    }
+                }
+
+                _ => panic!("unexpected branch opcode: {:?}", op0),
+            }
+        } else {
+            assert_eq!(branches.len(), 1);
+
+            // Must be an unconditional branch or trap.
+            let op = ctx.data(branches[0]).opcode();
+            match op {
+                Opcode::Jump | Opcode::Fallthrough => {
+                    ctx.emit(Inst::jmp_known(targets[0]));
+                }
+
+                Opcode::BrTable => {
+                    let jt_size = targets.len() - 1;
+                    assert!(jt_size <= u32::max_value() as usize);
+                    let jt_size = jt_size as u32;
+
+                    let idx = extend_input_to_reg(
+                        ctx,
+                        InsnInput {
+                            insn: branches[0],
+                            input: 0,
+                        },
+                        ExtSpec::ZeroExtendTo32,
+                    );
+
+                    // Bounds-check (compute flags from idx - jt_size) and branch to default.
+                    ctx.emit(Inst::cmp_rmi_r(4, RegMemImm::imm(jt_size), idx));
+
+                    // Emit the compound instruction that does:
+                    //
+                    // lea $jt, %rA
+                    // movsbl [%rA, %rIndex, 2], %rB
+                    // add %rB, %rA
+                    // j *%rA
+                    // [jt entries]
+                    //
+                    // This must be *one* instruction in the vcode because we cannot allow regalloc
+                    // to insert any spills/fills in the middle of the sequence; otherwise, the
+                    // lea PC-rel offset to the jumptable would be incorrect.  (The alternative
+                    // is to introduce a relocation pass for inlined jumptables, which is much
+                    // worse.)
+
+                    // This temporary is used as a signed integer of 64-bits (to hold addresses).
+                    let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                    // This temporary is used as a signed integer of 32-bits (for the wasm-table
+                    // index) and then 64-bits (address addend). The small lie about the I64 type
+                    // is benign, since the temporary is dead after this instruction (and its
+                    // Cranelift type is thus unused).
+                    let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+
+                    let targets_for_term: Vec<MachLabel> = targets.to_vec();
+                    let default_target = targets[0];
+
+                    let jt_targets: Vec<MachLabel> = targets.iter().skip(1).cloned().collect();
+
+                    ctx.emit(Inst::JmpTableSeq {
+                        idx,
+                        tmp1,
+                        tmp2,
+                        default_target,
+                        targets: jt_targets,
+                        targets_for_term,
+                    });
+                }
+
+                _ => panic!("Unknown branch type {:?}", op),
+            }
+        }
+
+        Ok(())
+    }
+
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        Some(regs::pinned_reg())
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs b/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs
new file mode 100644
index 0000000000..fd4444498d
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs
@@ -0,0 +1,149 @@
+//! X86_64-bit Instruction Set Architecture.
+
+use self::inst::EmitInfo;
+
+use super::TargetIsa;
+use crate::ir::{condcodes::IntCC, Function};
+use crate::isa::x64::{inst::regs::create_reg_universe_systemv, settings as x64_settings};
+use crate::isa::Builder as IsaBuilder;
+use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
+use crate::result::CodegenResult;
+use crate::settings::{self as shared_settings, Flags};
+use alloc::boxed::Box;
+use regalloc::{PrettyPrint, RealRegUniverse};
+use target_lexicon::Triple;
+
+mod abi;
+mod inst;
+mod lower;
+mod settings;
+
+/// An X64 backend.
+pub(crate) struct X64Backend {
+    triple: Triple,
+    flags: Flags,
+    x64_flags: x64_settings::Flags,
+    reg_universe: RealRegUniverse,
+}
+
+impl X64Backend {
+    /// Create a new X64 backend with the given (shared) flags.
+    fn new_with_flags(triple: Triple, flags: Flags, x64_flags: x64_settings::Flags) -> Self {
+        let reg_universe = create_reg_universe_systemv(&flags);
+        Self {
+            triple,
+            flags,
+            x64_flags,
+            reg_universe,
+        }
+    }
+
+    fn compile_vcode(&self, func: &Function, flags: Flags) -> CodegenResult<VCode<inst::Inst>> {
+        // This performs lowering to VCode, register-allocates the code, computes
+        // block layout and finalizes branches. The result is ready for binary emission.
+        let emit_info = EmitInfo::new(flags.clone(), self.x64_flags.clone());
+        let abi = Box::new(abi::X64ABICallee::new(&func, flags)?);
+        compile::compile::<Self>(&func, self, abi, emit_info)
+    }
+}
+
+impl MachBackend for X64Backend {
+    fn compile_function(
+        &self,
+        func: &Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult> {
+        let flags = self.flags();
+        let vcode = self.compile_vcode(func, flags.clone())?;
+
+        let buffer = vcode.emit();
+        let buffer = buffer.finish();
+        let frame_size = vcode.frame_size();
+        let unwind_info = vcode.unwind_info()?;
+
+        let disasm = if want_disasm {
+            Some(vcode.show_rru(Some(&create_reg_universe_systemv(flags))))
+        } else {
+            None
+        };
+
+        Ok(MachCompileResult {
+            buffer,
+            frame_size,
+            disasm,
+            unwind_info,
+        })
+    }
+
+    fn flags(&self) -> &Flags {
+        &self.flags
+    }
+
+    fn name(&self) -> &'static str {
+        "x64"
+    }
+
+    fn triple(&self) -> Triple {
+        self.triple.clone()
+    }
+
+    fn reg_universe(&self) -> &RealRegUniverse {
+        &self.reg_universe
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> IntCC {
+        // Unsigned `>=`; this corresponds to the carry flag set on x86, which happens on
+        // overflow of an add.
+        IntCC::UnsignedGreaterThanOrEqual
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> IntCC {
+        // unsigned `>=`; this corresponds to the carry flag set on x86, which happens on
+        // underflow of a subtract (carry is borrow for subtract).
+        IntCC::UnsignedGreaterThanOrEqual
+    }
+
+    #[cfg(feature = "unwind")]
+    fn emit_unwind_info(
+        &self,
+        result: &MachCompileResult,
+        kind: crate::machinst::UnwindInfoKind,
+    ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> {
+        use crate::isa::unwind::UnwindInfo;
+        use crate::machinst::UnwindInfoKind;
+        Ok(match (result.unwind_info.as_ref(), kind) {
+            (Some(info), UnwindInfoKind::SystemV) => {
+                inst::unwind::systemv::create_unwind_info(info.clone())?.map(UnwindInfo::SystemV)
+            }
+            (Some(_info), UnwindInfoKind::Windows) => {
+                //TODO inst::unwind::winx64::create_unwind_info(info.clone())?.map(|u| UnwindInfo::WindowsX64(u))
+                None
+            }
+            _ => None,
+        })
+    }
+
+    #[cfg(feature = "unwind")]
+    fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> {
+        Some(inst::unwind::systemv::create_cie())
+    }
+}
+
+/// Create a new `isa::Builder`.
+pub(crate) fn isa_builder(triple: Triple) -> IsaBuilder {
+    IsaBuilder {
+        triple,
+        setup: x64_settings::builder(),
+        constructor: isa_constructor,
+    }
+}
+
+fn isa_constructor(
+    triple: Triple,
+    shared_flags: Flags,
+    builder: shared_settings::Builder,
+) -> Box<dyn TargetIsa> {
+    let isa_flags = x64_settings::Flags::new(&shared_flags, builder);
+    let backend = X64Backend::new_with_flags(triple, shared_flags, isa_flags);
+    Box::new(TargetIsaAdapter::new(backend))
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs b/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs
new file mode 100644
index 0000000000..c5371bb132
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs
@@ -0,0 +1,9 @@
+//! x86 Settings.
+
+use crate::settings::{self, detail, Builder};
+use core::fmt;
+
+// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
+// public `Flags` struct with an impl for all of the settings defined in
+// `cranelift-codegen/meta/src/isa/x86/settings.rs`.
+include!(concat!(env!("OUT_DIR"), "/settings-x86.rs"));
diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/abi.rs b/third_party/rust/cranelift-codegen/src/isa/x86/abi.rs
new file mode 100644
index 0000000000..5119bb3241
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/abi.rs
@@ -0,0 +1,1093 @@
+//! x86 ABI implementation.
+
+use super::super::settings as shared_settings;
+use super::registers::{FPR, GPR, RU};
+use super::settings as isa_settings;
+use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
+use crate::cursor::{Cursor, CursorPosition, EncCursor};
+use crate::ir;
+use crate::ir::immediates::Imm64;
+use crate::ir::stackslot::{StackOffset, StackSize};
+use crate::ir::types;
+use crate::ir::{
+    get_probestack_funcref, AbiParam, ArgumentExtension, ArgumentLoc, ArgumentPurpose, InstBuilder,
+    ValueLoc,
+};
+use crate::isa::{CallConv, RegClass, RegUnit, TargetIsa};
+use crate::regalloc::RegisterSet;
+use crate::result::CodegenResult;
+use crate::stack_layout::layout_stack;
+use alloc::borrow::Cow;
+use core::i32;
+use target_lexicon::{PointerWidth, Triple};
+
+/// Argument registers for x86-64
+static ARG_GPRS: [RU; 6] = [RU::rdi, RU::rsi, RU::rdx, RU::rcx, RU::r8, RU::r9];
+
+/// Return value registers.
+static RET_GPRS: [RU; 3] = [RU::rax, RU::rdx, RU::rcx];
+
+/// Argument registers for x86-64, when using windows fastcall
+static ARG_GPRS_WIN_FASTCALL_X64: [RU; 4] = [RU::rcx, RU::rdx, RU::r8, RU::r9];
+
+/// Return value registers for x86-64, when using windows fastcall
+static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax];
+
+/// The win64 fastcall ABI uses some shadow stack space, allocated by the caller, that can be used
+/// by the callee for temporary values.
+///
+/// [1] "Space is allocated on the call stack as a shadow store for callees to save" This shadow
+/// store contains the parameters which are passed through registers (ARG_GPRS) and is eventually
+/// used by the callee to save & restore the values of the arguments.
+///
+/// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling
+/// convention reserves spill space for parameters, you don’t have to use them as such"
+const WIN_SHADOW_STACK_SPACE: StackSize = 32;
+
+/// Stack alignment requirement for functions.
+///
+/// 16 bytes is the perfect stack alignment, because:
+///
+/// - On Win64, "The primary exceptions are the stack pointer and malloc or alloca memory, which
+/// are aligned to 16 bytes in order to aid performance".
+/// - The original 32-bit x86 ELF ABI had a 4-byte aligned stack pointer, but newer versions use a
+/// 16-byte aligned stack pointer.
+/// - This allows using aligned loads and stores on SIMD vectors of 16 bytes that are located
+/// higher up in the stack.
+const STACK_ALIGNMENT: u32 = 16;
+
+#[derive(Clone)]
+struct Args {
+    pointer_bytes: u8,
+    pointer_bits: u8,
+    pointer_type: ir::Type,
+    gpr: &'static [RU],
+    gpr_used: usize,
+    fpr_limit: usize,
+    fpr_used: usize,
+    offset: u32,
+    call_conv: CallConv,
+    shared_flags: shared_settings::Flags,
+    #[allow(dead_code)]
+    isa_flags: isa_settings::Flags,
+    assigning_returns: bool,
+}
+
+impl Args {
+    fn new(
+        bits: u8,
+        gpr: &'static [RU],
+        fpr_limit: usize,
+        call_conv: CallConv,
+        shared_flags: &shared_settings::Flags,
+        isa_flags: &isa_settings::Flags,
+        assigning_returns: bool,
+    ) -> Self {
+        let offset = if call_conv.extends_windows_fastcall() {
+            WIN_SHADOW_STACK_SPACE
+        } else {
+            0
+        };
+
+        Self {
+            pointer_bytes: bits / 8,
+            pointer_bits: bits,
+            pointer_type: ir::Type::int(u16::from(bits)).unwrap(),
+            gpr,
+            gpr_used: 0,
+            fpr_limit,
+            fpr_used: 0,
+            offset,
+            call_conv,
+            shared_flags: shared_flags.clone(),
+            isa_flags: isa_flags.clone(),
+            assigning_returns,
+        }
+    }
+}
+
+impl ArgAssigner for Args {
+    fn assign(&mut self, arg: &AbiParam) -> ArgAction {
+        if let ArgumentPurpose::StructArgument(size) = arg.purpose {
+            if self.call_conv != CallConv::SystemV {
+                panic!(
+                    "The sarg argument purpose is not yet implemented for non-systemv call conv {:?}",
+                    self.call_conv,
+                );
+            }
+            let loc = ArgumentLoc::Stack(self.offset as i32);
+            self.offset += size;
+            debug_assert!(self.offset <= i32::MAX as u32);
+            return ArgAction::AssignAndChangeType(loc, types::SARG_T);
+        }
+
+        let ty = arg.value_type;
+
+        if ty.bits() > u16::from(self.pointer_bits) {
+            if !self.assigning_returns && self.call_conv.extends_windows_fastcall() {
+                // "Any argument that doesn't fit in 8 bytes, or isn't
+                // 1, 2, 4, or 8 bytes, must be passed by reference"
+                return ValueConversion::Pointer(self.pointer_type).into();
+            } else if !ty.is_vector() && !ty.is_float() {
+                // On SystemV large integers and booleans are broken down to fit in a register.
+                return ValueConversion::IntSplit.into();
+            }
+        }
+
+        // Vectors should stay in vector registers unless SIMD is not enabled--then they are split
+        if ty.is_vector() {
+            if self.shared_flags.enable_simd() {
+                let reg = FPR.unit(self.fpr_used);
+                self.fpr_used += 1;
+                return ArgumentLoc::Reg(reg).into();
+            }
+            return ValueConversion::VectorSplit.into();
+        }
+
+        // Small integers are extended to the size of a pointer register.
+        if ty.is_int() && ty.bits() < u16::from(self.pointer_bits) {
+            match arg.extension {
+                ArgumentExtension::None => {}
+                ArgumentExtension::Uext => return ValueConversion::Uext(self.pointer_type).into(),
+                ArgumentExtension::Sext => return ValueConversion::Sext(self.pointer_type).into(),
+            }
+        }
+
+        // Handle special-purpose arguments.
+        if ty.is_int() && self.call_conv.extends_baldrdash() {
+            match arg.purpose {
+                // This is SpiderMonkey's `WasmTlsReg`.
+                ArgumentPurpose::VMContext => {
+                    return ArgumentLoc::Reg(if self.pointer_bits == 64 {
+                        RU::r14
+                    } else {
+                        RU::rsi
+                    } as RegUnit)
+                    .into();
+                }
+                // This is SpiderMonkey's `WasmTableCallSigReg`.
+                ArgumentPurpose::SignatureId => {
+                    return ArgumentLoc::Reg(if self.pointer_bits == 64 {
+                        RU::r10
+                    } else {
+                        RU::rcx
+                    } as RegUnit)
+                    .into()
+                }
+                _ => {}
+            }
+        }
+
+        // Try to use a GPR.
+        if !ty.is_float() && self.gpr_used < self.gpr.len() {
+            let reg = self.gpr[self.gpr_used] as RegUnit;
+            self.gpr_used += 1;
+            return ArgumentLoc::Reg(reg).into();
+        }
+
+        // Try to use an FPR.
+        let fpr_offset = if self.call_conv.extends_windows_fastcall() {
+            // Float and general registers on windows share the same parameter index.
+            // The used register depends entirely on the parameter index: Even if XMM0
+            // is not used for the first parameter, it cannot be used for the second parameter.
+            debug_assert_eq!(self.fpr_limit, self.gpr.len());
+            &mut self.gpr_used
+        } else {
+            &mut self.fpr_used
+        };
+
+        if ty.is_float() && *fpr_offset < self.fpr_limit {
+            let reg = FPR.unit(*fpr_offset);
+            *fpr_offset += 1;
+            return ArgumentLoc::Reg(reg).into();
+        }
+
+        // Assign a stack location.
+        let loc = ArgumentLoc::Stack(self.offset as i32);
+        self.offset += u32::from(self.pointer_bytes);
+        debug_assert!(self.offset <= i32::MAX as u32);
+        loc.into()
+    }
+}
+
+/// Legalize `sig`.
+pub fn legalize_signature(
+    sig: &mut Cow<ir::Signature>,
+    triple: &Triple,
+    _current: bool,
+    shared_flags: &shared_settings::Flags,
+    isa_flags: &isa_settings::Flags,
+) {
+    let bits;
+    let mut args;
+
+    match triple.pointer_width().unwrap() {
+        PointerWidth::U16 => panic!(),
+        PointerWidth::U32 => {
+            bits = 32;
+            args = Args::new(bits, &[], 0, sig.call_conv, shared_flags, isa_flags, false);
+        }
+        PointerWidth::U64 => {
+            bits = 64;
+            args = if sig.call_conv.extends_windows_fastcall() {
+                Args::new(
+                    bits,
+                    &ARG_GPRS_WIN_FASTCALL_X64[..],
+                    4,
+                    sig.call_conv,
+                    shared_flags,
+                    isa_flags,
+                    false,
+                )
+            } else {
+                Args::new(
+                    bits,
+                    &ARG_GPRS[..],
+                    8,
+                    sig.call_conv,
+                    shared_flags,
+                    isa_flags,
+                    false,
+                )
+            };
+        }
+    }
+
+    let (ret_regs, ret_fpr_limit) = if sig.call_conv.extends_windows_fastcall() {
+        // windows-x64 calling convention only uses XMM0 or RAX for return values
+        (&RET_GPRS_WIN_FASTCALL_X64[..], 1)
+    } else {
+        (&RET_GPRS[..], 2)
+    };
+
+    let mut rets = Args::new(
+        bits,
+        ret_regs,
+        ret_fpr_limit,
+        sig.call_conv,
+        shared_flags,
+        isa_flags,
+        true,
+    );
+
+    // If we don't have enough available return registers
+    // to fit all of the return values, we need to backtrack and start
+    // assigning locations all over again with a different strategy. In order to
+    // do that, we need a copy of the original assigner for the returns.
+    let mut backup_rets = rets.clone();
+
+    if let Some(new_returns) = legalize_args(&sig.returns, &mut rets) {
+        if new_returns
+            .iter()
+            .filter(|r| r.purpose == ArgumentPurpose::Normal)
+            .any(|r| !r.location.is_reg())
+        {
+            // The return values couldn't all fit into available return
+            // registers. Introduce the use of a struct-return parameter.
+            debug_assert!(!sig.uses_struct_return_param());
+
+            // We're using the first register for the return pointer parameter.
+            let mut ret_ptr_param = AbiParam {
+                value_type: args.pointer_type,
+                purpose: ArgumentPurpose::StructReturn,
+                extension: ArgumentExtension::None,
+                location: ArgumentLoc::Unassigned,
+                legalized_to_pointer: false,
+            };
+            match args.assign(&ret_ptr_param) {
+                ArgAction::Assign(ArgumentLoc::Reg(reg)) => {
+                    ret_ptr_param.location = ArgumentLoc::Reg(reg);
+                    sig.to_mut().params.push(ret_ptr_param);
+                }
+                _ => unreachable!("return pointer should always get a register assignment"),
+            }
+
+            // We're using the first return register for the return pointer (like
+            // sys v does).
+            let mut ret_ptr_return = AbiParam {
+                value_type: args.pointer_type,
+                purpose: ArgumentPurpose::StructReturn,
+                extension: ArgumentExtension::None,
+                location: ArgumentLoc::Unassigned,
+                legalized_to_pointer: false,
+            };
+            match backup_rets.assign(&ret_ptr_return) {
+                ArgAction::Assign(ArgumentLoc::Reg(reg)) => {
+                    ret_ptr_return.location = ArgumentLoc::Reg(reg);
+                    sig.to_mut().returns.push(ret_ptr_return);
+                }
+                _ => unreachable!("return pointer should always get a register assignment"),
+            }
+
+            sig.to_mut().returns.retain(|ret| {
+                // Either this is the return pointer, in which case we want to keep
+                // it, or else assume that it is assigned for a reason and doesn't
+                // conflict with our return pointering legalization.
+                debug_assert_eq!(
+                    ret.location.is_assigned(),
+                    ret.purpose != ArgumentPurpose::Normal
+                );
+                ret.location.is_assigned()
+            });
+
+            if let Some(new_returns) = legalize_args(&sig.returns, &mut backup_rets) {
+                sig.to_mut().returns = new_returns;
+            }
+        } else {
+            sig.to_mut().returns = new_returns;
+        }
+    }
+
+    if let Some(new_params) = legalize_args(&sig.params, &mut args) {
+        sig.to_mut().params = new_params;
+    }
+}
+
+/// Get register class for a type appearing in a legalized signature.
+pub fn regclass_for_abi_type(ty: ir::Type) -> RegClass {
+    if ty.is_int() || ty.is_bool() || ty.is_ref() {
+        GPR
+    } else {
+        FPR
+    }
+}
+
+/// Get the set of allocatable registers for `func`.
+pub fn allocatable_registers(triple: &Triple, flags: &shared_settings::Flags) -> RegisterSet {
+    let mut regs = RegisterSet::new();
+    regs.take(GPR, RU::rsp as RegUnit);
+    regs.take(GPR, RU::rbp as RegUnit);
+
+    // 32-bit arch only has 8 registers.
+    if triple.pointer_width().unwrap() != PointerWidth::U64 {
+        for i in 8..16 {
+            regs.take(GPR, GPR.unit(i));
+            regs.take(FPR, FPR.unit(i));
+        }
+        if flags.enable_pinned_reg() {
+            unimplemented!("Pinned register not implemented on x86-32.");
+        }
+    } else {
+        // Choose r15 as the pinned register on 64-bits: it is non-volatile on native ABIs and
+        // isn't the fixed output register of any instruction.
+        if flags.enable_pinned_reg() {
+            regs.take(GPR, RU::r15 as RegUnit);
+        }
+    }
+
+    regs
+}
+
+/// Get the set of callee-saved general-purpose registers.
+fn callee_saved_gprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] {
+    match isa.triple().pointer_width().unwrap() {
+        PointerWidth::U16 => panic!(),
+        PointerWidth::U32 => &[RU::rbx, RU::rsi, RU::rdi],
+        PointerWidth::U64 => {
+            if call_conv.extends_windows_fastcall() {
+                // "registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 are
+                // considered nonvolatile and must be saved and restored by a function that uses
+                //  them."
+                // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention
+                // RSP & RBP are not listed below, since they are restored automatically during
+                // a function call. If that wasn't the case, function calls (RET) would not work.
+                &[
+                    RU::rbx,
+                    RU::rdi,
+                    RU::rsi,
+                    RU::r12,
+                    RU::r13,
+                    RU::r14,
+                    RU::r15,
+                ]
+            } else {
+                &[RU::rbx, RU::r12, RU::r13, RU::r14, RU::r15]
+            }
+        }
+    }
+}
+
+/// Get the set of callee-saved floating-point (SIMD) registers.
+fn callee_saved_fprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] {
+    match isa.triple().pointer_width().unwrap() {
+        PointerWidth::U16 => panic!(),
+        PointerWidth::U32 => &[],
+        PointerWidth::U64 => {
+            if call_conv.extends_windows_fastcall() {
+                // "registers RBX, ... , and XMM6-15 are considered nonvolatile and must be saved
+                //  and restored by a function that uses them."
+                // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention as of
+                // February 5th, 2020.
+                &[
+                    RU::xmm6,
+                    RU::xmm7,
+                    RU::xmm8,
+                    RU::xmm9,
+                    RU::xmm10,
+                    RU::xmm11,
+                    RU::xmm12,
+                    RU::xmm13,
+                    RU::xmm14,
+                    RU::xmm15,
+                ]
+            } else {
+                &[]
+            }
+        }
+    }
+}
+
+/// Get the set of callee-saved registers that are used.
+fn callee_saved_regs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterSet {
+    let mut all_callee_saved = RegisterSet::empty();
+    for reg in callee_saved_gprs(isa, func.signature.call_conv) {
+        all_callee_saved.free(GPR, *reg as RegUnit);
+    }
+    for reg in callee_saved_fprs(isa, func.signature.call_conv) {
+        all_callee_saved.free(FPR, *reg as RegUnit);
+    }
+
+    let mut used = RegisterSet::empty();
+    for value_loc in func.locations.values() {
+        // Note that `value_loc` here contains only a single unit of a potentially multi-unit
+        // register. We don't use registers that overlap each other in the x86 ISA, but in others
+        // we do. So this should not be blindly reused.
+        if let ValueLoc::Reg(ru) = *value_loc {
+            if GPR.contains(ru) {
+                if !used.is_avail(GPR, ru) {
+                    used.free(GPR, ru);
+                }
+            } else if FPR.contains(ru) {
+                if !used.is_avail(FPR, ru) {
+                    used.free(FPR, ru);
+                }
+            }
+        }
+    }
+
+    // regmove and regfill instructions may temporarily divert values into other registers,
+    // and these are not reflected in `func.locations`. Scan the function for such instructions
+    // and note which callee-saved registers they use.
+    //
+    // TODO: Consider re-evaluating how regmove/regfill/regspill work and whether it's possible
+    // to avoid this step.
+    for block in &func.layout {
+        for inst in func.layout.block_insts(block) {
+            match func.dfg[inst] {
+                ir::instructions::InstructionData::RegMove { dst, .. }
+                | ir::instructions::InstructionData::RegFill { dst, .. } => {
+                    if GPR.contains(dst) {
+                        if !used.is_avail(GPR, dst) {
+                            used.free(GPR, dst);
+                        }
+                    } else if FPR.contains(dst) {
+                        if !used.is_avail(FPR, dst) {
+                            used.free(FPR, dst);
+                        }
+                    }
+                }
+                _ => (),
+            }
+        }
+    }
+
+    used.intersect(&all_callee_saved);
+    used
+}
+
+pub fn prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> {
+    match func.signature.call_conv {
+        // For now, just translate fast and cold as system_v.
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV => {
+            system_v_prologue_epilogue(func, isa)
+        }
+        CallConv::WindowsFastcall => fastcall_prologue_epilogue(func, isa),
+        CallConv::BaldrdashSystemV | CallConv::BaldrdashWindows => {
+            baldrdash_prologue_epilogue(func, isa)
+        }
+        CallConv::Probestack => unimplemented!("probestack calling convention"),
+        CallConv::Baldrdash2020 => unimplemented!("Baldrdash ABI 2020"),
+    }
+}
+
+fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> {
+    debug_assert!(
+        !isa.flags().enable_probestack(),
+        "baldrdash does not expect cranelift to emit stack probes"
+    );
+
+    let word_size = StackSize::from(isa.pointer_bytes());
+    let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() {
+        WIN_SHADOW_STACK_SPACE
+    } else {
+        0
+    };
+
+    let bytes =
+        StackSize::from(isa.flags().baldrdash_prologue_words()) * word_size + shadow_store_size;
+
+    let mut ss = ir::StackSlotData::new(ir::StackSlotKind::IncomingArg, bytes);
+    ss.offset = Some(-(bytes as StackOffset));
+    func.stack_slots.push(ss);
+
+    let is_leaf = func.is_leaf();
+    layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)?;
+    Ok(())
+}
+
+/// Implementation of the fastcall-based Win64 calling convention described at [1]
+/// [1] https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention
+fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> {
+    if isa.triple().pointer_width().unwrap() != PointerWidth::U64 {
+        panic!("TODO: windows-fastcall: x86-32 not implemented yet");
+    }
+
+    // The reserved stack area is composed of:
+    //   return address + frame pointer + all callee-saved registers
+    //
+    // Pushing the return address is an implicit function of the `call`
+    // instruction. Each of the others we will then push explicitly. Then we
+    // will adjust the stack pointer to make room for the rest of the required
+    // space for this frame.
+    let csrs = callee_saved_regs_used(isa, func);
+    let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32;
+    let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32;
+    let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size;
+
+    // FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed
+    if fpsr_stack_size > 0 {
+        csr_stack_size = (csr_stack_size + 15) & !15;
+    }
+
+    func.create_stack_slot(ir::StackSlotData {
+        kind: ir::StackSlotKind::IncomingArg,
+        size: csr_stack_size,
+        offset: Some(-(csr_stack_size as StackOffset)),
+    });
+
+    let is_leaf = func.is_leaf();
+
+    // If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space
+    if !is_leaf {
+        // TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack
+        func.create_stack_slot(ir::StackSlotData {
+            kind: ir::StackSlotKind::ExplicitSlot,
+            size: WIN_SHADOW_STACK_SPACE,
+            offset: None,
+        });
+    }
+
+    let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
+
+    // Subtract the GPR saved register size from the local size because pushes are used for the saves
+    let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32);
+
+    // Add CSRs to function signature
+    let reg_type = isa.pointer_type();
+    let sp_arg_index = if fpsr_stack_size > 0 {
+        let sp_arg = ir::AbiParam::special_reg(
+            reg_type,
+            ir::ArgumentPurpose::CalleeSaved,
+            RU::rsp as RegUnit,
+        );
+        let index = func.signature.params.len();
+        func.signature.params.push(sp_arg);
+        Some(index)
+    } else {
+        None
+    };
+    let fp_arg = ir::AbiParam::special_reg(
+        reg_type,
+        ir::ArgumentPurpose::FramePointer,
+        RU::rbp as RegUnit,
+    );
+    func.signature.params.push(fp_arg);
+    func.signature.returns.push(fp_arg);
+
+    for gp_csr in csrs.iter(GPR) {
+        let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, gp_csr);
+        func.signature.params.push(csr_arg);
+        func.signature.returns.push(csr_arg);
+    }
+
+    for fp_csr in csrs.iter(FPR) {
+        // The calling convention described in
+        // https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention only requires
+        // preserving the low 128 bits of XMM6-XMM15.
+        let csr_arg =
+            ir::AbiParam::special_reg(types::F64X2, ir::ArgumentPurpose::CalleeSaved, fp_csr);
+        func.signature.params.push(csr_arg);
+        func.signature.returns.push(csr_arg);
+    }
+
+    // Set up the cursor and insert the prologue
+    let entry_block = func.layout.entry_block().expect("missing entry block");
+    let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
+    insert_common_prologue(
+        &mut pos,
+        local_stack_size,
+        reg_type,
+        &csrs,
+        sp_arg_index.is_some(),
+        isa,
+    );
+
+    // Reset the cursor and insert the epilogue
+    let mut pos = pos.at_position(CursorPosition::Nowhere);
+    insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index);
+
+    Ok(())
+}
+
+/// Insert a System V-compatible prologue and epilogue.
+fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> {
+    let pointer_width = isa.triple().pointer_width().unwrap();
+    let word_size = pointer_width.bytes() as usize;
+
+    let csrs = callee_saved_regs_used(isa, func);
+    assert!(
+        csrs.iter(FPR).len() == 0,
+        "SysV ABI does not have callee-save SIMD registers"
+    );
+
+    // The reserved stack area is composed of:
+    //   return address + frame pointer + all callee-saved registers
+    //
+    // Pushing the return address is an implicit function of the `call`
+    // instruction. Each of the others we will then push explicitly. Then we
+    // will adjust the stack pointer to make room for the rest of the required
+    // space for this frame.
+    let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
+    func.create_stack_slot(ir::StackSlotData {
+        kind: ir::StackSlotKind::IncomingArg,
+        size: csr_stack_size as u32,
+        offset: Some(-csr_stack_size),
+    });
+
+    let is_leaf = func.is_leaf();
+    let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
+    let local_stack_size = i64::from(total_stack_size - csr_stack_size);
+
+    // Add CSRs to function signature
+    let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap();
+    // On X86-32 all parameters, including vmctx, are passed on stack, and we need
+    // to extract vmctx from the stack before we can save the frame pointer.
+    let sp_arg_index = if isa.pointer_bits() == 32 {
+        let sp_arg = ir::AbiParam::special_reg(
+            reg_type,
+            ir::ArgumentPurpose::CalleeSaved,
+            RU::rsp as RegUnit,
+        );
+        let index = func.signature.params.len();
+        func.signature.params.push(sp_arg);
+        Some(index)
+    } else {
+        None
+    };
+    let fp_arg = ir::AbiParam::special_reg(
+        reg_type,
+        ir::ArgumentPurpose::FramePointer,
+        RU::rbp as RegUnit,
+    );
+    func.signature.params.push(fp_arg);
+    func.signature.returns.push(fp_arg);
+
+    for csr in csrs.iter(GPR) {
+        let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, csr);
+        func.signature.params.push(csr_arg);
+        func.signature.returns.push(csr_arg);
+    }
+
+    // Set up the cursor and insert the prologue
+    let entry_block = func.layout.entry_block().expect("missing entry block");
+    let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
+    insert_common_prologue(
+        &mut pos,
+        local_stack_size,
+        reg_type,
+        &csrs,
+        sp_arg_index.is_some(),
+        isa,
+    );
+
+    // Reset the cursor and insert the epilogue
+    let mut pos = pos.at_position(CursorPosition::Nowhere);
+    insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index);
+
+    Ok(())
+}
+
+/// Insert the prologue for a given function.
+/// This is used by common calling conventions such as System V.
+fn insert_common_prologue(
+    pos: &mut EncCursor,
+    stack_size: i64,
+    reg_type: ir::types::Type,
+    csrs: &RegisterSet,
+    has_sp_param: bool,
+    isa: &dyn TargetIsa,
+) {
+    let sp = if has_sp_param {
+        let block = pos.current_block().expect("missing block under cursor");
+        let sp = pos.func.dfg.append_block_param(block, reg_type);
+        pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit);
+        Some(sp)
+    } else {
+        None
+    };
+
+    // If this is a leaf function with zero stack, then there's no need to
+    // insert a stack check since it can't overflow anything and
+    // forward-progress is guarantee so long as loop are handled anyway.
+    //
+    // If this has a stack size it could stack overflow, or if it isn't a leaf
+    // it could be part of a long call chain which we need to check anyway.
+    //
+    // First we look for the stack limit as a special argument to the function,
+    // and failing that we see if a custom stack limit factory has been provided
+    // which will be used to likely calculate the stack limit from the arguments
+    // or perhaps constants.
+    if stack_size > 0 || !pos.func.is_leaf() {
+        let scratch = ir::ValueLoc::Reg(RU::rax as RegUnit);
+        let stack_limit_arg = match pos.func.special_param(ArgumentPurpose::StackLimit) {
+            Some(arg) => {
+                let copy = pos.ins().copy(arg);
+                pos.func.locations[copy] = scratch;
+                Some(copy)
+            }
+            None => pos
+                .func
+                .stack_limit
+                .map(|gv| interpret_gv(pos, gv, sp, scratch)),
+        };
+        if let Some(stack_limit_arg) = stack_limit_arg {
+            insert_stack_check(pos, stack_size, stack_limit_arg);
+        }
+    }
+
+    // Append param to entry block
+    let block = pos.current_block().expect("missing block under cursor");
+    let fp = pos.func.dfg.append_block_param(block, reg_type);
+    pos.func.locations[fp] = ir::ValueLoc::Reg(RU::rbp as RegUnit);
+
+    pos.ins().x86_push(fp);
+
+    let mov_sp_inst = pos
+        .ins()
+        .copy_special(RU::rsp as RegUnit, RU::rbp as RegUnit);
+
+    let mut last_csr_push = None;
+    for reg in csrs.iter(GPR) {
+        // Append param to entry block
+        let csr_arg = pos.func.dfg.append_block_param(block, reg_type);
+
+        // Assign it a location
+        pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
+        last_csr_push = Some(pos.ins().x86_push(csr_arg));
+    }
+
+    // Allocate stack frame storage.
+    let mut adjust_sp_inst = None;
+    if stack_size > 0 {
+        if isa.flags().enable_probestack() && stack_size > (1 << isa.flags().probestack_size_log2())
+        {
+            // Emit a stack probe.
+            let rax = RU::rax as RegUnit;
+            let rax_val = ir::ValueLoc::Reg(rax);
+
+            // The probestack function expects its input in %rax.
+            let arg = pos.ins().iconst(reg_type, stack_size);
+            pos.func.locations[arg] = rax_val;
+
+            // Call the probestack function.
+            let callee = get_probestack_funcref(pos.func, reg_type, rax, isa);
+
+            // Make the call.
+            let call = if !isa.flags().is_pic()
+                && isa.triple().pointer_width().unwrap() == PointerWidth::U64
+                && !pos.func.dfg.ext_funcs[callee].colocated
+            {
+                // 64-bit non-PIC non-colocated calls need to be legalized to call_indirect.
+                // Use r11 as it may be clobbered under all supported calling conventions.
+                let r11 = RU::r11 as RegUnit;
+                let sig = pos.func.dfg.ext_funcs[callee].signature;
+                let addr = pos.ins().func_addr(reg_type, callee);
+                pos.func.locations[addr] = ir::ValueLoc::Reg(r11);
+                pos.ins().call_indirect(sig, addr, &[arg])
+            } else {
+                // Otherwise just do a normal call.
+                pos.ins().call(callee, &[arg])
+            };
+
+            // If the probestack function doesn't adjust sp, do it ourselves.
+            if !isa.flags().probestack_func_adjusts_sp() {
+                let result = pos.func.dfg.inst_results(call)[0];
+                pos.func.locations[result] = rax_val;
+                adjust_sp_inst = Some(pos.ins().adjust_sp_down(result));
+            }
+        } else {
+            // Simply decrement the stack pointer.
+            adjust_sp_inst = Some(pos.ins().adjust_sp_down_imm(Imm64::new(stack_size)));
+        }
+    }
+
+    // With the stack pointer adjusted, save any callee-saved floating point registers via offset
+    // FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes
+    let mut last_fpr_save = None;
+
+    for (i, reg) in csrs.iter(FPR).enumerate() {
+        // Append param to entry block
+        let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
+
+        // Since regalloc has already run, we must assign a location.
+        pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
+
+        // Offset to where the register is saved relative to RSP, accounting for FPR save alignment
+        let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
+            + (stack_size % types::F64X2.bytes() as i64);
+
+        last_fpr_save = Some(pos.ins().store(
+            ir::MemFlags::trusted(),
+            csr_arg,
+            sp.expect("FPR save requires SP param"),
+            (stack_size - offset) as i32,
+        ));
+    }
+
+    pos.func.prologue_end = Some(
+        last_fpr_save
+            .or(adjust_sp_inst)
+            .or(last_csr_push)
+            .unwrap_or(mov_sp_inst),
+    );
+}
+
+/// Inserts code necessary to calculate `gv`.
+///
+/// Note that this is typically done with `ins().global_value(...)` but that
+/// requires legalization to run to encode it, and we're running super late
+/// here in the backend where legalization isn't possible. To get around this
+/// we manually interpret the `gv` specified and do register allocation for
+/// intermediate values.
+///
+/// This is an incomplete implementation of loading `GlobalValue` values to get
+/// compared to the stack pointer, but currently it serves enough functionality
+/// to get this implemented in `wasmtime` itself. This'll likely get expanded a
+/// bit over time!
+fn interpret_gv(
+    pos: &mut EncCursor,
+    gv: ir::GlobalValue,
+    sp: Option<ir::Value>,
+    scratch: ir::ValueLoc,
+) -> ir::Value {
+    match pos.func.global_values[gv] {
+        ir::GlobalValueData::VMContext => {
+            let vmctx_index = pos
+                .func
+                .signature
+                .special_param_index(ir::ArgumentPurpose::VMContext)
+                .expect("no vmcontext parameter found");
+            match pos.func.signature.params[vmctx_index] {
+                AbiParam {
+                    location: ArgumentLoc::Reg(_),
+                    ..
+                } => {
+                    let entry = pos.func.layout.entry_block().unwrap();
+                    pos.func.dfg.block_params(entry)[vmctx_index]
+                }
+                AbiParam {
+                    location: ArgumentLoc::Stack(offset),
+                    value_type,
+                    ..
+                } => {
+                    let offset =
+                        offset + i32::from(pos.isa.pointer_bytes() * (1 + vmctx_index as u8));
+                    // The following access can be marked `trusted` because it is a load of an argument. We
+                    // know it is safe because it was safe to write it in preparing this function call.
+                    let ret =
+                        pos.ins()
+                            .load(value_type, ir::MemFlags::trusted(), sp.unwrap(), offset);
+                    pos.func.locations[ret] = scratch;
+                    return ret;
+                }
+                AbiParam {
+                    location: ArgumentLoc::Unassigned,
+                    ..
+                } => unreachable!(),
+            }
+        }
+        ir::GlobalValueData::Load {
+            base,
+            offset,
+            global_type,
+            readonly: _,
+        } => {
+            let base = interpret_gv(pos, base, sp, scratch);
+            let ret = pos
+                .ins()
+                .load(global_type, ir::MemFlags::trusted(), base, offset);
+            pos.func.locations[ret] = scratch;
+            return ret;
+        }
+        ref other => panic!("global value for stack limit not supported: {}", other),
+    }
+}
+
+/// Insert a check that generates a trap if the stack pointer goes
+/// below a value in `stack_limit_arg`.
+fn insert_stack_check(pos: &mut EncCursor, stack_size: i64, stack_limit_arg: ir::Value) {
+    use crate::ir::condcodes::IntCC;
+
+    // Our stack pointer, after subtracting `stack_size`, must not be below
+    // `stack_limit_arg`. To do this we're going to add `stack_size` to
+    // `stack_limit_arg` and see if the stack pointer is below that. The
+    // `stack_size + stack_limit_arg` computation might overflow, however, due
+    // to how stack limits may be loaded and set externally to trigger a trap.
+    //
+    // To handle this we'll need an extra comparison to see if the stack
+    // pointer is already below `stack_limit_arg`. Most of the time this
+    // isn't necessary though since the stack limit which triggers a trap is
+    // likely a sentinel somewhere around `usize::max_value()`. In that case
+    // only conditionally emit this pre-flight check. That way most functions
+    // only have the one comparison, but are also guaranteed that if we add
+    // `stack_size` to `stack_limit_arg` is won't overflow.
+    //
+    // This does mean that code generators which use this stack check
+    // functionality need to ensure that values stored into the stack limit
+    // will never overflow if this threshold is added.
+    if stack_size >= 32 * 1024 {
+        let cflags = pos.ins().ifcmp_sp(stack_limit_arg);
+        pos.func.locations[cflags] = ir::ValueLoc::Reg(RU::rflags as RegUnit);
+        pos.ins().trapif(
+            IntCC::UnsignedGreaterThanOrEqual,
+            cflags,
+            ir::TrapCode::StackOverflow,
+        );
+    }
+
+    // Copy `stack_limit_arg` into a %rax and use it for calculating
+    // a SP threshold.
+    let sp_threshold = pos.ins().iadd_imm(stack_limit_arg, stack_size);
+    pos.func.locations[sp_threshold] = ir::ValueLoc::Reg(RU::rax as RegUnit);
+
+    // If the stack pointer currently reaches the SP threshold or below it then after opening
+    // the current stack frame, the current stack pointer will reach the limit.
+    let cflags = pos.ins().ifcmp_sp(sp_threshold);
+    pos.func.locations[cflags] = ir::ValueLoc::Reg(RU::rflags as RegUnit);
+    pos.ins().trapif(
+        IntCC::UnsignedGreaterThanOrEqual,
+        cflags,
+        ir::TrapCode::StackOverflow,
+    );
+}
+
+/// Find all `return` instructions and insert epilogues before them.
+fn insert_common_epilogues(
+    pos: &mut EncCursor,
+    stack_size: i64,
+    reg_type: ir::types::Type,
+    csrs: &RegisterSet,
+    sp_arg_index: Option<usize>,
+) {
+    while let Some(block) = pos.next_block() {
+        pos.goto_last_inst(block);
+        if let Some(inst) = pos.current_inst() {
+            if pos.func.dfg[inst].opcode().is_return() {
+                insert_common_epilogue(inst, block, stack_size, pos, reg_type, csrs, sp_arg_index);
+            }
+        }
+    }
+}
+
+/// Insert an epilogue given a specific `return` instruction.
+/// This is used by common calling conventions such as System V.
+fn insert_common_epilogue(
+    inst: ir::Inst,
+    block: ir::Block,
+    stack_size: i64,
+    pos: &mut EncCursor,
+    reg_type: ir::types::Type,
+    csrs: &RegisterSet,
+    sp_arg_index: Option<usize>,
+) {
+    // Insert the pop of the frame pointer
+    let fp_pop = pos.ins().x86_pop(reg_type);
+    let fp_pop_inst = pos.prev_inst().unwrap();
+    pos.func.locations[fp_pop] = ir::ValueLoc::Reg(RU::rbp as RegUnit);
+    pos.func.dfg.append_inst_arg(inst, fp_pop);
+
+    // Insert the CSR pops
+    let mut first_csr_pop_inst = None;
+    for reg in csrs.iter(GPR) {
+        let csr_pop = pos.ins().x86_pop(reg_type);
+        first_csr_pop_inst = pos.prev_inst();
+        assert!(first_csr_pop_inst.is_some());
+        pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg);
+        pos.func.dfg.append_inst_arg(inst, csr_pop);
+    }
+
+    // Insert the adjustment of SP
+    let mut sp_adjust_inst = None;
+    if stack_size > 0 {
+        pos.ins().adjust_sp_up_imm(Imm64::new(stack_size));
+        sp_adjust_inst = pos.prev_inst();
+        assert!(sp_adjust_inst.is_some());
+    }
+
+    let mut first_fpr_load = None;
+    if let Some(index) = sp_arg_index {
+        let sp = pos
+            .func
+            .dfg
+            .block_params(pos.func.layout.entry_block().unwrap())[index];
+
+        // Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads)
+        for (i, reg) in csrs.iter(FPR).enumerate() {
+            // Offset to where the register is saved relative to RSP, accounting for FPR save alignment
+            let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
+                + (stack_size % types::F64X2.bytes() as i64);
+
+            let value = pos.ins().load(
+                types::F64X2,
+                ir::MemFlags::trusted(),
+                sp,
+                (stack_size - offset) as i32,
+            );
+
+            first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
+
+            pos.func.locations[value] = ir::ValueLoc::Reg(reg);
+            pos.func.dfg.append_inst_arg(inst, value);
+        }
+    } else {
+        assert!(csrs.iter(FPR).len() == 0);
+    }
+
+    pos.func.epilogues_start.push((
+        first_fpr_load
+            .or(sp_adjust_inst)
+            .or(first_csr_pop_inst)
+            .unwrap_or(fp_pop_inst),
+        block,
+    ));
+}
+
+#[cfg(feature = "unwind")]
+pub fn create_unwind_info(
+    func: &ir::Function,
+    isa: &dyn TargetIsa,
+) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> {
+    use crate::isa::unwind::UnwindInfo;
+
+    // Assumption: RBP is being used as the frame pointer for both calling conventions
+    // In the future, we should be omitting frame pointer as an optimization, so this will change
+    Ok(match func.signature.call_conv {
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV => {
+            super::unwind::systemv::create_unwind_info(func, isa)?.map(|u| UnwindInfo::SystemV(u))
+        }
+        CallConv::WindowsFastcall => {
+            super::unwind::winx64::create_unwind_info(func, isa)?.map(|u| UnwindInfo::WindowsX64(u))
+        }
+        _ => None,
+    })
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/binemit.rs b/third_party/rust/cranelift-codegen/src/isa/x86/binemit.rs
new file mode 100644
index 0000000000..90ed8b7ef8
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/binemit.rs
@@ -0,0 +1,576 @@
+//! Emitting binary x86 machine code.
+
+use super::enc_tables::{needs_offset, needs_sib_byte};
+use super::registers::RU;
+use crate::binemit::{bad_encoding, CodeSink, Reloc};
+use crate::ir::condcodes::{CondCode, FloatCC, IntCC};
+use crate::ir::{
+    Block, Constant, ExternalName, Function, Inst, InstructionData, JumpTable, LibCall, Opcode,
+    TrapCode,
+};
+use crate::isa::{RegUnit, StackBase, StackBaseMask, StackRef, TargetIsa};
+use crate::regalloc::RegDiversions;
+use cranelift_codegen_shared::isa::x86::EncodingBits;
+
+include!(concat!(env!("OUT_DIR"), "/binemit-x86.rs"));
+
+// Convert a stack base to the corresponding register.
+fn stk_base(base: StackBase) -> RegUnit {
+    let ru = match base {
+        StackBase::SP => RU::rsp,
+        StackBase::FP => RU::rbp,
+        StackBase::Zone => unimplemented!(),
+    };
+    ru as RegUnit
+}
+
+// Mandatory prefix bytes for Mp* opcodes.
+const PREFIX: [u8; 3] = [0x66, 0xf3, 0xf2];
+
+// Second byte for three-byte opcodes for mm=0b10 and mm=0b11.
+const OP3_BYTE2: [u8; 2] = [0x38, 0x3a];
+
+// A REX prefix with no bits set: 0b0100WRXB.
+const BASE_REX: u8 = 0b0100_0000;
+
+// Create a single-register REX prefix, setting the B bit to bit 3 of the register.
+// This is used for instructions that encode a register in the low 3 bits of the opcode and for
+// instructions that use the ModR/M `reg` field for something else.
+fn rex1(reg_b: RegUnit) -> u8 {
+    let b = ((reg_b >> 3) & 1) as u8;
+    BASE_REX | b
+}
+
+// Create a dual-register REX prefix, setting:
+//
+// REX.B = bit 3 of r/m register, or SIB base register when a SIB byte is present.
+// REX.R = bit 3 of reg register.
+fn rex2(rm: RegUnit, reg: RegUnit) -> u8 {
+    let b = ((rm >> 3) & 1) as u8;
+    let r = ((reg >> 3) & 1) as u8;
+    BASE_REX | b | (r << 2)
+}
+
+// Create a three-register REX prefix, setting:
+//
+// REX.B = bit 3 of r/m register, or SIB base register when a SIB byte is present.
+// REX.R = bit 3 of reg register.
+// REX.X = bit 3 of SIB index register.
+fn rex3(rm: RegUnit, reg: RegUnit, index: RegUnit) -> u8 {
+    let b = ((rm >> 3) & 1) as u8;
+    let r = ((reg >> 3) & 1) as u8;
+    let x = ((index >> 3) & 1) as u8;
+    BASE_REX | b | (x << 1) | (r << 2)
+}
+
+/// Encode the RXBR' bits of the EVEX P0 byte. For an explanation of these bits, see section 2.6.1
+/// in the Intel Software Development Manual, volume 2A. These bits can be used by different
+/// addressing modes (see section 2.6.2), requiring different `vex*` functions than this one.
+fn evex2(rm: RegUnit, reg: RegUnit) -> u8 {
+    let b = (!(rm >> 3) & 1) as u8;
+    let x = (!(rm >> 4) & 1) as u8;
+    let r = (!(reg >> 3) & 1) as u8;
+    let r_ = (!(reg >> 4) & 1) as u8;
+    0x00 | r_ | (b << 1) | (x << 2) | (r << 3)
+}
+
+/// Determines whether a REX prefix should be emitted. A REX byte always has 0100 in bits 7:4; bits
+/// 3:0 correspond to WRXB. W allows certain instructions to declare a 64-bit operand size; because
+/// [needs_rex] is only used by [infer_rex] and we prevent [infer_rex] from using [w] in
+/// [Template::build], we do not need to check again whether [w] forces an inferred REX prefix--it
+/// always does and should be encoded like `.rex().w()`. The RXB are extension of ModR/M or SIB
+/// fields; see section 2.2.1.2 in the Intel Software Development Manual.
+#[inline]
+fn needs_rex(rex: u8) -> bool {
+    rex != BASE_REX
+}
+
+// Emit a REX prefix.
+//
+// The R, X, and B bits are computed from registers using the functions above. The W bit is
+// extracted from `bits`.
+fn rex_prefix<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(rex & 0xf8, BASE_REX);
+    let w = EncodingBits::from(bits).rex_w();
+    sink.put1(rex | (w << 3));
+}
+
+// Emit a single-byte opcode with no REX prefix.
+fn put_op1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x8f00, 0, "Invalid encoding bits for Op1*");
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Op1 encoding");
+    sink.put1(bits as u8);
+}
+
+// Emit a single-byte opcode with REX prefix.
+fn put_rexop1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x0f00, 0, "Invalid encoding bits for RexOp1*");
+    rex_prefix(bits, rex, sink);
+    sink.put1(bits as u8);
+}
+
+/// Emit a single-byte opcode with inferred REX prefix.
+fn put_dynrexop1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x0f00, 0, "Invalid encoding bits for DynRexOp1*");
+    if needs_rex(rex) {
+        rex_prefix(bits, rex, sink);
+    }
+    sink.put1(bits as u8);
+}
+
+// Emit two-byte opcode: 0F XX
+fn put_op2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x8f00, 0x0400, "Invalid encoding bits for Op2*");
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Op2 encoding");
+    sink.put1(0x0f);
+    sink.put1(bits as u8);
+}
+
+// Emit two-byte opcode: 0F XX with REX prefix.
+fn put_rexop2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x0f00, 0x0400, "Invalid encoding bits for RexOp2*");
+    rex_prefix(bits, rex, sink);
+    sink.put1(0x0f);
+    sink.put1(bits as u8);
+}
+
+/// Emit two-byte opcode: 0F XX with inferred REX prefix.
+fn put_dynrexop2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(
+        bits & 0x0f00,
+        0x0400,
+        "Invalid encoding bits for DynRexOp2*"
+    );
+    if needs_rex(rex) {
+        rex_prefix(bits, rex, sink);
+    }
+    sink.put1(0x0f);
+    sink.put1(bits as u8);
+}
+
+// Emit single-byte opcode with mandatory prefix.
+fn put_mp1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x8c00, 0, "Invalid encoding bits for Mp1*");
+    let enc = EncodingBits::from(bits);
+    sink.put1(PREFIX[(enc.pp() - 1) as usize]);
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp1 encoding");
+    sink.put1(bits as u8);
+}
+
+// Emit single-byte opcode with mandatory prefix and REX.
+fn put_rexmp1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x0c00, 0, "Invalid encoding bits for RexMp1*");
+    let enc = EncodingBits::from(bits);
+    sink.put1(PREFIX[(enc.pp() - 1) as usize]);
+    rex_prefix(bits, rex, sink);
+    sink.put1(bits as u8);
+}
+
+// Emit two-byte opcode (0F XX) with mandatory prefix.
+fn put_mp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x8c00, 0x0400, "Invalid encoding bits for Mp2*");
+    let enc = EncodingBits::from(bits);
+    sink.put1(PREFIX[(enc.pp() - 1) as usize]);
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp2 encoding");
+    sink.put1(0x0f);
+    sink.put1(bits as u8);
+}
+
+// Emit two-byte opcode (0F XX) with mandatory prefix and REX.
+fn put_rexmp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x0c00, 0x0400, "Invalid encoding bits for RexMp2*");
+    let enc = EncodingBits::from(bits);
+    sink.put1(PREFIX[(enc.pp() - 1) as usize]);
+    rex_prefix(bits, rex, sink);
+    sink.put1(0x0f);
+    sink.put1(bits as u8);
+}
+
+/// Emit two-byte opcode (0F XX) with mandatory prefix and inferred REX.
+fn put_dynrexmp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(
+        bits & 0x0c00,
+        0x0400,
+        "Invalid encoding bits for DynRexMp2*"
+    );
+    let enc = EncodingBits::from(bits);
+    sink.put1(PREFIX[(enc.pp() - 1) as usize]);
+    if needs_rex(rex) {
+        rex_prefix(bits, rex, sink);
+    }
+    sink.put1(0x0f);
+    sink.put1(bits as u8);
+}
+
+/// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix.
+fn put_mp3<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x8800, 0x0800, "Invalid encoding bits for Mp3*");
+    debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp3 encoding");
+    let enc = EncodingBits::from(bits);
+    sink.put1(PREFIX[(enc.pp() - 1) as usize]);
+    sink.put1(0x0f);
+    sink.put1(OP3_BYTE2[(enc.mm() - 2) as usize]);
+    sink.put1(bits as u8);
+}
+
+/// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix and REX
+fn put_rexmp3<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(bits & 0x0800, 0x0800, "Invalid encoding bits for RexMp3*");
+    let enc = EncodingBits::from(bits);
+    sink.put1(PREFIX[(enc.pp() - 1) as usize]);
+    rex_prefix(bits, rex, sink);
+    sink.put1(0x0f);
+    sink.put1(OP3_BYTE2[(enc.mm() - 2) as usize]);
+    sink.put1(bits as u8);
+}
+
+/// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix and an inferred REX prefix.
+fn put_dynrexmp3<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
+    debug_assert_eq!(
+        bits & 0x0800,
+        0x0800,
+        "Invalid encoding bits for DynRexMp3*"
+    );
+    let enc = EncodingBits::from(bits);
+    sink.put1(PREFIX[(enc.pp() - 1) as usize]);
+    if needs_rex(rex) {
+        rex_prefix(bits, rex, sink);
+    }
+    sink.put1(0x0f);
+    sink.put1(OP3_BYTE2[(enc.mm() - 2) as usize]);
+    sink.put1(bits as u8);
+}
+
+/// Defines the EVEX context for the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte). Table 2-36 in
+/// section 2.6.10 (Intel Software Development Manual, volume 2A) describes how these bits can be
+/// used together for certain classes of instructions; i.e., special care should be taken to ensure
+/// that instructions use an applicable correct `EvexContext`. Table 2-39 contains cases where
+/// opcodes can result in an #UD.
+#[allow(dead_code)]
+enum EvexContext {
+    RoundingRegToRegFP {
+        rc: EvexRoundingControl,
+    },
+    NoRoundingFP {
+        sae: bool,
+        length: EvexVectorLength,
+    },
+    MemoryOp {
+        broadcast: bool,
+        length: EvexVectorLength,
+    },
+    Other {
+        length: EvexVectorLength,
+    },
+}
+
+impl EvexContext {
+    /// Encode the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte) for merging with the P2 byte.
+    fn bits(&self) -> u8 {
+        match self {
+            Self::RoundingRegToRegFP { rc } => 0b001 | rc.bits() << 1,
+            Self::NoRoundingFP { sae, length } => (*sae as u8) | length.bits() << 1,
+            Self::MemoryOp { broadcast, length } => (*broadcast as u8) | length.bits() << 1,
+            Self::Other { length } => length.bits() << 1,
+        }
+    }
+}
+
+/// The EVEX format allows choosing a vector length in the `L'` and `L` bits; see `EvexContext`.
+enum EvexVectorLength {
+    V128,
+    V256,
+    V512,
+}
+
+impl EvexVectorLength {
+    /// Encode the `L'` and `L` bits for merging with the P2 byte.
+    fn bits(&self) -> u8 {
+        match self {
+            Self::V128 => 0b00,
+            Self::V256 => 0b01,
+            Self::V512 => 0b10,
+            // 0b11 is reserved (#UD).
+        }
+    }
+}
+
+/// The EVEX format allows defining rounding control in the `L'` and `L` bits; see `EvexContext`.
+enum EvexRoundingControl {
+    RNE,
+    RD,
+    RU,
+    RZ,
+}
+
+impl EvexRoundingControl {
+    /// Encode the `L'` and `L` bits for merging with the P2 byte.
+    fn bits(&self) -> u8 {
+        match self {
+            Self::RNE => 0b00,
+            Self::RD => 0b01,
+            Self::RU => 0b10,
+            Self::RZ => 0b11,
+        }
+    }
+}
+
+/// Defines the EVEX masking behavior; masking support is described in section 2.6.4 of the Intel
+/// Software Development Manual, volume 2A.
+#[allow(dead_code)]
+enum EvexMasking {
+    None,
+    Merging { k: u8 },
+    Zeroing { k: u8 },
+}
+
+impl EvexMasking {
+    /// Encode the `z` bit for merging with the P2 byte.
+    fn z_bit(&self) -> u8 {
+        match self {
+            Self::None | Self::Merging { .. } => 0,
+            Self::Zeroing { .. } => 1,
+        }
+    }
+
+    /// Encode the `aaa` bits for merging with the P2 byte.
+    fn aaa_bits(&self) -> u8 {
+        match self {
+            Self::None => 0b000,
+            Self::Merging { k } | Self::Zeroing { k } => {
+                debug_assert!(*k <= 7);
+                *k
+            }
+        }
+    }
+}
+
+/// Encode an EVEX prefix, including the instruction opcode. To match the current recipe
+/// convention, the ModR/M byte is written separately in the recipe. This EVEX encoding function
+/// only encodes the `reg` (operand 1), `vvvv` (operand 2), `rm` (operand 3) form; other forms are
+/// possible (see section 2.6.2, Intel Software Development Manual, volume 2A), requiring
+/// refactoring of this function or separate functions for each form (e.g. as for the REX prefix).
+fn put_evex<CS: CodeSink + ?Sized>(
+    bits: u16,
+    reg: RegUnit,
+    vvvvv: RegUnit,
+    rm: RegUnit,
+    context: EvexContext,
+    masking: EvexMasking,
+    sink: &mut CS,
+) {
+    let enc = EncodingBits::from(bits);
+
+    // EVEX prefix.
+    sink.put1(0x62);
+
+    debug_assert!(enc.mm() < 0b100);
+    let mut p0 = enc.mm() & 0b11;
+    p0 |= evex2(rm, reg) << 4; // bits 3:2 are always unset
+    sink.put1(p0);
+
+    let mut p1 = enc.pp() | 0b100; // bit 2 is always set
+    p1 |= (!(vvvvv as u8) & 0b1111) << 3;
+    p1 |= (enc.rex_w() & 0b1) << 7;
+    sink.put1(p1);
+
+    let mut p2 = masking.aaa_bits();
+    p2 |= (!(vvvvv as u8 >> 4) & 0b1) << 3;
+    p2 |= context.bits() << 4;
+    p2 |= masking.z_bit() << 7;
+    sink.put1(p2);
+
+    // Opcode
+    sink.put1(enc.opcode_byte());
+
+    // ModR/M byte placed in recipe
+}
+
+/// Emit a ModR/M byte for reg-reg operands.
+fn modrm_rr<CS: CodeSink + ?Sized>(rm: RegUnit, reg: RegUnit, sink: &mut CS) {
+    let reg = reg as u8 & 7;
+    let rm = rm as u8 & 7;
+    let mut b = 0b11000000;
+    b |= reg << 3;
+    b |= rm;
+    sink.put1(b);
+}
+
+/// Emit a ModR/M byte where the reg bits are part of the opcode.
+fn modrm_r_bits<CS: CodeSink + ?Sized>(rm: RegUnit, bits: u16, sink: &mut CS) {
+    let reg = (bits >> 12) as u8 & 7;
+    let rm = rm as u8 & 7;
+    let mut b = 0b11000000;
+    b |= reg << 3;
+    b |= rm;
+    sink.put1(b);
+}
+
+/// Emit a mode 00 ModR/M byte. This is a register-indirect addressing mode with no offset.
+/// Registers %rsp and %rbp are invalid for `rm`, %rsp indicates a SIB byte, and %rbp indicates an
+/// absolute immediate 32-bit address.
+fn modrm_rm<CS: CodeSink + ?Sized>(rm: RegUnit, reg: RegUnit, sink: &mut CS) {
+    let reg = reg as u8 & 7;
+    let rm = rm as u8 & 7;
+    let mut b = 0b00000000;
+    b |= reg << 3;
+    b |= rm;
+    sink.put1(b);
+}
+
+/// Emit a mode 00 Mod/RM byte, with a rip-relative displacement in 64-bit mode. Effective address
+/// is calculated by adding displacement to 64-bit rip of next instruction. See intel Sw dev manual
+/// section 2.2.1.6.
+fn modrm_riprel<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) {
+    modrm_rm(0b101, reg, sink)
+}
+
+/// Emit a mode 01 ModR/M byte. This is a register-indirect addressing mode with 8-bit
+/// displacement.
+/// Register %rsp is invalid for `rm`. It indicates the presence of a SIB byte.
+fn modrm_disp8<CS: CodeSink + ?Sized>(rm: RegUnit, reg: RegUnit, sink: &mut CS) {
+    let reg = reg as u8 & 7;
+    let rm = rm as u8 & 7;
+    let mut b = 0b01000000;
+    b |= reg << 3;
+    b |= rm;
+    sink.put1(b);
+}
+
+/// Emit a mode 10 ModR/M byte. This is a register-indirect addressing mode with 32-bit
+/// displacement.
+/// Register %rsp is invalid for `rm`. It indicates the presence of a SIB byte.
+fn modrm_disp32<CS: CodeSink + ?Sized>(rm: RegUnit, reg: RegUnit, sink: &mut CS) {
+    let reg = reg as u8 & 7;
+    let rm = rm as u8 & 7;
+    let mut b = 0b10000000;
+    b |= reg << 3;
+    b |= rm;
+    sink.put1(b);
+}
+
+/// Emit a mode 00 ModR/M with a 100 RM indicating a SIB byte is present.
+fn modrm_sib<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) {
+    modrm_rm(0b100, reg, sink);
+}
+
+/// Emit a mode 01 ModR/M with a 100 RM indicating a SIB byte and 8-bit
+/// displacement are present.
+fn modrm_sib_disp8<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) {
+    modrm_disp8(0b100, reg, sink);
+}
+
+/// Emit a mode 10 ModR/M with a 100 RM indicating a SIB byte and 32-bit
+/// displacement are present.
+fn modrm_sib_disp32<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) {
+    modrm_disp32(0b100, reg, sink);
+}
+
+/// Emit a SIB byte with a base register and no scale+index.
+fn sib_noindex<CS: CodeSink + ?Sized>(base: RegUnit, sink: &mut CS) {
+    let base = base as u8 & 7;
+    // SIB        SS_III_BBB.
+    let mut b = 0b00_100_000;
+    b |= base;
+    sink.put1(b);
+}
+
+/// Emit a SIB byte with a scale, base, and index.
+fn sib<CS: CodeSink + ?Sized>(scale: u8, index: RegUnit, base: RegUnit, sink: &mut CS) {
+    // SIB        SS_III_BBB.
+    debug_assert_eq!(scale & !0x03, 0, "Scale out of range");
+    let scale = scale & 3;
+    let index = index as u8 & 7;
+    let base = base as u8 & 7;
+    let b: u8 = (scale << 6) | (index << 3) | base;
+    sink.put1(b);
+}
+
+/// Get the low 4 bits of an opcode for an integer condition code.
+///
+/// Add this offset to a base opcode for:
+///
+/// ---- 0x70: Short conditional branch.
+/// 0x0f 0x80: Long conditional branch.
+/// 0x0f 0x90: SetCC.
+///
+fn icc2opc(cond: IntCC) -> u16 {
+    use crate::ir::condcodes::IntCC::*;
+    match cond {
+        Overflow => 0x0,
+        NotOverflow => 0x1,
+        UnsignedLessThan => 0x2,
+        UnsignedGreaterThanOrEqual => 0x3,
+        Equal => 0x4,
+        NotEqual => 0x5,
+        UnsignedLessThanOrEqual => 0x6,
+        UnsignedGreaterThan => 0x7,
+        // 0x8 = Sign.
+        // 0x9 = !Sign.
+        // 0xa = Parity even.
+        // 0xb = Parity odd.
+        SignedLessThan => 0xc,
+        SignedGreaterThanOrEqual => 0xd,
+        SignedLessThanOrEqual => 0xe,
+        SignedGreaterThan => 0xf,
+    }
+}
+
+/// Get the low 4 bits of an opcode for a floating point condition code.
+///
+/// The ucomiss/ucomisd instructions set the FLAGS bits CF/PF/CF like this:
+///
+///    ZPC OSA
+/// UN 111 000
+/// GT 000 000
+/// LT 001 000
+/// EQ 100 000
+///
+/// Not all floating point condition codes are supported.
+fn fcc2opc(cond: FloatCC) -> u16 {
+    use crate::ir::condcodes::FloatCC::*;
+    match cond {
+        Ordered                    => 0xb, // EQ|LT|GT => *np (P=0)
+        Unordered                  => 0xa, // UN       => *p  (P=1)
+        OrderedNotEqual            => 0x5, // LT|GT    => *ne (Z=0),
+        UnorderedOrEqual           => 0x4, // UN|EQ    => *e  (Z=1)
+        GreaterThan                => 0x7, // GT       => *a  (C=0&Z=0)
+        GreaterThanOrEqual         => 0x3, // GT|EQ    => *ae (C=0)
+        UnorderedOrLessThan        => 0x2, // UN|LT    => *b  (C=1)
+        UnorderedOrLessThanOrEqual => 0x6, // UN|LT|EQ => *be (Z=1|C=1)
+        Equal |                            // EQ
+        NotEqual |                         // UN|LT|GT
+        LessThan |                         // LT
+        LessThanOrEqual |                  // LT|EQ
+        UnorderedOrGreaterThan |           // UN|GT
+        UnorderedOrGreaterThanOrEqual      // UN|GT|EQ
+        => panic!("{} not supported", cond),
+    }
+}
+
+/// Emit a single-byte branch displacement to `destination`.
+fn disp1<CS: CodeSink + ?Sized>(destination: Block, func: &Function, sink: &mut CS) {
+    let delta = func.offsets[destination].wrapping_sub(sink.offset() + 1);
+    sink.put1(delta as u8);
+}
+
+/// Emit a four-byte branch displacement to `destination`.
+fn disp4<CS: CodeSink + ?Sized>(destination: Block, func: &Function, sink: &mut CS) {
+    let delta = func.offsets[destination].wrapping_sub(sink.offset() + 4);
+    sink.put4(delta);
+}
+
+/// Emit a four-byte displacement to jump table `jt`.
+fn jt_disp4<CS: CodeSink + ?Sized>(jt: JumpTable, func: &Function, sink: &mut CS) {
+    let delta = func.jt_offsets[jt].wrapping_sub(sink.offset() + 4);
+    sink.put4(delta);
+    sink.reloc_jt(Reloc::X86PCRelRodata4, jt);
+}
+
+/// Emit a four-byte displacement to `constant`.
+fn const_disp4<CS: CodeSink + ?Sized>(constant: Constant, func: &Function, sink: &mut CS) {
+    let offset = func.dfg.constants.get_offset(constant);
+    let delta = offset.wrapping_sub(sink.offset() + 4);
+    sink.put4(delta);
+    sink.reloc_constant(Reloc::X86PCRelRodata4, offset);
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/enc_tables.rs b/third_party/rust/cranelift-codegen/src/isa/x86/enc_tables.rs
new file mode 100644
index 0000000000..976f1581e3
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/enc_tables.rs
@@ -0,0 +1,1922 @@
+//! Encoding tables for x86 ISAs.
+
+use super::registers::*;
+use crate::bitset::BitSet;
+use crate::cursor::{Cursor, FuncCursor};
+use crate::flowgraph::ControlFlowGraph;
+use crate::ir::condcodes::{FloatCC, IntCC};
+use crate::ir::types::*;
+use crate::ir::{self, Function, Inst, InstBuilder, MemFlags};
+use crate::isa::constraints::*;
+use crate::isa::enc_tables::*;
+use crate::isa::encoding::base_size;
+use crate::isa::encoding::{Encoding, RecipeSizing};
+use crate::isa::RegUnit;
+use crate::isa::{self, TargetIsa};
+use crate::legalizer::expand_as_libcall;
+use crate::predicates;
+use crate::regalloc::RegDiversions;
+
+include!(concat!(env!("OUT_DIR"), "/encoding-x86.rs"));
+include!(concat!(env!("OUT_DIR"), "/legalize-x86.rs"));
+
+/// Whether the REX prefix is needed for encoding extended registers (via REX.RXB).
+///
+/// Normal x86 instructions have only 3 bits for encoding a register.
+/// The REX prefix adds REX.R, REX,X, and REX.B bits, interpreted as fourth bits.
+pub fn is_extended_reg(reg: RegUnit) -> bool {
+    // Extended registers have the fourth bit set.
+    reg as u8 & 0b1000 != 0
+}
+
+pub fn needs_sib_byte(reg: RegUnit) -> bool {
+    reg == RU::r12 as RegUnit || reg == RU::rsp as RegUnit
+}
+pub fn needs_offset(reg: RegUnit) -> bool {
+    reg == RU::r13 as RegUnit || reg == RU::rbp as RegUnit
+}
+pub fn needs_sib_byte_or_offset(reg: RegUnit) -> bool {
+    needs_sib_byte(reg) || needs_offset(reg)
+}
+
+fn test_input(
+    op_index: usize,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+    condition_func: fn(RegUnit) -> bool,
+) -> bool {
+    let in_reg = divert.reg(func.dfg.inst_args(inst)[op_index], &func.locations);
+    condition_func(in_reg)
+}
+
+fn test_result(
+    result_index: usize,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+    condition_func: fn(RegUnit) -> bool,
+) -> bool {
+    let out_reg = divert.reg(func.dfg.inst_results(inst)[result_index], &func.locations);
+    condition_func(out_reg)
+}
+
+fn size_plus_maybe_offset_for_inreg_0(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    let needs_offset = test_input(0, inst, divert, func, needs_offset);
+    sizing.base_size + if needs_offset { 1 } else { 0 }
+}
+fn size_plus_maybe_offset_for_inreg_1(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    let needs_offset = test_input(1, inst, divert, func, needs_offset);
+    sizing.base_size + if needs_offset { 1 } else { 0 }
+}
+fn size_plus_maybe_sib_for_inreg_0(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    let needs_sib = test_input(0, inst, divert, func, needs_sib_byte);
+    sizing.base_size + if needs_sib { 1 } else { 0 }
+}
+fn size_plus_maybe_sib_for_inreg_1(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    let needs_sib = test_input(1, inst, divert, func, needs_sib_byte);
+    sizing.base_size + if needs_sib { 1 } else { 0 }
+}
+fn size_plus_maybe_sib_or_offset_for_inreg_0(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    let needs_sib_or_offset = test_input(0, inst, divert, func, needs_sib_byte_or_offset);
+    sizing.base_size + if needs_sib_or_offset { 1 } else { 0 }
+}
+fn size_plus_maybe_sib_or_offset_for_inreg_1(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    let needs_sib_or_offset = test_input(1, inst, divert, func, needs_sib_byte_or_offset);
+    sizing.base_size + if needs_sib_or_offset { 1 } else { 0 }
+}
+
+/// Calculates the size while inferring if the first and second input registers (inreg0, inreg1)
+/// require a dynamic REX prefix and if the second input register (inreg1) requires a SIB or offset.
+fn size_plus_maybe_sib_or_offset_inreg1_plus_rex_prefix_for_inreg0_inreg1(
+    sizing: &RecipeSizing,
+    enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
+        || test_input(1, inst, divert, func, is_extended_reg);
+    size_plus_maybe_sib_or_offset_for_inreg_1(sizing, enc, inst, divert, func)
+        + if needs_rex { 1 } else { 0 }
+}
+
+/// Calculates the size while inferring if the first and second input registers (inreg0, inreg1)
+/// require a dynamic REX prefix and if the second input register (inreg1) requires a SIB.
+fn size_plus_maybe_sib_inreg1_plus_rex_prefix_for_inreg0_inreg1(
+    sizing: &RecipeSizing,
+    enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
+        || test_input(1, inst, divert, func, is_extended_reg);
+    size_plus_maybe_sib_for_inreg_1(sizing, enc, inst, divert, func) + if needs_rex { 1 } else { 0 }
+}
+
+/// Calculates the size while inferring if the first input register (inreg0) and first output
+/// register (outreg0) require a dynamic REX and if the first input register (inreg0) requires a
+/// SIB or offset.
+fn size_plus_maybe_sib_or_offset_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0(
+    sizing: &RecipeSizing,
+    enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
+        || test_result(0, inst, divert, func, is_extended_reg);
+    size_plus_maybe_sib_or_offset_for_inreg_0(sizing, enc, inst, divert, func)
+        + if needs_rex { 1 } else { 0 }
+}
+
+/// Calculates the size while inferring if the first input register (inreg0) and first output
+/// register (outreg0) require a dynamic REX and if the first input register (inreg0) requires a
+/// SIB.
+fn size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0(
+    sizing: &RecipeSizing,
+    enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
+        || test_result(0, inst, divert, func, is_extended_reg);
+    size_plus_maybe_sib_for_inreg_0(sizing, enc, inst, divert, func) + if needs_rex { 1 } else { 0 }
+}
+
+/// Infers whether a dynamic REX prefix will be emitted, for use with one input reg.
+///
+/// A REX prefix is known to be emitted if either:
+///  1. The EncodingBits specify that REX.W is to be set.
+///  2. Registers are used that require REX.R or REX.B bits for encoding.
+fn size_with_inferred_rex_for_inreg0(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(0, inst, divert, func, is_extended_reg);
+    sizing.base_size + if needs_rex { 1 } else { 0 }
+}
+
+/// Infers whether a dynamic REX prefix will be emitted, based on the second operand.
+fn size_with_inferred_rex_for_inreg1(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(1, inst, divert, func, is_extended_reg);
+    sizing.base_size + if needs_rex { 1 } else { 0 }
+}
+
+/// Infers whether a dynamic REX prefix will be emitted, based on the third operand.
+fn size_with_inferred_rex_for_inreg2(
+    sizing: &RecipeSizing,
+    _: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(2, inst, divert, func, is_extended_reg);
+    sizing.base_size + if needs_rex { 1 } else { 0 }
+}
+
+/// Infers whether a dynamic REX prefix will be emitted, for use with two input registers.
+///
+/// A REX prefix is known to be emitted if either:
+///  1. The EncodingBits specify that REX.W is to be set.
+///  2. Registers are used that require REX.R or REX.B bits for encoding.
+fn size_with_inferred_rex_for_inreg0_inreg1(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
+        || test_input(1, inst, divert, func, is_extended_reg);
+    sizing.base_size + if needs_rex { 1 } else { 0 }
+}
+
+/// Infers whether a dynamic REX prefix will be emitted, based on second and third operand.
+fn size_with_inferred_rex_for_inreg1_inreg2(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(1, inst, divert, func, is_extended_reg)
+        || test_input(2, inst, divert, func, is_extended_reg);
+    sizing.base_size + if needs_rex { 1 } else { 0 }
+}
+
+/// Infers whether a dynamic REX prefix will be emitted, based on a single
+/// input register and a single output register.
+fn size_with_inferred_rex_for_inreg0_outreg0(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
+        || test_result(0, inst, divert, func, is_extended_reg);
+    sizing.base_size + if needs_rex { 1 } else { 0 }
+}
+
+/// Infers whether a dynamic REX prefix will be emitted, based on a single output register.
+fn size_with_inferred_rex_for_outreg0(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_result(0, inst, divert, func, is_extended_reg);
+    sizing.base_size + if needs_rex { 1 } else { 0 }
+}
+
+/// Infers whether a dynamic REX prefix will be emitted, for use with CMOV.
+///
+/// CMOV uses 3 inputs, with the REX is inferred from reg1 and reg2.
+fn size_with_inferred_rex_for_cmov(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(1, inst, divert, func, is_extended_reg)
+        || test_input(2, inst, divert, func, is_extended_reg);
+    sizing.base_size + if needs_rex { 1 } else { 0 }
+}
+
+/// If the value's definition is a constant immediate, returns its unpacked value, or None
+/// otherwise.
+fn maybe_iconst_imm(pos: &FuncCursor, value: ir::Value) -> Option<i64> {
+    if let ir::ValueDef::Result(inst, _) = &pos.func.dfg.value_def(value) {
+        if let ir::InstructionData::UnaryImm {
+            opcode: ir::Opcode::Iconst,
+            imm,
+        } = &pos.func.dfg[*inst]
+        {
+            let value: i64 = (*imm).into();
+            Some(value)
+        } else {
+            None
+        }
+    } else {
+        None
+    }
+}
+
+/// Expand the `sdiv` and `srem` instructions using `x86_sdivmodx`.
+fn expand_sdivrem(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let (x, y, is_srem) = match func.dfg[inst] {
+        ir::InstructionData::Binary {
+            opcode: ir::Opcode::Sdiv,
+            args,
+        } => (args[0], args[1], false),
+        ir::InstructionData::Binary {
+            opcode: ir::Opcode::Srem,
+            args,
+        } => (args[0], args[1], true),
+        _ => panic!("Need sdiv/srem: {}", func.dfg.display_inst(inst, None)),
+    };
+
+    let old_block = func.layout.pp_block(inst);
+    let result = func.dfg.first_result(inst);
+    let ty = func.dfg.value_type(result);
+
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+    pos.func.dfg.clear_results(inst);
+
+    let avoid_div_traps = isa.flags().avoid_div_traps();
+
+    // If we can tolerate native division traps, sdiv doesn't need branching.
+    if !avoid_div_traps && !is_srem {
+        let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1);
+        pos.ins().with_result(result).x86_sdivmodx(x, xhi, y);
+        pos.remove_inst();
+        return;
+    }
+
+    // Try to remove checks if the input value is an immediate other than 0 or -1. For these two
+    // immediates, we'd ideally replace conditional traps by traps, but this requires more
+    // manipulation of the dfg/cfg, which is out of scope here.
+    let (could_be_zero, could_be_minus_one) = if let Some(imm) = maybe_iconst_imm(&pos, y) {
+        (imm == 0, imm == -1)
+    } else {
+        (true, true)
+    };
+
+    // Put in an explicit division-by-zero trap if the environment requires it.
+    if avoid_div_traps && could_be_zero {
+        pos.ins().trapz(y, ir::TrapCode::IntegerDivisionByZero);
+    }
+
+    if !could_be_minus_one {
+        let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1);
+        let reuse = if is_srem {
+            [None, Some(result)]
+        } else {
+            [Some(result), None]
+        };
+        pos.ins().with_results(reuse).x86_sdivmodx(x, xhi, y);
+        pos.remove_inst();
+        return;
+    }
+
+    // block handling the nominal case.
+    let nominal = pos.func.dfg.make_block();
+
+    // block handling the -1 divisor case.
+    let minus_one = pos.func.dfg.make_block();
+
+    // Final block with one argument representing the final result value.
+    let done = pos.func.dfg.make_block();
+
+    // Move the `inst` result value onto the `done` block.
+    pos.func.dfg.attach_block_param(done, result);
+
+    // Start by checking for a -1 divisor which needs to be handled specially.
+    let is_m1 = pos.ins().ifcmp_imm(y, -1);
+    pos.ins().brif(IntCC::Equal, is_m1, minus_one, &[]);
+    pos.ins().jump(nominal, &[]);
+
+    // Now it is safe to execute the `x86_sdivmodx` instruction which will still trap on division
+    // by zero.
+    pos.insert_block(nominal);
+    let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1);
+    let (quot, rem) = pos.ins().x86_sdivmodx(x, xhi, y);
+    let divres = if is_srem { rem } else { quot };
+    pos.ins().jump(done, &[divres]);
+
+    // Now deal with the -1 divisor case.
+    pos.insert_block(minus_one);
+    let m1_result = if is_srem {
+        // x % -1 = 0.
+        pos.ins().iconst(ty, 0)
+    } else {
+        // Explicitly check for overflow: Trap when x == INT_MIN.
+        debug_assert!(avoid_div_traps, "Native trapping divide handled above");
+        let f = pos.ins().ifcmp_imm(x, -1 << (ty.lane_bits() - 1));
+        pos.ins()
+            .trapif(IntCC::Equal, f, ir::TrapCode::IntegerOverflow);
+        // x / -1 = -x.
+        pos.ins().irsub_imm(x, 0)
+    };
+
+    // Recycle the original instruction as a jump.
+    pos.func.dfg.replace(inst).jump(done, &[m1_result]);
+
+    // Finally insert a label for the completion.
+    pos.next_inst();
+    pos.insert_block(done);
+
+    cfg.recompute_block(pos.func, old_block);
+    cfg.recompute_block(pos.func, nominal);
+    cfg.recompute_block(pos.func, minus_one);
+    cfg.recompute_block(pos.func, done);
+}
+
+/// Expand the `udiv` and `urem` instructions using `x86_udivmodx`.
+fn expand_udivrem(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let (x, y, is_urem) = match func.dfg[inst] {
+        ir::InstructionData::Binary {
+            opcode: ir::Opcode::Udiv,
+            args,
+        } => (args[0], args[1], false),
+        ir::InstructionData::Binary {
+            opcode: ir::Opcode::Urem,
+            args,
+        } => (args[0], args[1], true),
+        _ => panic!("Need udiv/urem: {}", func.dfg.display_inst(inst, None)),
+    };
+    let avoid_div_traps = isa.flags().avoid_div_traps();
+    let result = func.dfg.first_result(inst);
+    let ty = func.dfg.value_type(result);
+
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+    pos.func.dfg.clear_results(inst);
+
+    // Put in an explicit division-by-zero trap if the environment requires it.
+    if avoid_div_traps {
+        let zero_check = if let Some(imm) = maybe_iconst_imm(&pos, y) {
+            // Ideally, we'd just replace the conditional trap with a trap when the immediate is
+            // zero, but this requires more manipulation of the dfg/cfg, which is out of scope
+            // here.
+            imm == 0
+        } else {
+            true
+        };
+        if zero_check {
+            pos.ins().trapz(y, ir::TrapCode::IntegerDivisionByZero);
+        }
+    }
+
+    // Now it is safe to execute the `x86_udivmodx` instruction.
+    let xhi = pos.ins().iconst(ty, 0);
+    let reuse = if is_urem {
+        [None, Some(result)]
+    } else {
+        [Some(result), None]
+    };
+    pos.ins().with_results(reuse).x86_udivmodx(x, xhi, y);
+    pos.remove_inst();
+}
+
+/// Expand the `fmin` and `fmax` instructions using the x86 `x86_fmin` and `x86_fmax`
+/// instructions.
+fn expand_minmax(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let (x, y, x86_opc, bitwise_opc) = match func.dfg[inst] {
+        ir::InstructionData::Binary {
+            opcode: ir::Opcode::Fmin,
+            args,
+        } => (args[0], args[1], ir::Opcode::X86Fmin, ir::Opcode::Bor),
+        ir::InstructionData::Binary {
+            opcode: ir::Opcode::Fmax,
+            args,
+        } => (args[0], args[1], ir::Opcode::X86Fmax, ir::Opcode::Band),
+        _ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)),
+    };
+    let old_block = func.layout.pp_block(inst);
+
+    // We need to handle the following conditions, depending on how x and y compare:
+    //
+    // 1. LT or GT: The native `x86_opc` min/max instruction does what we need.
+    // 2. EQ: We need to use `bitwise_opc` to make sure that
+    //    fmin(0.0, -0.0) -> -0.0 and fmax(0.0, -0.0) -> 0.0.
+    // 3. UN: We need to produce a quiet NaN that is canonical if the inputs are canonical.
+
+    // block handling case 1) where operands are ordered but not equal.
+    let one_block = func.dfg.make_block();
+
+    // block handling case 3) where one operand is NaN.
+    let uno_block = func.dfg.make_block();
+
+    // block that handles the unordered or equal cases 2) and 3).
+    let ueq_block = func.dfg.make_block();
+
+    // block handling case 2) where operands are ordered and equal.
+    let eq_block = func.dfg.make_block();
+
+    // Final block with one argument representing the final result value.
+    let done = func.dfg.make_block();
+
+    // The basic blocks are laid out to minimize branching for the common cases:
+    //
+    // 1) One branch not taken, one jump.
+    // 2) One branch taken.
+    // 3) Two branches taken, one jump.
+
+    // Move the `inst` result value onto the `done` block.
+    let result = func.dfg.first_result(inst);
+    let ty = func.dfg.value_type(result);
+    func.dfg.clear_results(inst);
+    func.dfg.attach_block_param(done, result);
+
+    // Test for case 1) ordered and not equal.
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+    let cmp_ueq = pos.ins().fcmp(FloatCC::UnorderedOrEqual, x, y);
+    pos.ins().brnz(cmp_ueq, ueq_block, &[]);
+    pos.ins().jump(one_block, &[]);
+
+    // Handle the common ordered, not equal (LT|GT) case.
+    pos.insert_block(one_block);
+    let one_inst = pos.ins().Binary(x86_opc, ty, x, y).0;
+    let one_result = pos.func.dfg.first_result(one_inst);
+    pos.ins().jump(done, &[one_result]);
+
+    // Case 3) Unordered.
+    // We know that at least one operand is a NaN that needs to be propagated. We simply use an
+    // `fadd` instruction which has the same NaN propagation semantics.
+    pos.insert_block(uno_block);
+    let uno_result = pos.ins().fadd(x, y);
+    pos.ins().jump(done, &[uno_result]);
+
+    // Case 2) or 3).
+    pos.insert_block(ueq_block);
+    // Test for case 3) (UN) one value is NaN.
+    // TODO: When we get support for flag values, we can reuse the above comparison.
+    let cmp_uno = pos.ins().fcmp(FloatCC::Unordered, x, y);
+    pos.ins().brnz(cmp_uno, uno_block, &[]);
+    pos.ins().jump(eq_block, &[]);
+
+    // We are now in case 2) where x and y compare EQ.
+    // We need a bitwise operation to get the sign right.
+    pos.insert_block(eq_block);
+    let bw_inst = pos.ins().Binary(bitwise_opc, ty, x, y).0;
+    let bw_result = pos.func.dfg.first_result(bw_inst);
+    // This should become a fall-through for this second most common case.
+    // Recycle the original instruction as a jump.
+    pos.func.dfg.replace(inst).jump(done, &[bw_result]);
+
+    // Finally insert a label for the completion.
+    pos.next_inst();
+    pos.insert_block(done);
+
+    cfg.recompute_block(pos.func, old_block);
+    cfg.recompute_block(pos.func, one_block);
+    cfg.recompute_block(pos.func, uno_block);
+    cfg.recompute_block(pos.func, ueq_block);
+    cfg.recompute_block(pos.func, eq_block);
+    cfg.recompute_block(pos.func, done);
+}
+
+/// This legalization converts a minimum/maximum operation into a sequence that matches the
+/// non-x86-friendly WebAssembly semantics of NaN handling. This logic is kept separate from
+/// [expand_minmax] above (the scalar version) for code clarity.
+fn expand_minmax_vector(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let ty = func.dfg.ctrl_typevar(inst);
+    debug_assert!(ty.is_vector());
+    let (x, y, x86_opcode, is_max) = match func.dfg[inst] {
+        ir::InstructionData::Binary {
+            opcode: ir::Opcode::Fmin,
+            args,
+        } => (args[0], args[1], ir::Opcode::X86Fmin, false),
+        ir::InstructionData::Binary {
+            opcode: ir::Opcode::Fmax,
+            args,
+        } => (args[0], args[1], ir::Opcode::X86Fmax, true),
+        _ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)),
+    };
+
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    // This sequence is complex due to how x86 handles NaNs and +0/-0. If x86 finds a NaN in
+    // either lane it returns the second operand; likewise, if both operands are in {+0.0, -0.0}
+    // it returns the second operand. To match the behavior of "return the minimum of the
+    // operands or a canonical NaN if either operand is NaN," we must compare in both
+    // directions.
+    let (forward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, x, y);
+    let forward = dfg.first_result(forward_inst);
+    let (backward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, y, x);
+    let backward = dfg.first_result(backward_inst);
+
+    let (value, mask) = if is_max {
+        // For maximum:
+        // Find any differences between the forward and backward `max` operation.
+        let difference = pos.ins().bxor(forward, backward);
+        // Merge in the differences.
+        let propagate_nans_and_plus_zero = pos.ins().bor(backward, difference);
+        let value = pos.ins().fsub(propagate_nans_and_plus_zero, difference);
+        // Discover which lanes have NaNs in them.
+        let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, difference, value);
+        (value, find_nan_lanes_mask)
+    } else {
+        // For minimum:
+        // If either lane is a NaN, we want to use these bits, not the second operand bits.
+        let propagate_nans = pos.ins().bor(backward, forward);
+        // Find which lanes contain a NaN with an unordered comparison, filling the mask with
+        // 1s.
+        let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, forward, propagate_nans);
+        let bitcast_find_nan_lanes_mask = pos.ins().raw_bitcast(ty, find_nan_lanes_mask);
+        // Then flood the value lane with all 1s if that lane is a NaN. This causes all NaNs
+        // along this code path to be quieted and negative: after the upcoming shift and and_not,
+        // all upper bits (sign, exponent, and payload MSB) will be 1s.
+        let tmp = pos.ins().bor(propagate_nans, bitcast_find_nan_lanes_mask);
+        (tmp, bitcast_find_nan_lanes_mask)
+    };
+
+    // During this lowering we will need to know how many bits to shift by and what type to
+    // convert to when using an integer shift. Recall that an IEEE754 number looks like:
+    // `[sign bit] [exponent bits] [significand bits]`
+    // A quiet NaN has all exponent bits set to 1 and the most significant bit of the
+    // significand set to 1; a signaling NaN has the same exponent but the MSB of the
+    // significand is set to 0. The payload of the NaN is the remaining significand bits, and
+    // WebAssembly assumes a canonical NaN is quiet and has 0s in its payload. To compute this
+    // canonical NaN, we create a mask for the top 10 bits on F32X4 (1 sign + 8 exp. + 1 MSB
+    // sig.) and the top 13 bits on F64X2 (1 sign + 11 exp. + 1 MSB sig.). This means that all
+    // NaNs produced with the mask will be negative (`-NaN`) which is allowed by the sign
+    // non-determinism in the spec: https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0
+    let (shift_by, ty_as_int) = match ty {
+        F32X4 => (10, I32X4),
+        F64X2 => (13, I64X2),
+        _ => unimplemented!("this legalization only understands 128-bit floating point types"),
+    };
+
+    // In order to clear the NaN payload for canonical NaNs, we shift right the NaN lanes (all
+    // 1s) leaving 0s in the top bits. Remember that non-NaN lanes are all 0s so this has
+    // little effect.
+    let mask_as_int = pos.ins().raw_bitcast(ty_as_int, mask);
+    let shift_mask = pos.ins().ushr_imm(mask_as_int, shift_by);
+    let shift_mask_as_float = pos.ins().raw_bitcast(ty, shift_mask);
+
+    // Finally, we replace the value with `value & ~shift_mask`. For non-NaN lanes, this is
+    // equivalent to `... & 1111...` but for NaN lanes this will only have 1s in the top bits,
+    // clearing the payload.
+    pos.func
+        .dfg
+        .replace(inst)
+        .band_not(value, shift_mask_as_float);
+}
+
+/// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to
+/// i64 with a pattern, the rest needs more code.
+///
+/// Note that this is the scalar implementation; for the vector implemenation see
+/// [expand_fcvt_from_uint_vector].
+fn expand_fcvt_from_uint(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let x;
+    match func.dfg[inst] {
+        ir::InstructionData::Unary {
+            opcode: ir::Opcode::FcvtFromUint,
+            arg,
+        } => x = arg,
+        _ => panic!("Need fcvt_from_uint: {}", func.dfg.display_inst(inst, None)),
+    }
+    let xty = func.dfg.value_type(x);
+    let result = func.dfg.first_result(inst);
+    let ty = func.dfg.value_type(result);
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    // Conversion from an unsigned int smaller than 64bit is easy on x86-64.
+    match xty {
+        ir::types::I8 | ir::types::I16 | ir::types::I32 => {
+            // TODO: This should be guarded by an ISA check.
+            let wide = pos.ins().uextend(ir::types::I64, x);
+            pos.func.dfg.replace(inst).fcvt_from_sint(ty, wide);
+            return;
+        }
+        ir::types::I64 => {}
+        _ => unimplemented!(),
+    }
+
+    let old_block = pos.func.layout.pp_block(inst);
+
+    // block handling the case where x >= 0.
+    let poszero_block = pos.func.dfg.make_block();
+
+    // block handling the case where x < 0.
+    let neg_block = pos.func.dfg.make_block();
+
+    // Final block with one argument representing the final result value.
+    let done = pos.func.dfg.make_block();
+
+    // Move the `inst` result value onto the `done` block.
+    pos.func.dfg.clear_results(inst);
+    pos.func.dfg.attach_block_param(done, result);
+
+    // If x as a signed int is not negative, we can use the existing `fcvt_from_sint` instruction.
+    let is_neg = pos.ins().icmp_imm(IntCC::SignedLessThan, x, 0);
+    pos.ins().brnz(is_neg, neg_block, &[]);
+    pos.ins().jump(poszero_block, &[]);
+
+    // Easy case: just use a signed conversion.
+    pos.insert_block(poszero_block);
+    let posres = pos.ins().fcvt_from_sint(ty, x);
+    pos.ins().jump(done, &[posres]);
+
+    // Now handle the negative case.
+    pos.insert_block(neg_block);
+
+    // Divide x by two to get it in range for the signed conversion, keep the LSB, and scale it
+    // back up on the FP side.
+    let ihalf = pos.ins().ushr_imm(x, 1);
+    let lsb = pos.ins().band_imm(x, 1);
+    let ifinal = pos.ins().bor(ihalf, lsb);
+    let fhalf = pos.ins().fcvt_from_sint(ty, ifinal);
+    let negres = pos.ins().fadd(fhalf, fhalf);
+
+    // Recycle the original instruction as a jump.
+    pos.func.dfg.replace(inst).jump(done, &[negres]);
+
+    // Finally insert a label for the completion.
+    pos.next_inst();
+    pos.insert_block(done);
+
+    cfg.recompute_block(pos.func, old_block);
+    cfg.recompute_block(pos.func, poszero_block);
+    cfg.recompute_block(pos.func, neg_block);
+    cfg.recompute_block(pos.func, done);
+}
+
+/// To convert packed unsigned integers to their float equivalents, we must legalize to a special
+/// AVX512 instruction (using MCSR rounding) or use a long sequence of instructions. This logic is
+/// separate from [expand_fcvt_from_uint] above (the scalar version), only due to how the transform
+/// groups are set up; TODO if we change the SIMD legalization groups, then this logic could be
+/// merged into [expand_fcvt_from_uint] (see https://github.com/bytecodealliance/wasmtime/issues/1745).
+fn expand_fcvt_from_uint_vector(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Unary {
+        opcode: ir::Opcode::FcvtFromUint,
+        arg,
+    } = pos.func.dfg[inst]
+    {
+        let controlling_type = pos.func.dfg.ctrl_typevar(inst);
+        if controlling_type == F32X4 {
+            debug_assert_eq!(pos.func.dfg.value_type(arg), I32X4);
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.use_avx512vl_simd() || x86_isa.isa_flags.use_avx512f_simd() {
+                // If we have certain AVX512 features, we can lower this instruction simply.
+                pos.func.dfg.replace(inst).x86_vcvtudq2ps(arg);
+            } else {
+                // Otherwise, we default to a very lengthy SSE4.1-compatible sequence: PXOR,
+                // PBLENDW, PSUB, CVTDQ2PS, PSRLD, CVTDQ2PS, ADDPS, ADDPS
+                let bitcast_arg = pos.ins().raw_bitcast(I16X8, arg);
+                let zero_constant = pos.func.dfg.constants.insert(vec![0; 16].into());
+                let zero = pos.ins().vconst(I16X8, zero_constant);
+                let low = pos.ins().x86_pblendw(zero, bitcast_arg, 0x55);
+                let bitcast_low = pos.ins().raw_bitcast(I32X4, low);
+                let high = pos.ins().isub(arg, bitcast_low);
+                let convert_low = pos.ins().fcvt_from_sint(F32X4, bitcast_low);
+                let shift_high = pos.ins().ushr_imm(high, 1);
+                let convert_high = pos.ins().fcvt_from_sint(F32X4, shift_high);
+                let double_high = pos.ins().fadd(convert_high, convert_high);
+                pos.func.dfg.replace(inst).fadd(double_high, convert_low);
+            }
+        } else {
+            unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None))
+        }
+    }
+}
+
+fn expand_fcvt_to_sint(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    use crate::ir::immediates::{Ieee32, Ieee64};
+
+    let x = match func.dfg[inst] {
+        ir::InstructionData::Unary {
+            opcode: ir::Opcode::FcvtToSint,
+            arg,
+        } => arg,
+        _ => panic!("Need fcvt_to_sint: {}", func.dfg.display_inst(inst, None)),
+    };
+    let old_block = func.layout.pp_block(inst);
+    let xty = func.dfg.value_type(x);
+    let result = func.dfg.first_result(inst);
+    let ty = func.dfg.value_type(result);
+
+    // Final block after the bad value checks.
+    let done = func.dfg.make_block();
+
+    // block for checking failure cases.
+    let maybe_trap_block = func.dfg.make_block();
+
+    // The `x86_cvtt2si` performs the desired conversion, but it doesn't trap on NaN or overflow.
+    // It produces an INT_MIN result instead.
+    func.dfg.replace(inst).x86_cvtt2si(ty, x);
+
+    let mut pos = FuncCursor::new(func).after_inst(inst);
+    pos.use_srcloc(inst);
+
+    let is_done = pos
+        .ins()
+        .icmp_imm(IntCC::NotEqual, result, 1 << (ty.lane_bits() - 1));
+    pos.ins().brnz(is_done, done, &[]);
+    pos.ins().jump(maybe_trap_block, &[]);
+
+    // We now have the following possibilities:
+    //
+    // 1. INT_MIN was actually the correct conversion result.
+    // 2. The input was NaN -> trap bad_toint
+    // 3. The input was out of range -> trap int_ovf
+    //
+    pos.insert_block(maybe_trap_block);
+
+    // Check for NaN.
+    let is_nan = pos.ins().fcmp(FloatCC::Unordered, x, x);
+    pos.ins()
+        .trapnz(is_nan, ir::TrapCode::BadConversionToInteger);
+
+    // Check for case 1: INT_MIN is the correct result.
+    // Determine the smallest floating point number that would convert to INT_MIN.
+    let mut overflow_cc = FloatCC::LessThan;
+    let output_bits = ty.lane_bits();
+    let flimit = match xty {
+        ir::types::F32 =>
+        // An f32 can represent `i16::min_value() - 1` exactly with precision to spare, so
+        // there are values less than -2^(N-1) that convert correctly to INT_MIN.
+        {
+            pos.ins().f32const(if output_bits < 32 {
+                overflow_cc = FloatCC::LessThanOrEqual;
+                Ieee32::fcvt_to_sint_negative_overflow(output_bits)
+            } else {
+                Ieee32::pow2(output_bits - 1).neg()
+            })
+        }
+        ir::types::F64 =>
+        // An f64 can represent `i32::min_value() - 1` exactly with precision to spare, so
+        // there are values less than -2^(N-1) that convert correctly to INT_MIN.
+        {
+            pos.ins().f64const(if output_bits < 64 {
+                overflow_cc = FloatCC::LessThanOrEqual;
+                Ieee64::fcvt_to_sint_negative_overflow(output_bits)
+            } else {
+                Ieee64::pow2(output_bits - 1).neg()
+            })
+        }
+        _ => panic!("Can't convert {}", xty),
+    };
+    let overflow = pos.ins().fcmp(overflow_cc, x, flimit);
+    pos.ins().trapnz(overflow, ir::TrapCode::IntegerOverflow);
+
+    // Finally, we could have a positive value that is too large.
+    let fzero = match xty {
+        ir::types::F32 => pos.ins().f32const(Ieee32::with_bits(0)),
+        ir::types::F64 => pos.ins().f64const(Ieee64::with_bits(0)),
+        _ => panic!("Can't convert {}", xty),
+    };
+    let overflow = pos.ins().fcmp(FloatCC::GreaterThanOrEqual, x, fzero);
+    pos.ins().trapnz(overflow, ir::TrapCode::IntegerOverflow);
+
+    pos.ins().jump(done, &[]);
+    pos.insert_block(done);
+
+    cfg.recompute_block(pos.func, old_block);
+    cfg.recompute_block(pos.func, maybe_trap_block);
+    cfg.recompute_block(pos.func, done);
+}
+
+fn expand_fcvt_to_sint_sat(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    use crate::ir::immediates::{Ieee32, Ieee64};
+
+    let x = match func.dfg[inst] {
+        ir::InstructionData::Unary {
+            opcode: ir::Opcode::FcvtToSintSat,
+            arg,
+        } => arg,
+        _ => panic!(
+            "Need fcvt_to_sint_sat: {}",
+            func.dfg.display_inst(inst, None)
+        ),
+    };
+
+    let old_block = func.layout.pp_block(inst);
+    let xty = func.dfg.value_type(x);
+    let result = func.dfg.first_result(inst);
+    let ty = func.dfg.value_type(result);
+
+    // Final block after the bad value checks.
+    let done_block = func.dfg.make_block();
+    let intmin_block = func.dfg.make_block();
+    let minsat_block = func.dfg.make_block();
+    let maxsat_block = func.dfg.make_block();
+    func.dfg.clear_results(inst);
+    func.dfg.attach_block_param(done_block, result);
+
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    // The `x86_cvtt2si` performs the desired conversion, but it doesn't trap on NaN or
+    // overflow. It produces an INT_MIN result instead.
+    let cvtt2si = pos.ins().x86_cvtt2si(ty, x);
+
+    let is_done = pos
+        .ins()
+        .icmp_imm(IntCC::NotEqual, cvtt2si, 1 << (ty.lane_bits() - 1));
+    pos.ins().brnz(is_done, done_block, &[cvtt2si]);
+    pos.ins().jump(intmin_block, &[]);
+
+    // We now have the following possibilities:
+    //
+    // 1. INT_MIN was actually the correct conversion result.
+    // 2. The input was NaN -> replace the result value with 0.
+    // 3. The input was out of range -> saturate the result to the min/max value.
+    pos.insert_block(intmin_block);
+
+    // Check for NaN, which is truncated to 0.
+    let zero = pos.ins().iconst(ty, 0);
+    let is_nan = pos.ins().fcmp(FloatCC::Unordered, x, x);
+    pos.ins().brnz(is_nan, done_block, &[zero]);
+    pos.ins().jump(minsat_block, &[]);
+
+    // Check for case 1: INT_MIN is the correct result.
+    // Determine the smallest floating point number that would convert to INT_MIN.
+    pos.insert_block(minsat_block);
+    let mut overflow_cc = FloatCC::LessThan;
+    let output_bits = ty.lane_bits();
+    let flimit = match xty {
+        ir::types::F32 =>
+        // An f32 can represent `i16::min_value() - 1` exactly with precision to spare, so
+        // there are values less than -2^(N-1) that convert correctly to INT_MIN.
+        {
+            pos.ins().f32const(if output_bits < 32 {
+                overflow_cc = FloatCC::LessThanOrEqual;
+                Ieee32::fcvt_to_sint_negative_overflow(output_bits)
+            } else {
+                Ieee32::pow2(output_bits - 1).neg()
+            })
+        }
+        ir::types::F64 =>
+        // An f64 can represent `i32::min_value() - 1` exactly with precision to spare, so
+        // there are values less than -2^(N-1) that convert correctly to INT_MIN.
+        {
+            pos.ins().f64const(if output_bits < 64 {
+                overflow_cc = FloatCC::LessThanOrEqual;
+                Ieee64::fcvt_to_sint_negative_overflow(output_bits)
+            } else {
+                Ieee64::pow2(output_bits - 1).neg()
+            })
+        }
+        _ => panic!("Can't convert {}", xty),
+    };
+
+    let overflow = pos.ins().fcmp(overflow_cc, x, flimit);
+    let min_imm = match ty {
+        ir::types::I32 => i32::min_value() as i64,
+        ir::types::I64 => i64::min_value(),
+        _ => panic!("Don't know the min value for {}", ty),
+    };
+    let min_value = pos.ins().iconst(ty, min_imm);
+    pos.ins().brnz(overflow, done_block, &[min_value]);
+    pos.ins().jump(maxsat_block, &[]);
+
+    // Finally, we could have a positive value that is too large.
+    pos.insert_block(maxsat_block);
+    let fzero = match xty {
+        ir::types::F32 => pos.ins().f32const(Ieee32::with_bits(0)),
+        ir::types::F64 => pos.ins().f64const(Ieee64::with_bits(0)),
+        _ => panic!("Can't convert {}", xty),
+    };
+
+    let max_imm = match ty {
+        ir::types::I32 => i32::max_value() as i64,
+        ir::types::I64 => i64::max_value(),
+        _ => panic!("Don't know the max value for {}", ty),
+    };
+    let max_value = pos.ins().iconst(ty, max_imm);
+
+    let overflow = pos.ins().fcmp(FloatCC::GreaterThanOrEqual, x, fzero);
+    pos.ins().brnz(overflow, done_block, &[max_value]);
+
+    // Recycle the original instruction.
+    pos.func.dfg.replace(inst).jump(done_block, &[cvtt2si]);
+
+    // Finally insert a label for the completion.
+    pos.next_inst();
+    pos.insert_block(done_block);
+
+    cfg.recompute_block(pos.func, old_block);
+    cfg.recompute_block(pos.func, intmin_block);
+    cfg.recompute_block(pos.func, minsat_block);
+    cfg.recompute_block(pos.func, maxsat_block);
+    cfg.recompute_block(pos.func, done_block);
+}
+
+/// This legalization converts a vector of 32-bit floating point lanes to signed integer lanes
+/// using CVTTPS2DQ (see encoding of `x86_cvtt2si`). This logic is separate from [expand_fcvt_to_sint_sat]
+/// above (the scalar version), only due to how the transform groups are set up; TODO if we change
+/// the SIMD legalization groups, then this logic could be merged into [expand_fcvt_to_sint_sat]
+/// (see https://github.com/bytecodealliance/wasmtime/issues/1745).
+fn expand_fcvt_to_sint_sat_vector(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Unary {
+        opcode: ir::Opcode::FcvtToSintSat,
+        arg,
+    } = pos.func.dfg[inst]
+    {
+        let controlling_type = pos.func.dfg.ctrl_typevar(inst);
+        if controlling_type == I32X4 {
+            debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4);
+            // We must both quiet any NaNs--setting that lane to 0--and saturate any
+            // lanes that might overflow during conversion to the highest/lowest signed integer
+            // allowed in that lane.
+
+            // Saturate NaNs: `fcmp eq` will not match if a lane contains a NaN. We use ANDPS to
+            // avoid doing the comparison twice (we need the zeroed lanes to find differences).
+            let zeroed_nans = pos.ins().fcmp(FloatCC::Equal, arg, arg);
+            let zeroed_nans_bitcast = pos.ins().raw_bitcast(F32X4, zeroed_nans);
+            let zeroed_nans_copy = pos.ins().band(arg, zeroed_nans_bitcast);
+
+            // Find differences with the zeroed lanes (we will only use the MSB: 1 if positive or
+            // NaN, 0 otherwise).
+            let differences = pos.ins().bxor(zeroed_nans_bitcast, arg);
+            let differences_bitcast = pos.ins().raw_bitcast(I32X4, differences);
+
+            // Convert the numeric lanes. CVTTPS2DQ will mark overflows with 0x80000000 (MSB set).
+            let converted = pos.ins().x86_cvtt2si(I32X4, zeroed_nans_copy);
+
+            // Create a mask of all 1s only on positive overflow, 0s otherwise. This uses the MSB
+            // of `differences` (1 when positive or NaN) and the MSB of `converted` (1 on positive
+            // overflow).
+            let tmp = pos.ins().band(differences_bitcast, converted);
+            let mask = pos.ins().sshr_imm(tmp, 31);
+
+            // Apply the mask to create 0x7FFFFFFF for positive overflow. XOR of all 0s (all other
+            // cases) has no effect.
+            pos.func.dfg.replace(inst).bxor(converted, mask);
+        } else {
+            unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None))
+        }
+    }
+}
+
+fn expand_fcvt_to_uint(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    use crate::ir::immediates::{Ieee32, Ieee64};
+
+    let x = match func.dfg[inst] {
+        ir::InstructionData::Unary {
+            opcode: ir::Opcode::FcvtToUint,
+            arg,
+        } => arg,
+        _ => panic!("Need fcvt_to_uint: {}", func.dfg.display_inst(inst, None)),
+    };
+
+    let old_block = func.layout.pp_block(inst);
+    let xty = func.dfg.value_type(x);
+    let result = func.dfg.first_result(inst);
+    let ty = func.dfg.value_type(result);
+
+    // block handle numbers < 2^(N-1).
+    let below_uint_max_block = func.dfg.make_block();
+
+    // block handle numbers < 0.
+    let below_zero_block = func.dfg.make_block();
+
+    // block handling numbers >= 2^(N-1).
+    let large = func.dfg.make_block();
+
+    // Final block after the bad value checks.
+    let done = func.dfg.make_block();
+
+    // Move the `inst` result value onto the `done` block.
+    func.dfg.clear_results(inst);
+    func.dfg.attach_block_param(done, result);
+
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    // Start by materializing the floating point constant 2^(N-1) where N is the number of bits in
+    // the destination integer type.
+    let pow2nm1 = match xty {
+        ir::types::F32 => pos.ins().f32const(Ieee32::pow2(ty.lane_bits() - 1)),
+        ir::types::F64 => pos.ins().f64const(Ieee64::pow2(ty.lane_bits() - 1)),
+        _ => panic!("Can't convert {}", xty),
+    };
+    let is_large = pos.ins().ffcmp(x, pow2nm1);
+    pos.ins()
+        .brff(FloatCC::GreaterThanOrEqual, is_large, large, &[]);
+    pos.ins().jump(below_uint_max_block, &[]);
+
+    // We need to generate a specific trap code when `x` is NaN, so reuse the flags from the
+    // previous comparison.
+    pos.insert_block(below_uint_max_block);
+    pos.ins().trapff(
+        FloatCC::Unordered,
+        is_large,
+        ir::TrapCode::BadConversionToInteger,
+    );
+
+    // Now we know that x < 2^(N-1) and not NaN.
+    let sres = pos.ins().x86_cvtt2si(ty, x);
+    let is_neg = pos.ins().ifcmp_imm(sres, 0);
+    pos.ins()
+        .brif(IntCC::SignedGreaterThanOrEqual, is_neg, done, &[sres]);
+    pos.ins().jump(below_zero_block, &[]);
+
+    pos.insert_block(below_zero_block);
+    pos.ins().trap(ir::TrapCode::IntegerOverflow);
+
+    // Handle the case where x >= 2^(N-1) and not NaN.
+    pos.insert_block(large);
+    let adjx = pos.ins().fsub(x, pow2nm1);
+    let lres = pos.ins().x86_cvtt2si(ty, adjx);
+    let is_neg = pos.ins().ifcmp_imm(lres, 0);
+    pos.ins()
+        .trapif(IntCC::SignedLessThan, is_neg, ir::TrapCode::IntegerOverflow);
+    let lfinal = pos.ins().iadd_imm(lres, 1 << (ty.lane_bits() - 1));
+
+    // Recycle the original instruction as a jump.
+    pos.func.dfg.replace(inst).jump(done, &[lfinal]);
+
+    // Finally insert a label for the completion.
+    pos.next_inst();
+    pos.insert_block(done);
+
+    cfg.recompute_block(pos.func, old_block);
+    cfg.recompute_block(pos.func, below_uint_max_block);
+    cfg.recompute_block(pos.func, below_zero_block);
+    cfg.recompute_block(pos.func, large);
+    cfg.recompute_block(pos.func, done);
+}
+
+fn expand_fcvt_to_uint_sat(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    use crate::ir::immediates::{Ieee32, Ieee64};
+
+    let x = match func.dfg[inst] {
+        ir::InstructionData::Unary {
+            opcode: ir::Opcode::FcvtToUintSat,
+            arg,
+        } => arg,
+        _ => panic!(
+            "Need fcvt_to_uint_sat: {}",
+            func.dfg.display_inst(inst, None)
+        ),
+    };
+
+    let old_block = func.layout.pp_block(inst);
+    let xty = func.dfg.value_type(x);
+    let result = func.dfg.first_result(inst);
+    let ty = func.dfg.value_type(result);
+
+    // block handle numbers < 2^(N-1).
+    let below_pow2nm1_or_nan_block = func.dfg.make_block();
+    let below_pow2nm1_block = func.dfg.make_block();
+
+    // block handling numbers >= 2^(N-1).
+    let large = func.dfg.make_block();
+
+    // block handling numbers < 2^N.
+    let uint_large_block = func.dfg.make_block();
+
+    // Final block after the bad value checks.
+    let done = func.dfg.make_block();
+
+    // Move the `inst` result value onto the `done` block.
+    func.dfg.clear_results(inst);
+    func.dfg.attach_block_param(done, result);
+
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    // Start by materializing the floating point constant 2^(N-1) where N is the number of bits in
+    // the destination integer type.
+    let pow2nm1 = match xty {
+        ir::types::F32 => pos.ins().f32const(Ieee32::pow2(ty.lane_bits() - 1)),
+        ir::types::F64 => pos.ins().f64const(Ieee64::pow2(ty.lane_bits() - 1)),
+        _ => panic!("Can't convert {}", xty),
+    };
+    let zero = pos.ins().iconst(ty, 0);
+    let is_large = pos.ins().ffcmp(x, pow2nm1);
+    pos.ins()
+        .brff(FloatCC::GreaterThanOrEqual, is_large, large, &[]);
+    pos.ins().jump(below_pow2nm1_or_nan_block, &[]);
+
+    // We need to generate zero when `x` is NaN, so reuse the flags from the previous comparison.
+    pos.insert_block(below_pow2nm1_or_nan_block);
+    pos.ins().brff(FloatCC::Unordered, is_large, done, &[zero]);
+    pos.ins().jump(below_pow2nm1_block, &[]);
+
+    // Now we know that x < 2^(N-1) and not NaN. If the result of the cvtt2si is positive, we're
+    // done; otherwise saturate to the minimum unsigned value, that is 0.
+    pos.insert_block(below_pow2nm1_block);
+    let sres = pos.ins().x86_cvtt2si(ty, x);
+    let is_neg = pos.ins().ifcmp_imm(sres, 0);
+    pos.ins()
+        .brif(IntCC::SignedGreaterThanOrEqual, is_neg, done, &[sres]);
+    pos.ins().jump(done, &[zero]);
+
+    // Handle the case where x >= 2^(N-1) and not NaN.
+    pos.insert_block(large);
+    let adjx = pos.ins().fsub(x, pow2nm1);
+    let lres = pos.ins().x86_cvtt2si(ty, adjx);
+    let max_value = pos.ins().iconst(
+        ty,
+        match ty {
+            ir::types::I32 => u32::max_value() as i64,
+            ir::types::I64 => u64::max_value() as i64,
+            _ => panic!("Can't convert {}", ty),
+        },
+    );
+    let is_neg = pos.ins().ifcmp_imm(lres, 0);
+    pos.ins()
+        .brif(IntCC::SignedLessThan, is_neg, done, &[max_value]);
+    pos.ins().jump(uint_large_block, &[]);
+
+    pos.insert_block(uint_large_block);
+    let lfinal = pos.ins().iadd_imm(lres, 1 << (ty.lane_bits() - 1));
+
+    // Recycle the original instruction as a jump.
+    pos.func.dfg.replace(inst).jump(done, &[lfinal]);
+
+    // Finally insert a label for the completion.
+    pos.next_inst();
+    pos.insert_block(done);
+
+    cfg.recompute_block(pos.func, old_block);
+    cfg.recompute_block(pos.func, below_pow2nm1_or_nan_block);
+    cfg.recompute_block(pos.func, below_pow2nm1_block);
+    cfg.recompute_block(pos.func, large);
+    cfg.recompute_block(pos.func, uint_large_block);
+    cfg.recompute_block(pos.func, done);
+}
+
+// Lanes of an I32x4 filled with the max signed integer values converted to an F32x4.
+static MAX_SIGNED_I32X4S_AS_F32X4S: [u8; 16] = [
+    0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f,
+];
+
+/// This legalization converts a vector of 32-bit floating point lanes to unsigned integer lanes
+/// using a long sequence of NaN quieting and truncation. This logic is separate from
+/// [expand_fcvt_to_uint_sat] above (the scalar version), only due to how the transform groups are
+/// set up; TODO if we change the SIMD legalization groups, then this logic could be merged into
+/// [expand_fcvt_to_uint_sat] (see https://github.com/bytecodealliance/wasmtime/issues/1745).
+fn expand_fcvt_to_uint_sat_vector(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Unary {
+        opcode: ir::Opcode::FcvtToUintSat,
+        arg,
+    } = pos.func.dfg[inst]
+    {
+        let controlling_type = pos.func.dfg.ctrl_typevar(inst);
+        if controlling_type == I32X4 {
+            debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4);
+            // We must both quiet any NaNs--setting that lane to 0--and saturate any
+            // lanes that might overflow during conversion to the highest/lowest integer
+            // allowed in that lane.
+            let zeroes_constant = pos.func.dfg.constants.insert(vec![0x00; 16].into());
+            let max_signed_constant = pos
+                .func
+                .dfg
+                .constants
+                .insert(MAX_SIGNED_I32X4S_AS_F32X4S.as_ref().into());
+            let zeroes = pos.ins().vconst(F32X4, zeroes_constant);
+            let max_signed = pos.ins().vconst(F32X4, max_signed_constant);
+            // Clamp the input to 0 for negative floating point numbers. TODO we need to
+            // convert NaNs to 0 but this doesn't do that?
+            let ge_zero = pos.ins().x86_fmax(arg, zeroes);
+            // Find lanes that exceed the max signed value that CVTTPS2DQ knows how to convert.
+            // For floating point numbers above this, CVTTPS2DQ returns the undefined value
+            // 0x80000000.
+            let minus_max_signed = pos.ins().fsub(ge_zero, max_signed);
+            let le_max_signed =
+                pos.ins()
+                    .fcmp(FloatCC::LessThanOrEqual, max_signed, minus_max_signed);
+            // Identify lanes that have minus_max_signed > max_signed || minus_max_signed < 0.
+            // These lanes have the MSB set to 1 after the XOR. We are trying to calculate a
+            // valid, in-range addend.
+            let minus_max_signed_as_int = pos.ins().x86_cvtt2si(I32X4, minus_max_signed);
+            let le_max_signed_as_int = pos.ins().raw_bitcast(I32X4, le_max_signed);
+            let difference = pos
+                .ins()
+                .bxor(minus_max_signed_as_int, le_max_signed_as_int);
+            // Calculate amount to add above 0x7FFFFFF, zeroing out any lanes identified
+            // previously (MSB set to 1).
+            let zeroes_as_int = pos.ins().raw_bitcast(I32X4, zeroes);
+            let addend = pos.ins().x86_pmaxs(difference, zeroes_as_int);
+            // Convert the original clamped number to an integer and add back in the addend
+            // (the part of the value above 0x7FFFFFF, since CVTTPS2DQ overflows with these).
+            let converted = pos.ins().x86_cvtt2si(I32X4, ge_zero);
+            pos.func.dfg.replace(inst).iadd(converted, addend);
+        } else {
+            unreachable!(
+                "{} should not be legalized in expand_fcvt_to_uint_sat_vector",
+                pos.func.dfg.display_inst(inst, None)
+            )
+        }
+    }
+}
+
+/// Convert shuffle instructions.
+fn convert_shuffle(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Shuffle { args, mask, .. } = pos.func.dfg[inst] {
+        // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a 1
+        // in the most significant position zeroes the lane.
+        let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
+
+        // We only have to worry about aliasing here because copies will be introduced later (in
+        // regalloc).
+        let a = pos.func.dfg.resolve_aliases(args[0]);
+        let b = pos.func.dfg.resolve_aliases(args[1]);
+        let mask = pos
+            .func
+            .dfg
+            .immediates
+            .get(mask)
+            .expect("The shuffle immediate should have been recorded before this point")
+            .clone();
+        if a == b {
+            // PSHUFB the first argument (since it is the same as the second).
+            let constructed_mask = mask
+                .iter()
+                // If the mask is greater than 15 it still may be referring to a lane in b.
+                .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
+                .map(zero_unknown_lane_index)
+                .collect();
+            let handle = pos.func.dfg.constants.insert(constructed_mask);
+            // Move the built mask into another XMM register.
+            let a_type = pos.func.dfg.value_type(a);
+            let mask_value = pos.ins().vconst(a_type, handle);
+            // Shuffle the single incoming argument.
+            pos.func.dfg.replace(inst).x86_pshufb(a, mask_value);
+        } else {
+            // PSHUFB the first argument, placing zeroes for unused lanes.
+            let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
+            let handle = pos.func.dfg.constants.insert(constructed_mask);
+            // Move the built mask into another XMM register.
+            let a_type = pos.func.dfg.value_type(a);
+            let mask_value = pos.ins().vconst(a_type, handle);
+            // Shuffle the first argument.
+            let shuffled_first_arg = pos.ins().x86_pshufb(a, mask_value);
+
+            // PSHUFB the second argument, placing zeroes for unused lanes.
+            let constructed_mask = mask
+                .iter()
+                .map(|b| b.wrapping_sub(16))
+                .map(zero_unknown_lane_index)
+                .collect();
+            let handle = pos.func.dfg.constants.insert(constructed_mask);
+            // Move the built mask into another XMM register.
+            let b_type = pos.func.dfg.value_type(b);
+            let mask_value = pos.ins().vconst(b_type, handle);
+            // Shuffle the second argument.
+            let shuffled_second_arg = pos.ins().x86_pshufb(b, mask_value);
+
+            // OR the vectors together to form the final shuffled value.
+            pos.func
+                .dfg
+                .replace(inst)
+                .bor(shuffled_first_arg, shuffled_second_arg);
+
+            // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
+        };
+    }
+}
+
+/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF
+/// extractlane instruction
+fn convert_extractlane(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::BinaryImm8 {
+        opcode: ir::Opcode::Extractlane,
+        arg,
+        imm: lane,
+    } = pos.func.dfg[inst]
+    {
+        // NOTE: the following legalization assumes that the upper bits of the XMM register do
+        // not need to be zeroed during extractlane.
+        let value_type = pos.func.dfg.value_type(arg);
+        if value_type.lane_type().is_float() {
+            // Floats are already in XMM registers and can stay there.
+            let shuffled = if lane != 0 {
+                // Replace the extractlane with a PSHUFD to get the float in the right place.
+                match value_type {
+                    F32X4 => {
+                        // Move the selected lane to the 0 lane.
+                        let shuffle_mask: u8 = 0b00_00_00_00 | lane;
+                        pos.ins().x86_pshufd(arg, shuffle_mask)
+                    }
+                    F64X2 => {
+                        assert_eq!(lane, 1);
+                        // Because we know the lane == 1, we move the upper 64 bits to the lower
+                        // 64 bits, leaving the top 64 bits as-is.
+                        let shuffle_mask = 0b11_10_11_10;
+                        let bitcast = pos.ins().raw_bitcast(F32X4, arg);
+                        pos.ins().x86_pshufd(bitcast, shuffle_mask)
+                    }
+                    _ => unreachable!(),
+                }
+            } else {
+                // Remove the extractlane instruction, leaving the float where it is.
+                arg
+            };
+            // Then we must bitcast to the right type.
+            pos.func
+                .dfg
+                .replace(inst)
+                .raw_bitcast(value_type.lane_type(), shuffled);
+        } else {
+            // For non-floats, lower with the usual PEXTR* instruction.
+            pos.func.dfg.replace(inst).x86_pextr(arg, lane);
+        }
+    }
+}
+
+/// Because floats exist in XMM registers, we can keep them there when executing a CLIF
+/// insertlane instruction
+fn convert_insertlane(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::TernaryImm8 {
+        opcode: ir::Opcode::Insertlane,
+        args: [vector, replacement],
+        imm: lane,
+    } = pos.func.dfg[inst]
+    {
+        let value_type = pos.func.dfg.value_type(vector);
+        if value_type.lane_type().is_float() {
+            // Floats are already in XMM registers and can stay there.
+            match value_type {
+                F32X4 => {
+                    assert!(lane <= 3);
+                    let immediate = 0b00_00_00_00 | lane << 4;
+                    // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
+                    // shifted into bits 5:6).
+                    pos.func
+                        .dfg
+                        .replace(inst)
+                        .x86_insertps(vector, replacement, immediate)
+                }
+                F64X2 => {
+                    let replacement_as_vector = pos.ins().raw_bitcast(F64X2, replacement); // only necessary due to SSA types
+                    if lane == 0 {
+                        // Move the lowest quadword in replacement to vector without changing
+                        // the upper bits.
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .x86_movsd(vector, replacement_as_vector)
+                    } else {
+                        assert_eq!(lane, 1);
+                        // Move the low 64 bits of replacement vector to the high 64 bits of the
+                        // vector.
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .x86_movlhps(vector, replacement_as_vector)
+                    }
+                }
+                _ => unreachable!(),
+            };
+        } else {
+            // For non-floats, lower with the usual PINSR* instruction.
+            pos.func
+                .dfg
+                .replace(inst)
+                .x86_pinsr(vector, replacement, lane);
+        }
+    }
+}
+
+/// For SIMD or scalar integer negation, convert `ineg` to `vconst + isub` or `iconst + isub`.
+fn convert_ineg(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Unary {
+        opcode: ir::Opcode::Ineg,
+        arg,
+    } = pos.func.dfg[inst]
+    {
+        let value_type = pos.func.dfg.value_type(arg);
+        let zero_value = if value_type.is_vector() && value_type.lane_type().is_int() {
+            let zero_immediate = pos.func.dfg.constants.insert(vec![0; 16].into());
+            pos.ins().vconst(value_type, zero_immediate) // this should be legalized to a PXOR
+        } else if value_type.is_int() {
+            pos.ins().iconst(value_type, 0)
+        } else {
+            panic!("Can't convert ineg of type {}", value_type)
+        };
+        pos.func.dfg.replace(inst).isub(zero_value, arg);
+    } else {
+        unreachable!()
+    }
+}
+
+fn expand_dword_to_xmm<'f>(
+    pos: &mut FuncCursor<'_>,
+    arg: ir::Value,
+    arg_type: ir::Type,
+) -> ir::Value {
+    if arg_type == I64 {
+        let (arg_lo, arg_hi) = pos.ins().isplit(arg);
+        let arg = pos.ins().scalar_to_vector(I32X4, arg_lo);
+        let arg = pos.ins().insertlane(arg, arg_hi, 1);
+        let arg = pos.ins().raw_bitcast(I64X2, arg);
+        arg
+    } else {
+        pos.ins().bitcast(I64X2, arg)
+    }
+}
+
+fn contract_dword_from_xmm<'f>(
+    pos: &mut FuncCursor<'f>,
+    inst: ir::Inst,
+    ret: ir::Value,
+    ret_type: ir::Type,
+) {
+    if ret_type == I64 {
+        let ret = pos.ins().raw_bitcast(I32X4, ret);
+        let ret_lo = pos.ins().extractlane(ret, 0);
+        let ret_hi = pos.ins().extractlane(ret, 1);
+        pos.func.dfg.replace(inst).iconcat(ret_lo, ret_hi);
+    } else {
+        let ret = pos.ins().extractlane(ret, 0);
+        pos.func.dfg.replace(inst).ireduce(ret_type, ret);
+    }
+}
+
+// Masks for i8x16 unsigned right shift.
+static USHR_MASKS: [u8; 128] = [
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+    0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+    0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+    0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+];
+
+// Convert a vector unsigned right shift. x86 has implementations for i16x8 and up (see `x86_pslr`),
+// but for i8x16 we translate the shift to a i16x8 shift and mask off the upper bits. This same
+// conversion could be provided in the CDSL if we could use varargs there (TODO); i.e. `load_complex`
+// has a varargs field that we can't modify with the CDSL in legalize.rs.
+fn convert_ushr(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Binary {
+        opcode: ir::Opcode::Ushr,
+        args: [arg0, arg1],
+    } = pos.func.dfg[inst]
+    {
+        // Note that for Wasm, the bounding of the shift index has happened during translation
+        let arg0_type = pos.func.dfg.value_type(arg0);
+        let arg1_type = pos.func.dfg.value_type(arg1);
+        assert!(!arg1_type.is_vector() && arg1_type.is_int());
+
+        // TODO it may be more clear to use scalar_to_vector here; the current issue is that
+        // scalar_to_vector has the restriction that the vector produced has a matching lane size
+        // (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 ->
+        // i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift
+        // index type--this could be relaxed since it is not really meaningful.
+        let shift_index = pos.ins().bitcast(I64X2, arg1);
+
+        if arg0_type == I8X16 {
+            // First, shift the vector using an I16X8 shift.
+            let bitcasted = pos.ins().raw_bitcast(I16X8, arg0);
+            let shifted = pos.ins().x86_psrl(bitcasted, shift_index);
+            let shifted = pos.ins().raw_bitcast(I8X16, shifted);
+
+            // Then, fixup the even lanes that have incorrect upper bits. This uses the 128 mask
+            // bytes as a table that we index into. It is a substantial code-size increase but
+            // reduces the instruction count slightly.
+            let masks = pos.func.dfg.constants.insert(USHR_MASKS.as_ref().into());
+            let mask_address = pos.ins().const_addr(isa.pointer_type(), masks);
+            let mask_offset = pos.ins().ishl_imm(arg1, 4);
+            let mask =
+                pos.ins()
+                    .load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0);
+            pos.func.dfg.replace(inst).band(shifted, mask);
+        } else if arg0_type.is_vector() {
+            // x86 has encodings for these shifts.
+            pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index);
+        } else if arg0_type == I64 {
+            // 64 bit shifts need to be legalized on x86_32.
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.has_sse41() {
+                // if we have pinstrq/pextrq (SSE 4.1), legalize to that
+                let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type);
+                let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type);
+                let shifted = pos.ins().x86_psrl(value, amount);
+                contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type);
+            } else {
+                // otherwise legalize to libcall
+                expand_as_libcall(inst, func, isa);
+            }
+        } else {
+            // Everything else should be already legal.
+            unreachable!()
+        }
+    }
+}
+
+// Masks for i8x16 left shift.
+static SHL_MASKS: [u8; 128] = [
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+    0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
+    0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
+    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+    0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
+    0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+];
+
+// Convert a vector left shift. x86 has implementations for i16x8 and up (see `x86_psll`),
+// but for i8x16 we translate the shift to a i16x8 shift and mask off the lower bits. This same
+// conversion could be provided in the CDSL if we could use varargs there (TODO); i.e. `load_complex`
+// has a varargs field that we can't modify with the CDSL in legalize.rs.
+fn convert_ishl(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Binary {
+        opcode: ir::Opcode::Ishl,
+        args: [arg0, arg1],
+    } = pos.func.dfg[inst]
+    {
+        // Note that for Wasm, the bounding of the shift index has happened during translation
+        let arg0_type = pos.func.dfg.value_type(arg0);
+        let arg1_type = pos.func.dfg.value_type(arg1);
+        assert!(!arg1_type.is_vector() && arg1_type.is_int());
+
+        // TODO it may be more clear to use scalar_to_vector here; the current issue is that
+        // scalar_to_vector has the restriction that the vector produced has a matching lane size
+        // (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 ->
+        // i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift
+        // index type--this could be relaxed since it is not really meaningful.
+        let shift_index = pos.ins().bitcast(I64X2, arg1);
+
+        if arg0_type == I8X16 {
+            // First, shift the vector using an I16X8 shift.
+            let bitcasted = pos.ins().raw_bitcast(I16X8, arg0);
+            let shifted = pos.ins().x86_psll(bitcasted, shift_index);
+            let shifted = pos.ins().raw_bitcast(I8X16, shifted);
+
+            // Then, fixup the even lanes that have incorrect lower bits. This uses the 128 mask
+            // bytes as a table that we index into. It is a substantial code-size increase but
+            // reduces the instruction count slightly.
+            let masks = pos.func.dfg.constants.insert(SHL_MASKS.as_ref().into());
+            let mask_address = pos.ins().const_addr(isa.pointer_type(), masks);
+            let mask_offset = pos.ins().ishl_imm(arg1, 4);
+            let mask =
+                pos.ins()
+                    .load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0);
+            pos.func.dfg.replace(inst).band(shifted, mask);
+        } else if arg0_type.is_vector() {
+            // x86 has encodings for these shifts.
+            pos.func.dfg.replace(inst).x86_psll(arg0, shift_index);
+        } else if arg0_type == I64 {
+            // 64 bit shifts need to be legalized on x86_32.
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.has_sse41() {
+                // if we have pinstrq/pextrq (SSE 4.1), legalize to that
+                let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type);
+                let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type);
+                let shifted = pos.ins().x86_psll(value, amount);
+                contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type);
+            } else {
+                // otherwise legalize to libcall
+                expand_as_libcall(inst, func, isa);
+            }
+        } else {
+            // Everything else should be already legal.
+            unreachable!()
+        }
+    }
+}
+
+/// Convert an imul.i64x2 to a valid code sequence on x86, first with AVX512 and then with SSE2.
+fn convert_i64x2_imul(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Binary {
+        opcode: ir::Opcode::Imul,
+        args: [arg0, arg1],
+    } = pos.func.dfg[inst]
+    {
+        let ty = pos.func.dfg.ctrl_typevar(inst);
+        if ty == I64X2 {
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.use_avx512dq_simd() || x86_isa.isa_flags.use_avx512vl_simd() {
+                // If we have certain AVX512 features, we can lower this instruction simply.
+                pos.func.dfg.replace(inst).x86_pmullq(arg0, arg1);
+            } else {
+                // Otherwise, we default to a very lengthy SSE2-compatible sequence. It splits each
+                // 64-bit lane into 32-bit high and low sections using shifting and then performs
+                // the following arithmetic per lane: with arg0 = concat(high0, low0) and arg1 =
+                // concat(high1, low1), calculate (high0 * low1) + (high1 * low0) + (low0 * low1).
+                let high0 = pos.ins().ushr_imm(arg0, 32);
+                let mul0 = pos.ins().x86_pmuludq(high0, arg1);
+                let high1 = pos.ins().ushr_imm(arg1, 32);
+                let mul1 = pos.ins().x86_pmuludq(high1, arg0);
+                let addhigh = pos.ins().iadd(mul0, mul1);
+                let high = pos.ins().ishl_imm(addhigh, 32);
+                let low = pos.ins().x86_pmuludq(arg0, arg1);
+                pos.func.dfg.replace(inst).iadd(low, high);
+            }
+        } else {
+            unreachable!(
+                "{} should be encodable; it cannot be legalized by convert_i64x2_imul",
+                pos.func.dfg.display_inst(inst, None)
+            );
+        }
+    }
+}
+
+fn expand_tls_value(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    use crate::settings::TlsModel;
+
+    assert!(
+        isa.triple().architecture == target_lexicon::Architecture::X86_64,
+        "Not yet implemented for {:?}",
+        isa.triple(),
+    );
+
+    if let ir::InstructionData::UnaryGlobalValue {
+        opcode: ir::Opcode::TlsValue,
+        global_value,
+    } = func.dfg[inst]
+    {
+        let ctrl_typevar = func.dfg.ctrl_typevar(inst);
+        assert_eq!(ctrl_typevar, ir::types::I64);
+
+        match isa.flags().tls_model() {
+            TlsModel::None => panic!("tls_model flag is not set."),
+            TlsModel::ElfGd => {
+                func.dfg.replace(inst).x86_elf_tls_get_addr(global_value);
+            }
+            TlsModel::Macho => {
+                func.dfg.replace(inst).x86_macho_tls_get_addr(global_value);
+            }
+            model => unimplemented!("tls_value for tls model {:?}", model),
+        }
+    } else {
+        unreachable!();
+    }
+}
+
+fn expand_load_splat(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+
+    pos.use_srcloc(inst);
+
+    let (ptr, offset, flags) = match pos.func.dfg[inst] {
+        ir::InstructionData::Load {
+            opcode: ir::Opcode::LoadSplat,
+            arg,
+            offset,
+            flags,
+        } => (arg, offset, flags),
+        _ => panic!(
+            "Expected load_splat: {}",
+            pos.func.dfg.display_inst(inst, None)
+        ),
+    };
+    let ty = pos.func.dfg.ctrl_typevar(inst);
+    let load = pos.ins().load(ty.lane_type(), flags, ptr, offset);
+
+    pos.func.dfg.replace(inst).splat(ty, load);
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/mod.rs b/third_party/rust/cranelift-codegen/src/isa/x86/mod.rs
new file mode 100644
index 0000000000..cbdeb3069d
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/mod.rs
@@ -0,0 +1,190 @@
+//! x86 Instruction Set Architectures.
+
+mod abi;
+mod binemit;
+mod enc_tables;
+mod registers;
+pub mod settings;
+#[cfg(feature = "unwind")]
+pub mod unwind;
+
+use super::super::settings as shared_settings;
+#[cfg(feature = "testing_hooks")]
+use crate::binemit::CodeSink;
+use crate::binemit::{emit_function, MemoryCodeSink};
+use crate::ir;
+use crate::isa::enc_tables::{self as shared_enc_tables, lookup_enclist, Encodings};
+use crate::isa::Builder as IsaBuilder;
+#[cfg(feature = "unwind")]
+use crate::isa::{unwind::systemv::RegisterMappingError, RegUnit};
+use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
+use crate::regalloc;
+use crate::result::CodegenResult;
+use crate::timing;
+use alloc::borrow::Cow;
+use alloc::boxed::Box;
+use core::any::Any;
+use core::fmt;
+use target_lexicon::{PointerWidth, Triple};
+
+#[allow(dead_code)]
+struct Isa {
+    triple: Triple,
+    shared_flags: shared_settings::Flags,
+    isa_flags: settings::Flags,
+    cpumode: &'static [shared_enc_tables::Level1Entry<u16>],
+}
+
+/// Get an ISA builder for creating x86 targets.
+pub fn isa_builder(triple: Triple) -> IsaBuilder {
+    IsaBuilder {
+        triple,
+        setup: settings::builder(),
+        constructor: isa_constructor,
+    }
+}
+
+fn isa_constructor(
+    triple: Triple,
+    shared_flags: shared_settings::Flags,
+    builder: shared_settings::Builder,
+) -> Box<dyn TargetIsa> {
+    let level1 = match triple.pointer_width().unwrap() {
+        PointerWidth::U16 => unimplemented!("x86-16"),
+        PointerWidth::U32 => &enc_tables::LEVEL1_I32[..],
+        PointerWidth::U64 => &enc_tables::LEVEL1_I64[..],
+    };
+
+    let isa_flags = settings::Flags::new(&shared_flags, builder);
+
+    Box::new(Isa {
+        triple,
+        isa_flags,
+        shared_flags,
+        cpumode: level1,
+    })
+}
+
+impl TargetIsa for Isa {
+    fn name(&self) -> &'static str {
+        "x86"
+    }
+
+    fn triple(&self) -> &Triple {
+        &self.triple
+    }
+
+    fn flags(&self) -> &shared_settings::Flags {
+        &self.shared_flags
+    }
+
+    fn uses_cpu_flags(&self) -> bool {
+        true
+    }
+
+    fn uses_complex_addresses(&self) -> bool {
+        true
+    }
+
+    fn register_info(&self) -> RegInfo {
+        registers::INFO.clone()
+    }
+
+    #[cfg(feature = "unwind")]
+    fn map_dwarf_register(&self, reg: RegUnit) -> Result<u16, RegisterMappingError> {
+        unwind::systemv::map_reg(self, reg).map(|r| r.0)
+    }
+
+    fn encoding_info(&self) -> EncInfo {
+        enc_tables::INFO.clone()
+    }
+
+    fn legal_encodings<'a>(
+        &'a self,
+        func: &'a ir::Function,
+        inst: &'a ir::InstructionData,
+        ctrl_typevar: ir::Type,
+    ) -> Encodings<'a> {
+        lookup_enclist(
+            ctrl_typevar,
+            inst,
+            func,
+            self.cpumode,
+            &enc_tables::LEVEL2[..],
+            &enc_tables::ENCLISTS[..],
+            &enc_tables::LEGALIZE_ACTIONS[..],
+            &enc_tables::RECIPE_PREDICATES[..],
+            &enc_tables::INST_PREDICATES[..],
+            self.isa_flags.predicate_view(),
+        )
+    }
+
+    fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool) {
+        abi::legalize_signature(
+            sig,
+            &self.triple,
+            current,
+            &self.shared_flags,
+            &self.isa_flags,
+        )
+    }
+
+    fn regclass_for_abi_type(&self, ty: ir::Type) -> RegClass {
+        abi::regclass_for_abi_type(ty)
+    }
+
+    fn allocatable_registers(&self, _func: &ir::Function) -> regalloc::RegisterSet {
+        abi::allocatable_registers(&self.triple, &self.shared_flags)
+    }
+
+    #[cfg(feature = "testing_hooks")]
+    fn emit_inst(
+        &self,
+        func: &ir::Function,
+        inst: ir::Inst,
+        divert: &mut regalloc::RegDiversions,
+        sink: &mut dyn CodeSink,
+    ) {
+        binemit::emit_inst(func, inst, divert, sink, self)
+    }
+
+    fn emit_function_to_memory(&self, func: &ir::Function, sink: &mut MemoryCodeSink) {
+        emit_function(func, binemit::emit_inst, sink, self)
+    }
+
+    fn prologue_epilogue(&self, func: &mut ir::Function) -> CodegenResult<()> {
+        let _tt = timing::prologue_epilogue();
+        abi::prologue_epilogue(func, self)
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC {
+        ir::condcodes::IntCC::UnsignedLessThan
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC {
+        ir::condcodes::IntCC::UnsignedLessThan
+    }
+
+    #[cfg(feature = "unwind")]
+    fn create_unwind_info(
+        &self,
+        func: &ir::Function,
+    ) -> CodegenResult<Option<super::unwind::UnwindInfo>> {
+        abi::create_unwind_info(func, self)
+    }
+
+    #[cfg(feature = "unwind")]
+    fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> {
+        Some(unwind::systemv::create_cie())
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
+}
+
+impl fmt::Display for Isa {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}\n{}", self.shared_flags, self.isa_flags)
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/registers.rs b/third_party/rust/cranelift-codegen/src/isa/x86/registers.rs
new file mode 100644
index 0000000000..a7518b268b
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/registers.rs
@@ -0,0 +1,86 @@
+//! x86 register descriptions.
+
+use crate::isa::registers::{RegBank, RegClass, RegClassData, RegInfo, RegUnit};
+
+include!(concat!(env!("OUT_DIR"), "/registers-x86.rs"));
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::isa::RegUnit;
+    use alloc::string::{String, ToString};
+
+    #[test]
+    fn unit_encodings() {
+        fn gpr(unit: usize) -> Option<u16> {
+            Some(GPR.unit(unit))
+        }
+        // The encoding of integer registers is not alphabetical.
+        assert_eq!(INFO.parse_regunit("rax"), gpr(0));
+        assert_eq!(INFO.parse_regunit("rbx"), gpr(3));
+        assert_eq!(INFO.parse_regunit("rcx"), gpr(1));
+        assert_eq!(INFO.parse_regunit("rdx"), gpr(2));
+        assert_eq!(INFO.parse_regunit("rsi"), gpr(6));
+        assert_eq!(INFO.parse_regunit("rdi"), gpr(7));
+        assert_eq!(INFO.parse_regunit("rbp"), gpr(5));
+        assert_eq!(INFO.parse_regunit("rsp"), gpr(4));
+        assert_eq!(INFO.parse_regunit("r8"), gpr(8));
+        assert_eq!(INFO.parse_regunit("r15"), gpr(15));
+
+        fn fpr(unit: usize) -> Option<u16> {
+            Some(FPR.unit(unit))
+        }
+        assert_eq!(INFO.parse_regunit("xmm0"), fpr(0));
+        assert_eq!(INFO.parse_regunit("xmm15"), fpr(15));
+
+        // FIXME(#1306) Add these tests back in when FPR32 is re-added.
+        // fn fpr32(unit: usize) -> Option<u16> {
+        //    Some(FPR32.unit(unit))
+        // }
+        // assert_eq!(INFO.parse_regunit("xmm0"), fpr32(0));
+        // assert_eq!(INFO.parse_regunit("xmm31"), fpr32(31));
+    }
+
+    #[test]
+    fn unit_names() {
+        fn gpr(ru: RegUnit) -> String {
+            INFO.display_regunit(GPR.first + ru).to_string()
+        }
+        assert_eq!(gpr(0), "%rax");
+        assert_eq!(gpr(3), "%rbx");
+        assert_eq!(gpr(1), "%rcx");
+        assert_eq!(gpr(2), "%rdx");
+        assert_eq!(gpr(6), "%rsi");
+        assert_eq!(gpr(7), "%rdi");
+        assert_eq!(gpr(5), "%rbp");
+        assert_eq!(gpr(4), "%rsp");
+        assert_eq!(gpr(8), "%r8");
+        assert_eq!(gpr(15), "%r15");
+
+        fn fpr(ru: RegUnit) -> String {
+            INFO.display_regunit(FPR.first + ru).to_string()
+        }
+        assert_eq!(fpr(0), "%xmm0");
+        assert_eq!(fpr(15), "%xmm15");
+
+        // FIXME(#1306) Add these tests back in when FPR32 is re-added.
+        // fn fpr32(ru: RegUnit) -> String {
+        //    INFO.display_regunit(FPR32.first + ru).to_string()
+        // }
+        // assert_eq!(fpr32(0), "%xmm0");
+        // assert_eq!(fpr32(31), "%xmm31");
+    }
+
+    #[test]
+    fn regclasses() {
+        assert_eq!(GPR.intersect_index(GPR), Some(GPR.into()));
+        assert_eq!(GPR.intersect_index(ABCD), Some(ABCD.into()));
+        assert_eq!(GPR.intersect_index(FPR), None);
+        assert_eq!(ABCD.intersect_index(GPR), Some(ABCD.into()));
+        assert_eq!(ABCD.intersect_index(ABCD), Some(ABCD.into()));
+        assert_eq!(ABCD.intersect_index(FPR), None);
+        assert_eq!(FPR.intersect_index(FPR), Some(FPR.into()));
+        assert_eq!(FPR.intersect_index(GPR), None);
+        assert_eq!(FPR.intersect_index(ABCD), None);
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/settings.rs b/third_party/rust/cranelift-codegen/src/isa/x86/settings.rs
new file mode 100644
index 0000000000..2d3a3f6698
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/settings.rs
@@ -0,0 +1,52 @@
+//! x86 Settings.
+
+use crate::settings::{self, detail, Builder};
+use core::fmt;
+
+// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
+// public `Flags` struct with an impl for all of the settings defined in
+// `cranelift-codegen/meta/src/isa/x86/settings.rs`.
+include!(concat!(env!("OUT_DIR"), "/settings-x86.rs"));
+
+#[cfg(test)]
+mod tests {
+    use super::{builder, Flags};
+    use crate::settings::{self, Configurable};
+
+    #[test]
+    fn presets() {
+        let shared = settings::Flags::new(settings::builder());
+
+        // Nehalem has SSE4.1 but not BMI1.
+        let mut b0 = builder();
+        b0.enable("nehalem").unwrap();
+        let f0 = Flags::new(&shared, b0);
+        assert_eq!(f0.has_sse41(), true);
+        assert_eq!(f0.has_bmi1(), false);
+
+        let mut b1 = builder();
+        b1.enable("haswell").unwrap();
+        let f1 = Flags::new(&shared, b1);
+        assert_eq!(f1.has_sse41(), true);
+        assert_eq!(f1.has_bmi1(), true);
+    }
+    #[test]
+    fn display_presets() {
+        // Spot check that the flags Display impl does not cause a panic
+        let shared = settings::Flags::new(settings::builder());
+
+        let b0 = builder();
+        let f0 = Flags::new(&shared, b0);
+        let _ = format!("{}", f0);
+
+        let mut b1 = builder();
+        b1.enable("nehalem").unwrap();
+        let f1 = Flags::new(&shared, b1);
+        let _ = format!("{}", f1);
+
+        let mut b2 = builder();
+        b2.enable("haswell").unwrap();
+        let f2 = Flags::new(&shared, b2);
+        let _ = format!("{}", f2);
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/x86/unwind.rs
new file mode 100644
index 0000000000..2d6b29f04d
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/unwind.rs
@@ -0,0 +1,535 @@
+//! Module for x86 unwind generation for supported ABIs.
+
+pub mod systemv;
+pub mod winx64;
+
+use crate::ir::{Function, InstructionData, Opcode, ValueLoc};
+use crate::isa::x86::registers::{FPR, RU};
+use crate::isa::{RegUnit, TargetIsa};
+use crate::result::CodegenResult;
+use alloc::vec::Vec;
+use std::collections::HashMap;
+
+use crate::isa::unwind::input::{UnwindCode, UnwindInfo};
+
+pub(crate) fn create_unwind_info(
+    func: &Function,
+    isa: &dyn TargetIsa,
+) -> CodegenResult<Option<UnwindInfo<RegUnit>>> {
+    // Find last block based on max offset.
+    let last_block = func
+        .layout
+        .blocks()
+        .max_by_key(|b| func.offsets[*b])
+        .expect("at least a block");
+    // Find last instruction offset + size, and make it function size.
+    let function_size = func
+        .inst_offsets(last_block, &isa.encoding_info())
+        .fold(0, |_, (offset, _, size)| offset + size);
+
+    let entry_block = func.layout.entry_block().expect("missing entry block");
+    let prologue_end = func.prologue_end.unwrap();
+    let epilogues_start = func
+        .epilogues_start
+        .iter()
+        .map(|(i, b)| (*b, *i))
+        .collect::<HashMap<_, _>>();
+
+    let word_size = isa.pointer_bytes();
+
+    let mut stack_size = None;
+    let mut prologue_size = 0;
+    let mut prologue_unwind_codes = Vec::new();
+    let mut epilogues_unwind_codes = Vec::new();
+    let mut frame_register: Option<RegUnit> = None;
+
+    // Process only entry block and blocks with epilogues.
+    let mut blocks = func
+        .epilogues_start
+        .iter()
+        .map(|(_, b)| *b)
+        .collect::<Vec<_>>();
+    if !blocks.contains(&entry_block) {
+        blocks.push(entry_block);
+    }
+    blocks.sort_by_key(|b| func.offsets[*b]);
+
+    for block in blocks.iter() {
+        let mut in_prologue = block == &entry_block;
+        let mut in_epilogue = false;
+        let mut epilogue_pop_offsets = Vec::new();
+
+        let epilogue_start = epilogues_start.get(block);
+        let is_last_block = block == &last_block;
+
+        for (offset, inst, size) in func.inst_offsets(*block, &isa.encoding_info()) {
+            let offset = offset + size;
+
+            let unwind_codes;
+            if in_prologue {
+                // Check for prologue end (inclusive)
+                if prologue_end == inst {
+                    in_prologue = false;
+                }
+                prologue_size += size;
+                unwind_codes = &mut prologue_unwind_codes;
+            } else if !in_epilogue && epilogue_start == Some(&inst) {
+                // Now in an epilogue, emit a remember state instruction if not last block
+                in_epilogue = true;
+
+                epilogues_unwind_codes.push(Vec::new());
+                unwind_codes = epilogues_unwind_codes.last_mut().unwrap();
+
+                if !is_last_block {
+                    unwind_codes.push((offset, UnwindCode::RememberState));
+                }
+            } else if in_epilogue {
+                unwind_codes = epilogues_unwind_codes.last_mut().unwrap();
+            } else {
+                // Ignore normal instructions
+                continue;
+            }
+
+            match func.dfg[inst] {
+                InstructionData::Unary { opcode, arg } => {
+                    match opcode {
+                        Opcode::X86Push => {
+                            let reg = func.locations[arg].unwrap_reg();
+                            unwind_codes.push((
+                                offset,
+                                UnwindCode::StackAlloc {
+                                    size: word_size.into(),
+                                },
+                            ));
+                            unwind_codes.push((
+                                offset,
+                                UnwindCode::SaveRegister {
+                                    reg,
+                                    stack_offset: 0,
+                                },
+                            ));
+                        }
+                        Opcode::AdjustSpDown => {
+                            let stack_size =
+                                stack_size.expect("expected a previous stack size instruction");
+
+                            // This is used when calling a stack check function
+                            // We need to track the assignment to RAX which has the size of the stack
+                            unwind_codes
+                                .push((offset, UnwindCode::StackAlloc { size: stack_size }));
+                        }
+                        _ => {}
+                    }
+                }
+                InstructionData::UnaryImm { opcode, imm } => {
+                    match opcode {
+                        Opcode::Iconst => {
+                            let imm: i64 = imm.into();
+                            assert!(imm <= core::u32::MAX as i64);
+                            assert!(stack_size.is_none());
+
+                            // This instruction should only appear in a prologue to pass an
+                            // argument of the stack size to a stack check function.
+                            // Record the stack size so we know what it is when we encounter the adjustment
+                            // instruction (which will adjust via the register assigned to this instruction).
+                            stack_size = Some(imm as u32);
+                        }
+                        Opcode::AdjustSpDownImm => {
+                            let imm: i64 = imm.into();
+                            assert!(imm <= core::u32::MAX as i64);
+
+                            stack_size = Some(imm as u32);
+
+                            unwind_codes
+                                .push((offset, UnwindCode::StackAlloc { size: imm as u32 }));
+                        }
+                        Opcode::AdjustSpUpImm => {
+                            let imm: i64 = imm.into();
+                            assert!(imm <= core::u32::MAX as i64);
+
+                            stack_size = Some(imm as u32);
+
+                            unwind_codes
+                                .push((offset, UnwindCode::StackDealloc { size: imm as u32 }));
+                        }
+                        _ => {}
+                    }
+                }
+                InstructionData::Store {
+                    opcode: Opcode::Store,
+                    args: [arg1, arg2],
+                    offset: stack_offset,
+                    ..
+                } => {
+                    if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) =
+                        (func.locations[arg1], func.locations[arg2])
+                    {
+                        // If this is a save of an FPR, record an unwind operation
+                        // Note: the stack_offset here is relative to an adjusted SP
+                        if dst == (RU::rsp as RegUnit) && FPR.contains(src) {
+                            let stack_offset: i32 = stack_offset.into();
+                            unwind_codes.push((
+                                offset,
+                                UnwindCode::SaveRegister {
+                                    reg: src,
+                                    stack_offset: stack_offset as u32,
+                                },
+                            ));
+                        }
+                    }
+                }
+                InstructionData::CopySpecial { src, dst, .. } if frame_register.is_none() => {
+                    // Check for change in CFA register (RSP is always the starting CFA)
+                    if src == (RU::rsp as RegUnit) {
+                        unwind_codes.push((offset, UnwindCode::SetFramePointer { reg: dst }));
+                        frame_register = Some(dst);
+                    }
+                }
+                InstructionData::NullAry { opcode } => match opcode {
+                    Opcode::X86Pop => {
+                        epilogue_pop_offsets.push(offset);
+                    }
+                    _ => {}
+                },
+                InstructionData::MultiAry { opcode, .. } if in_epilogue => match opcode {
+                    Opcode::Return => {
+                        let args = func.dfg.inst_args(inst);
+                        for (i, arg) in args.iter().rev().enumerate() {
+                            // Only walk back the args for the pop instructions encountered
+                            if i >= epilogue_pop_offsets.len() {
+                                break;
+                            }
+
+                            let offset = epilogue_pop_offsets[i];
+
+                            let reg = func.locations[*arg].unwrap_reg();
+                            unwind_codes.push((offset, UnwindCode::RestoreRegister { reg }));
+                            unwind_codes.push((
+                                offset,
+                                UnwindCode::StackDealloc {
+                                    size: word_size.into(),
+                                },
+                            ));
+
+                            if Some(reg) == frame_register {
+                                unwind_codes.push((offset, UnwindCode::RestoreFramePointer));
+                                // Keep frame_register assigned for next epilogue.
+                            }
+                        }
+                        epilogue_pop_offsets.clear();
+
+                        // TODO ensure unwind codes sorted by offsets ?
+
+                        if !is_last_block {
+                            unwind_codes.push((offset, UnwindCode::RestoreState));
+                        }
+
+                        in_epilogue = false;
+                    }
+                    _ => {}
+                },
+                _ => {}
+            };
+        }
+    }
+
+    Ok(Some(UnwindInfo {
+        prologue_size,
+        prologue_unwind_codes,
+        epilogues_unwind_codes,
+        function_size,
+        word_size,
+        initial_sp_offset: word_size,
+    }))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::{
+        types, AbiParam, ExternalName, InstBuilder, Signature, StackSlotData, StackSlotKind,
+    };
+    use crate::isa::{lookup, CallConv};
+    use crate::settings::{builder, Flags};
+    use crate::Context;
+    use std::str::FromStr;
+    use target_lexicon::triple;
+
+    #[test]
+    #[cfg_attr(feature = "x64", should_panic)] // TODO #2079
+    fn test_small_alloc() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::WindowsFastcall,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let unwind = create_unwind_info(&context.func, &*isa)
+            .expect("can create unwind info")
+            .expect("expected unwind info");
+
+        assert_eq!(
+            unwind,
+            UnwindInfo {
+                prologue_size: 9,
+                prologue_unwind_codes: vec![
+                    (2, UnwindCode::StackAlloc { size: 8 }),
+                    (
+                        2,
+                        UnwindCode::SaveRegister {
+                            reg: RU::rbp.into(),
+                            stack_offset: 0,
+                        }
+                    ),
+                    (
+                        5,
+                        UnwindCode::SetFramePointer {
+                            reg: RU::rbp.into(),
+                        }
+                    ),
+                    (9, UnwindCode::StackAlloc { size: 64 })
+                ],
+                epilogues_unwind_codes: vec![vec![
+                    (13, UnwindCode::StackDealloc { size: 64 }),
+                    (
+                        15,
+                        UnwindCode::RestoreRegister {
+                            reg: RU::rbp.into()
+                        }
+                    ),
+                    (15, UnwindCode::StackDealloc { size: 8 }),
+                    (15, UnwindCode::RestoreFramePointer)
+                ]],
+                function_size: 16,
+                word_size: 8,
+                initial_sp_offset: 8,
+            }
+        );
+    }
+
+    #[test]
+    #[cfg_attr(feature = "x64", should_panic)] // TODO #2079
+    fn test_medium_alloc() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::WindowsFastcall,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 10000)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let unwind = create_unwind_info(&context.func, &*isa)
+            .expect("can create unwind info")
+            .expect("expected unwind info");
+
+        assert_eq!(
+            unwind,
+            UnwindInfo {
+                prologue_size: 27,
+                prologue_unwind_codes: vec![
+                    (2, UnwindCode::StackAlloc { size: 8 }),
+                    (
+                        2,
+                        UnwindCode::SaveRegister {
+                            reg: RU::rbp.into(),
+                            stack_offset: 0,
+                        }
+                    ),
+                    (
+                        5,
+                        UnwindCode::SetFramePointer {
+                            reg: RU::rbp.into(),
+                        }
+                    ),
+                    (27, UnwindCode::StackAlloc { size: 10000 })
+                ],
+                epilogues_unwind_codes: vec![vec![
+                    (34, UnwindCode::StackDealloc { size: 10000 }),
+                    (
+                        36,
+                        UnwindCode::RestoreRegister {
+                            reg: RU::rbp.into()
+                        }
+                    ),
+                    (36, UnwindCode::StackDealloc { size: 8 }),
+                    (36, UnwindCode::RestoreFramePointer)
+                ]],
+                function_size: 37,
+                word_size: 8,
+                initial_sp_offset: 8,
+            }
+        );
+    }
+
+    #[test]
+    #[cfg_attr(feature = "x64", should_panic)] // TODO #2079
+    fn test_large_alloc() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::WindowsFastcall,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 1000000)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let unwind = create_unwind_info(&context.func, &*isa)
+            .expect("can create unwind info")
+            .expect("expected unwind info");
+
+        assert_eq!(
+            unwind,
+            UnwindInfo {
+                prologue_size: 27,
+                prologue_unwind_codes: vec![
+                    (2, UnwindCode::StackAlloc { size: 8 }),
+                    (
+                        2,
+                        UnwindCode::SaveRegister {
+                            reg: RU::rbp.into(),
+                            stack_offset: 0,
+                        }
+                    ),
+                    (
+                        5,
+                        UnwindCode::SetFramePointer {
+                            reg: RU::rbp.into(),
+                        }
+                    ),
+                    (27, UnwindCode::StackAlloc { size: 1000000 })
+                ],
+                epilogues_unwind_codes: vec![vec![
+                    (34, UnwindCode::StackDealloc { size: 1000000 }),
+                    (
+                        36,
+                        UnwindCode::RestoreRegister {
+                            reg: RU::rbp.into()
+                        }
+                    ),
+                    (36, UnwindCode::StackDealloc { size: 8 }),
+                    (36, UnwindCode::RestoreFramePointer)
+                ]],
+                function_size: 37,
+                word_size: 8,
+                initial_sp_offset: 8,
+            }
+        );
+    }
+
+    fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function {
+        let mut func =
+            Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv));
+
+        let block0 = func.dfg.make_block();
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().return_(&[]);
+
+        if let Some(stack_slot) = stack_slot {
+            func.stack_slots.push(stack_slot);
+        }
+
+        func
+    }
+
+    #[test]
+    #[cfg_attr(feature = "x64", should_panic)] // TODO #2079
+    fn test_multi_return_func() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let unwind = create_unwind_info(&context.func, &*isa)
+            .expect("can create unwind info")
+            .expect("expected unwind info");
+
+        assert_eq!(
+            unwind,
+            UnwindInfo {
+                prologue_size: 5,
+                prologue_unwind_codes: vec![
+                    (2, UnwindCode::StackAlloc { size: 8 }),
+                    (
+                        2,
+                        UnwindCode::SaveRegister {
+                            reg: RU::rbp.into(),
+                            stack_offset: 0,
+                        }
+                    ),
+                    (
+                        5,
+                        UnwindCode::SetFramePointer {
+                            reg: RU::rbp.into()
+                        }
+                    )
+                ],
+                epilogues_unwind_codes: vec![
+                    vec![
+                        (12, UnwindCode::RememberState),
+                        (
+                            12,
+                            UnwindCode::RestoreRegister {
+                                reg: RU::rbp.into()
+                            }
+                        ),
+                        (12, UnwindCode::StackDealloc { size: 8 }),
+                        (12, UnwindCode::RestoreFramePointer),
+                        (13, UnwindCode::RestoreState)
+                    ],
+                    vec![
+                        (
+                            15,
+                            UnwindCode::RestoreRegister {
+                                reg: RU::rbp.into()
+                            }
+                        ),
+                        (15, UnwindCode::StackDealloc { size: 8 }),
+                        (15, UnwindCode::RestoreFramePointer)
+                    ]
+                ],
+                function_size: 16,
+                word_size: 8,
+                initial_sp_offset: 8,
+            }
+        );
+    }
+
+    fn create_multi_return_function(call_conv: CallConv) -> Function {
+        let mut sig = Signature::new(call_conv);
+        sig.params.push(AbiParam::new(types::I32));
+        let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig);
+
+        let block0 = func.dfg.make_block();
+        let v0 = func.dfg.append_block_param(block0, types::I32);
+        let block1 = func.dfg.make_block();
+        let block2 = func.dfg.make_block();
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().brnz(v0, block2, &[]);
+        pos.ins().jump(block1, &[]);
+
+        pos.insert_block(block1);
+        pos.ins().return_(&[]);
+
+        pos.insert_block(block2);
+        pos.ins().return_(&[]);
+
+        func
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/unwind/systemv.rs b/third_party/rust/cranelift-codegen/src/isa/x86/unwind/systemv.rs
new file mode 100644
index 0000000000..f6333f5afb
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/unwind/systemv.rs
@@ -0,0 +1,234 @@
+//! Unwind information for System V ABI (x86-64).
+
+use crate::ir::Function;
+use crate::isa::{
+    unwind::systemv::{RegisterMappingError, UnwindInfo},
+    CallConv, RegUnit, TargetIsa,
+};
+use crate::result::CodegenResult;
+use gimli::{write::CommonInformationEntry, Encoding, Format, Register, X86_64};
+
+/// Creates a new x86-64 common information entry (CIE).
+pub fn create_cie() -> CommonInformationEntry {
+    use gimli::write::CallFrameInstruction;
+
+    let mut entry = CommonInformationEntry::new(
+        Encoding {
+            address_size: 8,
+            format: Format::Dwarf32,
+            version: 1,
+        },
+        1,  // Code alignment factor
+        -8, // Data alignment factor
+        X86_64::RA,
+    );
+
+    // Every frame will start with the call frame address (CFA) at RSP+8
+    // It is +8 to account for the push of the return address by the call instruction
+    entry.add_instruction(CallFrameInstruction::Cfa(X86_64::RSP, 8));
+
+    // Every frame will start with the return address at RSP (CFA-8 = RSP+8-8 = RSP)
+    entry.add_instruction(CallFrameInstruction::Offset(X86_64::RA, -8));
+
+    entry
+}
+
+/// Map Cranelift registers to their corresponding Gimli registers.
+pub fn map_reg(isa: &dyn TargetIsa, reg: RegUnit) -> Result<Register, RegisterMappingError> {
+    if isa.name() != "x86" || isa.pointer_bits() != 64 {
+        return Err(RegisterMappingError::UnsupportedArchitecture);
+    }
+
+    // Mapping from https://github.com/bytecodealliance/cranelift/pull/902 by @iximeow
+    const X86_GP_REG_MAP: [gimli::Register; 16] = [
+        X86_64::RAX,
+        X86_64::RCX,
+        X86_64::RDX,
+        X86_64::RBX,
+        X86_64::RSP,
+        X86_64::RBP,
+        X86_64::RSI,
+        X86_64::RDI,
+        X86_64::R8,
+        X86_64::R9,
+        X86_64::R10,
+        X86_64::R11,
+        X86_64::R12,
+        X86_64::R13,
+        X86_64::R14,
+        X86_64::R15,
+    ];
+    const X86_XMM_REG_MAP: [gimli::Register; 16] = [
+        X86_64::XMM0,
+        X86_64::XMM1,
+        X86_64::XMM2,
+        X86_64::XMM3,
+        X86_64::XMM4,
+        X86_64::XMM5,
+        X86_64::XMM6,
+        X86_64::XMM7,
+        X86_64::XMM8,
+        X86_64::XMM9,
+        X86_64::XMM10,
+        X86_64::XMM11,
+        X86_64::XMM12,
+        X86_64::XMM13,
+        X86_64::XMM14,
+        X86_64::XMM15,
+    ];
+
+    let reg_info = isa.register_info();
+    let bank = reg_info
+        .bank_containing_regunit(reg)
+        .ok_or_else(|| RegisterMappingError::MissingBank)?;
+    match bank.name {
+        "IntRegs" => {
+            // x86 GP registers have a weird mapping to DWARF registers, so we use a
+            // lookup table.
+            Ok(X86_GP_REG_MAP[(reg - bank.first_unit) as usize])
+        }
+        "FloatRegs" => Ok(X86_XMM_REG_MAP[(reg - bank.first_unit) as usize]),
+        _ => Err(RegisterMappingError::UnsupportedRegisterBank(bank.name)),
+    }
+}
+
+pub(crate) fn create_unwind_info(
+    func: &Function,
+    isa: &dyn TargetIsa,
+) -> CodegenResult<Option<UnwindInfo>> {
+    // Only System V-like calling conventions are supported
+    match func.signature.call_conv {
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV => {}
+        _ => return Ok(None),
+    }
+
+    if func.prologue_end.is_none() || isa.name() != "x86" || isa.pointer_bits() != 64 {
+        return Ok(None);
+    }
+
+    let unwind = match super::create_unwind_info(func, isa)? {
+        Some(u) => u,
+        None => {
+            return Ok(None);
+        }
+    };
+
+    struct RegisterMapper<'a, 'b>(&'a (dyn TargetIsa + 'b));
+    impl<'a, 'b> crate::isa::unwind::systemv::RegisterMapper<RegUnit> for RegisterMapper<'a, 'b> {
+        fn map(&self, reg: RegUnit) -> Result<u16, RegisterMappingError> {
+            Ok(map_reg(self.0, reg)?.0)
+        }
+        fn sp(&self) -> u16 {
+            X86_64::RSP.0
+        }
+    }
+    let map = RegisterMapper(isa);
+
+    Ok(Some(UnwindInfo::build(unwind, &map)?))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::{
+        types, AbiParam, ExternalName, InstBuilder, Signature, StackSlotData, StackSlotKind,
+    };
+    use crate::isa::{lookup, CallConv};
+    use crate::settings::{builder, Flags};
+    use crate::Context;
+    use gimli::write::Address;
+    use std::str::FromStr;
+    use target_lexicon::triple;
+
+    #[test]
+    #[cfg_attr(feature = "x64", should_panic)] // TODO #2079
+    fn test_simple_func() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::SystemV,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let fde = match isa
+            .create_unwind_info(&context.func)
+            .expect("can create unwind info")
+        {
+            Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => {
+                info.to_fde(Address::Constant(1234))
+            }
+            _ => panic!("expected unwind information"),
+        };
+
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(1234), length: 16, lsda: None, instructions: [(2, CfaOffset(16)), (2, Offset(Register(6), -16)), (5, CfaRegister(Register(6))), (15, SameValue(Register(6))), (15, Cfa(Register(7), 8))] }");
+    }
+
+    fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function {
+        let mut func =
+            Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv));
+
+        let block0 = func.dfg.make_block();
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().return_(&[]);
+
+        if let Some(stack_slot) = stack_slot {
+            func.stack_slots.push(stack_slot);
+        }
+
+        func
+    }
+
+    #[test]
+    #[cfg_attr(feature = "x64", should_panic)] // TODO #2079
+    fn test_multi_return_func() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let fde = match isa
+            .create_unwind_info(&context.func)
+            .expect("can create unwind info")
+        {
+            Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => {
+                info.to_fde(Address::Constant(4321))
+            }
+            _ => panic!("expected unwind information"),
+        };
+
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 16, lsda: None, instructions: [(2, CfaOffset(16)), (2, Offset(Register(6), -16)), (5, CfaRegister(Register(6))), (12, RememberState), (12, SameValue(Register(6))), (12, Cfa(Register(7), 8)), (13, RestoreState), (15, SameValue(Register(6))), (15, Cfa(Register(7), 8))] }");
+    }
+
+    fn create_multi_return_function(call_conv: CallConv) -> Function {
+        let mut sig = Signature::new(call_conv);
+        sig.params.push(AbiParam::new(types::I32));
+        let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig);
+
+        let block0 = func.dfg.make_block();
+        let v0 = func.dfg.append_block_param(block0, types::I32);
+        let block1 = func.dfg.make_block();
+        let block2 = func.dfg.make_block();
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().brnz(v0, block2, &[]);
+        pos.ins().jump(block1, &[]);
+
+        pos.insert_block(block1);
+        pos.ins().return_(&[]);
+
+        pos.insert_block(block2);
+        pos.ins().return_(&[]);
+
+        func
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/unwind/winx64.rs b/third_party/rust/cranelift-codegen/src/isa/x86/unwind/winx64.rs
new file mode 100644
index 0000000000..ed046f9a87
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x86/unwind/winx64.rs
@@ -0,0 +1,268 @@
+//! Unwind information for Windows x64 ABI.
+
+use crate::ir::Function;
+use crate::isa::x86::registers::{FPR, GPR};
+use crate::isa::{unwind::winx64::UnwindInfo, CallConv, RegUnit, TargetIsa};
+use crate::result::CodegenResult;
+
+pub(crate) fn create_unwind_info(
+    func: &Function,
+    isa: &dyn TargetIsa,
+) -> CodegenResult<Option<UnwindInfo>> {
+    // Only Windows fastcall is supported for unwind information
+    if func.signature.call_conv != CallConv::WindowsFastcall || func.prologue_end.is_none() {
+        return Ok(None);
+    }
+
+    let unwind = match super::create_unwind_info(func, isa)? {
+        Some(u) => u,
+        None => {
+            return Ok(None);
+        }
+    };
+
+    Ok(Some(UnwindInfo::build::<RegisterMapper>(unwind)?))
+}
+
+struct RegisterMapper;
+
+impl crate::isa::unwind::winx64::RegisterMapper for RegisterMapper {
+    fn map(reg: RegUnit) -> crate::isa::unwind::winx64::MappedRegister {
+        use crate::isa::unwind::winx64::MappedRegister;
+        if GPR.contains(reg) {
+            MappedRegister::Int(GPR.index_of(reg) as u8)
+        } else if FPR.contains(reg) {
+            MappedRegister::Xmm(reg as u8)
+        } else {
+            panic!()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::{ExternalName, InstBuilder, Signature, StackSlotData, StackSlotKind};
+    use crate::isa::unwind::winx64::UnwindCode;
+    use crate::isa::x86::registers::RU;
+    use crate::isa::{lookup, CallConv};
+    use crate::settings::{builder, Flags};
+    use crate::Context;
+    use std::str::FromStr;
+    use target_lexicon::triple;
+
+    #[test]
+    fn test_wrong_calling_convention() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(CallConv::SystemV, None));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        assert_eq!(
+            create_unwind_info(&context.func, &*isa).expect("can create unwind info"),
+            None
+        );
+    }
+
+    #[test]
+    #[cfg_attr(feature = "x64", should_panic)] // TODO #2079
+    fn test_small_alloc() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::WindowsFastcall,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let unwind = create_unwind_info(&context.func, &*isa)
+            .expect("can create unwind info")
+            .expect("expected unwind info");
+
+        assert_eq!(
+            unwind,
+            UnwindInfo {
+                flags: 0,
+                prologue_size: 9,
+                frame_register: None,
+                frame_register_offset: 0,
+                unwind_codes: vec![
+                    UnwindCode::PushRegister {
+                        offset: 2,
+                        reg: GPR.index_of(RU::rbp.into()) as u8
+                    },
+                    UnwindCode::StackAlloc {
+                        offset: 9,
+                        size: 64
+                    }
+                ]
+            }
+        );
+
+        assert_eq!(unwind.emit_size(), 8);
+
+        let mut buf = [0u8; 8];
+        unwind.emit(&mut buf);
+
+        assert_eq!(
+            buf,
+            [
+                0x01, // Version and flags (version 1, no flags)
+                0x09, // Prologue size
+                0x02, // Unwind code count (1 for stack alloc, 1 for push reg)
+                0x00, // Frame register + offset (no frame register)
+                0x09, // Prolog offset
+                0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes)
+                0x02, // Prolog offset
+                0x50, // Operation 0 (save nonvolatile register), reg = 5 (RBP)
+            ]
+        );
+    }
+
+    #[test]
+    #[cfg_attr(feature = "x64", should_panic)] // TODO #2079
+    fn test_medium_alloc() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::WindowsFastcall,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 10000)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let unwind = create_unwind_info(&context.func, &*isa)
+            .expect("can create unwind info")
+            .expect("expected unwind info");
+
+        assert_eq!(
+            unwind,
+            UnwindInfo {
+                flags: 0,
+                prologue_size: 27,
+                frame_register: None,
+                frame_register_offset: 0,
+                unwind_codes: vec![
+                    UnwindCode::PushRegister {
+                        offset: 2,
+                        reg: GPR.index_of(RU::rbp.into()) as u8
+                    },
+                    UnwindCode::StackAlloc {
+                        offset: 27,
+                        size: 10000
+                    }
+                ]
+            }
+        );
+
+        assert_eq!(unwind.emit_size(), 12);
+
+        let mut buf = [0u8; 12];
+        unwind.emit(&mut buf);
+
+        assert_eq!(
+            buf,
+            [
+                0x01, // Version and flags (version 1, no flags)
+                0x1B, // Prologue size
+                0x03, // Unwind code count (2 for stack alloc, 1 for push reg)
+                0x00, // Frame register + offset (no frame register)
+                0x1B, // Prolog offset
+                0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0)
+                0xE2, // Low size byte
+                0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes)
+                0x02, // Prolog offset
+                0x50, // Operation 0 (push nonvolatile register), reg = 5 (RBP)
+                0x00, // Padding
+                0x00, // Padding
+            ]
+        );
+    }
+
+    #[test]
+    #[cfg_attr(feature = "x64", should_panic)] // TODO #2079
+    fn test_large_alloc() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::WindowsFastcall,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 1000000)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let unwind = create_unwind_info(&context.func, &*isa)
+            .expect("can create unwind info")
+            .expect("expected unwind info");
+
+        assert_eq!(
+            unwind,
+            UnwindInfo {
+                flags: 0,
+                prologue_size: 27,
+                frame_register: None,
+                frame_register_offset: 0,
+                unwind_codes: vec![
+                    UnwindCode::PushRegister {
+                        offset: 2,
+                        reg: GPR.index_of(RU::rbp.into()) as u8
+                    },
+                    UnwindCode::StackAlloc {
+                        offset: 27,
+                        size: 1000000
+                    }
+                ]
+            }
+        );
+
+        assert_eq!(unwind.emit_size(), 12);
+
+        let mut buf = [0u8; 12];
+        unwind.emit(&mut buf);
+
+        assert_eq!(
+            buf,
+            [
+                0x01, // Version and flags (version 1, no flags)
+                0x1B, // Prologue size
+                0x04, // Unwind code count (3 for stack alloc, 1 for push reg)
+                0x00, // Frame register + offset (no frame register)
+                0x1B, // Prolog offset
+                0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1)
+                0x40, // Byte 1 of size
+                0x42, // Byte 2 of size
+                0x0F, // Byte 3 of size
+                0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes)
+                0x02, // Prolog offset
+                0x50, // Operation 0 (push nonvolatile register), reg = 5 (RBP)
+            ]
+        );
+    }
+
+    fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function {
+        let mut func =
+            Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv));
+
+        let block0 = func.dfg.make_block();
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().return_(&[]);
+
+        if let Some(stack_slot) = stack_slot {
+            func.stack_slots.push(stack_slot);
+        }
+
+        func
+    }
+}