Adding upstream version 86.0.1.upstream/86.0.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 14:29:10 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 14:29:10 +0000
commit: 2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
tree: b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/cranelift-codegen/src/isa/x64
parent: Initial commit. (diff)
download: firefox-upstream.tar.xz
firefox-upstream.zip
11 files changed, 15701 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs b/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs
new file mode 100644
index 0000000000..f4c7624f36
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs
@@ -0,0 +1,794 @@
+//! Implementation of the standard x64 ABI.
+
+use crate::ir::types::*;
+use crate::ir::{self, types, MemFlags, TrapCode, Type};
+use crate::isa;
+use crate::isa::{x64::inst::*, CallConv};
+use crate::machinst::abi_impl::*;
+use crate::machinst::*;
+use crate::settings;
+use crate::{CodegenError, CodegenResult};
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use args::*;
+use regalloc::{RealReg, Reg, RegClass, Set, Writable};
+use smallvec::{smallvec, SmallVec};
+use std::convert::TryFrom;
+
+/// This is the limit for the size of argument and return-value areas on the
+/// stack. We place a reasonable limit here to avoid integer overflow issues
+/// with 32-bit arithmetic: for now, 128 MB.
+static STACK_ARG_RET_SIZE_LIMIT: u64 = 128 * 1024 * 1024;
+
+/// Offset in stack-arg area to callee-TLS slot in Baldrdash-2020 calling convention.
+static BALDRDASH_CALLEE_TLS_OFFSET: i64 = 0;
+/// Offset in stack-arg area to caller-TLS slot in Baldrdash-2020 calling convention.
+static BALDRDASH_CALLER_TLS_OFFSET: i64 = 8;
+
+/// Try to fill a Baldrdash register, returning it if it was found.
+fn try_fill_baldrdash_reg(call_conv: CallConv, param: &ir::AbiParam) -> Option<ABIArg> {
+    if call_conv.extends_baldrdash() {
+        match &param.purpose {
+            &ir::ArgumentPurpose::VMContext => {
+                // This is SpiderMonkey's `WasmTlsReg`.
+                Some(ABIArg::Reg(
+                    regs::r14().to_real_reg(),
+                    types::I64,
+                    param.extension,
+                    param.purpose,
+                ))
+            }
+            &ir::ArgumentPurpose::SignatureId => {
+                // This is SpiderMonkey's `WasmTableCallSigReg`.
+                Some(ABIArg::Reg(
+                    regs::r10().to_real_reg(),
+                    types::I64,
+                    param.extension,
+                    param.purpose,
+                ))
+            }
+            &ir::ArgumentPurpose::CalleeTLS => {
+                // This is SpiderMonkey's callee TLS slot in the extended frame of Wasm's ABI-2020.
+                assert!(call_conv == isa::CallConv::Baldrdash2020);
+                Some(ABIArg::Stack(
+                    BALDRDASH_CALLEE_TLS_OFFSET,
+                    ir::types::I64,
+                    ir::ArgumentExtension::None,
+                    param.purpose,
+                ))
+            }
+            &ir::ArgumentPurpose::CallerTLS => {
+                // This is SpiderMonkey's caller TLS slot in the extended frame of Wasm's ABI-2020.
+                assert!(call_conv == isa::CallConv::Baldrdash2020);
+                Some(ABIArg::Stack(
+                    BALDRDASH_CALLER_TLS_OFFSET,
+                    ir::types::I64,
+                    ir::ArgumentExtension::None,
+                    param.purpose,
+                ))
+            }
+            _ => None,
+        }
+    } else {
+        None
+    }
+}
+
+/// Support for the x64 ABI from the callee side (within a function body).
+pub(crate) type X64ABICallee = ABICalleeImpl<X64ABIMachineSpec>;
+
+/// Support for the x64 ABI from the caller side (at a callsite).
+pub(crate) type X64ABICaller = ABICallerImpl<X64ABIMachineSpec>;
+
+/// Implementation of ABI primitives for x64.
+pub(crate) struct X64ABIMachineSpec;
+
+impl ABIMachineSpec for X64ABIMachineSpec {
+    type I = Inst;
+
+    fn word_bits() -> u32 {
+        64
+    }
+
+    /// Return required stack alignment in bytes.
+    fn stack_align(_call_conv: isa::CallConv) -> u32 {
+        16
+    }
+
+    fn compute_arg_locs(
+        call_conv: isa::CallConv,
+        params: &[ir::AbiParam],
+        args_or_rets: ArgsOrRets,
+        add_ret_area_ptr: bool,
+    ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
+        let is_baldrdash = call_conv.extends_baldrdash();
+        let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020;
+
+        let mut next_gpr = 0;
+        let mut next_vreg = 0;
+        let mut next_stack: u64 = 0;
+        let mut ret = vec![];
+
+        if args_or_rets == ArgsOrRets::Args && has_baldrdash_tls {
+            // Baldrdash ABI-2020 always has two stack-arg slots reserved, for the callee and
+            // caller TLS-register values, respectively.
+            next_stack = 16;
+        }
+
+        for i in 0..params.len() {
+            // Process returns backward, according to the SpiderMonkey ABI (which we
+            // adopt internally if `is_baldrdash` is set).
+            let param = match (args_or_rets, is_baldrdash) {
+                (ArgsOrRets::Args, _) => &params[i],
+                (ArgsOrRets::Rets, false) => &params[i],
+                (ArgsOrRets::Rets, true) => &params[params.len() - 1 - i],
+            };
+
+            // Validate "purpose".
+            match &param.purpose {
+                &ir::ArgumentPurpose::VMContext
+                | &ir::ArgumentPurpose::Normal
+                | &ir::ArgumentPurpose::StackLimit
+                | &ir::ArgumentPurpose::SignatureId
+                | &ir::ArgumentPurpose::CalleeTLS
+                | &ir::ArgumentPurpose::CallerTLS => {}
+                _ => panic!(
+                    "Unsupported argument purpose {:?} in signature: {:?}",
+                    param.purpose, params
+                ),
+            }
+
+            let intreg = in_int_reg(param.value_type);
+            let vecreg = in_vec_reg(param.value_type);
+            debug_assert!(intreg || vecreg);
+            debug_assert!(!(intreg && vecreg));
+
+            let (next_reg, candidate) = if intreg {
+                let candidate = match args_or_rets {
+                    ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr),
+                    ArgsOrRets::Rets => get_intreg_for_retval_systemv(&call_conv, next_gpr, i),
+                };
+                debug_assert!(candidate
+                    .map(|r| r.get_class() == RegClass::I64)
+                    .unwrap_or(true));
+                (&mut next_gpr, candidate)
+            } else {
+                let candidate = match args_or_rets {
+                    ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg),
+                    ArgsOrRets::Rets => get_fltreg_for_retval_systemv(&call_conv, next_vreg, i),
+                };
+                debug_assert!(candidate
+                    .map(|r| r.get_class() == RegClass::V128)
+                    .unwrap_or(true));
+                (&mut next_vreg, candidate)
+            };
+
+            if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
+                assert!(intreg);
+                ret.push(param);
+            } else if let Some(reg) = candidate {
+                ret.push(ABIArg::Reg(
+                    reg.to_real_reg(),
+                    param.value_type,
+                    param.extension,
+                    param.purpose,
+                ));
+                *next_reg += 1;
+            } else {
+                // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte
+                // stack alignment happens separately after all args.)
+                let size = (param.value_type.bits() / 8) as u64;
+                let size = std::cmp::max(size, 8);
+                // Align.
+                debug_assert!(size.is_power_of_two());
+                next_stack = (next_stack + size - 1) & !(size - 1);
+                ret.push(ABIArg::Stack(
+                    next_stack as i64,
+                    param.value_type,
+                    param.extension,
+                    param.purpose,
+                ));
+                next_stack += size;
+            }
+        }
+
+        if args_or_rets == ArgsOrRets::Rets && is_baldrdash {
+            ret.reverse();
+        }
+
+        let extra_arg = if add_ret_area_ptr {
+            debug_assert!(args_or_rets == ArgsOrRets::Args);
+            if let Some(reg) = get_intreg_for_arg_systemv(&call_conv, next_gpr) {
+                ret.push(ABIArg::Reg(
+                    reg.to_real_reg(),
+                    types::I64,
+                    ir::ArgumentExtension::None,
+                    ir::ArgumentPurpose::Normal,
+                ));
+            } else {
+                ret.push(ABIArg::Stack(
+                    next_stack as i64,
+                    types::I64,
+                    ir::ArgumentExtension::None,
+                    ir::ArgumentPurpose::Normal,
+                ));
+                next_stack += 8;
+            }
+            Some(ret.len() - 1)
+        } else {
+            None
+        };
+
+        next_stack = (next_stack + 15) & !15;
+
+        // To avoid overflow issues, limit the arg/return size to something reasonable.
+        if next_stack > STACK_ARG_RET_SIZE_LIMIT {
+            return Err(CodegenError::ImplLimitExceeded);
+        }
+
+        Ok((ret, next_stack as i64, extra_arg))
+    }
+
+    fn fp_to_arg_offset(call_conv: isa::CallConv, flags: &settings::Flags) -> i64 {
+        if call_conv.extends_baldrdash() {
+            let num_words = flags.baldrdash_prologue_words() as i64;
+            debug_assert!(num_words > 0, "baldrdash must set baldrdash_prologue_words");
+            num_words * 8
+        } else {
+            16 // frame pointer + return address.
+        }
+    }
+
+    fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I {
+        let ext_kind = match ty {
+            types::B1
+            | types::B8
+            | types::I8
+            | types::B16
+            | types::I16
+            | types::B32
+            | types::I32 => ExtKind::SignExtend,
+            types::B64 | types::I64 | types::R64 | types::F32 | types::F64 => ExtKind::None,
+            _ if ty.bytes() == 16 => ExtKind::None,
+            _ => panic!("load_stack({})", ty),
+        };
+        Inst::load(ty, mem, into_reg, ext_kind)
+    }
+
+    fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Self::I {
+        Inst::store(ty, from_reg, mem)
+    }
+
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self::I {
+        Inst::gen_move(to_reg, from_reg, ty)
+    }
+
+    /// Generate an integer-extend operation.
+    fn gen_extend(
+        to_reg: Writable<Reg>,
+        from_reg: Reg,
+        is_signed: bool,
+        from_bits: u8,
+        to_bits: u8,
+    ) -> Self::I {
+        let ext_mode = ExtMode::new(from_bits as u16, to_bits as u16)
+            .expect(&format!("invalid extension: {} -> {}", from_bits, to_bits));
+        if is_signed {
+            Inst::movsx_rm_r(ext_mode, RegMem::reg(from_reg), to_reg)
+        } else {
+            Inst::movzx_rm_r(ext_mode, RegMem::reg(from_reg), to_reg)
+        }
+    }
+
+    fn gen_ret() -> Self::I {
+        Inst::ret()
+    }
+
+    fn gen_epilogue_placeholder() -> Self::I {
+        Inst::epilogue_placeholder()
+    }
+
+    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Self::I; 4]> {
+        let mut ret = SmallVec::new();
+        if from_reg != into_reg.to_reg() {
+            ret.push(Inst::gen_move(into_reg, from_reg, I64));
+        }
+        ret.push(Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(imm),
+            into_reg,
+        ));
+        ret
+    }
+
+    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Self::I; 2]> {
+        smallvec![
+            Inst::cmp_rmi_r(/* bytes = */ 8, RegMemImm::reg(regs::rsp()), limit_reg),
+            Inst::TrapIf {
+                // NBE == "> unsigned"; args above are reversed; this tests limit_reg > rsp.
+                cc: CC::NBE,
+                trap_code: TrapCode::StackOverflow,
+            },
+        ]
+    }
+
+    fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Self::I {
+        let mem: SyntheticAmode = mem.into();
+        Inst::lea(mem, into_reg)
+    }
+
+    fn get_stacklimit_reg() -> Reg {
+        debug_assert!(
+            !is_callee_save_systemv(regs::r10().to_real_reg())
+                && !is_callee_save_baldrdash(regs::r10().to_real_reg())
+        );
+
+        // As per comment on trait definition, we must return a caller-save
+        // register here.
+        regs::r10()
+    }
+
+    fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Self::I {
+        // Only ever used for I64s; if that changes, see if the ExtKind below needs to be changed.
+        assert_eq!(ty, I64);
+        let simm32 = offset as u32;
+        let mem = Amode::imm_reg(simm32, base);
+        Inst::load(ty, mem, into_reg, ExtKind::None)
+    }
+
+    fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Self::I {
+        let simm32 = offset as u32;
+        let mem = Amode::imm_reg(simm32, base);
+        Inst::store(ty, from_reg, mem)
+    }
+
+    fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Self::I; 2]> {
+        let (alu_op, amount) = if amount >= 0 {
+            (AluRmiROpcode::Add, amount)
+        } else {
+            (AluRmiROpcode::Sub, -amount)
+        };
+
+        let amount = amount as u32;
+
+        smallvec![Inst::alu_rmi_r(
+            true,
+            alu_op,
+            RegMemImm::imm(amount),
+            Writable::from_reg(regs::rsp()),
+        )]
+    }
+
+    fn gen_nominal_sp_adj(offset: i32) -> Self::I {
+        Inst::VirtualSPOffsetAdj {
+            offset: offset as i64,
+        }
+    }
+
+    fn gen_prologue_frame_setup() -> SmallVec<[Self::I; 2]> {
+        let r_rsp = regs::rsp();
+        let r_rbp = regs::rbp();
+        let w_rbp = Writable::from_reg(r_rbp);
+        let mut insts = SmallVec::new();
+        // RSP before the call will be 0 % 16.  So here, it is 8 % 16.
+        insts.push(Inst::push64(RegMemImm::reg(r_rbp)));
+        // RSP is now 0 % 16
+        insts.push(Inst::mov_r_r(true, r_rsp, w_rbp));
+        insts
+    }
+
+    fn gen_epilogue_frame_restore() -> SmallVec<[Self::I; 2]> {
+        let mut insts = SmallVec::new();
+        insts.push(Inst::mov_r_r(
+            true,
+            regs::rbp(),
+            Writable::from_reg(regs::rsp()),
+        ));
+        insts.push(Inst::pop64(Writable::from_reg(regs::rbp())));
+        insts
+    }
+
+    fn gen_clobber_save(
+        call_conv: isa::CallConv,
+        _: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
+    ) -> (u64, SmallVec<[Self::I; 16]>) {
+        let mut insts = SmallVec::new();
+        // Find all clobbered registers that are callee-save. These are only I64
+        // registers (all XMM registers are caller-save) so we can compute the
+        // total size of the needed stack space easily.
+        let clobbered = get_callee_saves(&call_conv, clobbers);
+        let clobbered_size = 8 * clobbered.len() as u32;
+        let stack_size = clobbered_size + fixed_frame_storage_size;
+        // Align to 16 bytes.
+        let stack_size = (stack_size + 15) & !15;
+        // Adjust the stack pointer downward with one `sub rsp, IMM`
+        // instruction.
+        if stack_size > 0 {
+            insts.push(Inst::alu_rmi_r(
+                true,
+                AluRmiROpcode::Sub,
+                RegMemImm::imm(stack_size),
+                Writable::from_reg(regs::rsp()),
+            ));
+        }
+        // Store each clobbered register in order at offsets from RSP.
+        let mut cur_offset = 0;
+        for reg in &clobbered {
+            let r_reg = reg.to_reg();
+            match r_reg.get_class() {
+                RegClass::I64 => {
+                    insts.push(Inst::mov_r_m(
+                        /* bytes = */ 8,
+                        r_reg.to_reg(),
+                        Amode::imm_reg(cur_offset, regs::rsp()),
+                    ));
+                    cur_offset += 8;
+                }
+                // No XMM regs are callee-save, so we do not need to implement
+                // this.
+                _ => unimplemented!(),
+            }
+        }
+
+        (clobbered_size as u64, insts)
+    }
+
+    fn gen_clobber_restore(
+        call_conv: isa::CallConv,
+        flags: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        _fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
+    ) -> SmallVec<[Self::I; 16]> {
+        let mut insts = SmallVec::new();
+
+        let clobbered = get_callee_saves(&call_conv, clobbers);
+        let stack_size = 8 * clobbered.len() as u32;
+        let stack_size = (stack_size + 15) & !15;
+
+        // Restore regs by loading from offsets of RSP.
+        let mut cur_offset = 0;
+        for reg in &clobbered {
+            let rreg = reg.to_reg();
+            match rreg.get_class() {
+                RegClass::I64 => {
+                    insts.push(Inst::mov64_m_r(
+                        Amode::imm_reg(cur_offset, regs::rsp()),
+                        Writable::from_reg(rreg.to_reg()),
+                    ));
+                    cur_offset += 8;
+                }
+                _ => unimplemented!(),
+            }
+        }
+        // Adjust RSP back upward.
+        if stack_size > 0 {
+            insts.push(Inst::alu_rmi_r(
+                true,
+                AluRmiROpcode::Add,
+                RegMemImm::imm(stack_size),
+                Writable::from_reg(regs::rsp()),
+            ));
+        }
+
+        // If this is Baldrdash-2020, restore the callee (i.e., our) TLS
+        // register. We may have allocated it for something else and clobbered
+        // it, but the ABI expects us to leave the TLS register unchanged.
+        if call_conv == isa::CallConv::Baldrdash2020 {
+            let off = BALDRDASH_CALLEE_TLS_OFFSET + Self::fp_to_arg_offset(call_conv, flags);
+            insts.push(Inst::mov64_m_r(
+                Amode::imm_reg(off as u32, regs::rbp()),
+                Writable::from_reg(regs::r14()),
+            ));
+        }
+
+        insts
+    }
+
+    /// Generate a call instruction/sequence.
+    fn gen_call(
+        dest: &CallDest,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: ir::Opcode,
+        tmp: Writable<Reg>,
+        _callee_conv: isa::CallConv,
+        _caller_conv: isa::CallConv,
+    ) -> SmallVec<[(InstIsSafepoint, Self::I); 2]> {
+        let mut insts = SmallVec::new();
+        match dest {
+            &CallDest::ExtName(ref name, RelocDistance::Near) => {
+                insts.push((
+                    InstIsSafepoint::Yes,
+                    Inst::call_known(name.clone(), uses, defs, opcode),
+                ));
+            }
+            &CallDest::ExtName(ref name, RelocDistance::Far) => {
+                insts.push((
+                    InstIsSafepoint::No,
+                    Inst::LoadExtName {
+                        dst: tmp,
+                        name: Box::new(name.clone()),
+                        offset: 0,
+                    },
+                ));
+                insts.push((
+                    InstIsSafepoint::Yes,
+                    Inst::call_unknown(RegMem::reg(tmp.to_reg()), uses, defs, opcode),
+                ));
+            }
+            &CallDest::Reg(reg) => {
+                insts.push((
+                    InstIsSafepoint::Yes,
+                    Inst::call_unknown(RegMem::reg(reg), uses, defs, opcode),
+                ));
+            }
+        }
+        insts
+    }
+
+    fn get_number_of_spillslots_for_value(rc: RegClass, ty: Type) -> u32 {
+        // We allocate in terms of 8-byte slots.
+        match (rc, ty) {
+            (RegClass::I64, _) => 1,
+            (RegClass::V128, types::F32) | (RegClass::V128, types::F64) => 1,
+            (RegClass::V128, _) => 2,
+            _ => panic!("Unexpected register class!"),
+        }
+    }
+
+    fn get_virtual_sp_offset_from_state(s: &<Self::I as MachInstEmit>::State) -> i64 {
+        s.virtual_sp_offset
+    }
+
+    fn get_nominal_sp_to_fp(s: &<Self::I as MachInstEmit>::State) -> i64 {
+        s.nominal_sp_to_fp
+    }
+
+    fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> Vec<Writable<Reg>> {
+        let mut caller_saved = vec![
+            // Systemv calling convention:
+            // - GPR: all except RBX, RBP, R12 to R15 (which are callee-saved).
+            Writable::from_reg(regs::rsi()),
+            Writable::from_reg(regs::rdi()),
+            Writable::from_reg(regs::rax()),
+            Writable::from_reg(regs::rcx()),
+            Writable::from_reg(regs::rdx()),
+            Writable::from_reg(regs::r8()),
+            Writable::from_reg(regs::r9()),
+            Writable::from_reg(regs::r10()),
+            Writable::from_reg(regs::r11()),
+            // - XMM: all the registers!
+            Writable::from_reg(regs::xmm0()),
+            Writable::from_reg(regs::xmm1()),
+            Writable::from_reg(regs::xmm2()),
+            Writable::from_reg(regs::xmm3()),
+            Writable::from_reg(regs::xmm4()),
+            Writable::from_reg(regs::xmm5()),
+            Writable::from_reg(regs::xmm6()),
+            Writable::from_reg(regs::xmm7()),
+            Writable::from_reg(regs::xmm8()),
+            Writable::from_reg(regs::xmm9()),
+            Writable::from_reg(regs::xmm10()),
+            Writable::from_reg(regs::xmm11()),
+            Writable::from_reg(regs::xmm12()),
+            Writable::from_reg(regs::xmm13()),
+            Writable::from_reg(regs::xmm14()),
+            Writable::from_reg(regs::xmm15()),
+        ];
+
+        if call_conv_of_callee.extends_baldrdash() {
+            caller_saved.push(Writable::from_reg(regs::r12()));
+            caller_saved.push(Writable::from_reg(regs::r13()));
+            // Not r14; implicitly preserved in the entry.
+            caller_saved.push(Writable::from_reg(regs::r15()));
+            caller_saved.push(Writable::from_reg(regs::rbx()));
+        }
+
+        caller_saved
+    }
+}
+
+impl From<StackAMode> for SyntheticAmode {
+    fn from(amode: StackAMode) -> Self {
+        // We enforce a 128 MB stack-frame size limit above, so these
+        // `expect()`s should never fail.
+        match amode {
+            StackAMode::FPOffset(off, _ty) => {
+                let off = i32::try_from(off)
+                    .expect("Offset in FPOffset is greater than 2GB; should hit impl limit first");
+                let simm32 = off as u32;
+                SyntheticAmode::Real(Amode::ImmReg {
+                    simm32,
+                    base: regs::rbp(),
+                    flags: MemFlags::trusted(),
+                })
+            }
+            StackAMode::NominalSPOffset(off, _ty) => {
+                let off = i32::try_from(off).expect(
+                    "Offset in NominalSPOffset is greater than 2GB; should hit impl limit first",
+                );
+                let simm32 = off as u32;
+                SyntheticAmode::nominal_sp_offset(simm32)
+            }
+            StackAMode::SPOffset(off, _ty) => {
+                let off = i32::try_from(off)
+                    .expect("Offset in SPOffset is greater than 2GB; should hit impl limit first");
+                let simm32 = off as u32;
+                SyntheticAmode::Real(Amode::ImmReg {
+                    simm32,
+                    base: regs::rsp(),
+                    flags: MemFlags::trusted(),
+                })
+            }
+        }
+    }
+}
+
+fn in_int_reg(ty: types::Type) -> bool {
+    match ty {
+        types::I8
+        | types::I16
+        | types::I32
+        | types::I64
+        | types::B1
+        | types::B8
+        | types::B16
+        | types::B32
+        | types::B64
+        | types::R64 => true,
+        types::R32 => panic!("unexpected 32-bits refs on x64!"),
+        _ => false,
+    }
+}
+
+fn in_vec_reg(ty: types::Type) -> bool {
+    match ty {
+        types::F32 | types::F64 => true,
+        _ if ty.is_vector() => true,
+        _ => false,
+    }
+}
+
+fn get_intreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> {
+    match call_conv {
+        CallConv::Fast
+        | CallConv::Cold
+        | CallConv::SystemV
+        | CallConv::BaldrdashSystemV
+        | CallConv::Baldrdash2020 => {}
+        _ => panic!("int args only supported for SysV calling convention"),
+    };
+    match idx {
+        0 => Some(regs::rdi()),
+        1 => Some(regs::rsi()),
+        2 => Some(regs::rdx()),
+        3 => Some(regs::rcx()),
+        4 => Some(regs::r8()),
+        5 => Some(regs::r9()),
+        _ => None,
+    }
+}
+
+fn get_fltreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> {
+    match call_conv {
+        CallConv::Fast
+        | CallConv::Cold
+        | CallConv::SystemV
+        | CallConv::BaldrdashSystemV
+        | CallConv::Baldrdash2020 => {}
+        _ => panic!("float args only supported for SysV calling convention"),
+    };
+    match idx {
+        0 => Some(regs::xmm0()),
+        1 => Some(regs::xmm1()),
+        2 => Some(regs::xmm2()),
+        3 => Some(regs::xmm3()),
+        4 => Some(regs::xmm4()),
+        5 => Some(regs::xmm5()),
+        6 => Some(regs::xmm6()),
+        7 => Some(regs::xmm7()),
+        _ => None,
+    }
+}
+
+fn get_intreg_for_retval_systemv(
+    call_conv: &CallConv,
+    intreg_idx: usize,
+    retval_idx: usize,
+) -> Option<Reg> {
+    match call_conv {
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV => match intreg_idx {
+            0 => Some(regs::rax()),
+            1 => Some(regs::rdx()),
+            _ => None,
+        },
+        CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => {
+            if intreg_idx == 0 && retval_idx == 0 {
+                Some(regs::rax())
+            } else {
+                None
+            }
+        }
+        CallConv::WindowsFastcall | CallConv::BaldrdashWindows | CallConv::Probestack => todo!(),
+    }
+}
+
+fn get_fltreg_for_retval_systemv(
+    call_conv: &CallConv,
+    fltreg_idx: usize,
+    retval_idx: usize,
+) -> Option<Reg> {
+    match call_conv {
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV => match fltreg_idx {
+            0 => Some(regs::xmm0()),
+            1 => Some(regs::xmm1()),
+            _ => None,
+        },
+        CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => {
+            if fltreg_idx == 0 && retval_idx == 0 {
+                Some(regs::xmm0())
+            } else {
+                None
+            }
+        }
+        CallConv::WindowsFastcall | CallConv::BaldrdashWindows | CallConv::Probestack => todo!(),
+    }
+}
+
+fn is_callee_save_systemv(r: RealReg) -> bool {
+    use regs::*;
+    match r.get_class() {
+        RegClass::I64 => match r.get_hw_encoding() as u8 {
+            ENC_RBX | ENC_RBP | ENC_R12 | ENC_R13 | ENC_R14 | ENC_R15 => true,
+            _ => false,
+        },
+        RegClass::V128 => false,
+        _ => unimplemented!(),
+    }
+}
+
+fn is_callee_save_baldrdash(r: RealReg) -> bool {
+    use regs::*;
+    match r.get_class() {
+        RegClass::I64 => {
+            if r.get_hw_encoding() as u8 == ENC_R14 {
+                // r14 is the WasmTlsReg and is preserved implicitly.
+                false
+            } else {
+                // Defer to native for the other ones.
+                is_callee_save_systemv(r)
+            }
+        }
+        RegClass::V128 => false,
+        _ => unimplemented!(),
+    }
+}
+
+fn get_callee_saves(call_conv: &CallConv, regs: &Set<Writable<RealReg>>) -> Vec<Writable<RealReg>> {
+    let mut regs: Vec<Writable<RealReg>> = match call_conv {
+        CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => regs
+            .iter()
+            .cloned()
+            .filter(|r| is_callee_save_baldrdash(r.to_reg()))
+            .collect(),
+        CallConv::BaldrdashWindows => {
+            todo!("baldrdash windows");
+        }
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV => regs
+            .iter()
+            .cloned()
+            .filter(|r| is_callee_save_systemv(r.to_reg()))
+            .collect(),
+        CallConv::WindowsFastcall => todo!("windows fastcall"),
+        CallConv::Probestack => todo!("probestack?"),
+    };
+    // Sort registers for deterministic code output. We can do an unstable sort because the
+    // registers will be unique (there are no dups).
+    regs.sort_unstable_by_key(|r| r.to_reg().get_index());
+    regs
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs
new file mode 100644
index 0000000000..6a8f65feb3
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs
@@ -0,0 +1,1215 @@
+//! Instruction operand sub-components (aka "parts"): definitions and printing.
+
+use super::regs::{self, show_ireg_sized};
+use super::EmitState;
+use crate::ir::condcodes::{FloatCC, IntCC};
+use crate::ir::MemFlags;
+use crate::machinst::*;
+use regalloc::{
+    PrettyPrint, PrettyPrintSized, RealRegUniverse, Reg, RegClass, RegUsageCollector,
+    RegUsageMapper, Writable,
+};
+use std::fmt;
+use std::string::String;
+
+/// A possible addressing mode (amode) that can be used in instructions.
+/// These denote a 64-bit value only.
+#[derive(Clone, Debug)]
+pub enum Amode {
+    /// Immediate sign-extended and a Register.
+    ImmReg {
+        simm32: u32,
+        base: Reg,
+        flags: MemFlags,
+    },
+
+    /// sign-extend-32-to-64(Immediate) + Register1 + (Register2 << Shift)
+    ImmRegRegShift {
+        simm32: u32,
+        base: Reg,
+        index: Reg,
+        shift: u8, /* 0 .. 3 only */
+        flags: MemFlags,
+    },
+
+    /// sign-extend-32-to-64(Immediate) + RIP (instruction pointer).
+    /// To wit: not supported in 32-bits mode.
+    RipRelative { target: MachLabel },
+}
+
+impl Amode {
+    pub(crate) fn imm_reg(simm32: u32, base: Reg) -> Self {
+        debug_assert!(base.get_class() == RegClass::I64);
+        Self::ImmReg {
+            simm32,
+            base,
+            flags: MemFlags::trusted(),
+        }
+    }
+
+    pub(crate) fn imm_reg_reg_shift(simm32: u32, base: Reg, index: Reg, shift: u8) -> Self {
+        debug_assert!(base.get_class() == RegClass::I64);
+        debug_assert!(index.get_class() == RegClass::I64);
+        debug_assert!(shift <= 3);
+        Self::ImmRegRegShift {
+            simm32,
+            base,
+            index,
+            shift,
+            flags: MemFlags::trusted(),
+        }
+    }
+
+    pub(crate) fn rip_relative(target: MachLabel) -> Self {
+        Self::RipRelative { target }
+    }
+
+    pub(crate) fn with_flags(&self, flags: MemFlags) -> Self {
+        match self {
+            &Self::ImmReg { simm32, base, .. } => Self::ImmReg {
+                simm32,
+                base,
+                flags,
+            },
+            &Self::ImmRegRegShift {
+                simm32,
+                base,
+                index,
+                shift,
+                ..
+            } => Self::ImmRegRegShift {
+                simm32,
+                base,
+                index,
+                shift,
+                flags,
+            },
+            _ => panic!("Amode {:?} cannot take memflags", self),
+        }
+    }
+
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            Amode::ImmReg { base, .. } => {
+                collector.add_use(*base);
+            }
+            Amode::ImmRegRegShift { base, index, .. } => {
+                collector.add_use(*base);
+                collector.add_use(*index);
+            }
+            Amode::RipRelative { .. } => {
+                // RIP isn't involved in regalloc.
+            }
+        }
+    }
+
+    pub(crate) fn get_flags(&self) -> MemFlags {
+        match self {
+            Amode::ImmReg { flags, .. } => *flags,
+            Amode::ImmRegRegShift { flags, .. } => *flags,
+            Amode::RipRelative { .. } => MemFlags::trusted(),
+        }
+    }
+
+    pub(crate) fn can_trap(&self) -> bool {
+        !self.get_flags().notrap()
+    }
+}
+
+impl PrettyPrint for Amode {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            Amode::ImmReg { simm32, base, .. } => {
+                format!("{}({})", *simm32 as i32, base.show_rru(mb_rru))
+            }
+            Amode::ImmRegRegShift {
+                simm32,
+                base,
+                index,
+                shift,
+                ..
+            } => format!(
+                "{}({},{},{})",
+                *simm32 as i32,
+                base.show_rru(mb_rru),
+                index.show_rru(mb_rru),
+                1 << shift
+            ),
+            Amode::RipRelative { ref target } => format!("label{}(%rip)", target.get()),
+        }
+    }
+}
+
+/// A Memory Address. These denote a 64-bit value only.
+/// Used for usual addressing modes as well as addressing modes used during compilation, when the
+/// moving SP offset is not known.
+#[derive(Clone)]
+pub enum SyntheticAmode {
+    /// A real amode.
+    Real(Amode),
+
+    /// A (virtual) offset to the "nominal SP" value, which will be recomputed as we push and pop
+    /// within the function.
+    NominalSPOffset { simm32: u32 },
+}
+
+impl SyntheticAmode {
+    pub(crate) fn nominal_sp_offset(simm32: u32) -> Self {
+        SyntheticAmode::NominalSPOffset { simm32 }
+    }
+
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            SyntheticAmode::Real(addr) => addr.get_regs_as_uses(collector),
+            SyntheticAmode::NominalSPOffset { .. } => {
+                // Nothing to do; the base is SP and isn't involved in regalloc.
+            }
+        }
+    }
+
+    pub(crate) fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            SyntheticAmode::Real(addr) => addr.map_uses(map),
+            SyntheticAmode::NominalSPOffset { .. } => {
+                // Nothing to do.
+            }
+        }
+    }
+
+    pub(crate) fn finalize(&self, state: &mut EmitState) -> Amode {
+        match self {
+            SyntheticAmode::Real(addr) => addr.clone(),
+            SyntheticAmode::NominalSPOffset { simm32 } => {
+                let off = *simm32 as i64 + state.virtual_sp_offset;
+                // TODO will require a sequence of add etc.
+                assert!(
+                    off <= u32::max_value() as i64,
+                    "amode finalize: add sequence NYI"
+                );
+                Amode::imm_reg(off as u32, regs::rsp())
+            }
+        }
+    }
+}
+
+impl Into<SyntheticAmode> for Amode {
+    fn into(self) -> SyntheticAmode {
+        SyntheticAmode::Real(self)
+    }
+}
+
+impl PrettyPrint for SyntheticAmode {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            SyntheticAmode::Real(addr) => addr.show_rru(mb_rru),
+            SyntheticAmode::NominalSPOffset { simm32 } => {
+                format!("rsp({} + virtual offset)", *simm32 as i32)
+            }
+        }
+    }
+}
+
+/// An operand which is either an integer Register, a value in Memory or an Immediate.  This can
+/// denote an 8, 16, 32 or 64 bit value.  For the Immediate form, in the 8- and 16-bit case, only
+/// the lower 8 or 16 bits of `simm32` is relevant.  In the 64-bit case, the value denoted by
+/// `simm32` is its sign-extension out to 64 bits.
+#[derive(Clone)]
+pub enum RegMemImm {
+    Reg { reg: Reg },
+    Mem { addr: SyntheticAmode },
+    Imm { simm32: u32 },
+}
+
+impl RegMemImm {
+    pub(crate) fn reg(reg: Reg) -> Self {
+        debug_assert!(reg.get_class() == RegClass::I64 || reg.get_class() == RegClass::V128);
+        Self::Reg { reg }
+    }
+    pub(crate) fn mem(addr: impl Into<SyntheticAmode>) -> Self {
+        Self::Mem { addr: addr.into() }
+    }
+    pub(crate) fn imm(simm32: u32) -> Self {
+        Self::Imm { simm32 }
+    }
+
+    /// Asserts that in register mode, the reg class is the one that's expected.
+    pub(crate) fn assert_regclass_is(&self, expected_reg_class: RegClass) {
+        if let Self::Reg { reg } = self {
+            debug_assert_eq!(reg.get_class(), expected_reg_class);
+        }
+    }
+
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            Self::Reg { reg } => collector.add_use(*reg),
+            Self::Mem { addr } => addr.get_regs_as_uses(collector),
+            Self::Imm { .. } => {}
+        }
+    }
+
+    pub(crate) fn to_reg(&self) -> Option<Reg> {
+        match self {
+            Self::Reg { reg } => Some(*reg),
+            _ => None,
+        }
+    }
+}
+
+impl PrettyPrint for RegMemImm {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        self.show_rru_sized(mb_rru, 8)
+    }
+}
+
+impl PrettyPrintSized for RegMemImm {
+    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+        match self {
+            Self::Reg { reg } => show_ireg_sized(*reg, mb_rru, size),
+            Self::Mem { addr } => addr.show_rru(mb_rru),
+            Self::Imm { simm32 } => format!("${}", *simm32 as i32),
+        }
+    }
+}
+
+/// An operand which is either an integer Register or a value in Memory.  This can denote an 8, 16,
+/// 32, 64, or 128 bit value.
+#[derive(Clone)]
+pub enum RegMem {
+    Reg { reg: Reg },
+    Mem { addr: SyntheticAmode },
+}
+
+impl RegMem {
+    pub(crate) fn reg(reg: Reg) -> Self {
+        debug_assert!(reg.get_class() == RegClass::I64 || reg.get_class() == RegClass::V128);
+        Self::Reg { reg }
+    }
+    pub(crate) fn mem(addr: impl Into<SyntheticAmode>) -> Self {
+        Self::Mem { addr: addr.into() }
+    }
+    /// Asserts that in register mode, the reg class is the one that's expected.
+    pub(crate) fn assert_regclass_is(&self, expected_reg_class: RegClass) {
+        if let Self::Reg { reg } = self {
+            debug_assert_eq!(reg.get_class(), expected_reg_class);
+        }
+    }
+    /// Add the regs mentioned by `self` to `collector`.
+    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
+        match self {
+            RegMem::Reg { reg } => collector.add_use(*reg),
+            RegMem::Mem { addr, .. } => addr.get_regs_as_uses(collector),
+        }
+    }
+    pub(crate) fn to_reg(&self) -> Option<Reg> {
+        match self {
+            RegMem::Reg { reg } => Some(*reg),
+            _ => None,
+        }
+    }
+}
+
+impl From<Writable<Reg>> for RegMem {
+    fn from(r: Writable<Reg>) -> Self {
+        RegMem::reg(r.to_reg())
+    }
+}
+
+impl PrettyPrint for RegMem {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        self.show_rru_sized(mb_rru, 8)
+    }
+}
+
+impl PrettyPrintSized for RegMem {
+    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+        match self {
+            RegMem::Reg { reg } => show_ireg_sized(*reg, mb_rru, size),
+            RegMem::Mem { addr, .. } => addr.show_rru(mb_rru),
+        }
+    }
+}
+
+/// Some basic ALU operations.  TODO: maybe add Adc, Sbb.
+#[derive(Copy, Clone, PartialEq)]
+pub enum AluRmiROpcode {
+    Add,
+    Sub,
+    And,
+    Or,
+    Xor,
+    /// The signless, non-extending (N x N -> N, for N in {32,64}) variant.
+    Mul,
+}
+
+impl fmt::Debug for AluRmiROpcode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            AluRmiROpcode::Add => "add",
+            AluRmiROpcode::Sub => "sub",
+            AluRmiROpcode::And => "and",
+            AluRmiROpcode::Or => "or",
+            AluRmiROpcode::Xor => "xor",
+            AluRmiROpcode::Mul => "imul",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for AluRmiROpcode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+#[derive(Clone, PartialEq)]
+pub enum UnaryRmROpcode {
+    /// Bit-scan reverse.
+    Bsr,
+    /// Bit-scan forward.
+    Bsf,
+}
+
+impl fmt::Debug for UnaryRmROpcode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            UnaryRmROpcode::Bsr => write!(fmt, "bsr"),
+            UnaryRmROpcode::Bsf => write!(fmt, "bsf"),
+        }
+    }
+}
+
+impl fmt::Display for UnaryRmROpcode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+pub(crate) enum InstructionSet {
+    SSE,
+    SSE2,
+    SSSE3,
+    SSE41,
+    SSE42,
+}
+
+/// Some SSE operations requiring 2 operands r/m and r.
+#[derive(Clone, Copy, PartialEq)]
+pub enum SseOpcode {
+    Addps,
+    Addpd,
+    Addss,
+    Addsd,
+    Andps,
+    Andpd,
+    Andnps,
+    Andnpd,
+    Comiss,
+    Comisd,
+    Cmpps,
+    Cmppd,
+    Cmpss,
+    Cmpsd,
+    Cvtdq2ps,
+    Cvtsd2ss,
+    Cvtsd2si,
+    Cvtsi2ss,
+    Cvtsi2sd,
+    Cvtss2si,
+    Cvtss2sd,
+    Cvttps2dq,
+    Cvttss2si,
+    Cvttsd2si,
+    Divps,
+    Divpd,
+    Divss,
+    Divsd,
+    Insertps,
+    Maxps,
+    Maxpd,
+    Maxss,
+    Maxsd,
+    Minps,
+    Minpd,
+    Minss,
+    Minsd,
+    Movaps,
+    Movapd,
+    Movd,
+    Movdqa,
+    Movdqu,
+    Movlhps,
+    Movmskps,
+    Movmskpd,
+    Movq,
+    Movss,
+    Movsd,
+    Movups,
+    Movupd,
+    Mulps,
+    Mulpd,
+    Mulss,
+    Mulsd,
+    Orps,
+    Orpd,
+    Pabsb,
+    Pabsw,
+    Pabsd,
+    Packsswb,
+    Paddb,
+    Paddd,
+    Paddq,
+    Paddw,
+    Paddsb,
+    Paddsw,
+    Paddusb,
+    Paddusw,
+    Pand,
+    Pandn,
+    Pavgb,
+    Pavgw,
+    Pcmpeqb,
+    Pcmpeqw,
+    Pcmpeqd,
+    Pcmpeqq,
+    Pcmpgtb,
+    Pcmpgtw,
+    Pcmpgtd,
+    Pcmpgtq,
+    Pextrb,
+    Pextrw,
+    Pextrd,
+    Pinsrb,
+    Pinsrw,
+    Pinsrd,
+    Pmaxsb,
+    Pmaxsw,
+    Pmaxsd,
+    Pmaxub,
+    Pmaxuw,
+    Pmaxud,
+    Pminsb,
+    Pminsw,
+    Pminsd,
+    Pminub,
+    Pminuw,
+    Pminud,
+    Pmovmskb,
+    Pmulld,
+    Pmullw,
+    Pmuludq,
+    Por,
+    Pshufb,
+    Pshufd,
+    Psllw,
+    Pslld,
+    Psllq,
+    Psraw,
+    Psrad,
+    Psrlw,
+    Psrld,
+    Psrlq,
+    Psubb,
+    Psubd,
+    Psubq,
+    Psubw,
+    Psubsb,
+    Psubsw,
+    Psubusb,
+    Psubusw,
+    Ptest,
+    Pxor,
+    Rcpss,
+    Roundss,
+    Roundsd,
+    Rsqrtss,
+    Sqrtps,
+    Sqrtpd,
+    Sqrtss,
+    Sqrtsd,
+    Subps,
+    Subpd,
+    Subss,
+    Subsd,
+    Ucomiss,
+    Ucomisd,
+    Xorps,
+    Xorpd,
+}
+
+impl SseOpcode {
+    /// Which `InstructionSet` is the first supporting this opcode?
+    pub(crate) fn available_from(&self) -> InstructionSet {
+        use InstructionSet::*;
+        match self {
+            SseOpcode::Addps
+            | SseOpcode::Addss
+            | SseOpcode::Andps
+            | SseOpcode::Andnps
+            | SseOpcode::Comiss
+            | SseOpcode::Cmpps
+            | SseOpcode::Cmpss
+            | SseOpcode::Cvtsi2ss
+            | SseOpcode::Cvtss2si
+            | SseOpcode::Cvttss2si
+            | SseOpcode::Divps
+            | SseOpcode::Divss
+            | SseOpcode::Maxps
+            | SseOpcode::Maxss
+            | SseOpcode::Minps
+            | SseOpcode::Minss
+            | SseOpcode::Movaps
+            | SseOpcode::Movlhps
+            | SseOpcode::Movmskps
+            | SseOpcode::Movss
+            | SseOpcode::Movups
+            | SseOpcode::Mulps
+            | SseOpcode::Mulss
+            | SseOpcode::Orps
+            | SseOpcode::Rcpss
+            | SseOpcode::Rsqrtss
+            | SseOpcode::Sqrtps
+            | SseOpcode::Sqrtss
+            | SseOpcode::Subps
+            | SseOpcode::Subss
+            | SseOpcode::Ucomiss
+            | SseOpcode::Xorps => SSE,
+
+            SseOpcode::Addpd
+            | SseOpcode::Addsd
+            | SseOpcode::Andpd
+            | SseOpcode::Andnpd
+            | SseOpcode::Cmppd
+            | SseOpcode::Cmpsd
+            | SseOpcode::Comisd
+            | SseOpcode::Cvtdq2ps
+            | SseOpcode::Cvtsd2ss
+            | SseOpcode::Cvtsd2si
+            | SseOpcode::Cvtsi2sd
+            | SseOpcode::Cvtss2sd
+            | SseOpcode::Cvttps2dq
+            | SseOpcode::Cvttsd2si
+            | SseOpcode::Divpd
+            | SseOpcode::Divsd
+            | SseOpcode::Maxpd
+            | SseOpcode::Maxsd
+            | SseOpcode::Minpd
+            | SseOpcode::Minsd
+            | SseOpcode::Movapd
+            | SseOpcode::Movd
+            | SseOpcode::Movmskpd
+            | SseOpcode::Movq
+            | SseOpcode::Movsd
+            | SseOpcode::Movupd
+            | SseOpcode::Movdqa
+            | SseOpcode::Movdqu
+            | SseOpcode::Mulpd
+            | SseOpcode::Mulsd
+            | SseOpcode::Orpd
+            | SseOpcode::Packsswb
+            | SseOpcode::Paddb
+            | SseOpcode::Paddd
+            | SseOpcode::Paddq
+            | SseOpcode::Paddw
+            | SseOpcode::Paddsb
+            | SseOpcode::Paddsw
+            | SseOpcode::Paddusb
+            | SseOpcode::Paddusw
+            | SseOpcode::Pand
+            | SseOpcode::Pandn
+            | SseOpcode::Pavgb
+            | SseOpcode::Pavgw
+            | SseOpcode::Pcmpeqb
+            | SseOpcode::Pcmpeqw
+            | SseOpcode::Pcmpeqd
+            | SseOpcode::Pcmpgtb
+            | SseOpcode::Pcmpgtw
+            | SseOpcode::Pcmpgtd
+            | SseOpcode::Pextrw
+            | SseOpcode::Pinsrw
+            | SseOpcode::Pmaxsw
+            | SseOpcode::Pmaxub
+            | SseOpcode::Pminsw
+            | SseOpcode::Pminub
+            | SseOpcode::Pmovmskb
+            | SseOpcode::Pmullw
+            | SseOpcode::Pmuludq
+            | SseOpcode::Por
+            | SseOpcode::Pshufd
+            | SseOpcode::Psllw
+            | SseOpcode::Pslld
+            | SseOpcode::Psllq
+            | SseOpcode::Psraw
+            | SseOpcode::Psrad
+            | SseOpcode::Psrlw
+            | SseOpcode::Psrld
+            | SseOpcode::Psrlq
+            | SseOpcode::Psubb
+            | SseOpcode::Psubd
+            | SseOpcode::Psubq
+            | SseOpcode::Psubw
+            | SseOpcode::Psubsb
+            | SseOpcode::Psubsw
+            | SseOpcode::Psubusb
+            | SseOpcode::Psubusw
+            | SseOpcode::Pxor
+            | SseOpcode::Sqrtpd
+            | SseOpcode::Sqrtsd
+            | SseOpcode::Subpd
+            | SseOpcode::Subsd
+            | SseOpcode::Ucomisd
+            | SseOpcode::Xorpd => SSE2,
+
+            SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3,
+
+            SseOpcode::Insertps
+            | SseOpcode::Pcmpeqq
+            | SseOpcode::Pextrb
+            | SseOpcode::Pextrd
+            | SseOpcode::Pinsrb
+            | SseOpcode::Pinsrd
+            | SseOpcode::Pmaxsb
+            | SseOpcode::Pmaxsd
+            | SseOpcode::Pmaxuw
+            | SseOpcode::Pmaxud
+            | SseOpcode::Pminsb
+            | SseOpcode::Pminsd
+            | SseOpcode::Pminuw
+            | SseOpcode::Pminud
+            | SseOpcode::Pmulld
+            | SseOpcode::Ptest
+            | SseOpcode::Roundss
+            | SseOpcode::Roundsd => SSE41,
+
+            SseOpcode::Pcmpgtq => SSE42,
+        }
+    }
+
+    /// Returns the src operand size for an instruction.
+    pub(crate) fn src_size(&self) -> u8 {
+        match self {
+            SseOpcode::Movd => 4,
+            _ => 8,
+        }
+    }
+}
+
+impl fmt::Debug for SseOpcode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            SseOpcode::Addps => "addps",
+            SseOpcode::Addpd => "addpd",
+            SseOpcode::Addss => "addss",
+            SseOpcode::Addsd => "addsd",
+            SseOpcode::Andpd => "andpd",
+            SseOpcode::Andps => "andps",
+            SseOpcode::Andnps => "andnps",
+            SseOpcode::Andnpd => "andnpd",
+            SseOpcode::Cmpps => "cmpps",
+            SseOpcode::Cmppd => "cmppd",
+            SseOpcode::Cmpss => "cmpss",
+            SseOpcode::Cmpsd => "cmpsd",
+            SseOpcode::Comiss => "comiss",
+            SseOpcode::Comisd => "comisd",
+            SseOpcode::Cvtdq2ps => "cvtdq2ps",
+            SseOpcode::Cvtsd2ss => "cvtsd2ss",
+            SseOpcode::Cvtsd2si => "cvtsd2si",
+            SseOpcode::Cvtsi2ss => "cvtsi2ss",
+            SseOpcode::Cvtsi2sd => "cvtsi2sd",
+            SseOpcode::Cvtss2si => "cvtss2si",
+            SseOpcode::Cvtss2sd => "cvtss2sd",
+            SseOpcode::Cvttps2dq => "cvttps2dq",
+            SseOpcode::Cvttss2si => "cvttss2si",
+            SseOpcode::Cvttsd2si => "cvttsd2si",
+            SseOpcode::Divps => "divps",
+            SseOpcode::Divpd => "divpd",
+            SseOpcode::Divss => "divss",
+            SseOpcode::Divsd => "divsd",
+            SseOpcode::Insertps => "insertps",
+            SseOpcode::Maxps => "maxps",
+            SseOpcode::Maxpd => "maxpd",
+            SseOpcode::Maxss => "maxss",
+            SseOpcode::Maxsd => "maxsd",
+            SseOpcode::Minps => "minps",
+            SseOpcode::Minpd => "minpd",
+            SseOpcode::Minss => "minss",
+            SseOpcode::Minsd => "minsd",
+            SseOpcode::Movaps => "movaps",
+            SseOpcode::Movapd => "movapd",
+            SseOpcode::Movd => "movd",
+            SseOpcode::Movdqa => "movdqa",
+            SseOpcode::Movdqu => "movdqu",
+            SseOpcode::Movlhps => "movlhps",
+            SseOpcode::Movmskps => "movmskps",
+            SseOpcode::Movmskpd => "movmskpd",
+            SseOpcode::Movq => "movq",
+            SseOpcode::Movss => "movss",
+            SseOpcode::Movsd => "movsd",
+            SseOpcode::Movups => "movups",
+            SseOpcode::Movupd => "movupd",
+            SseOpcode::Mulps => "mulps",
+            SseOpcode::Mulpd => "mulpd",
+            SseOpcode::Mulss => "mulss",
+            SseOpcode::Mulsd => "mulsd",
+            SseOpcode::Orpd => "orpd",
+            SseOpcode::Orps => "orps",
+            SseOpcode::Pabsb => "pabsb",
+            SseOpcode::Pabsw => "pabsw",
+            SseOpcode::Pabsd => "pabsd",
+            SseOpcode::Packsswb => "packsswb",
+            SseOpcode::Paddb => "paddb",
+            SseOpcode::Paddd => "paddd",
+            SseOpcode::Paddq => "paddq",
+            SseOpcode::Paddw => "paddw",
+            SseOpcode::Paddsb => "paddsb",
+            SseOpcode::Paddsw => "paddsw",
+            SseOpcode::Paddusb => "paddusb",
+            SseOpcode::Paddusw => "paddusw",
+            SseOpcode::Pand => "pand",
+            SseOpcode::Pandn => "pandn",
+            SseOpcode::Pavgb => "pavgb",
+            SseOpcode::Pavgw => "pavgw",
+            SseOpcode::Pcmpeqb => "pcmpeqb",
+            SseOpcode::Pcmpeqw => "pcmpeqw",
+            SseOpcode::Pcmpeqd => "pcmpeqd",
+            SseOpcode::Pcmpeqq => "pcmpeqq",
+            SseOpcode::Pcmpgtb => "pcmpgtb",
+            SseOpcode::Pcmpgtw => "pcmpgtw",
+            SseOpcode::Pcmpgtd => "pcmpgtd",
+            SseOpcode::Pcmpgtq => "pcmpgtq",
+            SseOpcode::Pextrb => "pextrb",
+            SseOpcode::Pextrw => "pextrw",
+            SseOpcode::Pextrd => "pextrd",
+            SseOpcode::Pinsrb => "pinsrb",
+            SseOpcode::Pinsrw => "pinsrw",
+            SseOpcode::Pinsrd => "pinsrd",
+            SseOpcode::Pmaxsb => "pmaxsb",
+            SseOpcode::Pmaxsw => "pmaxsw",
+            SseOpcode::Pmaxsd => "pmaxsd",
+            SseOpcode::Pmaxub => "pmaxub",
+            SseOpcode::Pmaxuw => "pmaxuw",
+            SseOpcode::Pmaxud => "pmaxud",
+            SseOpcode::Pminsb => "pminsb",
+            SseOpcode::Pminsw => "pminsw",
+            SseOpcode::Pminsd => "pminsd",
+            SseOpcode::Pminub => "pminub",
+            SseOpcode::Pminuw => "pminuw",
+            SseOpcode::Pminud => "pminud",
+            SseOpcode::Pmovmskb => "pmovmskb",
+            SseOpcode::Pmulld => "pmulld",
+            SseOpcode::Pmullw => "pmullw",
+            SseOpcode::Pmuludq => "pmuludq",
+            SseOpcode::Por => "por",
+            SseOpcode::Pshufb => "pshufb",
+            SseOpcode::Pshufd => "pshufd",
+            SseOpcode::Psllw => "psllw",
+            SseOpcode::Pslld => "pslld",
+            SseOpcode::Psllq => "psllq",
+            SseOpcode::Psraw => "psraw",
+            SseOpcode::Psrad => "psrad",
+            SseOpcode::Psrlw => "psrlw",
+            SseOpcode::Psrld => "psrld",
+            SseOpcode::Psrlq => "psrlq",
+            SseOpcode::Psubb => "psubb",
+            SseOpcode::Psubd => "psubd",
+            SseOpcode::Psubq => "psubq",
+            SseOpcode::Psubw => "psubw",
+            SseOpcode::Psubsb => "psubsb",
+            SseOpcode::Psubsw => "psubsw",
+            SseOpcode::Psubusb => "psubusb",
+            SseOpcode::Psubusw => "psubusw",
+            SseOpcode::Ptest => "ptest",
+            SseOpcode::Pxor => "pxor",
+            SseOpcode::Rcpss => "rcpss",
+            SseOpcode::Roundss => "roundss",
+            SseOpcode::Roundsd => "roundsd",
+            SseOpcode::Rsqrtss => "rsqrtss",
+            SseOpcode::Sqrtps => "sqrtps",
+            SseOpcode::Sqrtpd => "sqrtpd",
+            SseOpcode::Sqrtss => "sqrtss",
+            SseOpcode::Sqrtsd => "sqrtsd",
+            SseOpcode::Subps => "subps",
+            SseOpcode::Subpd => "subpd",
+            SseOpcode::Subss => "subss",
+            SseOpcode::Subsd => "subsd",
+            SseOpcode::Ucomiss => "ucomiss",
+            SseOpcode::Ucomisd => "ucomisd",
+            SseOpcode::Xorps => "xorps",
+            SseOpcode::Xorpd => "xorpd",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for SseOpcode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+/// This defines the ways a value can be extended: either signed- or zero-extension, or none for
+/// types that are not extended. Contrast with [ExtMode], which defines the widths from and to which
+/// values can be extended.
+#[derive(Clone, PartialEq)]
+pub enum ExtKind {
+    None,
+    SignExtend,
+    ZeroExtend,
+}
+
+/// These indicate ways of extending (widening) a value, using the Intel
+/// naming: B(yte) = u8, W(ord) = u16, L(ong)word = u32, Q(uad)word = u64
+#[derive(Clone, PartialEq)]
+pub enum ExtMode {
+    /// Byte -> Longword.
+    BL,
+    /// Byte -> Quadword.
+    BQ,
+    /// Word -> Longword.
+    WL,
+    /// Word -> Quadword.
+    WQ,
+    /// Longword -> Quadword.
+    LQ,
+}
+
+impl ExtMode {
+    /// Calculate the `ExtMode` from passed bit lengths of the from/to types.
+    pub(crate) fn new(from_bits: u16, to_bits: u16) -> Option<ExtMode> {
+        match (from_bits, to_bits) {
+            (1, 8) | (1, 16) | (1, 32) | (8, 16) | (8, 32) => Some(ExtMode::BL),
+            (1, 64) | (8, 64) => Some(ExtMode::BQ),
+            (16, 32) => Some(ExtMode::WL),
+            (16, 64) => Some(ExtMode::WQ),
+            (32, 64) => Some(ExtMode::LQ),
+            _ => None,
+        }
+    }
+
+    /// Return the source register size in bytes.
+    pub(crate) fn src_size(&self) -> u8 {
+        match self {
+            ExtMode::BL | ExtMode::BQ => 1,
+            ExtMode::WL | ExtMode::WQ => 2,
+            ExtMode::LQ => 4,
+        }
+    }
+
+    /// Return the destination register size in bytes.
+    pub(crate) fn dst_size(&self) -> u8 {
+        match self {
+            ExtMode::BL | ExtMode::WL => 4,
+            ExtMode::BQ | ExtMode::WQ | ExtMode::LQ => 8,
+        }
+    }
+}
+
+impl fmt::Debug for ExtMode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            ExtMode::BL => "bl",
+            ExtMode::BQ => "bq",
+            ExtMode::WL => "wl",
+            ExtMode::WQ => "wq",
+            ExtMode::LQ => "lq",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for ExtMode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+/// These indicate the form of a scalar shift/rotate: left, signed right, unsigned right.
+#[derive(Clone)]
+pub enum ShiftKind {
+    ShiftLeft,
+    /// Inserts zeros in the most significant bits.
+    ShiftRightLogical,
+    /// Replicates the sign bit in the most significant bits.
+    ShiftRightArithmetic,
+    RotateLeft,
+    RotateRight,
+}
+
+impl fmt::Debug for ShiftKind {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            ShiftKind::ShiftLeft => "shl",
+            ShiftKind::ShiftRightLogical => "shr",
+            ShiftKind::ShiftRightArithmetic => "sar",
+            ShiftKind::RotateLeft => "rol",
+            ShiftKind::RotateRight => "ror",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for ShiftKind {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+/// What kind of division or remainer instruction this is?
+#[derive(Clone)]
+pub enum DivOrRemKind {
+    SignedDiv,
+    UnsignedDiv,
+    SignedRem,
+    UnsignedRem,
+}
+
+impl DivOrRemKind {
+    pub(crate) fn is_signed(&self) -> bool {
+        match self {
+            DivOrRemKind::SignedDiv | DivOrRemKind::SignedRem => true,
+            _ => false,
+        }
+    }
+
+    pub(crate) fn is_div(&self) -> bool {
+        match self {
+            DivOrRemKind::SignedDiv | DivOrRemKind::UnsignedDiv => true,
+            _ => false,
+        }
+    }
+}
+
+/// These indicate condition code tests.  Not all are represented since not all are useful in
+/// compiler-generated code.
+#[derive(Copy, Clone)]
+#[repr(u8)]
+pub enum CC {
+    ///  overflow
+    O = 0,
+    /// no overflow
+    NO = 1,
+
+    /// < unsigned
+    B = 2,
+    /// >= unsigned
+    NB = 3,
+
+    /// zero
+    Z = 4,
+    /// not-zero
+    NZ = 5,
+
+    /// <= unsigned
+    BE = 6,
+    /// > unsigned
+    NBE = 7,
+
+    /// negative
+    S = 8,
+    /// not-negative
+    NS = 9,
+
+    /// < signed
+    L = 12,
+    /// >= signed
+    NL = 13,
+
+    /// <= signed
+    LE = 14,
+    /// > signed
+    NLE = 15,
+
+    /// parity
+    P = 10,
+
+    /// not parity
+    NP = 11,
+}
+
+impl CC {
+    pub(crate) fn from_intcc(intcc: IntCC) -> Self {
+        match intcc {
+            IntCC::Equal => CC::Z,
+            IntCC::NotEqual => CC::NZ,
+            IntCC::SignedGreaterThanOrEqual => CC::NL,
+            IntCC::SignedGreaterThan => CC::NLE,
+            IntCC::SignedLessThanOrEqual => CC::LE,
+            IntCC::SignedLessThan => CC::L,
+            IntCC::UnsignedGreaterThanOrEqual => CC::NB,
+            IntCC::UnsignedGreaterThan => CC::NBE,
+            IntCC::UnsignedLessThanOrEqual => CC::BE,
+            IntCC::UnsignedLessThan => CC::B,
+            IntCC::Overflow => CC::O,
+            IntCC::NotOverflow => CC::NO,
+        }
+    }
+
+    pub(crate) fn invert(&self) -> Self {
+        match self {
+            CC::O => CC::NO,
+            CC::NO => CC::O,
+
+            CC::B => CC::NB,
+            CC::NB => CC::B,
+
+            CC::Z => CC::NZ,
+            CC::NZ => CC::Z,
+
+            CC::BE => CC::NBE,
+            CC::NBE => CC::BE,
+
+            CC::S => CC::NS,
+            CC::NS => CC::S,
+
+            CC::L => CC::NL,
+            CC::NL => CC::L,
+
+            CC::LE => CC::NLE,
+            CC::NLE => CC::LE,
+
+            CC::P => CC::NP,
+            CC::NP => CC::P,
+        }
+    }
+
+    pub(crate) fn from_floatcc(floatcc: FloatCC) -> Self {
+        match floatcc {
+            FloatCC::Ordered => CC::NP,
+            FloatCC::Unordered => CC::P,
+            // Alias for NE
+            FloatCC::OrderedNotEqual => CC::NZ,
+            // Alias for E
+            FloatCC::UnorderedOrEqual => CC::Z,
+            // Alias for A
+            FloatCC::GreaterThan => CC::NBE,
+            // Alias for AE
+            FloatCC::GreaterThanOrEqual => CC::NB,
+            FloatCC::UnorderedOrLessThan => CC::B,
+            FloatCC::UnorderedOrLessThanOrEqual => CC::BE,
+            FloatCC::Equal
+            | FloatCC::NotEqual
+            | FloatCC::LessThan
+            | FloatCC::LessThanOrEqual
+            | FloatCC::UnorderedOrGreaterThan
+            | FloatCC::UnorderedOrGreaterThanOrEqual => panic!(
+                "{:?} can't be lowered to a CC code; treat as special case.",
+                floatcc
+            ),
+        }
+    }
+
+    pub(crate) fn get_enc(self) -> u8 {
+        self as u8
+    }
+}
+
+impl fmt::Debug for CC {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            CC::O => "o",
+            CC::NO => "no",
+            CC::B => "b",
+            CC::NB => "nb",
+            CC::Z => "z",
+            CC::NZ => "nz",
+            CC::BE => "be",
+            CC::NBE => "nbe",
+            CC::S => "s",
+            CC::NS => "ns",
+            CC::L => "l",
+            CC::NL => "nl",
+            CC::LE => "le",
+            CC::NLE => "nle",
+            CC::P => "p",
+            CC::NP => "np",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for CC {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
+/// Encode the ways that floats can be compared. This is used in float comparisons such as `cmpps`,
+/// e.g.; it is distinguished from other float comparisons (e.g. `ucomiss`) in that those use EFLAGS
+/// whereas [FcmpImm] is used as an immediate.
+pub(crate) enum FcmpImm {
+    Equal = 0x00,
+    LessThan = 0x01,
+    LessThanOrEqual = 0x02,
+    Unordered = 0x03,
+    NotEqual = 0x04,
+    UnorderedOrGreaterThanOrEqual = 0x05,
+    UnorderedOrGreaterThan = 0x06,
+    Ordered = 0x07,
+}
+
+impl FcmpImm {
+    pub(crate) fn encode(self) -> u8 {
+        self as u8
+    }
+}
+
+impl From<FloatCC> for FcmpImm {
+    fn from(cond: FloatCC) -> Self {
+        match cond {
+            FloatCC::Equal => FcmpImm::Equal,
+            FloatCC::LessThan => FcmpImm::LessThan,
+            FloatCC::LessThanOrEqual => FcmpImm::LessThanOrEqual,
+            FloatCC::Unordered => FcmpImm::Unordered,
+            FloatCC::NotEqual => FcmpImm::NotEqual,
+            FloatCC::UnorderedOrGreaterThanOrEqual => FcmpImm::UnorderedOrGreaterThanOrEqual,
+            FloatCC::UnorderedOrGreaterThan => FcmpImm::UnorderedOrGreaterThan,
+            FloatCC::Ordered => FcmpImm::Ordered,
+            _ => panic!("unable to create comparison predicate for {}", cond),
+        }
+    }
+}
+
+/// An operand's size in bits.
+#[derive(Clone, Copy, PartialEq)]
+pub enum OperandSize {
+    Size32,
+    Size64,
+}
+
+impl OperandSize {
+    pub(crate) fn from_bytes(num_bytes: u32) -> Self {
+        match num_bytes {
+            1 | 2 | 4 => OperandSize::Size32,
+            8 => OperandSize::Size64,
+            _ => unreachable!(),
+        }
+    }
+
+    pub(crate) fn to_bytes(&self) -> u8 {
+        match self {
+            Self::Size32 => 4,
+            Self::Size64 => 8,
+        }
+    }
+
+    pub(crate) fn to_bits(&self) -> u8 {
+        match self {
+            Self::Size32 => 32,
+            Self::Size64 => 64,
+        }
+    }
+}
+
+/// An x64 memory fence kind.
+#[derive(Clone)]
+#[allow(dead_code)]
+pub enum FenceKind {
+    /// `mfence` instruction ("Memory Fence")
+    MFence,
+    /// `lfence` instruction ("Load Fence")
+    LFence,
+    /// `sfence` instruction ("Store Fence")
+    SFence,
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs
new file mode 100644
index 0000000000..dd4125a2da
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs
@@ -0,0 +1,2819 @@
+use crate::binemit::{Addend, Reloc};
+use crate::ir::immediates::{Ieee32, Ieee64};
+use crate::ir::TrapCode;
+use crate::isa::x64::inst::args::*;
+use crate::isa::x64::inst::*;
+use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel};
+use core::convert::TryInto;
+use log::debug;
+use regalloc::{Reg, RegClass, Writable};
+
+fn low8_will_sign_extend_to_64(x: u32) -> bool {
+    let xs = (x as i32) as i64;
+    xs == ((xs << 56) >> 56)
+}
+
+fn low8_will_sign_extend_to_32(x: u32) -> bool {
+    let xs = x as i32;
+    xs == ((xs << 24) >> 24)
+}
+
+//=============================================================================
+// Instructions and subcomponents: emission
+
+// For all of the routines that take both a memory-or-reg operand (sometimes
+// called "E" in the Intel documentation) and a reg-only operand ("G" in
+// Intelese), the order is always G first, then E.
+//
+// "enc" in the following means "hardware register encoding number".
+
+#[inline(always)]
+fn encode_modrm(m0d: u8, enc_reg_g: u8, rm_e: u8) -> u8 {
+    debug_assert!(m0d < 4);
+    debug_assert!(enc_reg_g < 8);
+    debug_assert!(rm_e < 8);
+    ((m0d & 3) << 6) | ((enc_reg_g & 7) << 3) | (rm_e & 7)
+}
+
+#[inline(always)]
+fn encode_sib(shift: u8, enc_index: u8, enc_base: u8) -> u8 {
+    debug_assert!(shift < 4);
+    debug_assert!(enc_index < 8);
+    debug_assert!(enc_base < 8);
+    ((shift & 3) << 6) | ((enc_index & 7) << 3) | (enc_base & 7)
+}
+
+/// Get the encoding number of a GPR.
+#[inline(always)]
+fn int_reg_enc(reg: Reg) -> u8 {
+    debug_assert!(reg.is_real());
+    debug_assert_eq!(reg.get_class(), RegClass::I64);
+    reg.get_hw_encoding()
+}
+
+/// Get the encoding number of any register.
+#[inline(always)]
+fn reg_enc(reg: Reg) -> u8 {
+    debug_assert!(reg.is_real());
+    reg.get_hw_encoding()
+}
+
+/// A small bit field to record a REX prefix specification:
+/// - bit 0 set to 1 indicates REX.W must be 0 (cleared).
+/// - bit 1 set to 1 indicates the REX prefix must always be emitted.
+#[repr(transparent)]
+#[derive(Clone, Copy)]
+struct RexFlags(u8);
+
+impl RexFlags {
+    /// By default, set the W field, and don't always emit.
+    #[inline(always)]
+    fn set_w() -> Self {
+        Self(0)
+    }
+    /// Creates a new RexPrefix for which the REX.W bit will be cleared.
+    #[inline(always)]
+    fn clear_w() -> Self {
+        Self(1)
+    }
+
+    #[inline(always)]
+    fn always_emit(&mut self) -> &mut Self {
+        self.0 = self.0 | 2;
+        self
+    }
+
+    #[inline(always)]
+    fn must_clear_w(&self) -> bool {
+        (self.0 & 1) != 0
+    }
+    #[inline(always)]
+    fn must_always_emit(&self) -> bool {
+        (self.0 & 2) != 0
+    }
+
+    #[inline(always)]
+    fn emit_two_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_e: u8) {
+        let w = if self.must_clear_w() { 0 } else { 1 };
+        let r = (enc_g >> 3) & 1;
+        let x = 0;
+        let b = (enc_e >> 3) & 1;
+        let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+        if rex != 0x40 || self.must_always_emit() {
+            sink.put1(rex);
+        }
+    }
+
+    #[inline(always)]
+    fn emit_three_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_index: u8, enc_base: u8) {
+        let w = if self.must_clear_w() { 0 } else { 1 };
+        let r = (enc_g >> 3) & 1;
+        let x = (enc_index >> 3) & 1;
+        let b = (enc_base >> 3) & 1;
+        let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+        if rex != 0x40 || self.must_always_emit() {
+            sink.put1(rex);
+        }
+    }
+}
+
+/// We may need to include one or more legacy prefix bytes before the REX prefix.  This enum
+/// covers only the small set of possibilities that we actually need.
+enum LegacyPrefixes {
+    /// No prefix bytes
+    None,
+    /// Operand Size Override -- here, denoting "16-bit operation"
+    _66,
+    /// The Lock prefix
+    _F0,
+    /// Operand size override and Lock
+    _66F0,
+    /// REPNE, but no specific meaning here -- is just an opcode extension
+    _F2,
+    /// REP/REPE, but no specific meaning here -- is just an opcode extension
+    _F3,
+}
+
+impl LegacyPrefixes {
+    #[inline(always)]
+    fn emit(&self, sink: &mut MachBuffer<Inst>) {
+        match self {
+            LegacyPrefixes::_66 => sink.put1(0x66),
+            LegacyPrefixes::_F0 => sink.put1(0xF0),
+            LegacyPrefixes::_66F0 => {
+                // I don't think the order matters, but in any case, this is the same order that
+                // the GNU assembler uses.
+                sink.put1(0x66);
+                sink.put1(0xF0);
+            }
+            LegacyPrefixes::_F2 => sink.put1(0xF2),
+            LegacyPrefixes::_F3 => sink.put1(0xF3),
+            LegacyPrefixes::None => (),
+        }
+    }
+}
+
+/// This is the core 'emit' function for instructions that reference memory.
+///
+/// For an instruction that has as operands a reg encoding `enc_g` and a memory address `mem_e`,
+/// create and emit:
+/// - first the legacy prefixes, if any
+/// - then the REX prefix, if needed
+/// - then caller-supplied opcode byte(s) (`opcodes` and `num_opcodes`),
+/// - then the MOD/RM byte,
+/// - then optionally, a SIB byte,
+/// - and finally optionally an immediate that will be derived from the `mem_e` operand.
+///
+/// For most instructions up to and including SSE4.2, that will be the whole instruction: this is
+/// what we call "standard" instructions, and abbreviate "std" in the name here. VEX-prefixed
+/// instructions will require their own emitter functions.
+///
+/// This will also work for 32-bits x86 instructions, assuming no REX prefix is provided.
+///
+/// The opcodes are written bigendianly for the convenience of callers.  For example, if the opcode
+/// bytes to be emitted are, in this order, F3 0F 27, then the caller should pass `opcodes` ==
+/// 0xF3_0F_27 and `num_opcodes` == 3.
+///
+/// The register operand is represented here not as a `Reg` but as its hardware encoding, `enc_g`.
+/// `rex` can specify special handling for the REX prefix.  By default, the REX prefix will
+/// indicate a 64-bit operation and will be deleted if it is redundant (0x40).  Note that for a
+/// 64-bit operation, the REX prefix will normally never be redundant, since REX.W must be 1 to
+/// indicate a 64-bit operation.
+fn emit_std_enc_mem(
+    sink: &mut MachBuffer<Inst>,
+    state: &EmitState,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    mut num_opcodes: usize,
+    enc_g: u8,
+    mem_e: &Amode,
+    rex: RexFlags,
+) {
+    // General comment for this function: the registers in `mem_e` must be
+    // 64-bit integer registers, because they are part of an address
+    // expression.  But `enc_g` can be derived from a register of any class.
+
+    let srcloc = state.cur_srcloc();
+    if srcloc != SourceLoc::default() && mem_e.can_trap() {
+        sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+    }
+
+    prefixes.emit(sink);
+
+    match mem_e {
+        Amode::ImmReg { simm32, base, .. } => {
+            // First, the REX byte.
+            let enc_e = int_reg_enc(*base);
+            rex.emit_two_op(sink, enc_g, enc_e);
+
+            // Now the opcode(s).  These include any other prefixes the caller
+            // hands to us.
+            while num_opcodes > 0 {
+                num_opcodes -= 1;
+                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+            }
+
+            // Now the mod/rm and associated immediates.  This is
+            // significantly complicated due to the multiple special cases.
+            if *simm32 == 0
+                && enc_e != regs::ENC_RSP
+                && enc_e != regs::ENC_RBP
+                && enc_e != regs::ENC_R12
+                && enc_e != regs::ENC_R13
+            {
+                // FIXME JRS 2020Feb11: those four tests can surely be
+                // replaced by a single mask-and-compare check.  We should do
+                // that because this routine is likely to be hot.
+                sink.put1(encode_modrm(0, enc_g & 7, enc_e & 7));
+            } else if *simm32 == 0 && (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12) {
+                sink.put1(encode_modrm(0, enc_g & 7, 4));
+                sink.put1(0x24);
+            } else if low8_will_sign_extend_to_32(*simm32)
+                && enc_e != regs::ENC_RSP
+                && enc_e != regs::ENC_R12
+            {
+                sink.put1(encode_modrm(1, enc_g & 7, enc_e & 7));
+                sink.put1((simm32 & 0xFF) as u8);
+            } else if enc_e != regs::ENC_RSP && enc_e != regs::ENC_R12 {
+                sink.put1(encode_modrm(2, enc_g & 7, enc_e & 7));
+                sink.put4(*simm32);
+            } else if (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12)
+                && low8_will_sign_extend_to_32(*simm32)
+            {
+                // REX.B distinguishes RSP from R12
+                sink.put1(encode_modrm(1, enc_g & 7, 4));
+                sink.put1(0x24);
+                sink.put1((simm32 & 0xFF) as u8);
+            } else if enc_e == regs::ENC_R12 || enc_e == regs::ENC_RSP {
+                //.. wait for test case for RSP case
+                // REX.B distinguishes RSP from R12
+                sink.put1(encode_modrm(2, enc_g & 7, 4));
+                sink.put1(0x24);
+                sink.put4(*simm32);
+            } else {
+                unreachable!("ImmReg");
+            }
+        }
+
+        Amode::ImmRegRegShift {
+            simm32,
+            base: reg_base,
+            index: reg_index,
+            shift,
+            ..
+        } => {
+            let enc_base = int_reg_enc(*reg_base);
+            let enc_index = int_reg_enc(*reg_index);
+
+            // The rex byte.
+            rex.emit_three_op(sink, enc_g, enc_index, enc_base);
+
+            // All other prefixes and opcodes.
+            while num_opcodes > 0 {
+                num_opcodes -= 1;
+                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+            }
+
+            // modrm, SIB, immediates.
+            if low8_will_sign_extend_to_32(*simm32) && enc_index != regs::ENC_RSP {
+                sink.put1(encode_modrm(1, enc_g & 7, 4));
+                sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7));
+                sink.put1(*simm32 as u8);
+            } else if enc_index != regs::ENC_RSP {
+                sink.put1(encode_modrm(2, enc_g & 7, 4));
+                sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7));
+                sink.put4(*simm32);
+            } else {
+                panic!("ImmRegRegShift");
+            }
+        }
+
+        Amode::RipRelative { ref target } => {
+            // First, the REX byte, with REX.B = 0.
+            rex.emit_two_op(sink, enc_g, 0);
+
+            // Now the opcode(s).  These include any other prefixes the caller
+            // hands to us.
+            while num_opcodes > 0 {
+                num_opcodes -= 1;
+                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+            }
+
+            // RIP-relative is mod=00, rm=101.
+            sink.put1(encode_modrm(0, enc_g & 7, 0b101));
+
+            let offset = sink.cur_offset();
+            sink.use_label_at_offset(offset, *target, LabelUse::JmpRel32);
+            sink.put4(0);
+        }
+    }
+}
+
+/// This is the core 'emit' function for instructions that do not reference memory.
+///
+/// This is conceptually the same as emit_modrm_sib_enc_ge, except it is for the case where the E
+/// operand is a register rather than memory.  Hence it is much simpler.
+fn emit_std_enc_enc(
+    sink: &mut MachBuffer<Inst>,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    mut num_opcodes: usize,
+    enc_g: u8,
+    enc_e: u8,
+    rex: RexFlags,
+) {
+    // EncG and EncE can be derived from registers of any class, and they
+    // don't even have to be from the same class.  For example, for an
+    // integer-to-FP conversion insn, one might be RegClass::I64 and the other
+    // RegClass::V128.
+
+    // The legacy prefixes.
+    prefixes.emit(sink);
+
+    // The rex byte.
+    rex.emit_two_op(sink, enc_g, enc_e);
+
+    // All other prefixes and opcodes.
+    while num_opcodes > 0 {
+        num_opcodes -= 1;
+        sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+    }
+
+    // Now the mod/rm byte.  The instruction we're generating doesn't access
+    // memory, so there is no SIB byte or immediate -- we're done.
+    sink.put1(encode_modrm(3, enc_g & 7, enc_e & 7));
+}
+
+// These are merely wrappers for the above two functions that facilitate passing
+// actual `Reg`s rather than their encodings.
+
+fn emit_std_reg_mem(
+    sink: &mut MachBuffer<Inst>,
+    state: &EmitState,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    num_opcodes: usize,
+    reg_g: Reg,
+    mem_e: &Amode,
+    rex: RexFlags,
+) {
+    let enc_g = reg_enc(reg_g);
+    emit_std_enc_mem(
+        sink,
+        state,
+        prefixes,
+        opcodes,
+        num_opcodes,
+        enc_g,
+        mem_e,
+        rex,
+    );
+}
+
+fn emit_std_reg_reg(
+    sink: &mut MachBuffer<Inst>,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    num_opcodes: usize,
+    reg_g: Reg,
+    reg_e: Reg,
+    rex: RexFlags,
+) {
+    let enc_g = reg_enc(reg_g);
+    let enc_e = reg_enc(reg_e);
+    emit_std_enc_enc(sink, prefixes, opcodes, num_opcodes, enc_g, enc_e, rex);
+}
+
+/// Write a suitable number of bits from an imm64 to the sink.
+fn emit_simm(sink: &mut MachBuffer<Inst>, size: u8, simm32: u32) {
+    match size {
+        8 | 4 => sink.put4(simm32),
+        2 => sink.put2(simm32 as u16),
+        1 => sink.put1(simm32 as u8),
+        _ => unreachable!(),
+    }
+}
+
+/// A small helper to generate a signed conversion instruction.
+fn emit_signed_cvt(
+    sink: &mut MachBuffer<Inst>,
+    info: &EmitInfo,
+    state: &mut EmitState,
+    src: Reg,
+    dst: Writable<Reg>,
+    to_f64: bool,
+) {
+    // Handle an unsigned int, which is the "easy" case: a signed conversion will do the
+    // right thing.
+    let op = if to_f64 {
+        SseOpcode::Cvtsi2sd
+    } else {
+        SseOpcode::Cvtsi2ss
+    };
+    let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst);
+    inst.emit(sink, info, state);
+}
+
+/// Emits a one way conditional jump if CC is set (true).
+fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
+    let cond_start = sink.cur_offset();
+    let cond_disp_off = cond_start + 2;
+    sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
+    sink.put1(0x0F);
+    sink.put1(0x80 + cc.get_enc());
+    sink.put4(0x0);
+}
+
+/// Emits a relocation, attaching the current source location as well.
+fn emit_reloc(
+    sink: &mut MachBuffer<Inst>,
+    state: &EmitState,
+    kind: Reloc,
+    name: &ExternalName,
+    addend: Addend,
+) {
+    let srcloc = state.cur_srcloc();
+    sink.add_reloc(srcloc, kind, name, addend);
+}
+
+/// The top-level emit function.
+///
+/// Important!  Do not add improved (shortened) encoding cases to existing
+/// instructions without also adding tests for those improved encodings.  That
+/// is a dangerous game that leads to hard-to-track-down errors in the emitted
+/// code.
+///
+/// For all instructions, make sure to have test coverage for all of the
+/// following situations.  Do this by creating the cross product resulting from
+/// applying the following rules to each operand:
+///
+/// (1) for any insn that mentions a register: one test using a register from
+///     the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
+///     using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
+///     This helps detect incorrect REX prefix construction.
+///
+/// (2) for any insn that mentions a byte register: one test for each of the
+///     four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
+///     [r8b .. r11b] and [r12b .. r15b].  This checks that
+///     apparently-redundant REX prefixes are retained when required.
+///
+/// (3) for any insn that contains an immediate field, check the following
+///     cases: field is zero, field is in simm8 range (-128 .. 127), field is
+///     in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF).  This is because some
+///     instructions that require a 32-bit immediate have a short-form encoding
+///     when the imm is in simm8 range.
+///
+/// Rules (1), (2) and (3) don't apply for registers within address expressions
+/// (`Addr`s).  Those are already pretty well tested, and the registers in them
+/// don't have any effect on the containing instruction (apart from possibly
+/// require REX prefix bits).
+///
+/// When choosing registers for a test, avoid using registers with the same
+/// offset within a given group.  For example, don't use rax and r8, since they
+/// both have the lowest 3 bits as 000, and so the test won't detect errors
+/// where those 3-bit register sub-fields are confused by the emitter.  Instead
+/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001).  Similarly, don't use (eg) cl
+/// and bpl since they have the same offset in their group; use instead (eg) cl
+/// and sil.
+///
+/// For all instructions, also add a test that uses only low-half registers
+/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
+/// prefixes are correctly omitted.  This low-half restriction must apply to
+/// _all_ registers in the insn, even those in address expressions.
+///
+/// Following these rules creates large numbers of test cases, but it's the
+/// only way to make the emitter reliable.
+///
+/// Known possible improvements:
+///
+/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
+///   care?)
+pub(crate) fn emit(
+    inst: &Inst,
+    sink: &mut MachBuffer<Inst>,
+    info: &EmitInfo,
+    state: &mut EmitState,
+) {
+    if let Some(iset_requirement) = inst.isa_requirement() {
+        match iset_requirement {
+            // Cranelift assumes SSE2 at least.
+            InstructionSet::SSE | InstructionSet::SSE2 => {}
+            InstructionSet::SSSE3 => assert!(info.isa_flags.has_ssse3()),
+            InstructionSet::SSE41 => assert!(info.isa_flags.has_sse41()),
+            InstructionSet::SSE42 => assert!(info.isa_flags.has_sse42()),
+        }
+    }
+
+    match inst {
+        Inst::AluRmiR {
+            is_64,
+            op,
+            src,
+            dst: reg_g,
+        } => {
+            let rex = if *is_64 {
+                RexFlags::set_w()
+            } else {
+                RexFlags::clear_w()
+            };
+
+            if *op == AluRmiROpcode::Mul {
+                // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so
+                // we have to special-case it.
+                match src {
+                    RegMemImm::Reg { reg: reg_e } => {
+                        emit_std_reg_reg(
+                            sink,
+                            LegacyPrefixes::None,
+                            0x0FAF,
+                            2,
+                            reg_g.to_reg(),
+                            *reg_e,
+                            rex,
+                        );
+                    }
+
+                    RegMemImm::Mem { addr } => {
+                        let amode = addr.finalize(state);
+                        emit_std_reg_mem(
+                            sink,
+                            state,
+                            LegacyPrefixes::None,
+                            0x0FAF,
+                            2,
+                            reg_g.to_reg(),
+                            &amode,
+                            rex,
+                        );
+                    }
+
+                    RegMemImm::Imm { simm32 } => {
+                        let use_imm8 = low8_will_sign_extend_to_32(*simm32);
+                        let opcode = if use_imm8 { 0x6B } else { 0x69 };
+                        // Yes, really, reg_g twice.
+                        emit_std_reg_reg(
+                            sink,
+                            LegacyPrefixes::None,
+                            opcode,
+                            1,
+                            reg_g.to_reg(),
+                            reg_g.to_reg(),
+                            rex,
+                        );
+                        emit_simm(sink, if use_imm8 { 1 } else { 4 }, *simm32);
+                    }
+                }
+            } else {
+                let (opcode_r, opcode_m, subopcode_i) = match op {
+                    AluRmiROpcode::Add => (0x01, 0x03, 0),
+                    AluRmiROpcode::Sub => (0x29, 0x2B, 5),
+                    AluRmiROpcode::And => (0x21, 0x23, 4),
+                    AluRmiROpcode::Or => (0x09, 0x0B, 1),
+                    AluRmiROpcode::Xor => (0x31, 0x33, 6),
+                    AluRmiROpcode::Mul => panic!("unreachable"),
+                };
+
+                match src {
+                    RegMemImm::Reg { reg: reg_e } => {
+                        // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
+                        // duality). Do this too, so as to be able to compare generated machine
+                        // code easily.
+                        emit_std_reg_reg(
+                            sink,
+                            LegacyPrefixes::None,
+                            opcode_r,
+                            1,
+                            *reg_e,
+                            reg_g.to_reg(),
+                            rex,
+                        );
+                        // NB: if this is ever extended to handle byte size ops, be sure to retain
+                        // redundant REX prefixes.
+                    }
+
+                    RegMemImm::Mem { addr } => {
+                        // Here we revert to the "normal" G-E ordering.
+                        let amode = addr.finalize(state);
+                        emit_std_reg_mem(
+                            sink,
+                            state,
+                            LegacyPrefixes::None,
+                            opcode_m,
+                            1,
+                            reg_g.to_reg(),
+                            &amode,
+                            rex,
+                        );
+                    }
+
+                    RegMemImm::Imm { simm32 } => {
+                        let use_imm8 = low8_will_sign_extend_to_32(*simm32);
+                        let opcode = if use_imm8 { 0x83 } else { 0x81 };
+                        // And also here we use the "normal" G-E ordering.
+                        let enc_g = int_reg_enc(reg_g.to_reg());
+                        emit_std_enc_enc(
+                            sink,
+                            LegacyPrefixes::None,
+                            opcode,
+                            1,
+                            subopcode_i,
+                            enc_g,
+                            rex,
+                        );
+                        emit_simm(sink, if use_imm8 { 1 } else { 4 }, *simm32);
+                    }
+                }
+            }
+        }
+
+        Inst::UnaryRmR { size, op, src, dst } => {
+            let (prefix, rex_flags) = match size {
+                2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!(),
+            };
+
+            let (opcode, num_opcodes) = match op {
+                UnaryRmROpcode::Bsr => (0x0fbd, 2),
+                UnaryRmROpcode::Bsf => (0x0fbc, 2),
+            };
+
+            match src {
+                RegMem::Reg { reg: src } => emit_std_reg_reg(
+                    sink,
+                    prefix,
+                    opcode,
+                    num_opcodes,
+                    dst.to_reg(),
+                    *src,
+                    rex_flags,
+                ),
+                RegMem::Mem { addr: src } => {
+                    let amode = src.finalize(state);
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        prefix,
+                        opcode,
+                        num_opcodes,
+                        dst.to_reg(),
+                        &amode,
+                        rex_flags,
+                    );
+                }
+            }
+        }
+
+        Inst::Not { size, src } => {
+            let (opcode, prefix, rex_flags) = match size {
+                1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
+                2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!("{}", size),
+            };
+
+            let subopcode = 2;
+            let src = int_reg_enc(src.to_reg());
+            emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
+        }
+
+        Inst::Neg { size, src } => {
+            let (opcode, prefix, rex_flags) = match size {
+                1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
+                2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!("{}", size),
+            };
+
+            let subopcode = 3;
+            let src = int_reg_enc(src.to_reg());
+            emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
+        }
+
+        Inst::Div {
+            size,
+            signed,
+            divisor,
+        } => {
+            let (opcode, prefix, rex_flags) = match size {
+                1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()),
+                2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!("{}", size),
+            };
+
+            let loc = state.cur_srcloc();
+            sink.add_trap(loc, TrapCode::IntegerDivisionByZero);
+
+            let subopcode = if *signed { 7 } else { 6 };
+            match divisor {
+                RegMem::Reg { reg } => {
+                    let src = int_reg_enc(*reg);
+                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags)
+                }
+                RegMem::Mem { addr: src } => {
+                    let amode = src.finalize(state);
+                    emit_std_enc_mem(sink, state, prefix, opcode, 1, subopcode, &amode, rex_flags);
+                }
+            }
+        }
+
+        Inst::MulHi { size, signed, rhs } => {
+            let (prefix, rex_flags) = match size {
+                2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!(),
+            };
+
+            let subopcode = if *signed { 5 } else { 4 };
+            match rhs {
+                RegMem::Reg { reg } => {
+                    let src = int_reg_enc(*reg);
+                    emit_std_enc_enc(sink, prefix, 0xF7, 1, subopcode, src, rex_flags)
+                }
+                RegMem::Mem { addr: src } => {
+                    let amode = src.finalize(state);
+                    emit_std_enc_mem(sink, state, prefix, 0xF7, 1, subopcode, &amode, rex_flags);
+                }
+            }
+        }
+
+        Inst::SignExtendData { size } => match size {
+            1 => {
+                sink.put1(0x66);
+                sink.put1(0x98);
+            }
+            2 => {
+                sink.put1(0x66);
+                sink.put1(0x99);
+            }
+            4 => sink.put1(0x99),
+            8 => {
+                sink.put1(0x48);
+                sink.put1(0x99);
+            }
+            _ => unreachable!(),
+        },
+
+        Inst::CheckedDivOrRemSeq {
+            kind,
+            size,
+            divisor,
+            tmp,
+        } => {
+            // Generates the following code sequence:
+            //
+            // ;; check divide by zero:
+            // cmp 0 %divisor
+            // jnz $after_trap
+            // ud2
+            // $after_trap:
+            //
+            // ;; for signed modulo/div:
+            // cmp -1 %divisor
+            // jnz $do_op
+            // ;;   for signed modulo, result is 0
+            //    mov #0, %rdx
+            //    j $done
+            // ;;   for signed div, check for integer overflow against INT_MIN of the right size
+            // cmp INT_MIN, %rax
+            // jnz $do_op
+            // ud2
+            //
+            // $do_op:
+            // ;; if signed
+            //     cdq ;; sign-extend from rax into rdx
+            // ;; else
+            //     mov #0, %rdx
+            // idiv %divisor
+            //
+            // $done:
+            debug_assert!(info.flags().avoid_div_traps());
+
+            // Check if the divisor is zero, first.
+            let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0), divisor.to_reg());
+            inst.emit(sink, info, state);
+
+            let inst = Inst::trap_if(CC::Z, TrapCode::IntegerDivisionByZero);
+            inst.emit(sink, info, state);
+
+            let (do_op, done_label) = if kind.is_signed() {
+                // Now check if the divisor is -1.
+                let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0xffffffff), divisor.to_reg());
+                inst.emit(sink, info, state);
+
+                let do_op = sink.get_label();
+
+                // If not equal, jump to do-op.
+                one_way_jmp(sink, CC::NZ, do_op);
+
+                // Here, divisor == -1.
+                if !kind.is_div() {
+                    // x % -1 = 0; put the result into the destination, $rdx.
+                    let done_label = sink.get_label();
+
+                    let inst = Inst::imm(
+                        OperandSize::from_bytes(*size as u32),
+                        0,
+                        Writable::from_reg(regs::rdx()),
+                    );
+                    inst.emit(sink, info, state);
+
+                    let inst = Inst::jmp_known(done_label);
+                    inst.emit(sink, info, state);
+
+                    (Some(do_op), Some(done_label))
+                } else {
+                    // Check for integer overflow.
+                    if *size == 8 {
+                        let tmp = tmp.expect("temporary for i64 sdiv");
+
+                        let inst = Inst::imm(OperandSize::Size64, 0x8000000000000000, tmp);
+                        inst.emit(sink, info, state);
+
+                        let inst = Inst::cmp_rmi_r(8, RegMemImm::reg(tmp.to_reg()), regs::rax());
+                        inst.emit(sink, info, state);
+                    } else {
+                        let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0x80000000), regs::rax());
+                        inst.emit(sink, info, state);
+                    }
+
+                    // If not equal, jump over the trap.
+                    let inst = Inst::trap_if(CC::Z, TrapCode::IntegerOverflow);
+                    inst.emit(sink, info, state);
+
+                    (Some(do_op), None)
+                }
+            } else {
+                (None, None)
+            };
+
+            if let Some(do_op) = do_op {
+                sink.bind_label(do_op);
+            }
+
+            assert!(
+                *size > 1,
+                "CheckedDivOrRemSeq for i8 is not yet implemented"
+            );
+
+            // Fill in the high parts:
+            if kind.is_signed() {
+                // sign-extend the sign-bit of rax into rdx, for signed opcodes.
+                let inst = Inst::sign_extend_data(*size);
+                inst.emit(sink, info, state);
+            } else {
+                // zero for unsigned opcodes.
+                let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(regs::rdx()));
+                inst.emit(sink, info, state);
+            }
+
+            let inst = Inst::div(*size, kind.is_signed(), RegMem::reg(divisor.to_reg()));
+            inst.emit(sink, info, state);
+
+            // Lowering takes care of moving the result back into the right register, see comment
+            // there.
+
+            if let Some(done) = done_label {
+                sink.bind_label(done);
+            }
+        }
+
+        Inst::Imm {
+            dst_is_64,
+            simm64,
+            dst,
+        } => {
+            let enc_dst = int_reg_enc(dst.to_reg());
+            if *dst_is_64 {
+                if low32_will_sign_extend_to_64(*simm64) {
+                    // Sign-extended move imm32.
+                    emit_std_enc_enc(
+                        sink,
+                        LegacyPrefixes::None,
+                        0xC7,
+                        1,
+                        /* subopcode */ 0,
+                        enc_dst,
+                        RexFlags::set_w(),
+                    );
+                    sink.put4(*simm64 as u32);
+                } else {
+                    sink.put1(0x48 | ((enc_dst >> 3) & 1));
+                    sink.put1(0xB8 | (enc_dst & 7));
+                    sink.put8(*simm64);
+                }
+            } else {
+                if ((enc_dst >> 3) & 1) == 1 {
+                    sink.put1(0x41);
+                }
+                sink.put1(0xB8 | (enc_dst & 7));
+                sink.put4(*simm64 as u32);
+            }
+        }
+
+        Inst::MovRR { is_64, src, dst } => {
+            let rex = if *is_64 {
+                RexFlags::set_w()
+            } else {
+                RexFlags::clear_w()
+            };
+            emit_std_reg_reg(sink, LegacyPrefixes::None, 0x89, 1, *src, dst.to_reg(), rex);
+        }
+
+        Inst::MovzxRmR { ext_mode, src, dst } => {
+            let (opcodes, num_opcodes, mut rex_flags) = match ext_mode {
+                ExtMode::BL => {
+                    // MOVZBL is (REX.W==0) 0F B6 /r
+                    (0x0FB6, 2, RexFlags::clear_w())
+                }
+                ExtMode::BQ => {
+                    // MOVZBQ is (REX.W==1) 0F B6 /r
+                    // I'm not sure why the Intel manual offers different
+                    // encodings for MOVZBQ than for MOVZBL.  AIUI they should
+                    // achieve the same, since MOVZBL is just going to zero out
+                    // the upper half of the destination anyway.
+                    (0x0FB6, 2, RexFlags::set_w())
+                }
+                ExtMode::WL => {
+                    // MOVZWL is (REX.W==0) 0F B7 /r
+                    (0x0FB7, 2, RexFlags::clear_w())
+                }
+                ExtMode::WQ => {
+                    // MOVZWQ is (REX.W==1) 0F B7 /r
+                    (0x0FB7, 2, RexFlags::set_w())
+                }
+                ExtMode::LQ => {
+                    // This is just a standard 32 bit load, and we rely on the
+                    // default zero-extension rule to perform the extension.
+                    // Note that in reg/reg mode, gcc seems to use the swapped form R/RM, which we
+                    // don't do here, since it's the same encoding size.
+                    // MOV r/m32, r32 is (REX.W==0) 8B /r
+                    (0x8B, 1, RexFlags::clear_w())
+                }
+            };
+
+            match src {
+                RegMem::Reg { reg: src } => {
+                    match ext_mode {
+                        ExtMode::BL | ExtMode::BQ => {
+                            // A redundant REX prefix must be emitted for certain register inputs.
+                            let enc_src = int_reg_enc(*src);
+                            if enc_src >= 4 && enc_src <= 7 {
+                                rex_flags.always_emit();
+                            };
+                        }
+                        _ => {}
+                    }
+                    emit_std_reg_reg(
+                        sink,
+                        LegacyPrefixes::None,
+                        opcodes,
+                        num_opcodes,
+                        dst.to_reg(),
+                        *src,
+                        rex_flags,
+                    )
+                }
+
+                RegMem::Mem { addr: src } => {
+                    let src = &src.finalize(state);
+
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        opcodes,
+                        num_opcodes,
+                        dst.to_reg(),
+                        src,
+                        rex_flags,
+                    )
+                }
+            }
+        }
+
+        Inst::Mov64MR { src, dst } => {
+            let src = &src.finalize(state);
+
+            emit_std_reg_mem(
+                sink,
+                state,
+                LegacyPrefixes::None,
+                0x8B,
+                1,
+                dst.to_reg(),
+                src,
+                RexFlags::set_w(),
+            )
+        }
+
+        Inst::LoadEffectiveAddress { addr, dst } => {
+            let amode = addr.finalize(state);
+
+            emit_std_reg_mem(
+                sink,
+                state,
+                LegacyPrefixes::None,
+                0x8D,
+                1,
+                dst.to_reg(),
+                &amode,
+                RexFlags::set_w(),
+            );
+        }
+
+        Inst::MovsxRmR { ext_mode, src, dst } => {
+            let (opcodes, num_opcodes, mut rex_flags) = match ext_mode {
+                ExtMode::BL => {
+                    // MOVSBL is (REX.W==0) 0F BE /r
+                    (0x0FBE, 2, RexFlags::clear_w())
+                }
+                ExtMode::BQ => {
+                    // MOVSBQ is (REX.W==1) 0F BE /r
+                    (0x0FBE, 2, RexFlags::set_w())
+                }
+                ExtMode::WL => {
+                    // MOVSWL is (REX.W==0) 0F BF /r
+                    (0x0FBF, 2, RexFlags::clear_w())
+                }
+                ExtMode::WQ => {
+                    // MOVSWQ is (REX.W==1) 0F BF /r
+                    (0x0FBF, 2, RexFlags::set_w())
+                }
+                ExtMode::LQ => {
+                    // MOVSLQ is (REX.W==1) 63 /r
+                    (0x63, 1, RexFlags::set_w())
+                }
+            };
+
+            match src {
+                RegMem::Reg { reg: src } => {
+                    match ext_mode {
+                        ExtMode::BL | ExtMode::BQ => {
+                            // A redundant REX prefix must be emitted for certain register inputs.
+                            let enc_src = int_reg_enc(*src);
+                            if enc_src >= 4 && enc_src <= 7 {
+                                rex_flags.always_emit();
+                            };
+                        }
+                        _ => {}
+                    }
+                    emit_std_reg_reg(
+                        sink,
+                        LegacyPrefixes::None,
+                        opcodes,
+                        num_opcodes,
+                        dst.to_reg(),
+                        *src,
+                        rex_flags,
+                    )
+                }
+
+                RegMem::Mem { addr: src } => {
+                    let src = &src.finalize(state);
+
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        opcodes,
+                        num_opcodes,
+                        dst.to_reg(),
+                        src,
+                        rex_flags,
+                    )
+                }
+            }
+        }
+
+        Inst::MovRM { size, src, dst } => {
+            let dst = &dst.finalize(state);
+
+            match size {
+                1 => {
+                    // This is one of the few places where the presence of a
+                    // redundant REX prefix changes the meaning of the
+                    // instruction.
+                    let mut rex = RexFlags::clear_w();
+
+                    let enc_src = int_reg_enc(*src);
+                    if enc_src >= 4 && enc_src <= 7 {
+                        rex.always_emit();
+                    };
+
+                    // MOV r8, r/m8 is (REX.W==0) 88 /r
+                    emit_std_reg_mem(sink, state, LegacyPrefixes::None, 0x88, 1, *src, dst, rex)
+                }
+
+                2 => {
+                    // MOV r16, r/m16 is 66 (REX.W==0) 89 /r
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::_66,
+                        0x89,
+                        1,
+                        *src,
+                        dst,
+                        RexFlags::clear_w(),
+                    )
+                }
+
+                4 => {
+                    // MOV r32, r/m32 is (REX.W==0) 89 /r
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        0x89,
+                        1,
+                        *src,
+                        dst,
+                        RexFlags::clear_w(),
+                    )
+                }
+
+                8 => {
+                    // MOV r64, r/m64 is (REX.W==1) 89 /r
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        0x89,
+                        1,
+                        *src,
+                        dst,
+                        RexFlags::set_w(),
+                    )
+                }
+
+                _ => panic!("x64::Inst::Mov_R_M::emit: unreachable"),
+            }
+        }
+
+        Inst::ShiftR {
+            size,
+            kind,
+            num_bits,
+            dst,
+        } => {
+            let enc_dst = int_reg_enc(dst.to_reg());
+            let subopcode = match kind {
+                ShiftKind::RotateLeft => 0,
+                ShiftKind::RotateRight => 1,
+                ShiftKind::ShiftLeft => 4,
+                ShiftKind::ShiftRightLogical => 5,
+                ShiftKind::ShiftRightArithmetic => 7,
+            };
+
+            match num_bits {
+                None => {
+                    let (opcode, prefix, rex_flags) = match size {
+                        1 => (0xD2, LegacyPrefixes::None, RexFlags::clear_w()),
+                        2 => (0xD3, LegacyPrefixes::_66, RexFlags::clear_w()),
+                        4 => (0xD3, LegacyPrefixes::None, RexFlags::clear_w()),
+                        8 => (0xD3, LegacyPrefixes::None, RexFlags::set_w()),
+                        _ => unreachable!("{}", size),
+                    };
+
+                    // SHL/SHR/SAR %cl, reg8 is (REX.W==0) D2 /subopcode
+                    // SHL/SHR/SAR %cl, reg16 is 66 (REX.W==0) D3 /subopcode
+                    // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode
+                    // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode
+                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags);
+                }
+
+                Some(num_bits) => {
+                    let (opcode, prefix, rex_flags) = match size {
+                        1 => (0xC0, LegacyPrefixes::None, RexFlags::clear_w()),
+                        2 => (0xC1, LegacyPrefixes::_66, RexFlags::clear_w()),
+                        4 => (0xC1, LegacyPrefixes::None, RexFlags::clear_w()),
+                        8 => (0xC1, LegacyPrefixes::None, RexFlags::set_w()),
+                        _ => unreachable!("{}", size),
+                    };
+
+                    // SHL/SHR/SAR $ib, reg8 is (REX.W==0) C0 /subopcode
+                    // SHL/SHR/SAR $ib, reg16 is 66 (REX.W==0) C1 /subopcode
+                    // SHL/SHR/SAR $ib, reg32 is (REX.W==0) C1 /subopcode ib
+                    // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib
+                    // When the shift amount is 1, there's an even shorter encoding, but we don't
+                    // bother with that nicety here.
+                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags);
+                    sink.put1(*num_bits);
+                }
+            }
+        }
+
+        Inst::XmmRmiReg { opcode, src, dst } => {
+            let rex = RexFlags::clear_w();
+            let prefix = LegacyPrefixes::_66;
+            if let RegMemImm::Imm { simm32 } = src {
+                let (opcode_bytes, reg_digit) = match opcode {
+                    SseOpcode::Psllw => (0x0F71, 6),
+                    SseOpcode::Pslld => (0x0F72, 6),
+                    SseOpcode::Psllq => (0x0F73, 6),
+                    SseOpcode::Psraw => (0x0F71, 4),
+                    SseOpcode::Psrad => (0x0F72, 4),
+                    SseOpcode::Psrlw => (0x0F71, 2),
+                    SseOpcode::Psrld => (0x0F72, 2),
+                    SseOpcode::Psrlq => (0x0F73, 2),
+                    _ => panic!("invalid opcode: {}", opcode),
+                };
+                let dst_enc = reg_enc(dst.to_reg());
+                emit_std_enc_enc(sink, prefix, opcode_bytes, 2, reg_digit, dst_enc, rex);
+                let imm = (*simm32)
+                    .try_into()
+                    .expect("the immediate must be convertible to a u8");
+                sink.put1(imm);
+            } else {
+                let opcode_bytes = match opcode {
+                    SseOpcode::Psllw => 0x0FF1,
+                    SseOpcode::Pslld => 0x0FF2,
+                    SseOpcode::Psllq => 0x0FF3,
+                    SseOpcode::Psraw => 0x0FE1,
+                    SseOpcode::Psrad => 0x0FE2,
+                    SseOpcode::Psrlw => 0x0FD1,
+                    SseOpcode::Psrld => 0x0FD2,
+                    SseOpcode::Psrlq => 0x0FD3,
+                    _ => panic!("invalid opcode: {}", opcode),
+                };
+
+                match src {
+                    RegMemImm::Reg { reg } => {
+                        emit_std_reg_reg(sink, prefix, opcode_bytes, 2, dst.to_reg(), *reg, rex);
+                    }
+                    RegMemImm::Mem { addr } => {
+                        let addr = &addr.finalize(state);
+                        emit_std_reg_mem(
+                            sink,
+                            state,
+                            prefix,
+                            opcode_bytes,
+                            2,
+                            dst.to_reg(),
+                            addr,
+                            rex,
+                        );
+                    }
+                    RegMemImm::Imm { .. } => unreachable!(),
+                }
+            };
+        }
+
+        Inst::CmpRmiR {
+            size,
+            src: src_e,
+            dst: reg_g,
+        } => {
+            let mut prefix = LegacyPrefixes::None;
+            if *size == 2 {
+                prefix = LegacyPrefixes::_66;
+            }
+
+            let mut rex = match size {
+                8 => RexFlags::set_w(),
+                4 | 2 => RexFlags::clear_w(),
+                1 => {
+                    let mut rex = RexFlags::clear_w();
+                    // Here, a redundant REX prefix changes the meaning of the instruction.
+                    let enc_g = int_reg_enc(*reg_g);
+                    if enc_g >= 4 && enc_g <= 7 {
+                        rex.always_emit();
+                    }
+                    rex
+                }
+                _ => panic!("x64::Inst::Cmp_RMI_R::emit: unreachable"),
+            };
+
+            match src_e {
+                RegMemImm::Reg { reg: reg_e } => {
+                    if *size == 1 {
+                        // Check whether the E register forces the use of a redundant REX.
+                        let enc_e = int_reg_enc(*reg_e);
+                        if enc_e >= 4 && enc_e <= 7 {
+                            rex.always_emit();
+                        }
+                    }
+
+                    // Use the swapped operands encoding, to stay consistent with the output of
+                    // gcc/llvm.
+                    let opcode = if *size == 1 { 0x38 } else { 0x39 };
+                    emit_std_reg_reg(sink, prefix, opcode, 1, *reg_e, *reg_g, rex);
+                }
+
+                RegMemImm::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    // Whereas here we revert to the "normal" G-E ordering.
+                    let opcode = if *size == 1 { 0x3A } else { 0x3B };
+                    emit_std_reg_mem(sink, state, prefix, opcode, 1, *reg_g, addr, rex);
+                }
+
+                RegMemImm::Imm { simm32 } => {
+                    // FIXME JRS 2020Feb11: there are shorter encodings for
+                    // cmp $imm, rax/eax/ax/al.
+                    let use_imm8 = low8_will_sign_extend_to_32(*simm32);
+
+                    // And also here we use the "normal" G-E ordering.
+                    let opcode = if *size == 1 {
+                        0x80
+                    } else if use_imm8 {
+                        0x83
+                    } else {
+                        0x81
+                    };
+
+                    let enc_g = int_reg_enc(*reg_g);
+                    emit_std_enc_enc(sink, prefix, opcode, 1, 7 /*subopcode*/, enc_g, rex);
+                    emit_simm(sink, if use_imm8 { 1 } else { *size }, *simm32);
+                }
+            }
+        }
+
+        Inst::Setcc { cc, dst } => {
+            let opcode = 0x0f90 + cc.get_enc() as u32;
+            let mut rex_flags = RexFlags::clear_w();
+            rex_flags.always_emit();
+            emit_std_enc_enc(
+                sink,
+                LegacyPrefixes::None,
+                opcode,
+                2,
+                0,
+                reg_enc(dst.to_reg()),
+                rex_flags,
+            );
+        }
+
+        Inst::Cmove {
+            size,
+            cc,
+            src,
+            dst: reg_g,
+        } => {
+            let (prefix, rex_flags) = match size {
+                2 => (LegacyPrefixes::_66, RexFlags::clear_w()),
+                4 => (LegacyPrefixes::None, RexFlags::clear_w()),
+                8 => (LegacyPrefixes::None, RexFlags::set_w()),
+                _ => unreachable!("invalid size spec for cmove"),
+            };
+            let opcode = 0x0F40 + cc.get_enc() as u32;
+            match src {
+                RegMem::Reg { reg: reg_e } => {
+                    emit_std_reg_reg(sink, prefix, opcode, 2, reg_g.to_reg(), *reg_e, rex_flags);
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        prefix,
+                        opcode,
+                        2,
+                        reg_g.to_reg(),
+                        addr,
+                        rex_flags,
+                    );
+                }
+            }
+        }
+
+        Inst::XmmCmove {
+            is_64,
+            cc,
+            src,
+            dst,
+        } => {
+            // Lowering of the Select IR opcode when the input is an fcmp relies on the fact that
+            // this doesn't clobber flags. Make sure to not do so here.
+            let next = sink.get_label();
+
+            // Jump if cc is *not* set.
+            one_way_jmp(sink, cc.invert(), next);
+
+            let op = if *is_64 {
+                SseOpcode::Movsd
+            } else {
+                SseOpcode::Movss
+            };
+            let inst = Inst::xmm_unary_rm_r(op, src.clone(), *dst);
+            inst.emit(sink, info, state);
+
+            sink.bind_label(next);
+        }
+
+        Inst::Push64 { src } => {
+            match src {
+                RegMemImm::Reg { reg } => {
+                    let enc_reg = int_reg_enc(*reg);
+                    let rex = 0x40 | ((enc_reg >> 3) & 1);
+                    if rex != 0x40 {
+                        sink.put1(rex);
+                    }
+                    sink.put1(0x50 | (enc_reg & 7));
+                }
+
+                RegMemImm::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_enc_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        0xFF,
+                        1,
+                        6, /*subopcode*/
+                        addr,
+                        RexFlags::clear_w(),
+                    );
+                }
+
+                RegMemImm::Imm { simm32 } => {
+                    if low8_will_sign_extend_to_64(*simm32) {
+                        sink.put1(0x6A);
+                        sink.put1(*simm32 as u8);
+                    } else {
+                        sink.put1(0x68);
+                        sink.put4(*simm32);
+                    }
+                }
+            }
+        }
+
+        Inst::Pop64 { dst } => {
+            let enc_dst = int_reg_enc(dst.to_reg());
+            if enc_dst >= 8 {
+                // 0x41 == REX.{W=0, B=1}.  It seems that REX.W is irrelevant here.
+                sink.put1(0x41);
+            }
+            sink.put1(0x58 + (enc_dst & 7));
+        }
+
+        Inst::CallKnown { dest, opcode, .. } => {
+            if let Some(s) = state.take_stack_map() {
+                sink.add_stack_map(StackMapExtent::UpcomingBytes(5), s);
+            }
+            sink.put1(0xE8);
+            // The addend adjusts for the difference between the end of the instruction and the
+            // beginning of the immediate field.
+            emit_reloc(sink, state, Reloc::X86CallPCRel4, &dest, -4);
+            sink.put4(0);
+            if opcode.is_call() {
+                let loc = state.cur_srcloc();
+                sink.add_call_site(loc, *opcode);
+            }
+        }
+
+        Inst::CallUnknown { dest, opcode, .. } => {
+            let start_offset = sink.cur_offset();
+            match dest {
+                RegMem::Reg { reg } => {
+                    let reg_enc = int_reg_enc(*reg);
+                    emit_std_enc_enc(
+                        sink,
+                        LegacyPrefixes::None,
+                        0xFF,
+                        1,
+                        2, /*subopcode*/
+                        reg_enc,
+                        RexFlags::clear_w(),
+                    );
+                }
+
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_enc_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        0xFF,
+                        1,
+                        2, /*subopcode*/
+                        addr,
+                        RexFlags::clear_w(),
+                    );
+                }
+            }
+            if let Some(s) = state.take_stack_map() {
+                sink.add_stack_map(StackMapExtent::StartedAtOffset(start_offset), s);
+            }
+            if opcode.is_call() {
+                let loc = state.cur_srcloc();
+                sink.add_call_site(loc, *opcode);
+            }
+        }
+
+        Inst::Ret {} => sink.put1(0xC3),
+
+        Inst::JmpKnown { dst } => {
+            let br_start = sink.cur_offset();
+            let br_disp_off = br_start + 1;
+            let br_end = br_start + 5;
+
+            sink.use_label_at_offset(br_disp_off, *dst, LabelUse::JmpRel32);
+            sink.add_uncond_branch(br_start, br_end, *dst);
+
+            sink.put1(0xE9);
+            // Placeholder for the label value.
+            sink.put4(0x0);
+        }
+
+        Inst::JmpIf { cc, taken } => {
+            let cond_start = sink.cur_offset();
+            let cond_disp_off = cond_start + 2;
+
+            sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32);
+            // Since this is not a terminator, don't enroll in the branch inversion mechanism.
+
+            sink.put1(0x0F);
+            sink.put1(0x80 + cc.get_enc());
+            // Placeholder for the label value.
+            sink.put4(0x0);
+        }
+
+        Inst::JmpCond {
+            cc,
+            taken,
+            not_taken,
+        } => {
+            // If taken.
+            let cond_start = sink.cur_offset();
+            let cond_disp_off = cond_start + 2;
+            let cond_end = cond_start + 6;
+
+            sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32);
+            let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];
+            sink.add_cond_branch(cond_start, cond_end, *taken, &inverted[..]);
+
+            sink.put1(0x0F);
+            sink.put1(0x80 + cc.get_enc());
+            // Placeholder for the label value.
+            sink.put4(0x0);
+
+            // If not taken.
+            let uncond_start = sink.cur_offset();
+            let uncond_disp_off = uncond_start + 1;
+            let uncond_end = uncond_start + 5;
+
+            sink.use_label_at_offset(uncond_disp_off, *not_taken, LabelUse::JmpRel32);
+            sink.add_uncond_branch(uncond_start, uncond_end, *not_taken);
+
+            sink.put1(0xE9);
+            // Placeholder for the label value.
+            sink.put4(0x0);
+        }
+
+        Inst::JmpUnknown { target } => {
+            match target {
+                RegMem::Reg { reg } => {
+                    let reg_enc = int_reg_enc(*reg);
+                    emit_std_enc_enc(
+                        sink,
+                        LegacyPrefixes::None,
+                        0xFF,
+                        1,
+                        4, /*subopcode*/
+                        reg_enc,
+                        RexFlags::clear_w(),
+                    );
+                }
+
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_enc_mem(
+                        sink,
+                        state,
+                        LegacyPrefixes::None,
+                        0xFF,
+                        1,
+                        4, /*subopcode*/
+                        addr,
+                        RexFlags::clear_w(),
+                    );
+                }
+            }
+        }
+
+        Inst::JmpTableSeq {
+            idx,
+            tmp1,
+            tmp2,
+            ref targets,
+            default_target,
+            ..
+        } => {
+            // This sequence is *one* instruction in the vcode, and is expanded only here at
+            // emission time, because we cannot allow the regalloc to insert spills/reloads in
+            // the middle; we depend on hardcoded PC-rel addressing below.
+            //
+            // We don't have to worry about emitting islands, because the only label-use type has a
+            // maximum range of 2 GB. If we later consider using shorter-range label references,
+            // this will need to be revisited.
+
+            // Save index in a tmp (the live range of ridx only goes to start of this
+            // sequence; rtmp1 or rtmp2 may overwrite it).
+
+            // We generate the following sequence:
+            // ;; generated by lowering: cmp #jmp_table_size, %idx
+            // jnb $default_target
+            // movl %idx, %tmp2
+            // lea start_of_jump_table_offset(%rip), %tmp1
+            // movslq [%tmp1, %tmp2, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
+            // addq %tmp2, %tmp1
+            // j *%tmp1
+            // $start_of_jump_table:
+            // -- jump table entries
+            one_way_jmp(sink, CC::NB, *default_target); // idx unsigned >= jmp table size
+
+            // Copy the index (and make sure to clear the high 32-bits lane of tmp2).
+            let inst = Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(*idx), *tmp2);
+            inst.emit(sink, info, state);
+
+            // Load base address of jump table.
+            let start_of_jumptable = sink.get_label();
+            let inst = Inst::lea(Amode::rip_relative(start_of_jumptable), *tmp1);
+            inst.emit(sink, info, state);
+
+            // Load value out of the jump table. It's a relative offset to the target block, so it
+            // might be negative; use a sign-extension.
+            let inst = Inst::movsx_rm_r(
+                ExtMode::LQ,
+                RegMem::mem(Amode::imm_reg_reg_shift(0, tmp1.to_reg(), tmp2.to_reg(), 2)),
+                *tmp2,
+            );
+            inst.emit(sink, info, state);
+
+            // Add base of jump table to jump-table-sourced block offset.
+            let inst = Inst::alu_rmi_r(
+                true, /* is_64 */
+                AluRmiROpcode::Add,
+                RegMemImm::reg(tmp2.to_reg()),
+                *tmp1,
+            );
+            inst.emit(sink, info, state);
+
+            // Branch to computed address.
+            let inst = Inst::jmp_unknown(RegMem::reg(tmp1.to_reg()));
+            inst.emit(sink, info, state);
+
+            // Emit jump table (table of 32-bit offsets).
+            sink.bind_label(start_of_jumptable);
+            let jt_off = sink.cur_offset();
+            for &target in targets.iter() {
+                let word_off = sink.cur_offset();
+                // off_into_table is an addend here embedded in the label to be later patched at
+                // the end of codegen. The offset is initially relative to this jump table entry;
+                // with the extra addend, it'll be relative to the jump table's start, after
+                // patching.
+                let off_into_table = word_off - jt_off;
+                sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);
+                sink.put4(off_into_table);
+            }
+        }
+
+        Inst::TrapIf { cc, trap_code } => {
+            let else_label = sink.get_label();
+
+            // Jump over if the invert of CC is set (i.e. CC is not set).
+            one_way_jmp(sink, cc.invert(), else_label);
+
+            // Trap!
+            let inst = Inst::trap(*trap_code);
+            inst.emit(sink, info, state);
+
+            sink.bind_label(else_label);
+        }
+
+        Inst::XmmUnaryRmR {
+            op,
+            src: src_e,
+            dst: reg_g,
+        } => {
+            let rex = RexFlags::clear_w();
+
+            let (prefix, opcode, num_opcodes) = match op {
+                SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2),
+                SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2),
+                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2),
+                SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F28, 2),
+                SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F6F, 2),
+                SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F6F, 2),
+                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
+                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10, 2),
+                SseOpcode::Movups => (LegacyPrefixes::None, 0x0F10, 2),
+                SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F10, 2),
+                SseOpcode::Pabsb => (LegacyPrefixes::_66, 0x0F381C, 3),
+                SseOpcode::Pabsw => (LegacyPrefixes::_66, 0x0F381D, 3),
+                SseOpcode::Pabsd => (LegacyPrefixes::_66, 0x0F381E, 3),
+                SseOpcode::Sqrtps => (LegacyPrefixes::None, 0x0F51, 2),
+                SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2),
+                SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2),
+                SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2),
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+
+            match src_e {
+                RegMem::Reg { reg: reg_e } => {
+                    emit_std_reg_reg(
+                        sink,
+                        prefix,
+                        opcode,
+                        num_opcodes,
+                        reg_g.to_reg(),
+                        *reg_e,
+                        rex,
+                    );
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        prefix,
+                        opcode,
+                        num_opcodes,
+                        reg_g.to_reg(),
+                        addr,
+                        rex,
+                    );
+                }
+            };
+        }
+
+        Inst::XmmRmR {
+            op,
+            src: src_e,
+            dst: reg_g,
+        } => {
+            let rex = RexFlags::clear_w();
+            let (prefix, opcode, length) = match op {
+                SseOpcode::Addps => (LegacyPrefixes::None, 0x0F58, 2),
+                SseOpcode::Addpd => (LegacyPrefixes::_66, 0x0F58, 2),
+                SseOpcode::Addss => (LegacyPrefixes::_F3, 0x0F58, 2),
+                SseOpcode::Addsd => (LegacyPrefixes::_F2, 0x0F58, 2),
+                SseOpcode::Andps => (LegacyPrefixes::None, 0x0F54, 2),
+                SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
+                SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
+                SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
+                SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
+                SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
+                SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2),
+                SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2),
+                SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2),
+                SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2),
+                SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2),
+                SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2),
+                SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2),
+                SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2),
+                SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2),
+                SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2),
+                SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2),
+                SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2),
+                SseOpcode::Movlhps => (LegacyPrefixes::None, 0x0F16, 2),
+                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
+                SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2),
+                SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2),
+                SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2),
+                SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
+                SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
+                SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
+                SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
+                SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
+                SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
+                SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
+                SseOpcode::Paddw => (LegacyPrefixes::_66, 0x0FFD, 2),
+                SseOpcode::Paddsb => (LegacyPrefixes::_66, 0x0FEC, 2),
+                SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2),
+                SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2),
+                SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2),
+                SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2),
+                SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
+                SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
+                SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2),
+                SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2),
+                SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2),
+                SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2),
+                SseOpcode::Pcmpeqq => (LegacyPrefixes::_66, 0x0F3829, 3),
+                SseOpcode::Pcmpgtb => (LegacyPrefixes::_66, 0x0F64, 2),
+                SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2),
+                SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2),
+                SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3),
+                SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3),
+                SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2),
+                SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3),
+                SseOpcode::Pmaxub => (LegacyPrefixes::_66, 0x0FDE, 2),
+                SseOpcode::Pmaxuw => (LegacyPrefixes::_66, 0x0F383E, 3),
+                SseOpcode::Pmaxud => (LegacyPrefixes::_66, 0x0F383F, 3),
+                SseOpcode::Pminsb => (LegacyPrefixes::_66, 0x0F3838, 3),
+                SseOpcode::Pminsw => (LegacyPrefixes::_66, 0x0FEA, 2),
+                SseOpcode::Pminsd => (LegacyPrefixes::_66, 0x0F3839, 3),
+                SseOpcode::Pminub => (LegacyPrefixes::_66, 0x0FDA, 2),
+                SseOpcode::Pminuw => (LegacyPrefixes::_66, 0x0F383A, 3),
+                SseOpcode::Pminud => (LegacyPrefixes::_66, 0x0F383B, 3),
+                SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
+                SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
+                SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2),
+                SseOpcode::Por => (LegacyPrefixes::_66, 0x0FEB, 2),
+                SseOpcode::Pshufb => (LegacyPrefixes::_66, 0x0F3800, 3),
+                SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2),
+                SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2),
+                SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2),
+                SseOpcode::Psubw => (LegacyPrefixes::_66, 0x0FF9, 2),
+                SseOpcode::Psubsb => (LegacyPrefixes::_66, 0x0FE8, 2),
+                SseOpcode::Psubsw => (LegacyPrefixes::_66, 0x0FE9, 2),
+                SseOpcode::Psubusb => (LegacyPrefixes::_66, 0x0FD8, 2),
+                SseOpcode::Psubusw => (LegacyPrefixes::_66, 0x0FD9, 2),
+                SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
+                SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
+                SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
+                SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2),
+                SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2),
+                SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
+                SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+
+            match src_e {
+                RegMem::Reg { reg: reg_e } => {
+                    emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex);
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_reg_mem(
+                        sink,
+                        state,
+                        prefix,
+                        opcode,
+                        length,
+                        reg_g.to_reg(),
+                        addr,
+                        rex,
+                    );
+                }
+            }
+        }
+
+        Inst::XmmMinMaxSeq {
+            size,
+            is_min,
+            lhs,
+            rhs_dst,
+        } => {
+            // Generates the following sequence:
+            // cmpss/cmpsd %lhs, %rhs_dst
+            // jnz do_min_max
+            // jp propagate_nan
+            //
+            // ;; ordered and equal: propagate the sign bit (for -0 vs 0):
+            // {and,or}{ss,sd} %lhs, %rhs_dst
+            // j done
+            //
+            // ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the
+            // ;; NaN value is returned), we add both inputs.
+            // propagate_nan:
+            // add{ss,sd} %lhs, %rhs_dst
+            // j done
+            //
+            // do_min_max:
+            // {min,max}{ss,sd} %lhs, %rhs_dst
+            //
+            // done:
+            let done = sink.get_label();
+            let propagate_nan = sink.get_label();
+            let do_min_max = sink.get_label();
+
+            let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {
+                OperandSize::Size32 => (
+                    SseOpcode::Addss,
+                    SseOpcode::Ucomiss,
+                    SseOpcode::Andps,
+                    SseOpcode::Orps,
+                    if *is_min {
+                        SseOpcode::Minss
+                    } else {
+                        SseOpcode::Maxss
+                    },
+                ),
+                OperandSize::Size64 => (
+                    SseOpcode::Addsd,
+                    SseOpcode::Ucomisd,
+                    SseOpcode::Andpd,
+                    SseOpcode::Orpd,
+                    if *is_min {
+                        SseOpcode::Minsd
+                    } else {
+                        SseOpcode::Maxsd
+                    },
+                ),
+            };
+
+            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(*lhs), rhs_dst.to_reg());
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::NZ, do_min_max);
+            one_way_jmp(sink, CC::P, propagate_nan);
+
+            // Ordered and equal. The operands are bit-identical unless they are zero
+            // and negative zero. These instructions merge the sign bits in that
+            // case, and are no-ops otherwise.
+            let op = if *is_min { or_op } else { and_op };
+            let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::jmp_known(done);
+            inst.emit(sink, info, state);
+
+            // x86's min/max are not symmetric; if either operand is a NaN, they return the
+            // read-only operand: perform an addition between the two operands, which has the
+            // desired NaN propagation effects.
+            sink.bind_label(propagate_nan);
+            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst);
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::P, done);
+
+            sink.bind_label(do_min_max);
+            let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst);
+            inst.emit(sink, info, state);
+
+            sink.bind_label(done);
+        }
+
+        Inst::XmmRmRImm {
+            op,
+            src,
+            dst,
+            imm,
+            is64,
+        } => {
+            let (prefix, opcode, len) = match op {
+                SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2),
+                SseOpcode::Cmppd => (LegacyPrefixes::_66, 0x0FC2, 2),
+                SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2),
+                SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2),
+                SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3),
+                SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3),
+                SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
+                SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
+                SseOpcode::Pextrb => (LegacyPrefixes::_66, 0x0F3A14, 3),
+                SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2),
+                SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3),
+                SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+            let rex = if *is64 {
+                RexFlags::set_w()
+            } else {
+                RexFlags::clear_w()
+            };
+            let regs_swapped = match *op {
+                // These opcodes (and not the SSE2 version of PEXTRW) flip the operand
+                // encoding: `dst` in ModRM's r/m, `src` in ModRM's reg field.
+                SseOpcode::Pextrb | SseOpcode::Pextrd => true,
+                // The rest of the opcodes have the customary encoding: `dst` in ModRM's reg,
+                // `src` in ModRM's r/m field.
+                _ => false,
+            };
+            match src {
+                RegMem::Reg { reg } => {
+                    if regs_swapped {
+                        emit_std_reg_reg(sink, prefix, opcode, len, *reg, dst.to_reg(), rex);
+                    } else {
+                        emit_std_reg_reg(sink, prefix, opcode, len, dst.to_reg(), *reg, rex);
+                    }
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    assert!(
+                        !regs_swapped,
+                        "No existing way to encode a mem argument in the ModRM r/m field."
+                    );
+                    emit_std_reg_mem(sink, state, prefix, opcode, len, dst.to_reg(), addr, rex);
+                }
+            }
+            sink.put1(*imm);
+        }
+
+        Inst::XmmLoadConst { src, dst, ty } => {
+            let load_offset = Amode::rip_relative(sink.get_label_for_constant(*src));
+            let load = Inst::load(*ty, load_offset, *dst, ExtKind::None);
+            load.emit(sink, info, state);
+        }
+
+        Inst::XmmUninitializedValue { .. } => {
+            // This instruction format only exists to declare a register as a `def`; no code is
+            // emitted.
+        }
+
+        Inst::XmmMovRM { op, src, dst } => {
+            let (prefix, opcode) = match op {
+                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29),
+                SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29),
+                SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F7F),
+                SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F),
+                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11),
+                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11),
+                SseOpcode::Movups => (LegacyPrefixes::None, 0x0F11),
+                SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F11),
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+            let dst = &dst.finalize(state);
+            emit_std_reg_mem(
+                sink,
+                state,
+                prefix,
+                opcode,
+                2,
+                *src,
+                dst,
+                RexFlags::clear_w(),
+            );
+        }
+
+        Inst::XmmToGpr {
+            op,
+            src,
+            dst,
+            dst_size,
+        } => {
+            let (prefix, opcode, dst_first) = match op {
+                SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
+                SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
+                // Movd and movq use the same opcode; the presence of the REX prefix (set below)
+                // actually determines which is used.
+                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false),
+                SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true),
+                SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true),
+                SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true),
+                _ => panic!("unexpected opcode {:?}", op),
+            };
+            let rex = match dst_size {
+                OperandSize::Size32 => RexFlags::clear_w(),
+                OperandSize::Size64 => RexFlags::set_w(),
+            };
+
+            let (src, dst) = if dst_first {
+                (dst.to_reg(), *src)
+            } else {
+                (*src, dst.to_reg())
+            };
+
+            emit_std_reg_reg(sink, prefix, opcode, 2, src, dst, rex);
+        }
+
+        Inst::GprToXmm {
+            op,
+            src: src_e,
+            dst: reg_g,
+            src_size,
+        } => {
+            let (prefix, opcode) = match op {
+                // Movd and movq use the same opcode; the presence of the REX prefix (set below)
+                // actually determines which is used.
+                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E),
+                SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
+                SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
+                _ => panic!("unexpected opcode {:?}", op),
+            };
+            let rex = match *src_size {
+                OperandSize::Size32 => RexFlags::clear_w(),
+                OperandSize::Size64 => RexFlags::set_w(),
+            };
+            match src_e {
+                RegMem::Reg { reg: reg_e } => {
+                    emit_std_reg_reg(sink, prefix, opcode, 2, reg_g.to_reg(), *reg_e, rex);
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_reg_mem(sink, state, prefix, opcode, 2, reg_g.to_reg(), addr, rex);
+                }
+            }
+        }
+
+        Inst::XmmCmpRmR { op, src, dst } => {
+            let rex = RexFlags::clear_w();
+            let (prefix, opcode, len) = match op {
+                SseOpcode::Ptest => (LegacyPrefixes::_66, 0x0F3817, 3),
+                SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E, 2),
+                SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E, 2),
+                _ => unimplemented!("Emit xmm cmp rm r"),
+            };
+
+            match src {
+                RegMem::Reg { reg } => {
+                    emit_std_reg_reg(sink, prefix, opcode, len, *dst, *reg, rex);
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state);
+                    emit_std_reg_mem(sink, state, prefix, opcode, len, *dst, addr, rex);
+                }
+            }
+        }
+
+        Inst::CvtUint64ToFloatSeq {
+            to_f64,
+            src,
+            dst,
+            tmp_gpr1,
+            tmp_gpr2,
+        } => {
+            // Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a
+            // different sequence.
+            //
+            // Emit the following sequence:
+            //
+            //  cmp 0, %src
+            //  jl handle_negative
+            //
+            //  ;; handle positive, which can't overflow
+            //  cvtsi2sd/cvtsi2ss %src, %dst
+            //  j done
+            //
+            //  ;; handle negative: see below for an explanation of what it's doing.
+            //  handle_negative:
+            //  mov %src, %tmp_gpr1
+            //  shr $1, %tmp_gpr1
+            //  mov %src, %tmp_gpr2
+            //  and $1, %tmp_gpr2
+            //  or %tmp_gpr1, %tmp_gpr2
+            //  cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst
+            //  addsd/addss %dst, %dst
+            //
+            //  done:
+
+            assert_ne!(src, tmp_gpr1);
+            assert_ne!(src, tmp_gpr2);
+            assert_ne!(tmp_gpr1, tmp_gpr2);
+
+            let handle_negative = sink.get_label();
+            let done = sink.get_label();
+
+            // If x seen as a signed int64 is not negative, a signed-conversion will do the right
+            // thing.
+            // TODO use tst src, src here.
+            let inst = Inst::cmp_rmi_r(8, RegMemImm::imm(0), src.to_reg());
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::L, handle_negative);
+
+            // Handle a positive int64, which is the "easy" case: a signed conversion will do the
+            // right thing.
+            emit_signed_cvt(sink, info, state, src.to_reg(), *dst, *to_f64);
+
+            let inst = Inst::jmp_known(done);
+            inst.emit(sink, info, state);
+
+            sink.bind_label(handle_negative);
+
+            // Divide x by two to get it in range for the signed conversion, keep the LSB, and
+            // scale it back up on the FP side.
+            let inst = Inst::gen_move(*tmp_gpr1, src.to_reg(), types::I64);
+            inst.emit(sink, info, state);
+
+            // tmp_gpr1 := src >> 1
+            let inst = Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(1), *tmp_gpr1);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::gen_move(*tmp_gpr2, src.to_reg(), types::I64);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::alu_rmi_r(
+                true, /* 64bits */
+                AluRmiROpcode::And,
+                RegMemImm::imm(1),
+                *tmp_gpr2,
+            );
+            inst.emit(sink, info, state);
+
+            let inst = Inst::alu_rmi_r(
+                true, /* 64bits */
+                AluRmiROpcode::Or,
+                RegMemImm::reg(tmp_gpr1.to_reg()),
+                *tmp_gpr2,
+            );
+            inst.emit(sink, info, state);
+
+            emit_signed_cvt(sink, info, state, tmp_gpr2.to_reg(), *dst, *to_f64);
+
+            let add_op = if *to_f64 {
+                SseOpcode::Addsd
+            } else {
+                SseOpcode::Addss
+            };
+            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst);
+            inst.emit(sink, info, state);
+
+            sink.bind_label(done);
+        }
+
+        Inst::CvtFloatToSintSeq {
+            src_size,
+            dst_size,
+            is_saturating,
+            src,
+            dst,
+            tmp_gpr,
+            tmp_xmm,
+        } => {
+            // Emits the following common sequence:
+            //
+            // cvttss2si/cvttsd2si %src, %dst
+            // cmp %dst, 1
+            // jno done
+            //
+            // Then, for saturating conversions:
+            //
+            // ;; check for NaN
+            // cmpss/cmpsd %src, %src
+            // jnp not_nan
+            // xor %dst, %dst
+            //
+            // ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is
+            // ;; already in %dst.
+            // xorpd %tmp_xmm, %tmp_xmm
+            // cmpss/cmpsd %src, %tmp_xmm
+            // jnb done
+            // mov/movaps $INT_MAX, %dst
+            //
+            // done:
+            //
+            // Then, for non-saturating conversions:
+            //
+            // ;; check for NaN
+            // cmpss/cmpsd %src, %src
+            // jnp not_nan
+            // ud2 trap BadConversionToInteger
+            //
+            // ;; check if INT_MIN was the correct result, against a magic constant:
+            // not_nan:
+            // movaps/mov $magic, %tmp_gpr
+            // movq/movd %tmp_gpr, %tmp_xmm
+            // cmpss/cmpsd %tmp_xmm, %src
+            // jnb/jnbe $check_positive
+            // ud2 trap IntegerOverflow
+            //
+            // ;; if positive, it was a real overflow
+            // check_positive:
+            // xorpd %tmp_xmm, %tmp_xmm
+            // cmpss/cmpsd %src, %tmp_xmm
+            // jnb done
+            // ud2 trap IntegerOverflow
+            //
+            // done:
+
+            let src = src.to_reg();
+
+            let (cast_op, cmp_op, trunc_op) = match src_size {
+                OperandSize::Size64 => (SseOpcode::Movq, SseOpcode::Ucomisd, SseOpcode::Cvttsd2si),
+                OperandSize::Size32 => (SseOpcode::Movd, SseOpcode::Ucomiss, SseOpcode::Cvttss2si),
+            };
+
+            let done = sink.get_label();
+            let not_nan = sink.get_label();
+
+            // The truncation.
+            let inst = Inst::xmm_to_gpr(trunc_op, src, *dst, *dst_size);
+            inst.emit(sink, info, state);
+
+            // Compare against 1, in case of overflow the dst operand was INT_MIN.
+            let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(1), dst.to_reg());
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::NO, done); // no overflow => done
+
+            // Check for NaN.
+
+            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), src);
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
+
+            if *is_saturating {
+                // For NaN, emit 0.
+                let inst = Inst::alu_rmi_r(
+                    *dst_size == OperandSize::Size64,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(dst.to_reg()),
+                    *dst,
+                );
+                inst.emit(sink, info, state);
+
+                let inst = Inst::jmp_known(done);
+                inst.emit(sink, info, state);
+
+                sink.bind_label(not_nan);
+
+                // If the input was positive, saturate to INT_MAX.
+
+                // Zero out tmp_xmm.
+                let inst =
+                    Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm);
+                inst.emit(sink, info, state);
+
+                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
+                inst.emit(sink, info, state);
+
+                // Jump if >= to done.
+                one_way_jmp(sink, CC::NB, done);
+
+                // Otherwise, put INT_MAX.
+                if *dst_size == OperandSize::Size64 {
+                    let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, *dst);
+                    inst.emit(sink, info, state);
+                } else {
+                    let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, *dst);
+                    inst.emit(sink, info, state);
+                }
+            } else {
+                let check_positive = sink.get_label();
+
+                let inst = Inst::trap(TrapCode::BadConversionToInteger);
+                inst.emit(sink, info, state);
+
+                // Check if INT_MIN was the correct result: determine the smallest floating point
+                // number that would convert to INT_MIN, put it in a temporary register, and compare
+                // against the src register.
+                // If the src register is less (or in some cases, less-or-equal) than the threshold,
+                // trap!
+
+                sink.bind_label(not_nan);
+
+                let mut no_overflow_cc = CC::NB; // >=
+                let output_bits = dst_size.to_bits();
+                match *src_size {
+                    OperandSize::Size32 => {
+                        let cst = Ieee32::pow2(output_bits - 1).neg().bits();
+                        let inst = Inst::imm(OperandSize::Size32, cst as u64, *tmp_gpr);
+                        inst.emit(sink, info, state);
+                    }
+                    OperandSize::Size64 => {
+                        // An f64 can represent `i32::min_value() - 1` exactly with precision to spare,
+                        // so there are values less than -2^(N-1) that convert correctly to INT_MIN.
+                        let cst = if output_bits < 64 {
+                            no_overflow_cc = CC::NBE; // >
+                            Ieee64::fcvt_to_sint_negative_overflow(output_bits)
+                        } else {
+                            Ieee64::pow2(output_bits - 1).neg()
+                        };
+                        let inst = Inst::imm(OperandSize::Size64, cst.bits(), *tmp_gpr);
+                        inst.emit(sink, info, state);
+                    }
+                }
+
+                let inst =
+                    Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm);
+                inst.emit(sink, info, state);
+
+                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm.to_reg()), src);
+                inst.emit(sink, info, state);
+
+                // jump over trap if src >= or > threshold
+                one_way_jmp(sink, no_overflow_cc, check_positive);
+
+                let inst = Inst::trap(TrapCode::IntegerOverflow);
+                inst.emit(sink, info, state);
+
+                // If positive, it was a real overflow.
+
+                sink.bind_label(check_positive);
+
+                // Zero out the tmp_xmm register.
+                let inst =
+                    Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm);
+                inst.emit(sink, info, state);
+
+                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
+                inst.emit(sink, info, state);
+
+                one_way_jmp(sink, CC::NB, done); // jump over trap if 0 >= src
+
+                let inst = Inst::trap(TrapCode::IntegerOverflow);
+                inst.emit(sink, info, state);
+            }
+
+            sink.bind_label(done);
+        }
+
+        Inst::CvtFloatToUintSeq {
+            src_size,
+            dst_size,
+            is_saturating,
+            src,
+            dst,
+            tmp_gpr,
+            tmp_xmm,
+        } => {
+            // The only difference in behavior between saturating and non-saturating is how we
+            // handle errors. Emits the following sequence:
+            //
+            // movaps/mov 2**(int_width - 1), %tmp_gpr
+            // movq/movd %tmp_gpr, %tmp_xmm
+            // cmpss/cmpsd %tmp_xmm, %src
+            // jnb is_large
+            //
+            // ;; check for NaN inputs
+            // jnp not_nan
+            // -- non-saturating: ud2 trap BadConversionToInteger
+            // -- saturating: xor %dst, %dst; j done
+            //
+            // not_nan:
+            // cvttss2si/cvttsd2si %src, %dst
+            // cmp 0, %dst
+            // jnl done
+            // -- non-saturating: ud2 trap IntegerOverflow
+            // -- saturating: xor %dst, %dst; j done
+            //
+            // is_large:
+            // subss/subsd %tmp_xmm, %src ; <-- we clobber %src here
+            // cvttss2si/cvttss2sd %tmp_x, %dst
+            // cmp 0, %dst
+            // jnl next_is_large
+            // -- non-saturating: ud2 trap IntegerOverflow
+            // -- saturating: movaps $UINT_MAX, %dst; j done
+            //
+            // next_is_large:
+            // add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers
+            //
+            // done:
+
+            assert_ne!(tmp_xmm, src, "tmp_xmm clobbers src!");
+
+            let (sub_op, cast_op, cmp_op, trunc_op) = if *src_size == OperandSize::Size64 {
+                (
+                    SseOpcode::Subsd,
+                    SseOpcode::Movq,
+                    SseOpcode::Ucomisd,
+                    SseOpcode::Cvttsd2si,
+                )
+            } else {
+                (
+                    SseOpcode::Subss,
+                    SseOpcode::Movd,
+                    SseOpcode::Ucomiss,
+                    SseOpcode::Cvttss2si,
+                )
+            };
+
+            let done = sink.get_label();
+
+            let cst = if *src_size == OperandSize::Size64 {
+                Ieee64::pow2(dst_size.to_bits() - 1).bits()
+            } else {
+                Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64
+            };
+
+            let inst = Inst::imm(*src_size, cst, *tmp_gpr);
+            inst.emit(sink, info, state);
+
+            let inst =
+                Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm.to_reg()), src.to_reg());
+            inst.emit(sink, info, state);
+
+            let handle_large = sink.get_label();
+            one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
+
+            let not_nan = sink.get_label();
+            one_way_jmp(sink, CC::NP, not_nan); // jump over trap if not NaN
+
+            if *is_saturating {
+                // Emit 0.
+                let inst = Inst::alu_rmi_r(
+                    *dst_size == OperandSize::Size64,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(dst.to_reg()),
+                    *dst,
+                );
+                inst.emit(sink, info, state);
+
+                let inst = Inst::jmp_known(done);
+                inst.emit(sink, info, state);
+            } else {
+                // Trap.
+                let inst = Inst::trap(TrapCode::BadConversionToInteger);
+                inst.emit(sink, info, state);
+            }
+
+            sink.bind_label(not_nan);
+
+            // Actual truncation for small inputs: if the result is not positive, then we had an
+            // overflow.
+
+            let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(0), dst.to_reg());
+            inst.emit(sink, info, state);
+
+            one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
+
+            if *is_saturating {
+                // The input was "small" (< 2**(width -1)), so the only way to get an integer
+                // overflow is because the input was too small: saturate to the min value, i.e. 0.
+                let inst = Inst::alu_rmi_r(
+                    *dst_size == OperandSize::Size64,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(dst.to_reg()),
+                    *dst,
+                );
+                inst.emit(sink, info, state);
+
+                let inst = Inst::jmp_known(done);
+                inst.emit(sink, info, state);
+            } else {
+                // Trap.
+                let inst = Inst::trap(TrapCode::IntegerOverflow);
+                inst.emit(sink, info, state);
+            }
+
+            // Now handle large inputs.
+
+            sink.bind_label(handle_large);
+
+            let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size);
+            inst.emit(sink, info, state);
+
+            let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(0), dst.to_reg());
+            inst.emit(sink, info, state);
+
+            let next_is_large = sink.get_label();
+            one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
+
+            if *is_saturating {
+                // The input was "large" (>= 2**(width -1)), so the only way to get an integer
+                // overflow is because the input was too large: saturate to the max value.
+                let inst = Inst::imm(
+                    OperandSize::Size64,
+                    if *dst_size == OperandSize::Size64 {
+                        u64::max_value()
+                    } else {
+                        u32::max_value() as u64
+                    },
+                    *dst,
+                );
+                inst.emit(sink, info, state);
+
+                let inst = Inst::jmp_known(done);
+                inst.emit(sink, info, state);
+            } else {
+                let inst = Inst::trap(TrapCode::IntegerOverflow);
+                inst.emit(sink, info, state);
+            }
+
+            sink.bind_label(next_is_large);
+
+            if *dst_size == OperandSize::Size64 {
+                let inst = Inst::imm(OperandSize::Size64, 1 << 63, *tmp_gpr);
+                inst.emit(sink, info, state);
+
+                let inst = Inst::alu_rmi_r(
+                    true,
+                    AluRmiROpcode::Add,
+                    RegMemImm::reg(tmp_gpr.to_reg()),
+                    *dst,
+                );
+                inst.emit(sink, info, state);
+            } else {
+                let inst =
+                    Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::imm(1 << 31), *dst);
+                inst.emit(sink, info, state);
+            }
+
+            sink.bind_label(done);
+        }
+
+        Inst::LoadExtName { dst, name, offset } => {
+            // The full address can be encoded in the register, with a relocation.
+            // Generates: movabsq $name, %dst
+            let enc_dst = int_reg_enc(dst.to_reg());
+            sink.put1(0x48 | ((enc_dst >> 3) & 1));
+            sink.put1(0xB8 | (enc_dst & 7));
+            emit_reloc(sink, state, Reloc::Abs8, name, *offset);
+            if info.flags().emit_all_ones_funcaddrs() {
+                sink.put8(u64::max_value());
+            } else {
+                sink.put8(0);
+            }
+        }
+
+        Inst::LockCmpxchg { ty, src, dst } => {
+            // lock cmpxchg{b,w,l,q} %src, (dst)
+            // Note that 0xF0 is the Lock prefix.
+            let (prefix, rex, opcodes) = match *ty {
+                types::I8 => {
+                    let mut rex_flags = RexFlags::clear_w();
+                    let enc_src = int_reg_enc(*src);
+                    if enc_src >= 4 && enc_src <= 7 {
+                        rex_flags.always_emit();
+                    };
+                    (LegacyPrefixes::_F0, rex_flags, 0x0FB0)
+                }
+                types::I16 => (LegacyPrefixes::_66F0, RexFlags::clear_w(), 0x0FB1),
+                types::I32 => (LegacyPrefixes::_F0, RexFlags::clear_w(), 0x0FB1),
+                types::I64 => (LegacyPrefixes::_F0, RexFlags::set_w(), 0x0FB1),
+                _ => unreachable!(),
+            };
+            let amode = dst.finalize(state);
+            emit_std_reg_mem(sink, state, prefix, opcodes, 2, *src, &amode, rex);
+        }
+
+        Inst::AtomicRmwSeq { ty, op } => {
+            // Emit this:
+            //
+            //    mov{zbq,zwq,zlq,q}     (%r9), %rax  // rax = old value
+            //   again:
+            //    movq                   %rax, %r11   // rax = old value, r11 = old value
+            //    `op`q                  %r10, %r11   // rax = old value, r11 = new value
+            //    lock cmpxchg{b,w,l,q}  %r11, (%r9)  // try to store new value
+            //    jnz again // If this is taken, rax will have a "revised" old value
+            //
+            // Operand conventions:
+            //    IN:  %r9 (addr), %r10 (2nd arg for `op`)
+            //    OUT: %rax (old value), %r11 (trashed), %rflags (trashed)
+            //
+            // In the case where the operation is 'xchg', the "`op`q" instruction is instead
+            //   movq                    %r10, %r11
+            // so that we simply write in the destination, the "2nd arg for `op`".
+            let rax = regs::rax();
+            let r9 = regs::r9();
+            let r10 = regs::r10();
+            let r11 = regs::r11();
+            let rax_w = Writable::from_reg(rax);
+            let r11_w = Writable::from_reg(r11);
+            let amode = Amode::imm_reg(0, r9);
+            let again_label = sink.get_label();
+
+            // mov{zbq,zwq,zlq,q} (%r9), %rax
+            // No need to call `add_trap` here, since the `i1` emit will do that.
+            let i1 = Inst::load(*ty, amode.clone(), rax_w, ExtKind::ZeroExtend);
+            i1.emit(sink, info, state);
+
+            // again:
+            sink.bind_label(again_label);
+
+            // movq %rax, %r11
+            let i2 = Inst::mov_r_r(true, rax, r11_w);
+            i2.emit(sink, info, state);
+
+            // opq %r10, %r11
+            let r10_rmi = RegMemImm::reg(r10);
+            let i3 = if *op == inst_common::AtomicRmwOp::Xchg {
+                Inst::mov_r_r(true, r10, r11_w)
+            } else {
+                let alu_op = match op {
+                    inst_common::AtomicRmwOp::Add => AluRmiROpcode::Add,
+                    inst_common::AtomicRmwOp::Sub => AluRmiROpcode::Sub,
+                    inst_common::AtomicRmwOp::And => AluRmiROpcode::And,
+                    inst_common::AtomicRmwOp::Or => AluRmiROpcode::Or,
+                    inst_common::AtomicRmwOp::Xor => AluRmiROpcode::Xor,
+                    inst_common::AtomicRmwOp::Xchg => unreachable!(),
+                };
+                Inst::alu_rmi_r(true, alu_op, r10_rmi, r11_w)
+            };
+            i3.emit(sink, info, state);
+
+            // lock cmpxchg{b,w,l,q} %r11, (%r9)
+            // No need to call `add_trap` here, since the `i4` emit will do that.
+            let i4 = Inst::LockCmpxchg {
+                ty: *ty,
+                src: r11,
+                dst: amode.into(),
+            };
+            i4.emit(sink, info, state);
+
+            // jnz again
+            one_way_jmp(sink, CC::NZ, again_label);
+        }
+
+        Inst::Fence { kind } => {
+            sink.put1(0x0F);
+            sink.put1(0xAE);
+            match kind {
+                FenceKind::MFence => sink.put1(0xF0), // mfence = 0F AE F0
+                FenceKind::LFence => sink.put1(0xE8), // lfence = 0F AE E8
+                FenceKind::SFence => sink.put1(0xF8), // sfence = 0F AE F8
+            }
+        }
+
+        Inst::Hlt => {
+            sink.put1(0xcc);
+        }
+
+        Inst::Ud2 { trap_code } => {
+            let cur_srcloc = state.cur_srcloc();
+            sink.add_trap(cur_srcloc, *trap_code);
+            if let Some(s) = state.take_stack_map() {
+                sink.add_stack_map(StackMapExtent::UpcomingBytes(2), s);
+            }
+            sink.put1(0x0f);
+            sink.put1(0x0b);
+        }
+
+        Inst::VirtualSPOffsetAdj { offset } => {
+            debug!(
+                "virtual sp offset adjusted by {} -> {}",
+                offset,
+                state.virtual_sp_offset + offset
+            );
+            state.virtual_sp_offset += offset;
+        }
+
+        Inst::Nop { len } => {
+            // These encodings can all be found in Intel's architecture manual, at the NOP
+            // instruction description.
+            let mut len = *len;
+            while len != 0 {
+                let emitted = u8::min(len, 9);
+                match emitted {
+                    0 => {}
+                    1 => sink.put1(0x90), // NOP
+                    2 => {
+                        // 66 NOP
+                        sink.put1(0x66);
+                        sink.put1(0x90);
+                    }
+                    3 => {
+                        // NOP [EAX]
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x00);
+                    }
+                    4 => {
+                        // NOP 0(EAX), with 0 a 1-byte immediate.
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x40);
+                        sink.put1(0x00);
+                    }
+                    5 => {
+                        // NOP [EAX, EAX, 1]
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x44);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                    }
+                    6 => {
+                        // 66 NOP [EAX, EAX, 1]
+                        sink.put1(0x66);
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x44);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                    }
+                    7 => {
+                        // NOP 0[EAX], but 0 is a 4 bytes immediate.
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x80);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                    }
+                    8 => {
+                        // NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate.
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x84);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                    }
+                    9 => {
+                        // 66 NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate.
+                        sink.put1(0x66);
+                        sink.put1(0x0F);
+                        sink.put1(0x1F);
+                        sink.put1(0x84);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                        sink.put1(0x00);
+                    }
+                    _ => unreachable!(),
+                }
+                len -= emitted;
+            }
+        }
+
+        Inst::EpiloguePlaceholder => {
+            // Generate no code.
+        }
+    }
+
+    state.clear_post_insn();
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs
new file mode 100644
index 0000000000..06092d498a
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs
@@ -0,0 +1,3593 @@
+//! Tests for the emitter
+//!
+//! See comments at the top of `fn x64_emit` for advice on how to create reliable test cases.
+//!
+//! to see stdout: cargo test -- --nocapture
+//!
+//! for this specific case, as of 24 Aug 2020:
+//!
+//! cd to the top of your wasmtime tree, then:
+//! RUST_BACKTRACE=1 cargo test --features test-programs/test_programs \
+//!   --features experimental_x64 --all --exclude peepmatic --exclude lightbeam \
+//!   --exclude wasmtime-lightbeam --exclude peepmatic-automata --exclude peepmatic-fuzzing \
+//!  --exclude peepmatic-macro -- isa::x64::inst::emit_tests::test_x64_emit
+
+use super::*;
+use crate::isa::test_utils;
+use crate::isa::x64;
+use alloc::vec::Vec;
+
+#[test]
+fn test_x64_emit() {
+    let rax = regs::rax();
+    let rbx = regs::rbx();
+    let rcx = regs::rcx();
+    let rdx = regs::rdx();
+    let rsi = regs::rsi();
+    let rdi = regs::rdi();
+    let rsp = regs::rsp();
+    let rbp = regs::rbp();
+    let r8 = regs::r8();
+    let r9 = regs::r9();
+    let r10 = regs::r10();
+    let r11 = regs::r11();
+    let r12 = regs::r12();
+    let r13 = regs::r13();
+    let r14 = regs::r14();
+    let r15 = regs::r15();
+
+    let xmm0 = regs::xmm0();
+    let xmm1 = regs::xmm1();
+    let xmm2 = regs::xmm2();
+    let xmm3 = regs::xmm3();
+    let xmm4 = regs::xmm4();
+    let xmm5 = regs::xmm5();
+    let xmm6 = regs::xmm6();
+    let xmm7 = regs::xmm7();
+    let xmm8 = regs::xmm8();
+    let xmm9 = regs::xmm9();
+    let xmm10 = regs::xmm10();
+    let xmm11 = regs::xmm11();
+    let xmm12 = regs::xmm12();
+    let xmm13 = regs::xmm13();
+    let xmm14 = regs::xmm14();
+    let xmm15 = regs::xmm15();
+
+    // And Writable<> versions of the same:
+    let w_rax = Writable::<Reg>::from_reg(rax);
+    let w_rbx = Writable::<Reg>::from_reg(rbx);
+    let w_rcx = Writable::<Reg>::from_reg(rcx);
+    let w_rdx = Writable::<Reg>::from_reg(rdx);
+    let w_rsi = Writable::<Reg>::from_reg(rsi);
+    let w_rdi = Writable::<Reg>::from_reg(rdi);
+    let _w_rsp = Writable::<Reg>::from_reg(rsp);
+    let _w_rbp = Writable::<Reg>::from_reg(rbp);
+    let w_r8 = Writable::<Reg>::from_reg(r8);
+    let w_r9 = Writable::<Reg>::from_reg(r9);
+    let _w_r10 = Writable::<Reg>::from_reg(r10);
+    let w_r11 = Writable::<Reg>::from_reg(r11);
+    let w_r12 = Writable::<Reg>::from_reg(r12);
+    let w_r13 = Writable::<Reg>::from_reg(r13);
+    let w_r14 = Writable::<Reg>::from_reg(r14);
+    let w_r15 = Writable::<Reg>::from_reg(r15);
+
+    let w_xmm0 = Writable::<Reg>::from_reg(xmm0);
+    let w_xmm1 = Writable::<Reg>::from_reg(xmm1);
+    let w_xmm2 = Writable::<Reg>::from_reg(xmm2);
+    let w_xmm3 = Writable::<Reg>::from_reg(xmm3);
+    let w_xmm4 = Writable::<Reg>::from_reg(xmm4);
+    let w_xmm5 = Writable::<Reg>::from_reg(xmm5);
+    let w_xmm6 = Writable::<Reg>::from_reg(xmm6);
+    let w_xmm7 = Writable::<Reg>::from_reg(xmm7);
+    let w_xmm8 = Writable::<Reg>::from_reg(xmm8);
+    let w_xmm9 = Writable::<Reg>::from_reg(xmm9);
+    let w_xmm10 = Writable::<Reg>::from_reg(xmm10);
+    let w_xmm11 = Writable::<Reg>::from_reg(xmm11);
+    let w_xmm12 = Writable::<Reg>::from_reg(xmm12);
+    let w_xmm13 = Writable::<Reg>::from_reg(xmm13);
+    let w_xmm14 = Writable::<Reg>::from_reg(xmm14);
+    let w_xmm15 = Writable::<Reg>::from_reg(xmm15);
+
+    let mut insns = Vec::<(Inst, &str, &str)>::new();
+
+    // ========================================================
+    // Cases aimed at checking Addr-esses: IR (Imm + Reg)
+    //
+    // These are just a bunch of loads with all supported (by the emitter)
+    // permutations of address formats.
+    //
+    // Addr_IR, offset zero
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rax), w_rdi),
+        "488B38",
+        "movq    0(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rbx), w_rdi),
+        "488B3B",
+        "movq    0(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rcx), w_rdi),
+        "488B39",
+        "movq    0(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rdx), w_rdi),
+        "488B3A",
+        "movq    0(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rbp), w_rdi),
+        "488B7D00",
+        "movq    0(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rsp), w_rdi),
+        "488B3C24",
+        "movq    0(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rsi), w_rdi),
+        "488B3E",
+        "movq    0(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, rdi), w_rdi),
+        "488B3F",
+        "movq    0(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r8), w_rdi),
+        "498B38",
+        "movq    0(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r9), w_rdi),
+        "498B39",
+        "movq    0(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r10), w_rdi),
+        "498B3A",
+        "movq    0(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r11), w_rdi),
+        "498B3B",
+        "movq    0(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r12), w_rdi),
+        "498B3C24",
+        "movq    0(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r13), w_rdi),
+        "498B7D00",
+        "movq    0(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r14), w_rdi),
+        "498B3E",
+        "movq    0(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0, r15), w_rdi),
+        "498B3F",
+        "movq    0(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset max simm8
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rax), w_rdi),
+        "488B787F",
+        "movq    127(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rbx), w_rdi),
+        "488B7B7F",
+        "movq    127(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rcx), w_rdi),
+        "488B797F",
+        "movq    127(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rdx), w_rdi),
+        "488B7A7F",
+        "movq    127(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rbp), w_rdi),
+        "488B7D7F",
+        "movq    127(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rsp), w_rdi),
+        "488B7C247F",
+        "movq    127(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rsi), w_rdi),
+        "488B7E7F",
+        "movq    127(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, rdi), w_rdi),
+        "488B7F7F",
+        "movq    127(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r8), w_rdi),
+        "498B787F",
+        "movq    127(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r9), w_rdi),
+        "498B797F",
+        "movq    127(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r10), w_rdi),
+        "498B7A7F",
+        "movq    127(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r11), w_rdi),
+        "498B7B7F",
+        "movq    127(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r12), w_rdi),
+        "498B7C247F",
+        "movq    127(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r13), w_rdi),
+        "498B7D7F",
+        "movq    127(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r14), w_rdi),
+        "498B7E7F",
+        "movq    127(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(127, r15), w_rdi),
+        "498B7F7F",
+        "movq    127(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset min simm8
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rax), w_rdi),
+        "488B7880",
+        "movq    -128(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rbx), w_rdi),
+        "488B7B80",
+        "movq    -128(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rcx), w_rdi),
+        "488B7980",
+        "movq    -128(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rdx), w_rdi),
+        "488B7A80",
+        "movq    -128(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rbp), w_rdi),
+        "488B7D80",
+        "movq    -128(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rsp), w_rdi),
+        "488B7C2480",
+        "movq    -128(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rsi), w_rdi),
+        "488B7E80",
+        "movq    -128(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rdi), w_rdi),
+        "488B7F80",
+        "movq    -128(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r8), w_rdi),
+        "498B7880",
+        "movq    -128(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r9), w_rdi),
+        "498B7980",
+        "movq    -128(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r10), w_rdi),
+        "498B7A80",
+        "movq    -128(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r11), w_rdi),
+        "498B7B80",
+        "movq    -128(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r12), w_rdi),
+        "498B7C2480",
+        "movq    -128(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r13), w_rdi),
+        "498B7D80",
+        "movq    -128(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r14), w_rdi),
+        "498B7E80",
+        "movq    -128(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r15), w_rdi),
+        "498B7F80",
+        "movq    -128(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset smallest positive simm32
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rax), w_rdi),
+        "488BB880000000",
+        "movq    128(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rbx), w_rdi),
+        "488BBB80000000",
+        "movq    128(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rcx), w_rdi),
+        "488BB980000000",
+        "movq    128(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rdx), w_rdi),
+        "488BBA80000000",
+        "movq    128(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rbp), w_rdi),
+        "488BBD80000000",
+        "movq    128(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rsp), w_rdi),
+        "488BBC2480000000",
+        "movq    128(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rsi), w_rdi),
+        "488BBE80000000",
+        "movq    128(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, rdi), w_rdi),
+        "488BBF80000000",
+        "movq    128(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r8), w_rdi),
+        "498BB880000000",
+        "movq    128(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r9), w_rdi),
+        "498BB980000000",
+        "movq    128(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r10), w_rdi),
+        "498BBA80000000",
+        "movq    128(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r11), w_rdi),
+        "498BBB80000000",
+        "movq    128(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r12), w_rdi),
+        "498BBC2480000000",
+        "movq    128(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r13), w_rdi),
+        "498BBD80000000",
+        "movq    128(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r14), w_rdi),
+        "498BBE80000000",
+        "movq    128(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(128, r15), w_rdi),
+        "498BBF80000000",
+        "movq    128(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset smallest negative simm32
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rax), w_rdi),
+        "488BB87FFFFFFF",
+        "movq    -129(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rbx), w_rdi),
+        "488BBB7FFFFFFF",
+        "movq    -129(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rcx), w_rdi),
+        "488BB97FFFFFFF",
+        "movq    -129(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rdx), w_rdi),
+        "488BBA7FFFFFFF",
+        "movq    -129(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rbp), w_rdi),
+        "488BBD7FFFFFFF",
+        "movq    -129(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rsp), w_rdi),
+        "488BBC247FFFFFFF",
+        "movq    -129(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rsi), w_rdi),
+        "488BBE7FFFFFFF",
+        "movq    -129(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rdi), w_rdi),
+        "488BBF7FFFFFFF",
+        "movq    -129(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r8), w_rdi),
+        "498BB87FFFFFFF",
+        "movq    -129(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r9), w_rdi),
+        "498BB97FFFFFFF",
+        "movq    -129(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r10), w_rdi),
+        "498BBA7FFFFFFF",
+        "movq    -129(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r11), w_rdi),
+        "498BBB7FFFFFFF",
+        "movq    -129(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r12), w_rdi),
+        "498BBC247FFFFFFF",
+        "movq    -129(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r13), w_rdi),
+        "498BBD7FFFFFFF",
+        "movq    -129(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r14), w_rdi),
+        "498BBE7FFFFFFF",
+        "movq    -129(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r15), w_rdi),
+        "498BBF7FFFFFFF",
+        "movq    -129(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset large positive simm32
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rax), w_rdi),
+        "488BB877207317",
+        "movq    393420919(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rbx), w_rdi),
+        "488BBB77207317",
+        "movq    393420919(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rcx), w_rdi),
+        "488BB977207317",
+        "movq    393420919(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rdx), w_rdi),
+        "488BBA77207317",
+        "movq    393420919(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rbp), w_rdi),
+        "488BBD77207317",
+        "movq    393420919(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rsp), w_rdi),
+        "488BBC2477207317",
+        "movq    393420919(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rsi), w_rdi),
+        "488BBE77207317",
+        "movq    393420919(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, rdi), w_rdi),
+        "488BBF77207317",
+        "movq    393420919(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r8), w_rdi),
+        "498BB877207317",
+        "movq    393420919(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r9), w_rdi),
+        "498BB977207317",
+        "movq    393420919(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r10), w_rdi),
+        "498BBA77207317",
+        "movq    393420919(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r11), w_rdi),
+        "498BBB77207317",
+        "movq    393420919(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r12), w_rdi),
+        "498BBC2477207317",
+        "movq    393420919(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r13), w_rdi),
+        "498BBD77207317",
+        "movq    393420919(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r14), w_rdi),
+        "498BBE77207317",
+        "movq    393420919(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(0x17732077, r15), w_rdi),
+        "498BBF77207317",
+        "movq    393420919(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Addr_IR, offset large negative simm32
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rax), w_rdi),
+        "488BB8D9A6BECE",
+        "movq    -826366247(%rax), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rbx), w_rdi),
+        "488BBBD9A6BECE",
+        "movq    -826366247(%rbx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rcx), w_rdi),
+        "488BB9D9A6BECE",
+        "movq    -826366247(%rcx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rdx), w_rdi),
+        "488BBAD9A6BECE",
+        "movq    -826366247(%rdx), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rbp), w_rdi),
+        "488BBDD9A6BECE",
+        "movq    -826366247(%rbp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rsp), w_rdi),
+        "488BBC24D9A6BECE",
+        "movq    -826366247(%rsp), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rsi), w_rdi),
+        "488BBED9A6BECE",
+        "movq    -826366247(%rsi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rdi), w_rdi),
+        "488BBFD9A6BECE",
+        "movq    -826366247(%rdi), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r8), w_rdi),
+        "498BB8D9A6BECE",
+        "movq    -826366247(%r8), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r9), w_rdi),
+        "498BB9D9A6BECE",
+        "movq    -826366247(%r9), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r10), w_rdi),
+        "498BBAD9A6BECE",
+        "movq    -826366247(%r10), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r11), w_rdi),
+        "498BBBD9A6BECE",
+        "movq    -826366247(%r11), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r12), w_rdi),
+        "498BBC24D9A6BECE",
+        "movq    -826366247(%r12), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r13), w_rdi),
+        "498BBDD9A6BECE",
+        "movq    -826366247(%r13), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r14), w_rdi),
+        "498BBED9A6BECE",
+        "movq    -826366247(%r14), %rdi",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r15), w_rdi),
+        "498BBFD9A6BECE",
+        "movq    -826366247(%r15), %rdi",
+    ));
+
+    // ========================================================
+    // Cases aimed at checking Addr-esses: IRRS (Imm + Reg + (Reg << Shift))
+    // Note these don't check the case where the index reg is RSP, since we
+    // don't encode any of those.
+    //
+    // Addr_IRRS, offset max simm8
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rax, rax, 0), w_r11),
+        "4C8B5C007F",
+        "movq    127(%rax,%rax,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rdi, rax, 1), w_r11),
+        "4C8B5C477F",
+        "movq    127(%rdi,%rax,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r8, rax, 2), w_r11),
+        "4D8B5C807F",
+        "movq    127(%r8,%rax,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r15, rax, 3), w_r11),
+        "4D8B5CC77F",
+        "movq    127(%r15,%rax,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rax, rdi, 3), w_r11),
+        "4C8B5CF87F",
+        "movq    127(%rax,%rdi,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rdi, rdi, 2), w_r11),
+        "4C8B5CBF7F",
+        "movq    127(%rdi,%rdi,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r8, rdi, 1), w_r11),
+        "4D8B5C787F",
+        "movq    127(%r8,%rdi,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r15, rdi, 0), w_r11),
+        "4D8B5C3F7F",
+        "movq    127(%r15,%rdi,1), %r11",
+    ));
+
+    // ========================================================
+    // Addr_IRRS, offset min simm8
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rax, r8, 2), w_r11),
+        "4E8B5C8080",
+        "movq    -128(%rax,%r8,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rdi, r8, 3), w_r11),
+        "4E8B5CC780",
+        "movq    -128(%rdi,%r8,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r8, r8, 0), w_r11),
+        "4F8B5C0080",
+        "movq    -128(%r8,%r8,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r15, r8, 1), w_r11),
+        "4F8B5C4780",
+        "movq    -128(%r15,%r8,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rax, r15, 1), w_r11),
+        "4E8B5C7880",
+        "movq    -128(%rax,%r15,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rdi, r15, 0), w_r11),
+        "4E8B5C3F80",
+        "movq    -128(%rdi,%r15,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r8, r15, 3), w_r11),
+        "4F8B5CF880",
+        "movq    -128(%r8,%r15,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r15, r15, 2), w_r11),
+        "4F8B5CBF80",
+        "movq    -128(%r15,%r15,4), %r11",
+    ));
+
+    // ========================================================
+    // Addr_IRRS, offset large positive simm32
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rax, rax, 0), w_r11),
+        "4C8B9C00BE25664F",
+        "movq    1332094398(%rax,%rax,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rdi, rax, 1), w_r11),
+        "4C8B9C47BE25664F",
+        "movq    1332094398(%rdi,%rax,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r8, rax, 2), w_r11),
+        "4D8B9C80BE25664F",
+        "movq    1332094398(%r8,%rax,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r15, rax, 3), w_r11),
+        "4D8B9CC7BE25664F",
+        "movq    1332094398(%r15,%rax,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rax, rdi, 3), w_r11),
+        "4C8B9CF8BE25664F",
+        "movq    1332094398(%rax,%rdi,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rdi, rdi, 2), w_r11),
+        "4C8B9CBFBE25664F",
+        "movq    1332094398(%rdi,%rdi,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r8, rdi, 1), w_r11),
+        "4D8B9C78BE25664F",
+        "movq    1332094398(%r8,%rdi,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r15, rdi, 0), w_r11),
+        "4D8B9C3FBE25664F",
+        "movq    1332094398(%r15,%rdi,1), %r11",
+    ));
+
+    // ========================================================
+    // Addr_IRRS, offset large negative simm32
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rax, r8, 2),
+            w_r11,
+        ),
+        "4E8B9C8070E9B2D9",
+        "movq    -642586256(%rax,%r8,4), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rdi, r8, 3),
+            w_r11,
+        ),
+        "4E8B9CC770E9B2D9",
+        "movq    -642586256(%rdi,%r8,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r8, r8, 0),
+            w_r11,
+        ),
+        "4F8B9C0070E9B2D9",
+        "movq    -642586256(%r8,%r8,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r15, r8, 1),
+            w_r11,
+        ),
+        "4F8B9C4770E9B2D9",
+        "movq    -642586256(%r15,%r8,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rax, r15, 1),
+            w_r11,
+        ),
+        "4E8B9C7870E9B2D9",
+        "movq    -642586256(%rax,%r15,2), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rdi, r15, 0),
+            w_r11,
+        ),
+        "4E8B9C3F70E9B2D9",
+        "movq    -642586256(%rdi,%r15,1), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r8, r15, 3),
+            w_r11,
+        ),
+        "4F8B9CF870E9B2D9",
+        "movq    -642586256(%r8,%r15,8), %r11",
+    ));
+    insns.push((
+        Inst::mov64_m_r(
+            Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r15, r15, 2),
+            w_r11,
+        ),
+        "4F8B9CBF70E9B2D9",
+        "movq    -642586256(%r15,%r15,4), %r11",
+    ));
+
+    // End of test cases for Addr
+    // ========================================================
+
+    // ========================================================
+    // General tests for each insn.  Don't forget to follow the
+    // guidelines commented just prior to `fn x64_emit`.
+    //
+    // Alu_RMI_R
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Add, RegMemImm::reg(r15), w_rdx),
+        "4C01FA",
+        "addq    %r15, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::reg(rcx), w_r8),
+        "4101C8",
+        "addl    %ecx, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::reg(rcx), w_rsi),
+        "01CE",
+        "addl    %ecx, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Add,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_rdx,
+        ),
+        "48035763",
+        "addq    99(%rdi), %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_r8,
+        ),
+        "44034763",
+        "addl    99(%rdi), %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_rsi,
+        ),
+        "037763",
+        "addl    99(%rdi), %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-127i32 as u32),
+            w_rdx,
+        ),
+        "4883C281",
+        "addq    $-127, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-129i32 as u32),
+            w_rdx,
+        ),
+        "4881C27FFFFFFF",
+        "addq    $-129, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Add, RegMemImm::imm(76543210), w_rdx),
+        "4881C2EAF48F04",
+        "addq    $76543210, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-127i32 as u32),
+            w_r8,
+        ),
+        "4183C081",
+        "addl    $-127, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-129i32 as u32),
+            w_r8,
+        ),
+        "4181C07FFFFFFF",
+        "addl    $-129, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-76543210i32 as u32),
+            w_r8,
+        ),
+        "4181C0160B70FB",
+        "addl    $-76543210, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-127i32 as u32),
+            w_rsi,
+        ),
+        "83C681",
+        "addl    $-127, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Add,
+            RegMemImm::imm(-129i32 as u32),
+            w_rsi,
+        ),
+        "81C67FFFFFFF",
+        "addl    $-129, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::imm(76543210), w_rsi),
+        "81C6EAF48F04",
+        "addl    $76543210, %esi",
+    ));
+    // This is pretty feeble
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Sub, RegMemImm::reg(r15), w_rdx),
+        "4C29FA",
+        "subq    %r15, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::And, RegMemImm::reg(r15), w_rdx),
+        "4C21FA",
+        "andq    %r15, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Or, RegMemImm::reg(r15), w_rdx),
+        "4C09FA",
+        "orq     %r15, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Xor, RegMemImm::reg(r15), w_rdx),
+        "4C31FA",
+        "xorq    %r15, %rdx",
+    ));
+    // Test all mul cases, though
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Mul, RegMemImm::reg(r15), w_rdx),
+        "490FAFD7",
+        "imulq   %r15, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::reg(rcx), w_r8),
+        "440FAFC1",
+        "imull   %ecx, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::reg(rcx), w_rsi),
+        "0FAFF1",
+        "imull   %ecx, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Mul,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_rdx,
+        ),
+        "480FAF5763",
+        "imulq   99(%rdi), %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_r8,
+        ),
+        "440FAF4763",
+        "imull   99(%rdi), %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::mem(Amode::imm_reg(99, rdi)),
+            w_rsi,
+        ),
+        "0FAF7763",
+        "imull   99(%rdi), %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-127i32 as u32),
+            w_rdx,
+        ),
+        "486BD281",
+        "imulq   $-127, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            true,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-129i32 as u32),
+            w_rdx,
+        ),
+        "4869D27FFFFFFF",
+        "imulq   $-129, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(true, AluRmiROpcode::Mul, RegMemImm::imm(76543210), w_rdx),
+        "4869D2EAF48F04",
+        "imulq   $76543210, %rdx",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-127i32 as u32),
+            w_r8,
+        ),
+        "456BC081",
+        "imull   $-127, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-129i32 as u32),
+            w_r8,
+        ),
+        "4569C07FFFFFFF",
+        "imull   $-129, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-76543210i32 as u32),
+            w_r8,
+        ),
+        "4569C0160B70FB",
+        "imull   $-76543210, %r8d",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-127i32 as u32),
+            w_rsi,
+        ),
+        "6BF681",
+        "imull   $-127, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(
+            false,
+            AluRmiROpcode::Mul,
+            RegMemImm::imm(-129i32 as u32),
+            w_rsi,
+        ),
+        "69F67FFFFFFF",
+        "imull   $-129, %esi",
+    ));
+    insns.push((
+        Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::imm(76543210), w_rsi),
+        "69F6EAF48F04",
+        "imull   $76543210, %esi",
+    ));
+
+    // ========================================================
+    // UnaryRmR
+
+    insns.push((
+        Inst::unary_rm_r(4, UnaryRmROpcode::Bsr, RegMem::reg(rsi), w_rdi),
+        "0FBDFE",
+        "bsrl    %esi, %edi",
+    ));
+    insns.push((
+        Inst::unary_rm_r(8, UnaryRmROpcode::Bsr, RegMem::reg(r15), w_rax),
+        "490FBDC7",
+        "bsrq    %r15, %rax",
+    ));
+
+    // ========================================================
+    // Not
+    insns.push((
+        Inst::not(4, Writable::from_reg(regs::rsi())),
+        "F7D6",
+        "notl    %esi",
+    ));
+    insns.push((
+        Inst::not(8, Writable::from_reg(regs::r15())),
+        "49F7D7",
+        "notq    %r15",
+    ));
+    insns.push((
+        Inst::not(4, Writable::from_reg(regs::r14())),
+        "41F7D6",
+        "notl    %r14d",
+    ));
+    insns.push((
+        Inst::not(2, Writable::from_reg(regs::rdi())),
+        "66F7D7",
+        "notw    %di",
+    ));
+
+    // ========================================================
+    // Neg
+    insns.push((
+        Inst::neg(4, Writable::from_reg(regs::rsi())),
+        "F7DE",
+        "negl    %esi",
+    ));
+    insns.push((
+        Inst::neg(8, Writable::from_reg(regs::r15())),
+        "49F7DF",
+        "negq    %r15",
+    ));
+    insns.push((
+        Inst::neg(4, Writable::from_reg(regs::r14())),
+        "41F7DE",
+        "negl    %r14d",
+    ));
+    insns.push((
+        Inst::neg(2, Writable::from_reg(regs::rdi())),
+        "66F7DF",
+        "negw    %di",
+    ));
+
+    // ========================================================
+    // Div
+    insns.push((
+        Inst::div(4, true /*signed*/, RegMem::reg(regs::rsi())),
+        "F7FE",
+        "idiv    %esi",
+    ));
+    insns.push((
+        Inst::div(8, true /*signed*/, RegMem::reg(regs::r15())),
+        "49F7FF",
+        "idiv    %r15",
+    ));
+    insns.push((
+        Inst::div(4, false /*signed*/, RegMem::reg(regs::r14())),
+        "41F7F6",
+        "div     %r14d",
+    ));
+    insns.push((
+        Inst::div(8, false /*signed*/, RegMem::reg(regs::rdi())),
+        "48F7F7",
+        "div     %rdi",
+    ));
+
+    // ========================================================
+    // MulHi
+    insns.push((
+        Inst::mul_hi(4, true /*signed*/, RegMem::reg(regs::rsi())),
+        "F7EE",
+        "imul    %esi",
+    ));
+    insns.push((
+        Inst::mul_hi(8, true /*signed*/, RegMem::reg(regs::r15())),
+        "49F7EF",
+        "imul    %r15",
+    ));
+    insns.push((
+        Inst::mul_hi(4, false /*signed*/, RegMem::reg(regs::r14())),
+        "41F7E6",
+        "mul     %r14d",
+    ));
+    insns.push((
+        Inst::mul_hi(8, false /*signed*/, RegMem::reg(regs::rdi())),
+        "48F7E7",
+        "mul     %rdi",
+    ));
+
+    // ========================================================
+    // cbw
+    insns.push((Inst::sign_extend_data(1), "6698", "cbw"));
+
+    // ========================================================
+    // cdq family: SignExtendRaxRdx
+    insns.push((Inst::sign_extend_data(2), "6699", "cwd"));
+    insns.push((Inst::sign_extend_data(4), "99", "cdq"));
+    insns.push((Inst::sign_extend_data(8), "4899", "cqo"));
+
+    // ========================================================
+    // Imm_R
+    //
+    insns.push((
+        Inst::imm(OperandSize::Size32, 1234567, w_r14),
+        "41BE87D61200",
+        "movl    $1234567, %r14d",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size32, -126i64 as u64, w_r14),
+        "41BE82FFFFFF",
+        "movl    $-126, %r14d",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size64, 1234567898765, w_r14),
+        "49BE8D26FB711F010000",
+        "movabsq $1234567898765, %r14",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size64, -126i64 as u64, w_r14),
+        "49C7C682FFFFFF",
+        "movabsq $-126, %r14",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size32, 1234567, w_rcx),
+        "B987D61200",
+        "movl    $1234567, %ecx",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size32, -126i64 as u64, w_rcx),
+        "B982FFFFFF",
+        "movl    $-126, %ecx",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size64, 1234567898765, w_rsi),
+        "48BE8D26FB711F010000",
+        "movabsq $1234567898765, %rsi",
+    ));
+    insns.push((
+        Inst::imm(OperandSize::Size64, -126i64 as u64, w_rbx),
+        "48C7C382FFFFFF",
+        "movabsq $-126, %rbx",
+    ));
+
+    // ========================================================
+    // Mov_R_R
+    insns.push((
+        Inst::mov_r_r(false, rbx, w_rsi),
+        "89DE",
+        "movl    %ebx, %esi",
+    ));
+    insns.push((
+        Inst::mov_r_r(false, rbx, w_r9),
+        "4189D9",
+        "movl    %ebx, %r9d",
+    ));
+    insns.push((
+        Inst::mov_r_r(false, r11, w_rsi),
+        "4489DE",
+        "movl    %r11d, %esi",
+    ));
+    insns.push((
+        Inst::mov_r_r(false, r12, w_r9),
+        "4589E1",
+        "movl    %r12d, %r9d",
+    ));
+    insns.push((
+        Inst::mov_r_r(true, rbx, w_rsi),
+        "4889DE",
+        "movq    %rbx, %rsi",
+    ));
+    insns.push((
+        Inst::mov_r_r(true, rbx, w_r9),
+        "4989D9",
+        "movq    %rbx, %r9",
+    ));
+    insns.push((
+        Inst::mov_r_r(true, r11, w_rsi),
+        "4C89DE",
+        "movq    %r11, %rsi",
+    ));
+    insns.push((
+        Inst::mov_r_r(true, r12, w_r9),
+        "4D89E1",
+        "movq    %r12, %r9",
+    ));
+
+    // ========================================================
+    // MovZX_RM_R
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(rdi), w_rdi),
+        "400FB6FF",
+        "movzbl  %dil, %edi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(rax), w_rsi),
+        "0FB6F0",
+        "movzbl  %al, %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(r15), w_rsi),
+        "410FB6F7",
+        "movzbl  %r15b, %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "0FB671F9",
+        "movzbl  -7(%rcx), %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "410FB658F9",
+        "movzbl  -7(%r8), %ebx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "450FB64AF9",
+        "movzbl  -7(%r10), %r9d",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "410FB653F9",
+        "movzbl  -7(%r11), %edx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::BQ, RegMem::reg(rax), w_rsi),
+        "480FB6F0",
+        "movzbq  %al, %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::BQ, RegMem::reg(r10), w_rsi),
+        "490FB6F2",
+        "movzbq  %r10b, %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "480FB671F9",
+        "movzbq  -7(%rcx), %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "490FB658F9",
+        "movzbq  -7(%r8), %rbx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "4D0FB64AF9",
+        "movzbq  -7(%r10), %r9",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "490FB653F9",
+        "movzbq  -7(%r11), %rdx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::WL, RegMem::reg(rcx), w_rsi),
+        "0FB7F1",
+        "movzwl  %cx, %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::WL, RegMem::reg(r10), w_rsi),
+        "410FB7F2",
+        "movzwl  %r10w, %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "0FB771F9",
+        "movzwl  -7(%rcx), %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "410FB758F9",
+        "movzwl  -7(%r8), %ebx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "450FB74AF9",
+        "movzwl  -7(%r10), %r9d",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "410FB753F9",
+        "movzwl  -7(%r11), %edx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::WQ, RegMem::reg(rcx), w_rsi),
+        "480FB7F1",
+        "movzwq  %cx, %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::WQ, RegMem::reg(r11), w_rsi),
+        "490FB7F3",
+        "movzwq  %r11w, %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "480FB771F9",
+        "movzwq  -7(%rcx), %rsi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "490FB758F9",
+        "movzwq  -7(%r8), %rbx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "4D0FB74AF9",
+        "movzwq  -7(%r10), %r9",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "490FB753F9",
+        "movzwq  -7(%r11), %rdx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(rcx), w_rsi),
+        "8BF1",
+        "movl    %ecx, %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "8B71F9",
+        "movl    -7(%rcx), %esi",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "418B58F9",
+        "movl    -7(%r8), %ebx",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "458B4AF9",
+        "movl    -7(%r10), %r9d",
+    ));
+    insns.push((
+        Inst::movzx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "418B53F9",
+        "movl    -7(%r11), %edx",
+    ));
+
+    // ========================================================
+    // Mov64_M_R
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, rbx, 0), w_rcx),
+        "488B8C18B3000000",
+        "movq    179(%rax,%rbx,1), %rcx",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, rbx, 0), w_r8),
+        "4C8B8418B3000000",
+        "movq    179(%rax,%rbx,1), %r8",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, r9, 0), w_rcx),
+        "4A8B8C08B3000000",
+        "movq    179(%rax,%r9,1), %rcx",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, r9, 0), w_r8),
+        "4E8B8408B3000000",
+        "movq    179(%rax,%r9,1), %r8",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, rbx, 0), w_rcx),
+        "498B8C1AB3000000",
+        "movq    179(%r10,%rbx,1), %rcx",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, rbx, 0), w_r8),
+        "4D8B841AB3000000",
+        "movq    179(%r10,%rbx,1), %r8",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_rcx),
+        "4B8B8C0AB3000000",
+        "movq    179(%r10,%r9,1), %rcx",
+    ));
+    insns.push((
+        Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_r8),
+        "4F8B840AB3000000",
+        "movq    179(%r10,%r9,1), %r8",
+    ));
+
+    // ========================================================
+    // LoadEffectiveAddress
+    insns.push((
+        Inst::lea(Amode::imm_reg(42, r10), w_r8),
+        "4D8D422A",
+        "lea     42(%r10), %r8",
+    ));
+    insns.push((
+        Inst::lea(Amode::imm_reg(42, r10), w_r15),
+        "4D8D7A2A",
+        "lea     42(%r10), %r15",
+    ));
+    insns.push((
+        Inst::lea(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_r8),
+        "4F8D840AB3000000",
+        "lea     179(%r10,%r9,1), %r8",
+    ));
+    insns.push((
+        Inst::lea(Amode::rip_relative(MachLabel::from_block(0)), w_rdi),
+        "488D3D00000000",
+        "lea     label0(%rip), %rdi",
+    ));
+
+    // ========================================================
+    // MovSX_RM_R
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rdi), w_rdi),
+        "400FBEFF",
+        "movsbl  %dil, %edi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rcx), w_rsi),
+        "0FBEF1",
+        "movsbl  %cl, %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(r14), w_rsi),
+        "410FBEF6",
+        "movsbl  %r14b, %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "0FBE71F9",
+        "movsbl  -7(%rcx), %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "410FBE58F9",
+        "movsbl  -7(%r8), %ebx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "450FBE4AF9",
+        "movsbl  -7(%r10), %r9d",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "410FBE53F9",
+        "movsbl  -7(%r11), %edx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::BQ, RegMem::reg(rcx), w_rsi),
+        "480FBEF1",
+        "movsbq  %cl, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::BQ, RegMem::reg(r15), w_rsi),
+        "490FBEF7",
+        "movsbq  %r15b, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "480FBE71F9",
+        "movsbq  -7(%rcx), %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "490FBE58F9",
+        "movsbq  -7(%r8), %rbx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "4D0FBE4AF9",
+        "movsbq  -7(%r10), %r9",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::BQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "490FBE53F9",
+        "movsbq  -7(%r11), %rdx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::WL, RegMem::reg(rcx), w_rsi),
+        "0FBFF1",
+        "movswl  %cx, %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::WL, RegMem::reg(r14), w_rsi),
+        "410FBFF6",
+        "movswl  %r14w, %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "0FBF71F9",
+        "movswl  -7(%rcx), %esi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "410FBF58F9",
+        "movswl  -7(%r8), %ebx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "450FBF4AF9",
+        "movswl  -7(%r10), %r9d",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WL,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "410FBF53F9",
+        "movswl  -7(%r11), %edx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::WQ, RegMem::reg(rcx), w_rsi),
+        "480FBFF1",
+        "movswq  %cx, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::WQ, RegMem::reg(r13), w_rsi),
+        "490FBFF5",
+        "movswq  %r13w, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "480FBF71F9",
+        "movswq  -7(%rcx), %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "490FBF58F9",
+        "movswq  -7(%r8), %rbx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "4D0FBF4AF9",
+        "movswq  -7(%r10), %r9",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::WQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "490FBF53F9",
+        "movswq  -7(%r11), %rdx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::LQ, RegMem::reg(rcx), w_rsi),
+        "4863F1",
+        "movslq  %ecx, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(ExtMode::LQ, RegMem::reg(r15), w_rsi),
+        "4963F7",
+        "movslq  %r15d, %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)),
+            w_rsi,
+        ),
+        "486371F9",
+        "movslq  -7(%rcx), %rsi",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)),
+            w_rbx,
+        ),
+        "496358F9",
+        "movslq  -7(%r8), %rbx",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)),
+            w_r9,
+        ),
+        "4D634AF9",
+        "movslq  -7(%r10), %r9",
+    ));
+    insns.push((
+        Inst::movsx_rm_r(
+            ExtMode::LQ,
+            RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)),
+            w_rdx,
+        ),
+        "496353F9",
+        "movslq  -7(%r11), %rdx",
+    ));
+
+    // ========================================================
+    // Mov_R_M.  Byte stores are tricky.  Check everything carefully.
+    insns.push((
+        Inst::mov_r_m(8, rax, Amode::imm_reg(99, rdi)),
+        "48894763",
+        "movq    %rax, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rbx, Amode::imm_reg(99, r8)),
+        "49895863",
+        "movq    %rbx, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rcx, Amode::imm_reg(99, rsi)),
+        "48894E63",
+        "movq    %rcx, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rdx, Amode::imm_reg(99, r9)),
+        "49895163",
+        "movq    %rdx, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rsi, Amode::imm_reg(99, rax)),
+        "48897063",
+        "movq    %rsi, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rdi, Amode::imm_reg(99, r15)),
+        "49897F63",
+        "movq    %rdi, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rsp, Amode::imm_reg(99, rcx)),
+        "48896163",
+        "movq    %rsp, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, rbp, Amode::imm_reg(99, r14)),
+        "49896E63",
+        "movq    %rbp, 99(%r14)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r8, Amode::imm_reg(99, rdi)),
+        "4C894763",
+        "movq    %r8, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r9, Amode::imm_reg(99, r8)),
+        "4D894863",
+        "movq    %r9, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r10, Amode::imm_reg(99, rsi)),
+        "4C895663",
+        "movq    %r10, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r11, Amode::imm_reg(99, r9)),
+        "4D895963",
+        "movq    %r11, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r12, Amode::imm_reg(99, rax)),
+        "4C896063",
+        "movq    %r12, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r13, Amode::imm_reg(99, r15)),
+        "4D896F63",
+        "movq    %r13, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r14, Amode::imm_reg(99, rcx)),
+        "4C897163",
+        "movq    %r14, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(8, r15, Amode::imm_reg(99, r14)),
+        "4D897E63",
+        "movq    %r15, 99(%r14)",
+    ));
+    //
+    insns.push((
+        Inst::mov_r_m(4, rax, Amode::imm_reg(99, rdi)),
+        "894763",
+        "movl    %eax, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rbx, Amode::imm_reg(99, r8)),
+        "41895863",
+        "movl    %ebx, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rcx, Amode::imm_reg(99, rsi)),
+        "894E63",
+        "movl    %ecx, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rdx, Amode::imm_reg(99, r9)),
+        "41895163",
+        "movl    %edx, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rsi, Amode::imm_reg(99, rax)),
+        "897063",
+        "movl    %esi, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rdi, Amode::imm_reg(99, r15)),
+        "41897F63",
+        "movl    %edi, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rsp, Amode::imm_reg(99, rcx)),
+        "896163",
+        "movl    %esp, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, rbp, Amode::imm_reg(99, r14)),
+        "41896E63",
+        "movl    %ebp, 99(%r14)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r8, Amode::imm_reg(99, rdi)),
+        "44894763",
+        "movl    %r8d, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r9, Amode::imm_reg(99, r8)),
+        "45894863",
+        "movl    %r9d, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r10, Amode::imm_reg(99, rsi)),
+        "44895663",
+        "movl    %r10d, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r11, Amode::imm_reg(99, r9)),
+        "45895963",
+        "movl    %r11d, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r12, Amode::imm_reg(99, rax)),
+        "44896063",
+        "movl    %r12d, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r13, Amode::imm_reg(99, r15)),
+        "45896F63",
+        "movl    %r13d, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r14, Amode::imm_reg(99, rcx)),
+        "44897163",
+        "movl    %r14d, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(4, r15, Amode::imm_reg(99, r14)),
+        "45897E63",
+        "movl    %r15d, 99(%r14)",
+    ));
+    //
+    insns.push((
+        Inst::mov_r_m(2, rax, Amode::imm_reg(99, rdi)),
+        "66894763",
+        "movw    %ax, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rbx, Amode::imm_reg(99, r8)),
+        "6641895863",
+        "movw    %bx, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rcx, Amode::imm_reg(99, rsi)),
+        "66894E63",
+        "movw    %cx, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rdx, Amode::imm_reg(99, r9)),
+        "6641895163",
+        "movw    %dx, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rsi, Amode::imm_reg(99, rax)),
+        "66897063",
+        "movw    %si, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rdi, Amode::imm_reg(99, r15)),
+        "6641897F63",
+        "movw    %di, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rsp, Amode::imm_reg(99, rcx)),
+        "66896163",
+        "movw    %sp, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, rbp, Amode::imm_reg(99, r14)),
+        "6641896E63",
+        "movw    %bp, 99(%r14)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r8, Amode::imm_reg(99, rdi)),
+        "6644894763",
+        "movw    %r8w, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r9, Amode::imm_reg(99, r8)),
+        "6645894863",
+        "movw    %r9w, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r10, Amode::imm_reg(99, rsi)),
+        "6644895663",
+        "movw    %r10w, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r11, Amode::imm_reg(99, r9)),
+        "6645895963",
+        "movw    %r11w, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r12, Amode::imm_reg(99, rax)),
+        "6644896063",
+        "movw    %r12w, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r13, Amode::imm_reg(99, r15)),
+        "6645896F63",
+        "movw    %r13w, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r14, Amode::imm_reg(99, rcx)),
+        "6644897163",
+        "movw    %r14w, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(2, r15, Amode::imm_reg(99, r14)),
+        "6645897E63",
+        "movw    %r15w, 99(%r14)",
+    ));
+    //
+    insns.push((
+        Inst::mov_r_m(1, rax, Amode::imm_reg(99, rdi)),
+        "884763",
+        "movb    %al, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rbx, Amode::imm_reg(99, r8)),
+        "41885863",
+        "movb    %bl, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rcx, Amode::imm_reg(99, rsi)),
+        "884E63",
+        "movb    %cl, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rdx, Amode::imm_reg(99, r9)),
+        "41885163",
+        "movb    %dl, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rsi, Amode::imm_reg(99, rax)),
+        "40887063",
+        "movb    %sil, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rdi, Amode::imm_reg(99, r15)),
+        "41887F63",
+        "movb    %dil, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rsp, Amode::imm_reg(99, rcx)),
+        "40886163",
+        "movb    %spl, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, rbp, Amode::imm_reg(99, r14)),
+        "41886E63",
+        "movb    %bpl, 99(%r14)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r8, Amode::imm_reg(99, rdi)),
+        "44884763",
+        "movb    %r8b, 99(%rdi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r9, Amode::imm_reg(99, r8)),
+        "45884863",
+        "movb    %r9b, 99(%r8)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r10, Amode::imm_reg(99, rsi)),
+        "44885663",
+        "movb    %r10b, 99(%rsi)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r11, Amode::imm_reg(99, r9)),
+        "45885963",
+        "movb    %r11b, 99(%r9)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r12, Amode::imm_reg(99, rax)),
+        "44886063",
+        "movb    %r12b, 99(%rax)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r13, Amode::imm_reg(99, r15)),
+        "45886F63",
+        "movb    %r13b, 99(%r15)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r14, Amode::imm_reg(99, rcx)),
+        "44887163",
+        "movb    %r14b, 99(%rcx)",
+    ));
+    insns.push((
+        Inst::mov_r_m(1, r15, Amode::imm_reg(99, r14)),
+        "45887E63",
+        "movb    %r15b, 99(%r14)",
+    ));
+
+    // ========================================================
+    // Shift_R
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftLeft, None, w_rdi),
+        "D3E7",
+        "shll    %cl, %edi",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftLeft, None, w_r12),
+        "41D3E4",
+        "shll    %cl, %r12d",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftLeft, Some(2), w_r8),
+        "41C1E002",
+        "shll    $2, %r8d",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftLeft, Some(31), w_r13),
+        "41C1E51F",
+        "shll    $31, %r13d",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftLeft, None, w_r13),
+        "49D3E5",
+        "shlq    %cl, %r13",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftLeft, None, w_rdi),
+        "48D3E7",
+        "shlq    %cl, %rdi",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftLeft, Some(2), w_r8),
+        "49C1E002",
+        "shlq    $2, %r8",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftLeft, Some(3), w_rbx),
+        "48C1E303",
+        "shlq    $3, %rbx",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftLeft, Some(63), w_r13),
+        "49C1E53F",
+        "shlq    $63, %r13",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightLogical, None, w_rdi),
+        "D3EF",
+        "shrl    %cl, %edi",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(2), w_r8),
+        "41C1E802",
+        "shrl    $2, %r8d",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(31), w_r13),
+        "41C1ED1F",
+        "shrl    $31, %r13d",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightLogical, None, w_rdi),
+        "48D3EF",
+        "shrq    %cl, %rdi",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(2), w_r8),
+        "49C1E802",
+        "shrq    $2, %r8",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(63), w_r13),
+        "49C1ED3F",
+        "shrq    $63, %r13",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, None, w_rdi),
+        "D3FF",
+        "sarl    %cl, %edi",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, Some(2), w_r8),
+        "41C1F802",
+        "sarl    $2, %r8d",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, Some(31), w_r13),
+        "41C1FD1F",
+        "sarl    $31, %r13d",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, None, w_rdi),
+        "48D3FF",
+        "sarq    %cl, %rdi",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, Some(2), w_r8),
+        "49C1F802",
+        "sarq    $2, %r8",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, Some(63), w_r13),
+        "49C1FD3F",
+        "sarq    $63, %r13",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::RotateLeft, None, w_r8),
+        "49D3C0",
+        "rolq    %cl, %r8",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::RotateLeft, Some(3), w_r9),
+        "41C1C103",
+        "roll    $3, %r9d",
+    ));
+    insns.push((
+        Inst::shift_r(4, ShiftKind::RotateRight, None, w_rsi),
+        "D3CE",
+        "rorl    %cl, %esi",
+    ));
+    insns.push((
+        Inst::shift_r(8, ShiftKind::RotateRight, Some(5), w_r15),
+        "49C1CF05",
+        "rorq    $5, %r15",
+    ));
+    insns.push((
+        Inst::shift_r(1, ShiftKind::RotateRight, None, w_rsi),
+        "D2CE",
+        "rorb    %cl, %sil",
+    ));
+    insns.push((
+        Inst::shift_r(1, ShiftKind::RotateRight, Some(5), w_r15),
+        "41C0CF05",
+        "rorb    $5, %r15b",
+    ));
+    insns.push((
+        Inst::shift_r(2, ShiftKind::RotateRight, None, w_rsi),
+        "66D3CE",
+        "rorw    %cl, %si",
+    ));
+    insns.push((
+        Inst::shift_r(2, ShiftKind::RotateRight, Some(5), w_r15),
+        "6641C1CF05",
+        "rorw    $5, %r15w",
+    ));
+
+    // ========================================================
+    // CmpRMIR
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::reg(r15), rdx),
+        "4C39FA",
+        "cmpq    %r15, %rdx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::reg(rcx), r8),
+        "4939C8",
+        "cmpq    %rcx, %r8",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::reg(rcx), rsi),
+        "4839CE",
+        "cmpq    %rcx, %rsi",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "483B5763",
+        "cmpq    99(%rdi), %rdx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8),
+        "4C3B4763",
+        "cmpq    99(%rdi), %r8",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi),
+        "483B7763",
+        "cmpq    99(%rdi), %rsi",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::imm(76543210), rdx),
+        "4881FAEAF48F04",
+        "cmpq    $76543210, %rdx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::imm(-76543210i32 as u32), r8),
+        "4981F8160B70FB",
+        "cmpq    $-76543210, %r8",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(8, RegMemImm::imm(76543210), rsi),
+        "4881FEEAF48F04",
+        "cmpq    $76543210, %rsi",
+    ));
+    //
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::reg(r15), rdx),
+        "4439FA",
+        "cmpl    %r15d, %edx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::reg(rcx), r8),
+        "4139C8",
+        "cmpl    %ecx, %r8d",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::reg(rcx), rsi),
+        "39CE",
+        "cmpl    %ecx, %esi",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "3B5763",
+        "cmpl    99(%rdi), %edx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8),
+        "443B4763",
+        "cmpl    99(%rdi), %r8d",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi),
+        "3B7763",
+        "cmpl    99(%rdi), %esi",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::imm(76543210), rdx),
+        "81FAEAF48F04",
+        "cmpl    $76543210, %edx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::imm(-76543210i32 as u32), r8),
+        "4181F8160B70FB",
+        "cmpl    $-76543210, %r8d",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(4, RegMemImm::imm(76543210), rsi),
+        "81FEEAF48F04",
+        "cmpl    $76543210, %esi",
+    ));
+    //
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::reg(r15), rdx),
+        "664439FA",
+        "cmpw    %r15w, %dx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::reg(rcx), r8),
+        "664139C8",
+        "cmpw    %cx, %r8w",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::reg(rcx), rsi),
+        "6639CE",
+        "cmpw    %cx, %si",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "663B5763",
+        "cmpw    99(%rdi), %dx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8),
+        "66443B4763",
+        "cmpw    99(%rdi), %r8w",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi),
+        "663B7763",
+        "cmpw    99(%rdi), %si",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::imm(23210), rdx),
+        "6681FAAA5A",
+        "cmpw    $23210, %dx",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::imm(-7654i32 as u32), r8),
+        "664181F81AE2",
+        "cmpw    $-7654, %r8w",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(2, RegMemImm::imm(7654), rsi),
+        "6681FEE61D",
+        "cmpw    $7654, %si",
+    ));
+    //
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r15), rdx),
+        "4438FA",
+        "cmpb    %r15b, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r8),
+        "4138C8",
+        "cmpb    %cl, %r8b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rsi),
+        "4038CE",
+        "cmpb    %cl, %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx),
+        "3A5763",
+        "cmpb    99(%rdi), %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8),
+        "443A4763",
+        "cmpb    99(%rdi), %r8b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi),
+        "403A7763",
+        "cmpb    99(%rdi), %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::imm(70), rdx),
+        "80FA46",
+        "cmpb    $70, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::imm(-76i32 as u32), r8),
+        "4180F8B4",
+        "cmpb    $-76, %r8b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::imm(76), rsi),
+        "4080FE4C",
+        "cmpb    $76, %sil",
+    ));
+    // Extra byte-cases (paranoia!) for cmp_rmi_r for first operand = R
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rax), rbx),
+        "38C3",
+        "cmpb    %al, %bl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rbx), rax),
+        "38D8",
+        "cmpb    %bl, %al",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rdx),
+        "38CA",
+        "cmpb    %cl, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rsi),
+        "4038CE",
+        "cmpb    %cl, %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r10),
+        "4138CA",
+        "cmpb    %cl, %r10b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r14),
+        "4138CE",
+        "cmpb    %cl, %r14b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), rdx),
+        "4038EA",
+        "cmpb    %bpl, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), rsi),
+        "4038EE",
+        "cmpb    %bpl, %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), r10),
+        "4138EA",
+        "cmpb    %bpl, %r10b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), r14),
+        "4138EE",
+        "cmpb    %bpl, %r14b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r9), rdx),
+        "4438CA",
+        "cmpb    %r9b, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r9), rsi),
+        "4438CE",
+        "cmpb    %r9b, %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r9), r10),
+        "4538CA",
+        "cmpb    %r9b, %r10b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r9), r14),
+        "4538CE",
+        "cmpb    %r9b, %r14b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r13), rdx),
+        "4438EA",
+        "cmpb    %r13b, %dl",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r13), rsi),
+        "4438EE",
+        "cmpb    %r13b, %sil",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r13), r10),
+        "4538EA",
+        "cmpb    %r13b, %r10b",
+    ));
+    insns.push((
+        Inst::cmp_rmi_r(1, RegMemImm::reg(r13), r14),
+        "4538EE",
+        "cmpb    %r13b, %r14b",
+    ));
+
+    // ========================================================
+    // SetCC
+    insns.push((Inst::setcc(CC::O, w_rsi), "400F90C6", "seto    %sil"));
+    insns.push((Inst::setcc(CC::NLE, w_rsi), "400F9FC6", "setnle  %sil"));
+    insns.push((Inst::setcc(CC::Z, w_r14), "410F94C6", "setz    %r14b"));
+    insns.push((Inst::setcc(CC::LE, w_r14), "410F9EC6", "setle   %r14b"));
+    insns.push((Inst::setcc(CC::P, w_r9), "410F9AC1", "setp    %r9b"));
+    insns.push((Inst::setcc(CC::NP, w_r8), "410F9BC0", "setnp   %r8b"));
+    // ========================================================
+    // Cmove
+    insns.push((
+        Inst::cmove(2, CC::O, RegMem::reg(rdi), w_rsi),
+        "660F40F7",
+        "cmovow  %di, %si",
+    ));
+    insns.push((
+        Inst::cmove(
+            2,
+            CC::NO,
+            RegMem::mem(Amode::imm_reg_reg_shift(37, rdi, rsi, 2)),
+            w_r15,
+        ),
+        "66440F417CB725",
+        "cmovnow 37(%rdi,%rsi,4), %r15w",
+    ));
+    insns.push((
+        Inst::cmove(4, CC::LE, RegMem::reg(rdi), w_rsi),
+        "0F4EF7",
+        "cmovlel %edi, %esi",
+    ));
+    insns.push((
+        Inst::cmove(4, CC::NLE, RegMem::mem(Amode::imm_reg(0, r15)), w_rsi),
+        "410F4F37",
+        "cmovnlel 0(%r15), %esi",
+    ));
+    insns.push((
+        Inst::cmove(8, CC::Z, RegMem::reg(rdi), w_r14),
+        "4C0F44F7",
+        "cmovzq  %rdi, %r14",
+    ));
+    insns.push((
+        Inst::cmove(8, CC::NZ, RegMem::mem(Amode::imm_reg(13, rdi)), w_r14),
+        "4C0F45770D",
+        "cmovnzq 13(%rdi), %r14",
+    ));
+
+    // ========================================================
+    // Push64
+    insns.push((Inst::push64(RegMemImm::reg(rdi)), "57", "pushq   %rdi"));
+    insns.push((Inst::push64(RegMemImm::reg(r8)), "4150", "pushq   %r8"));
+    insns.push((
+        Inst::push64(RegMemImm::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))),
+        "FFB4CE41010000",
+        "pushq   321(%rsi,%rcx,8)",
+    ));
+    insns.push((
+        Inst::push64(RegMemImm::mem(Amode::imm_reg_reg_shift(321, r9, rbx, 2))),
+        "41FFB49941010000",
+        "pushq   321(%r9,%rbx,4)",
+    ));
+    insns.push((Inst::push64(RegMemImm::imm(0)), "6A00", "pushq   $0"));
+    insns.push((Inst::push64(RegMemImm::imm(127)), "6A7F", "pushq   $127"));
+    insns.push((
+        Inst::push64(RegMemImm::imm(128)),
+        "6880000000",
+        "pushq   $128",
+    ));
+    insns.push((
+        Inst::push64(RegMemImm::imm(0x31415927)),
+        "6827594131",
+        "pushq   $826366247",
+    ));
+    insns.push((
+        Inst::push64(RegMemImm::imm(-128i32 as u32)),
+        "6A80",
+        "pushq   $-128",
+    ));
+    insns.push((
+        Inst::push64(RegMemImm::imm(-129i32 as u32)),
+        "687FFFFFFF",
+        "pushq   $-129",
+    ));
+    insns.push((
+        Inst::push64(RegMemImm::imm(-0x75c4e8a1i32 as u32)),
+        "685F173B8A",
+        "pushq   $-1975838881",
+    ));
+
+    // ========================================================
+    // Pop64
+    insns.push((Inst::pop64(w_rax), "58", "popq    %rax"));
+    insns.push((Inst::pop64(w_rdi), "5F", "popq    %rdi"));
+    insns.push((Inst::pop64(w_r8), "4158", "popq    %r8"));
+    insns.push((Inst::pop64(w_r15), "415F", "popq    %r15"));
+
+    // ========================================================
+    // CallKnown
+    insns.push((
+        Inst::call_known(
+            ExternalName::User {
+                namespace: 0,
+                index: 0,
+            },
+            Vec::new(),
+            Vec::new(),
+            Opcode::Call,
+        ),
+        "E800000000",
+        "call    User { namespace: 0, index: 0 }",
+    ));
+
+    // ========================================================
+    // CallUnknown
+    fn call_unknown(rm: RegMem) -> Inst {
+        Inst::call_unknown(rm, Vec::new(), Vec::new(), Opcode::CallIndirect)
+    }
+
+    insns.push((call_unknown(RegMem::reg(rbp)), "FFD5", "call    *%rbp"));
+    insns.push((call_unknown(RegMem::reg(r11)), "41FFD3", "call    *%r11"));
+    insns.push((
+        call_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))),
+        "FF94CE41010000",
+        "call    *321(%rsi,%rcx,8)",
+    ));
+    insns.push((
+        call_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rdx, 2))),
+        "41FF949241010000",
+        "call    *321(%r10,%rdx,4)",
+    ));
+
+    // ========================================================
+    // Ret
+    insns.push((Inst::ret(), "C3", "ret"));
+
+    // ========================================================
+    // JmpKnown skipped for now
+
+    // ========================================================
+    // JmpCondSymm isn't a real instruction
+
+    // ========================================================
+    // JmpCond skipped for now
+
+    // ========================================================
+    // JmpCondCompound isn't a real instruction
+
+    // ========================================================
+    // JmpUnknown
+    insns.push((Inst::jmp_unknown(RegMem::reg(rbp)), "FFE5", "jmp     *%rbp"));
+    insns.push((
+        Inst::jmp_unknown(RegMem::reg(r11)),
+        "41FFE3",
+        "jmp     *%r11",
+    ));
+    insns.push((
+        Inst::jmp_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))),
+        "FFA4CE41010000",
+        "jmp     *321(%rsi,%rcx,8)",
+    ));
+    insns.push((
+        Inst::jmp_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rdx, 2))),
+        "41FFA49241010000",
+        "jmp     *321(%r10,%rdx,4)",
+    ));
+
+    // ========================================================
+    // XMM_CMP_RM_R
+
+    insns.push((
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, RegMem::reg(xmm1), xmm2),
+        "0F2ED1",
+        "ucomiss %xmm1, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, RegMem::reg(xmm0), xmm9),
+        "440F2EC8",
+        "ucomiss %xmm0, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, RegMem::reg(xmm13), xmm4),
+        "66410F2EE5",
+        "ucomisd %xmm13, %xmm4",
+    ));
+
+    insns.push((
+        Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, RegMem::reg(xmm11), xmm12),
+        "66450F2EE3",
+        "ucomisd %xmm11, %xmm12",
+    ));
+
+    // ========================================================
+    // XMM_RM_R: float binary ops
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0),
+        "F30F58C1",
+        "addss   %xmm1, %xmm0",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13),
+        "F3450F58EB",
+        "addss   %xmm11, %xmm13",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(
+            SseOpcode::Addss,
+            RegMem::mem(Amode::imm_reg_reg_shift(123, r10, rdx, 2)),
+            w_xmm0,
+        ),
+        "F3410F5844927B",
+        "addss   123(%r10,%rdx,4), %xmm0",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4),
+        "F2410F58E7",
+        "addsd   %xmm15, %xmm4",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1),
+        "F30F5CC8",
+        "subss   %xmm0, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1),
+        "F3410F5CCC",
+        "subss   %xmm12, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(
+            SseOpcode::Subss,
+            RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rax, 3)),
+            w_xmm10,
+        ),
+        "F3450F5C94C241010000",
+        "subss   321(%r10,%rax,8), %xmm10",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14),
+        "F2440F5CF5",
+        "subsd   %xmm5, %xmm14",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4),
+        "F30F59E5",
+        "mulss   %xmm5, %xmm4",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4),
+        "F20F59E5",
+        "mulsd   %xmm5, %xmm4",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7),
+        "F3410F5EF8",
+        "divss   %xmm8, %xmm7",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4),
+        "F20F5EE5",
+        "divsd   %xmm5, %xmm4",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12),
+        "440F54E3",
+        "andps   %xmm3, %xmm12",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11),
+        "440F55DC",
+        "andnps  %xmm4, %xmm11",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15),
+        "440F56F9",
+        "orps    %xmm1, %xmm15",
+    ));
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4),
+        "0F56E5",
+        "orps    %xmm5, %xmm4",
+    ));
+
+    // ========================================================
+    // XMM_RM_R: Integer Packed
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5),
+        "66410FFCE9",
+        "paddb   %xmm9, %xmm5",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6),
+        "660FFDF7",
+        "paddw   %xmm7, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13),
+        "66450FFEEC",
+        "paddd   %xmm12, %xmm13",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8),
+        "66440FD4C1",
+        "paddq   %xmm1, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5),
+        "66410FECE9",
+        "paddsb  %xmm9, %xmm5",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6),
+        "660FEDF7",
+        "paddsw  %xmm7, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13),
+        "66450FDCEC",
+        "paddusb %xmm12, %xmm13",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8),
+        "66440FDDC1",
+        "paddusw %xmm1, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5),
+        "66410FE8E9",
+        "psubsb  %xmm9, %xmm5",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6),
+        "660FE9F7",
+        "psubsw  %xmm7, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13),
+        "66450FD8EC",
+        "psubusb %xmm12, %xmm13",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8),
+        "66440FD9C1",
+        "psubusw %xmm1, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13),
+        "66450FE0EC",
+        "pavgb   %xmm12, %xmm13",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8),
+        "66440FE3C1",
+        "pavgw   %xmm1, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9),
+        "66440FF8CD",
+        "psubb   %xmm5, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7),
+        "660FF9FE",
+        "psubw   %xmm6, %xmm7",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12),
+        "66450FFAE5",
+        "psubd   %xmm13, %xmm12",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1),
+        "66410FFBC8",
+        "psubq   %xmm8, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6),
+        "66410F3840F7",
+        "pmulld  %xmm15, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1),
+        "66410FD5CE",
+        "pmullw  %xmm14, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
+        "66450FF4C8",
+        "pmuludq %xmm8, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6),
+        "66410F383CF7",
+        "pmaxsb  %xmm15, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6),
+        "66410FEEF7",
+        "pmaxsw  %xmm15, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6),
+        "66410F383DF7",
+        "pmaxsd  %xmm15, %xmm6",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1),
+        "66410FDECE",
+        "pmaxub  %xmm14, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1),
+        "66410F383ECE",
+        "pmaxuw  %xmm14, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1),
+        "66410F383FCE",
+        "pmaxud  %xmm14, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9),
+        "66450F3838C8",
+        "pminsb  %xmm8, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9),
+        "66450FEAC8",
+        "pminsw  %xmm8, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9),
+        "66450F3839C8",
+        "pminsd  %xmm8, %xmm9",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2),
+        "660FDAD3",
+        "pminub  %xmm3, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2),
+        "660F383AD3",
+        "pminuw  %xmm3, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2),
+        "660F383BD3",
+        "pminud  %xmm3, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2),
+        "66410FEFD3",
+        "pxor    %xmm11, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2),
+        "66410F3800D3",
+        "pshufb  %xmm11, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2),
+        "66410F63D3",
+        "packsswb %xmm11, %xmm2",
+    ));
+
+    // ========================================================
+    // XMM_RM_R: Integer Conversion
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::reg(xmm1), w_xmm8),
+        "440F5BC1",
+        "cvtdq2ps %xmm1, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Cvttps2dq, RegMem::reg(xmm9), w_xmm8),
+        "F3450F5BC1",
+        "cvttps2dq %xmm9, %xmm8",
+    ));
+
+    // XMM_Mov_R_M: float stores
+    insns.push((
+        Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12)),
+        "F3450F11BC2480000000",
+        "movss   %xmm15, 128(%r12)",
+    ));
+    insns.push((
+        Inst::xmm_mov_r_m(SseOpcode::Movsd, xmm1, Amode::imm_reg(0, rsi)),
+        "F20F110E",
+        "movsd   %xmm1, 0(%rsi)",
+    ));
+
+    // XmmUnary: moves and unary float ops
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Movss, RegMem::reg(xmm13), w_xmm2),
+        "F3410F10D5",
+        "movss   %xmm13, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Movsd, RegMem::reg(xmm0), w_xmm1),
+        "F20F10C8",
+        "movsd   %xmm0, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(
+            SseOpcode::Movsd,
+            RegMem::mem(Amode::imm_reg(0, rsi)),
+            w_xmm2,
+        ),
+        "F20F1016",
+        "movsd   0(%rsi), %xmm2",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Movsd, RegMem::reg(xmm14), w_xmm3),
+        "F2410F10DE",
+        "movsd   %xmm14, %xmm3",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Movaps, RegMem::reg(xmm5), w_xmm14),
+        "440F28F5",
+        "movaps  %xmm5, %xmm14",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Sqrtss, RegMem::reg(xmm7), w_xmm8),
+        "F3440F51C7",
+        "sqrtss  %xmm7, %xmm8",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Sqrtsd, RegMem::reg(xmm1), w_xmm2),
+        "F20F51D1",
+        "sqrtsd  %xmm1, %xmm2",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, RegMem::reg(xmm0), w_xmm1),
+        "F30F5AC8",
+        "cvtss2sd %xmm0, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, RegMem::reg(xmm1), w_xmm0),
+        "F20F5AC1",
+        "cvtsd2ss %xmm1, %xmm0",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Pabsb, RegMem::reg(xmm2), w_xmm1),
+        "660F381CCA",
+        "pabsb   %xmm2, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Pabsw, RegMem::reg(xmm0), w_xmm0),
+        "660F381DC0",
+        "pabsw   %xmm0, %xmm0",
+    ));
+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Pabsd, RegMem::reg(xmm10), w_xmm11),
+        "66450F381EDA",
+        "pabsd   %xmm10, %xmm11",
+    ));
+
+    // Xmm to int conversions, and conversely.
+
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Movd, xmm0, w_rsi, OperandSize::Size32),
+        "660F7EC6",
+        "movd    %xmm0, %esi",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Movq, xmm2, w_rdi, OperandSize::Size64),
+        "66480F7ED7",
+        "movq    %xmm2, %rdi",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Cvttss2si, xmm0, w_rsi, OperandSize::Size32),
+        "F30F2CF0",
+        "cvttss2si %xmm0, %esi",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Cvttss2si, xmm0, w_rdi, OperandSize::Size64),
+        "F3480F2CF8",
+        "cvttss2si %xmm0, %rdi",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Cvttsd2si, xmm0, w_rax, OperandSize::Size32),
+        "F20F2CC0",
+        "cvttsd2si %xmm0, %eax",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Cvttsd2si, xmm0, w_r15, OperandSize::Size64),
+        "F24C0F2CF8",
+        "cvttsd2si %xmm0, %r15",
+    ));
+
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Pmovmskb, xmm10, w_rax, OperandSize::Size32),
+        "66410FD7C2",
+        "pmovmskb %xmm10, %eax",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Movmskps, xmm2, w_rax, OperandSize::Size32),
+        "0F50C2",
+        "movmskps %xmm2, %eax",
+    ));
+    insns.push((
+        Inst::xmm_to_gpr(SseOpcode::Movmskpd, xmm0, w_rcx, OperandSize::Size32),
+        "660F50C8",
+        "movmskpd %xmm0, %ecx",
+    ));
+
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Movd,
+            RegMem::reg(rax),
+            OperandSize::Size32,
+            w_xmm15,
+        ),
+        "66440F6EF8",
+        "movd    %eax, %xmm15",
+    ));
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Movd,
+            RegMem::mem(Amode::imm_reg(2, r10)),
+            OperandSize::Size32,
+            w_xmm9,
+        ),
+        "66450F6E4A02",
+        "movd    2(%r10), %xmm9",
+    ));
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Movd,
+            RegMem::reg(rsi),
+            OperandSize::Size32,
+            w_xmm1,
+        ),
+        "660F6ECE",
+        "movd    %esi, %xmm1",
+    ));
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Movq,
+            RegMem::reg(rdi),
+            OperandSize::Size64,
+            w_xmm15,
+        ),
+        "664C0F6EFF",
+        "movq    %rdi, %xmm15",
+    ));
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Cvtsi2ss,
+            RegMem::reg(rdi),
+            OperandSize::Size32,
+            w_xmm15,
+        ),
+        "F3440F2AFF",
+        "cvtsi2ss %edi, %xmm15",
+    ));
+    insns.push((
+        Inst::gpr_to_xmm(
+            SseOpcode::Cvtsi2sd,
+            RegMem::reg(rsi),
+            OperandSize::Size64,
+            w_xmm1,
+        ),
+        "F2480F2ACE",
+        "cvtsi2sd %rsi, %xmm1",
+    ));
+
+    // ========================================================
+    // XmmRmi
+    insns.push((
+        Inst::xmm_rmi_reg(SseOpcode::Psraw, RegMemImm::reg(xmm10), w_xmm1),
+        "66410FE1CA",
+        "psraw   %xmm10, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(31), w_xmm1),
+        "660F72F11F",
+        "pslld   $31, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rmi_reg(SseOpcode::Psrlq, RegMemImm::imm(1), w_xmm3),
+        "660F73D301",
+        "psrlq   $1, %xmm3",
+    ));
+
+    // ========================================================
+    // XmmRmRImm
+    insns.push((
+        Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false),
+        "660FC2CD02",
+        "cmppd   $2, %xmm5, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false),
+        "410FC2FF00",
+        "cmpps   $0, %xmm15, %xmm7",
+    ));
+
+    // ========================================================
+    // Pertaining to atomics.
+    let am1: SyntheticAmode = Amode::imm_reg_reg_shift(321, r10, rdx, 2).into();
+    // `am2` doesn't contribute any 1 bits to the rex prefix, so we must use it when testing
+    // for retention of the apparently-redundant rex prefix in the 8-bit case.
+    let am2: SyntheticAmode = Amode::imm_reg_reg_shift(-12345i32 as u32, rcx, rsi, 3).into();
+
+    // A general 8-bit case.
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: rbx,
+            dst: am1,
+        },
+        "F0410FB09C9241010000",
+        "lock cmpxchgb %bl, 321(%r10,%rdx,4)",
+    ));
+    // Check redundant rex retention in 8-bit cases.
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: rdx,
+            dst: am2.clone(),
+        },
+        "F00FB094F1C7CFFFFF",
+        "lock cmpxchgb %dl, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: rsi,
+            dst: am2.clone(),
+        },
+        "F0400FB0B4F1C7CFFFFF",
+        "lock cmpxchgb %sil, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: r10,
+            dst: am2.clone(),
+        },
+        "F0440FB094F1C7CFFFFF",
+        "lock cmpxchgb %r10b, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I8,
+            src: r15,
+            dst: am2.clone(),
+        },
+        "F0440FB0BCF1C7CFFFFF",
+        "lock cmpxchgb %r15b, -12345(%rcx,%rsi,8)",
+    ));
+    // 16 bit cases
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I16,
+            src: rsi,
+            dst: am2.clone(),
+        },
+        "66F00FB1B4F1C7CFFFFF",
+        "lock cmpxchgw %si, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I16,
+            src: r10,
+            dst: am2.clone(),
+        },
+        "66F0440FB194F1C7CFFFFF",
+        "lock cmpxchgw %r10w, -12345(%rcx,%rsi,8)",
+    ));
+    // 32 bit cases
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I32,
+            src: rsi,
+            dst: am2.clone(),
+        },
+        "F00FB1B4F1C7CFFFFF",
+        "lock cmpxchgl %esi, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I32,
+            src: r10,
+            dst: am2.clone(),
+        },
+        "F0440FB194F1C7CFFFFF",
+        "lock cmpxchgl %r10d, -12345(%rcx,%rsi,8)",
+    ));
+    // 64 bit cases
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I64,
+            src: rsi,
+            dst: am2.clone(),
+        },
+        "F0480FB1B4F1C7CFFFFF",
+        "lock cmpxchgq %rsi, -12345(%rcx,%rsi,8)",
+    ));
+    insns.push((
+        Inst::LockCmpxchg {
+            ty: types::I64,
+            src: r10,
+            dst: am2.clone(),
+        },
+        "F04C0FB194F1C7CFFFFF",
+        "lock cmpxchgq %r10, -12345(%rcx,%rsi,8)",
+    ));
+
+    // AtomicRmwSeq
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I8, op: inst_common::AtomicRmwOp::Or, },
+        "490FB6014989C34D09D3F0450FB0190F85EFFFFFFF",
+        "atomically { 8_bits_at_[%r9]) Or= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I16, op: inst_common::AtomicRmwOp::And, },
+        "490FB7014989C34D21D366F0450FB1190F85EEFFFFFF",
+        "atomically { 16_bits_at_[%r9]) And= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I32, op: inst_common::AtomicRmwOp::Xchg, },
+        "418B014989C34D89D3F0450FB1190F85EFFFFFFF",
+        "atomically { 32_bits_at_[%r9]) Xchg= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+    insns.push((
+        Inst::AtomicRmwSeq { ty: types::I64, op: inst_common::AtomicRmwOp::Add, },
+        "498B014989C34D01D3F04D0FB1190F85EFFFFFFF",
+        "atomically { 64_bits_at_[%r9]) Add= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
+    ));
+
+    // Fence
+    insns.push((
+        Inst::Fence {
+            kind: FenceKind::MFence,
+        },
+        "0FAEF0",
+        "mfence",
+    ));
+    insns.push((
+        Inst::Fence {
+            kind: FenceKind::LFence,
+        },
+        "0FAEE8",
+        "lfence",
+    ));
+    insns.push((
+        Inst::Fence {
+            kind: FenceKind::SFence,
+        },
+        "0FAEF8",
+        "sfence",
+    ));
+
+    // ========================================================
+    // Misc instructions.
+
+    insns.push((Inst::Hlt, "CC", "hlt"));
+
+    let trap_code = TrapCode::UnreachableCodeReached;
+    insns.push((Inst::Ud2 { trap_code }, "0F0B", "ud2 unreachable"));
+
+    // ========================================================
+    // Actually run the tests!
+    let flags = settings::Flags::new(settings::builder());
+
+    use crate::settings::Configurable;
+    let mut isa_flag_builder = x64::settings::builder();
+    isa_flag_builder.enable("has_ssse3").unwrap();
+    isa_flag_builder.enable("has_sse41").unwrap();
+    let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
+
+    let rru = regs::create_reg_universe_systemv(&flags);
+    let emit_info = EmitInfo::new(flags, isa_flags);
+    for (insn, expected_encoding, expected_printing) in insns {
+        // Check the printed text is as expected.
+        let actual_printing = insn.show_rru(Some(&rru));
+        assert_eq!(expected_printing, actual_printing);
+        let mut sink = test_utils::TestCodeSink::new();
+        let mut buffer = MachBuffer::new();
+
+        insn.emit(&mut buffer, &emit_info, &mut Default::default());
+
+        // Allow one label just after the instruction (so the offset is 0).
+        let label = buffer.get_label();
+        buffer.bind_label(label);
+
+        let buffer = buffer.finish();
+        buffer.emit(&mut sink);
+        let actual_encoding = &sink.stringify();
+        assert_eq!(expected_encoding, actual_encoding, "{}", expected_printing);
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs
new file mode 100644
index 0000000000..1172b22eff
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs
@@ -0,0 +1,2733 @@
+//! This module defines x86_64-specific machine instruction types.
+
+use crate::binemit::{CodeOffset, StackMap};
+use crate::ir::{types, ExternalName, Opcode, SourceLoc, TrapCode, Type};
+use crate::isa::x64::settings as x64_settings;
+use crate::machinst::*;
+use crate::{settings, settings::Flags, CodegenError, CodegenResult};
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use regalloc::{
+    PrettyPrint, PrettyPrintSized, RealRegUniverse, Reg, RegClass, RegUsageCollector,
+    RegUsageMapper, SpillSlot, VirtualReg, Writable,
+};
+use smallvec::SmallVec;
+use std::fmt;
+use std::string::{String, ToString};
+
+pub mod args;
+mod emit;
+#[cfg(test)]
+mod emit_tests;
+pub mod regs;
+pub mod unwind;
+
+use args::*;
+use regs::{create_reg_universe_systemv, show_ireg_sized};
+
+//=============================================================================
+// Instructions (top level): definition
+
+// Don't build these directly.  Instead use the Inst:: functions to create them.
+
+/// Instructions.  Destinations are on the RIGHT (a la AT&T syntax).
+#[derive(Clone)]
+pub enum Inst {
+    /// Nops of various sizes, including zero.
+    Nop { len: u8 },
+
+    // =====================================
+    // Integer instructions.
+    /// Integer arithmetic/bit-twiddling: (add sub and or xor mul adc? sbb?) (32 64) (reg addr imm) reg
+    AluRmiR {
+        is_64: bool,
+        op: AluRmiROpcode,
+        src: RegMemImm,
+        dst: Writable<Reg>,
+    },
+
+    /// Instructions on GPR that only read src and defines dst (dst is not modified): bsr, etc.
+    UnaryRmR {
+        size: u8, // 2, 4 or 8
+        op: UnaryRmROpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// Bitwise not
+    Not {
+        size: u8, // 1, 2, 4 or 8
+        src: Writable<Reg>,
+    },
+
+    /// Integer negation
+    Neg {
+        size: u8, // 1, 2, 4 or 8
+        src: Writable<Reg>,
+    },
+
+    /// Integer quotient and remainder: (div idiv) $rax $rdx (reg addr)
+    Div {
+        size: u8, // 1, 2, 4 or 8
+        signed: bool,
+        divisor: RegMem,
+    },
+
+    /// The high bits (RDX) of a (un)signed multiply: RDX:RAX := RAX * rhs.
+    MulHi { size: u8, signed: bool, rhs: RegMem },
+
+    /// A synthetic sequence to implement the right inline checks for remainder and division,
+    /// assuming the dividend is in %rax.
+    /// Puts the result back into %rax if is_div, %rdx if !is_div, to mimic what the div
+    /// instruction does.
+    /// The generated code sequence is described in the emit's function match arm for this
+    /// instruction.
+    ///
+    /// Note: %rdx is marked as modified by this instruction, to avoid an early clobber problem
+    /// with the temporary and divisor registers. Make sure to zero %rdx right before this
+    /// instruction, or you might run into regalloc failures where %rdx is live before its first
+    /// def!
+    CheckedDivOrRemSeq {
+        kind: DivOrRemKind,
+        size: u8,
+        /// The divisor operand. Note it's marked as modified so that it gets assigned a register
+        /// different from the temporary.
+        divisor: Writable<Reg>,
+        tmp: Option<Writable<Reg>>,
+    },
+
+    /// Do a sign-extend based on the sign of the value in rax into rdx: (cwd cdq cqo)
+    /// or al into ah: (cbw)
+    SignExtendData {
+        size: u8, // 1, 2, 4 or 8
+    },
+
+    /// Constant materialization: (imm32 imm64) reg.
+    /// Either: movl $imm32, %reg32 or movabsq $imm64, %reg32.
+    Imm {
+        dst_is_64: bool,
+        simm64: u64,
+        dst: Writable<Reg>,
+    },
+
+    /// GPR to GPR move: mov (64 32) reg reg.
+    MovRR {
+        is_64: bool,
+        src: Reg,
+        dst: Writable<Reg>,
+    },
+
+    /// Zero-extended loads, except for 64 bits: movz (bl bq wl wq lq) addr reg.
+    /// Note that the lq variant doesn't really exist since the default zero-extend rule makes it
+    /// unnecessary. For that case we emit the equivalent "movl AM, reg32".
+    MovzxRmR {
+        ext_mode: ExtMode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// A plain 64-bit integer load, since MovZX_RM_R can't represent that.
+    Mov64MR {
+        src: SyntheticAmode,
+        dst: Writable<Reg>,
+    },
+
+    /// Loads the memory address of addr into dst.
+    LoadEffectiveAddress {
+        addr: SyntheticAmode,
+        dst: Writable<Reg>,
+    },
+
+    /// Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg.
+    MovsxRmR {
+        ext_mode: ExtMode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// Integer stores: mov (b w l q) reg addr.
+    MovRM {
+        size: u8, // 1, 2, 4 or 8.
+        src: Reg,
+        dst: SyntheticAmode,
+    },
+
+    /// Arithmetic shifts: (shl shr sar) (b w l q) imm reg.
+    ShiftR {
+        size: u8, // 1, 2, 4 or 8
+        kind: ShiftKind,
+        /// shift count: Some(0 .. #bits-in-type - 1), or None to mean "%cl".
+        num_bits: Option<u8>,
+        dst: Writable<Reg>,
+    },
+
+    /// Arithmetic SIMD shifts.
+    XmmRmiReg {
+        opcode: SseOpcode,
+        src: RegMemImm,
+        dst: Writable<Reg>,
+    },
+
+    /// Integer comparisons/tests: cmp (b w l q) (reg addr imm) reg.
+    CmpRmiR {
+        size: u8, // 1, 2, 4 or 8
+        src: RegMemImm,
+        dst: Reg,
+    },
+
+    /// Materializes the requested condition code in the destination reg.
+    Setcc { cc: CC, dst: Writable<Reg> },
+
+    /// Integer conditional move.
+    /// Overwrites the destination register.
+    Cmove {
+        /// Possible values are 2, 4 or 8. Checked in the related factory.
+        size: u8,
+        cc: CC,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    // =====================================
+    // Stack manipulation.
+    /// pushq (reg addr imm)
+    Push64 { src: RegMemImm },
+
+    /// popq reg
+    Pop64 { dst: Writable<Reg> },
+
+    // =====================================
+    // Floating-point operations.
+    /// XMM (scalar or vector) binary op: (add sub and or xor mul adc? sbb?) (32 64) (reg addr) reg
+    XmmRmR {
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt,
+    /// etc.
+    ///
+    /// This differs from XMM_RM_R in that the dst register of XmmUnaryRmR is not used in the
+    /// computation of the instruction dst value and so does not have to be a previously valid
+    /// value. This is characteristic of mov instructions.
+    XmmUnaryRmR {
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd, movq
+    XmmMovRM {
+        op: SseOpcode,
+        src: Reg,
+        dst: SyntheticAmode,
+    },
+
+    /// XMM (vector) unary op (to move a constant value into an xmm register): movups
+    XmmLoadConst {
+        src: VCodeConstant,
+        dst: Writable<Reg>,
+        ty: Type,
+    },
+
+    /// XMM (scalar) unary op (from xmm to integer reg): movd, movq, cvtts{s,d}2si
+    XmmToGpr {
+        op: SseOpcode,
+        src: Reg,
+        dst: Writable<Reg>,
+        dst_size: OperandSize,
+    },
+
+    /// XMM (scalar) unary op (from integer to float reg): movd, movq, cvtsi2s{s,d}
+    GprToXmm {
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+        src_size: OperandSize,
+    },
+
+    /// Converts an unsigned int64 to a float32/float64.
+    CvtUint64ToFloatSeq {
+        /// Is the target a 64-bits or 32-bits register?
+        to_f64: bool,
+        /// A copy of the source register, fed by lowering. It is marked as modified during
+        /// register allocation to make sure that the temporary registers differ from the src
+        /// register, since both registers are live at the same time in the generated code
+        /// sequence.
+        src: Writable<Reg>,
+        dst: Writable<Reg>,
+        tmp_gpr1: Writable<Reg>,
+        tmp_gpr2: Writable<Reg>,
+    },
+
+    /// Converts a scalar xmm to a signed int32/int64.
+    CvtFloatToSintSeq {
+        dst_size: OperandSize,
+        src_size: OperandSize,
+        is_saturating: bool,
+        /// A copy of the source register, fed by lowering. It is marked as modified during
+        /// register allocation to make sure that the temporary xmm register differs from the src
+        /// register, since both registers are live at the same time in the generated code
+        /// sequence.
+        src: Writable<Reg>,
+        dst: Writable<Reg>,
+        tmp_gpr: Writable<Reg>,
+        tmp_xmm: Writable<Reg>,
+    },
+
+    /// Converts a scalar xmm to an unsigned int32/int64.
+    CvtFloatToUintSeq {
+        src_size: OperandSize,
+        dst_size: OperandSize,
+        is_saturating: bool,
+        /// A copy of the source register, fed by lowering, reused as a temporary. It is marked as
+        /// modified during register allocation to make sure that the temporary xmm register
+        /// differs from the src register, since both registers are live at the same time in the
+        /// generated code sequence.
+        src: Writable<Reg>,
+        dst: Writable<Reg>,
+        tmp_gpr: Writable<Reg>,
+        tmp_xmm: Writable<Reg>,
+    },
+
+    /// A sequence to compute min/max with the proper NaN semantics for xmm registers.
+    XmmMinMaxSeq {
+        size: OperandSize,
+        is_min: bool,
+        lhs: Reg,
+        rhs_dst: Writable<Reg>,
+    },
+
+    /// XMM (scalar) conditional move.
+    /// Overwrites the destination register if cc is set.
+    XmmCmove {
+        /// Whether the cmove is moving either 32 or 64 bits.
+        is_64: bool,
+        cc: CC,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
+    /// Float comparisons/tests: cmp (b w l q) (reg addr imm) reg.
+    XmmCmpRmR {
+        op: SseOpcode,
+        src: RegMem,
+        dst: Reg,
+    },
+
+    /// A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg
+    XmmRmRImm {
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+        imm: u8,
+        is64: bool,
+    },
+
+    // =====================================
+    // Control flow instructions.
+    /// Direct call: call simm32.
+    CallKnown {
+        dest: ExternalName,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: Opcode,
+    },
+
+    /// Indirect call: callq (reg mem).
+    CallUnknown {
+        dest: RegMem,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: Opcode,
+    },
+
+    /// Return.
+    Ret,
+
+    /// A placeholder instruction, generating no code, meaning that a function epilogue must be
+    /// inserted there.
+    EpiloguePlaceholder,
+
+    /// Jump to a known target: jmp simm32.
+    JmpKnown { dst: MachLabel },
+
+    /// One-way conditional branch: jcond cond target.
+    ///
+    /// This instruction is useful when we have conditional jumps depending on more than two
+    /// conditions, see for instance the lowering of Brz/brnz with Fcmp inputs.
+    ///
+    /// A note of caution: in contexts where the branch target is another block, this has to be the
+    /// same successor as the one specified in the terminator branch of the current block.
+    /// Otherwise, this might confuse register allocation by creating new invisible edges.
+    JmpIf { cc: CC, taken: MachLabel },
+
+    /// Two-way conditional branch: jcond cond target target.
+    /// Emitted as a compound sequence; the MachBuffer will shrink it as appropriate.
+    JmpCond {
+        cc: CC,
+        taken: MachLabel,
+        not_taken: MachLabel,
+    },
+
+    /// Jump-table sequence, as one compound instruction (see note in lower.rs for rationale).
+    /// The generated code sequence is described in the emit's function match arm for this
+    /// instruction.
+    /// See comment in lowering about the temporaries signedness.
+    JmpTableSeq {
+        idx: Reg,
+        tmp1: Writable<Reg>,
+        tmp2: Writable<Reg>,
+        default_target: MachLabel,
+        targets: Vec<MachLabel>,
+        targets_for_term: Vec<MachLabel>,
+    },
+
+    /// Indirect jump: jmpq (reg mem).
+    JmpUnknown { target: RegMem },
+
+    /// Traps if the condition code is set.
+    TrapIf { cc: CC, trap_code: TrapCode },
+
+    /// A debug trap.
+    Hlt,
+
+    /// An instruction that will always trigger the illegal instruction exception.
+    Ud2 { trap_code: TrapCode },
+
+    /// Loads an external symbol in a register, with a relocation: movabsq $name, dst
+    LoadExtName {
+        dst: Writable<Reg>,
+        name: Box<ExternalName>,
+        offset: i64,
+    },
+
+    // =====================================
+    // Instructions pertaining to atomic memory accesses.
+    /// A standard (native) `lock cmpxchg src, (amode)`, with register conventions:
+    ///
+    /// `dst`  (read) address
+    /// `src`  (read) replacement value
+    /// %rax   (modified) in: expected value, out: value that was actually at `dst`
+    /// %rflags is written.  Do not assume anything about it after the instruction.
+    ///
+    /// The instruction "succeeded" iff the lowest `ty` bits of %rax afterwards are the same as
+    /// they were before.
+    LockCmpxchg {
+        ty: Type, // I8, I16, I32 or I64
+        src: Reg,
+        dst: SyntheticAmode,
+    },
+
+    /// A synthetic instruction, based on a loop around a native `lock cmpxchg` instruction.
+    /// This atomically modifies a value in memory and returns the old value.  The sequence
+    /// consists of an initial "normal" load from `dst`, followed by a loop which computes the
+    /// new value and tries to compare-and-swap ("CAS") it into `dst`, using the native
+    /// instruction `lock cmpxchg{b,w,l,q}` .  The loop iterates until the CAS is successful.
+    /// If there is no contention, there will be only one pass through the loop body.  The
+    /// sequence does *not* perform any explicit memory fence instructions
+    /// (mfence/sfence/lfence).
+    ///
+    /// Note that the transaction is atomic in the sense that, as observed by some other thread,
+    /// `dst` either has the initial or final value, but no other.  It isn't atomic in the sense
+    /// of guaranteeing that no other thread writes to `dst` in between the initial load and the
+    /// CAS -- but that would cause the CAS to fail unless the other thread's last write before
+    /// the CAS wrote the same value that was already there.  In other words, this
+    /// implementation suffers (unavoidably) from the A-B-A problem.
+    ///
+    /// This instruction sequence has fixed register uses as follows:
+    ///
+    /// %r9   (read) address
+    /// %r10  (read) second operand for `op`
+    /// %r11  (written) scratch reg; value afterwards has no meaning
+    /// %rax  (written) the old value at %r9
+    /// %rflags is written.  Do not assume anything about it after the instruction.
+    AtomicRmwSeq {
+        ty: Type, // I8, I16, I32 or I64
+        op: inst_common::AtomicRmwOp,
+    },
+
+    /// A memory fence (mfence, lfence or sfence).
+    Fence { kind: FenceKind },
+
+    // =====================================
+    // Meta-instructions generating no code.
+    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
+    /// controls how MemArg::NominalSPOffset args are lowered.
+    VirtualSPOffsetAdj { offset: i64 },
+
+    /// Provides a way to tell the register allocator that the upcoming sequence of instructions
+    /// will overwrite `dst` so it should be considered as a `def`; use this with care.
+    ///
+    /// This is useful when we have a sequence of instructions whose register usages are nominally
+    /// `mod`s, but such that the combination of operations creates a result that is independent of
+    /// the initial register value. It's thus semantically a `def`, not a `mod`, when all the
+    /// instructions are taken together, so we want to ensure the register is defined (its
+    /// live-range starts) prior to the sequence to keep analyses happy.
+    ///
+    /// One alternative would be a compound instruction that somehow encapsulates the others and
+    /// reports its own `def`s/`use`s/`mod`s; this adds complexity (the instruction list is no
+    /// longer flat) and requires knowledge about semantics and initial-value independence anyway.
+    XmmUninitializedValue { dst: Writable<Reg> },
+}
+
+pub(crate) fn low32_will_sign_extend_to_64(x: u64) -> bool {
+    let xs = x as i64;
+    xs == ((xs << 32) >> 32)
+}
+
+impl Inst {
+    fn isa_requirement(&self) -> Option<InstructionSet> {
+        match self {
+            // These instructions are part of SSE2, which is a basic requirement in Cranelift, and
+            // don't have to be checked.
+            Inst::AluRmiR { .. }
+            | Inst::AtomicRmwSeq { .. }
+            | Inst::CallKnown { .. }
+            | Inst::CallUnknown { .. }
+            | Inst::CheckedDivOrRemSeq { .. }
+            | Inst::Cmove { .. }
+            | Inst::CmpRmiR { .. }
+            | Inst::CvtFloatToSintSeq { .. }
+            | Inst::CvtFloatToUintSeq { .. }
+            | Inst::CvtUint64ToFloatSeq { .. }
+            | Inst::Div { .. }
+            | Inst::EpiloguePlaceholder
+            | Inst::Fence { .. }
+            | Inst::Hlt
+            | Inst::Imm { .. }
+            | Inst::JmpCond { .. }
+            | Inst::JmpIf { .. }
+            | Inst::JmpKnown { .. }
+            | Inst::JmpTableSeq { .. }
+            | Inst::JmpUnknown { .. }
+            | Inst::LoadEffectiveAddress { .. }
+            | Inst::LoadExtName { .. }
+            | Inst::LockCmpxchg { .. }
+            | Inst::Mov64MR { .. }
+            | Inst::MovRM { .. }
+            | Inst::MovRR { .. }
+            | Inst::MovsxRmR { .. }
+            | Inst::MovzxRmR { .. }
+            | Inst::MulHi { .. }
+            | Inst::Neg { .. }
+            | Inst::Not { .. }
+            | Inst::Nop { .. }
+            | Inst::Pop64 { .. }
+            | Inst::Push64 { .. }
+            | Inst::Ret
+            | Inst::Setcc { .. }
+            | Inst::ShiftR { .. }
+            | Inst::SignExtendData { .. }
+            | Inst::TrapIf { .. }
+            | Inst::Ud2 { .. }
+            | Inst::UnaryRmR { .. }
+            | Inst::VirtualSPOffsetAdj { .. }
+            | Inst::XmmCmove { .. }
+            | Inst::XmmCmpRmR { .. }
+            | Inst::XmmLoadConst { .. }
+            | Inst::XmmMinMaxSeq { .. }
+            | Inst::XmmUninitializedValue { .. } => None,
+
+            // These use dynamic SSE opcodes.
+            Inst::GprToXmm { op, .. }
+            | Inst::XmmMovRM { op, .. }
+            | Inst::XmmRmiReg { opcode: op, .. }
+            | Inst::XmmRmR { op, .. }
+            | Inst::XmmRmRImm { op, .. }
+            | Inst::XmmToGpr { op, .. }
+            | Inst::XmmUnaryRmR { op, .. } => Some(op.available_from()),
+        }
+    }
+}
+
+// Handy constructors for Insts.
+
+impl Inst {
+    pub(crate) fn nop(len: u8) -> Self {
+        debug_assert!(len <= 16);
+        Self::Nop { len }
+    }
+
+    pub(crate) fn alu_rmi_r(
+        is_64: bool,
+        op: AluRmiROpcode,
+        src: RegMemImm,
+        dst: Writable<Reg>,
+    ) -> Self {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Self::AluRmiR {
+            is_64,
+            op,
+            src,
+            dst,
+        }
+    }
+
+    pub(crate) fn unary_rm_r(
+        size: u8,
+        op: UnaryRmROpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    ) -> Self {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2);
+        Self::UnaryRmR { size, op, src, dst }
+    }
+
+    pub(crate) fn not(size: u8, src: Writable<Reg>) -> Inst {
+        debug_assert_eq!(src.to_reg().get_class(), RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        Inst::Not { size, src }
+    }
+
+    pub(crate) fn neg(size: u8, src: Writable<Reg>) -> Inst {
+        debug_assert_eq!(src.to_reg().get_class(), RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        Inst::Neg { size, src }
+    }
+
+    pub(crate) fn div(size: u8, signed: bool, divisor: RegMem) -> Inst {
+        divisor.assert_regclass_is(RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        Inst::Div {
+            size,
+            signed,
+            divisor,
+        }
+    }
+
+    pub(crate) fn mul_hi(size: u8, signed: bool, rhs: RegMem) -> Inst {
+        rhs.assert_regclass_is(RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        Inst::MulHi { size, signed, rhs }
+    }
+
+    pub(crate) fn checked_div_or_rem_seq(
+        kind: DivOrRemKind,
+        size: u8,
+        divisor: Writable<Reg>,
+        tmp: Option<Writable<Reg>>,
+    ) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert!(divisor.to_reg().get_class() == RegClass::I64);
+        debug_assert!(tmp
+            .map(|tmp| tmp.to_reg().get_class() == RegClass::I64)
+            .unwrap_or(true));
+        Inst::CheckedDivOrRemSeq {
+            kind,
+            size,
+            divisor,
+            tmp,
+        }
+    }
+
+    pub(crate) fn sign_extend_data(size: u8) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        Inst::SignExtendData { size }
+    }
+
+    pub(crate) fn imm(size: OperandSize, simm64: u64, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        // Try to generate a 32-bit immediate when the upper high bits are zeroed (which matches
+        // the semantics of movl).
+        let dst_is_64 = size == OperandSize::Size64 && simm64 > u32::max_value() as u64;
+        Inst::Imm {
+            dst_is_64,
+            simm64,
+            dst,
+        }
+    }
+
+    pub(crate) fn mov_r_r(is_64: bool, src: Reg, dst: Writable<Reg>) -> Inst {
+        debug_assert!(src.get_class() == RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::MovRR { is_64, src, dst }
+    }
+
+    // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
+    pub(crate) fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmUnaryRmR { op, src, dst }
+    }
+
+    pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable<Reg>, ty: Type) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        debug_assert!(ty.is_vector() && ty.bits() == 128);
+        Inst::XmmLoadConst { src, dst, ty }
+    }
+
+    /// Convenient helper for unary float operations.
+    pub(crate) fn xmm_unary_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmUnaryRmR { op, src, dst }
+    }
+
+    pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmRmR { op, src, dst }
+    }
+
+    pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmUninitializedValue { dst }
+    }
+
+    pub(crate) fn xmm_mov_r_m(op: SseOpcode, src: Reg, dst: impl Into<SyntheticAmode>) -> Inst {
+        debug_assert!(src.get_class() == RegClass::V128);
+        Inst::XmmMovRM {
+            op,
+            src,
+            dst: dst.into(),
+        }
+    }
+
+    pub(crate) fn xmm_to_gpr(
+        op: SseOpcode,
+        src: Reg,
+        dst: Writable<Reg>,
+        dst_size: OperandSize,
+    ) -> Inst {
+        debug_assert!(src.get_class() == RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::XmmToGpr {
+            op,
+            src,
+            dst,
+            dst_size,
+        }
+    }
+
+    pub(crate) fn gpr_to_xmm(
+        op: SseOpcode,
+        src: RegMem,
+        src_size: OperandSize,
+        dst: Writable<Reg>,
+    ) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::GprToXmm {
+            op,
+            src,
+            dst,
+            src_size,
+        }
+    }
+
+    pub(crate) fn xmm_cmp_rm_r(op: SseOpcode, src: RegMem, dst: Reg) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.get_class() == RegClass::V128);
+        Inst::XmmCmpRmR { op, src, dst }
+    }
+
+    pub(crate) fn cvt_u64_to_float_seq(
+        to_f64: bool,
+        src: Writable<Reg>,
+        tmp_gpr1: Writable<Reg>,
+        tmp_gpr2: Writable<Reg>,
+        dst: Writable<Reg>,
+    ) -> Inst {
+        debug_assert!(src.to_reg().get_class() == RegClass::I64);
+        debug_assert!(tmp_gpr1.to_reg().get_class() == RegClass::I64);
+        debug_assert!(tmp_gpr2.to_reg().get_class() == RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::CvtUint64ToFloatSeq {
+            src,
+            dst,
+            tmp_gpr1,
+            tmp_gpr2,
+            to_f64,
+        }
+    }
+
+    pub(crate) fn cvt_float_to_sint_seq(
+        src_size: OperandSize,
+        dst_size: OperandSize,
+        is_saturating: bool,
+        src: Writable<Reg>,
+        dst: Writable<Reg>,
+        tmp_gpr: Writable<Reg>,
+        tmp_xmm: Writable<Reg>,
+    ) -> Inst {
+        debug_assert!(src.to_reg().get_class() == RegClass::V128);
+        debug_assert!(tmp_xmm.to_reg().get_class() == RegClass::V128);
+        debug_assert!(tmp_gpr.to_reg().get_class() == RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::CvtFloatToSintSeq {
+            src_size,
+            dst_size,
+            is_saturating,
+            src,
+            dst,
+            tmp_gpr,
+            tmp_xmm,
+        }
+    }
+
+    pub(crate) fn cvt_float_to_uint_seq(
+        src_size: OperandSize,
+        dst_size: OperandSize,
+        is_saturating: bool,
+        src: Writable<Reg>,
+        dst: Writable<Reg>,
+        tmp_gpr: Writable<Reg>,
+        tmp_xmm: Writable<Reg>,
+    ) -> Inst {
+        debug_assert!(src.to_reg().get_class() == RegClass::V128);
+        debug_assert!(tmp_xmm.to_reg().get_class() == RegClass::V128);
+        debug_assert!(tmp_gpr.to_reg().get_class() == RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::CvtFloatToUintSeq {
+            src_size,
+            dst_size,
+            is_saturating,
+            src,
+            dst,
+            tmp_gpr,
+            tmp_xmm,
+        }
+    }
+
+    pub(crate) fn xmm_min_max_seq(
+        size: OperandSize,
+        is_min: bool,
+        lhs: Reg,
+        rhs_dst: Writable<Reg>,
+    ) -> Inst {
+        debug_assert_eq!(lhs.get_class(), RegClass::V128);
+        debug_assert_eq!(rhs_dst.to_reg().get_class(), RegClass::V128);
+        Inst::XmmMinMaxSeq {
+            size,
+            is_min,
+            lhs,
+            rhs_dst,
+        }
+    }
+
+    pub(crate) fn xmm_rm_r_imm(
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+        imm: u8,
+        is64: bool,
+    ) -> Inst {
+        Inst::XmmRmRImm {
+            op,
+            src,
+            dst,
+            imm,
+            is64,
+        }
+    }
+
+    pub(crate) fn movzx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::MovzxRmR { ext_mode, src, dst }
+    }
+
+    pub(crate) fn xmm_rmi_reg(opcode: SseOpcode, src: RegMemImm, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmRmiReg { opcode, src, dst }
+    }
+
+    pub(crate) fn movsx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::MovsxRmR { ext_mode, src, dst }
+    }
+
+    pub(crate) fn mov64_m_r(src: impl Into<SyntheticAmode>, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Mov64MR {
+            src: src.into(),
+            dst,
+        }
+    }
+
+    /// A convenience function to be able to use a RegMem as the source of a move.
+    pub(crate) fn mov64_rm_r(src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        match src {
+            RegMem::Reg { reg } => Self::mov_r_r(true, reg, dst),
+            RegMem::Mem { addr } => Self::mov64_m_r(addr, dst),
+        }
+    }
+
+    pub(crate) fn mov_r_m(
+        size: u8, // 1, 2, 4 or 8
+        src: Reg,
+        dst: impl Into<SyntheticAmode>,
+    ) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert!(src.get_class() == RegClass::I64);
+        Inst::MovRM {
+            size,
+            src,
+            dst: dst.into(),
+        }
+    }
+
+    pub(crate) fn lea(addr: impl Into<SyntheticAmode>, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::LoadEffectiveAddress {
+            addr: addr.into(),
+            dst,
+        }
+    }
+
+    pub(crate) fn shift_r(
+        size: u8,
+        kind: ShiftKind,
+        num_bits: Option<u8>,
+        dst: Writable<Reg>,
+    ) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert!(if let Some(num_bits) = num_bits {
+            num_bits < size * 8
+        } else {
+            true
+        });
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::ShiftR {
+            size,
+            kind,
+            num_bits,
+            dst,
+        }
+    }
+
+    /// Does a comparison of dst - src for operands of size `size`, as stated by the machine
+    /// instruction semantics. Be careful with the order of parameters!
+    pub(crate) fn cmp_rmi_r(
+        size: u8, // 1, 2, 4 or 8
+        src: RegMemImm,
+        dst: Reg,
+    ) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
+        debug_assert!(dst.get_class() == RegClass::I64);
+        Inst::CmpRmiR { size, src, dst }
+    }
+
+    pub(crate) fn trap(trap_code: TrapCode) -> Inst {
+        Inst::Ud2 {
+            trap_code: trap_code,
+        }
+    }
+
+    pub(crate) fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Setcc { cc, dst }
+    }
+
+    pub(crate) fn cmove(size: u8, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst {
+        debug_assert!(size == 8 || size == 4 || size == 2);
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Cmove { size, cc, src, dst }
+    }
+
+    pub(crate) fn xmm_cmove(is_64: bool, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmCmove {
+            is_64,
+            cc,
+            src,
+            dst,
+        }
+    }
+
+    pub(crate) fn push64(src: RegMemImm) -> Inst {
+        src.assert_regclass_is(RegClass::I64);
+        Inst::Push64 { src }
+    }
+
+    pub(crate) fn pop64(dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+        Inst::Pop64 { dst }
+    }
+
+    pub(crate) fn call_known(
+        dest: ExternalName,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: Opcode,
+    ) -> Inst {
+        Inst::CallKnown {
+            dest,
+            uses,
+            defs,
+            opcode,
+        }
+    }
+
+    pub(crate) fn call_unknown(
+        dest: RegMem,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: Opcode,
+    ) -> Inst {
+        dest.assert_regclass_is(RegClass::I64);
+        Inst::CallUnknown {
+            dest,
+            uses,
+            defs,
+            opcode,
+        }
+    }
+
+    pub(crate) fn ret() -> Inst {
+        Inst::Ret
+    }
+
+    pub(crate) fn epilogue_placeholder() -> Inst {
+        Inst::EpiloguePlaceholder
+    }
+
+    pub(crate) fn jmp_known(dst: MachLabel) -> Inst {
+        Inst::JmpKnown { dst }
+    }
+
+    pub(crate) fn jmp_if(cc: CC, taken: MachLabel) -> Inst {
+        Inst::JmpIf { cc, taken }
+    }
+
+    pub(crate) fn jmp_cond(cc: CC, taken: MachLabel, not_taken: MachLabel) -> Inst {
+        Inst::JmpCond {
+            cc,
+            taken,
+            not_taken,
+        }
+    }
+
+    pub(crate) fn jmp_unknown(target: RegMem) -> Inst {
+        target.assert_regclass_is(RegClass::I64);
+        Inst::JmpUnknown { target }
+    }
+
+    pub(crate) fn trap_if(cc: CC, trap_code: TrapCode) -> Inst {
+        Inst::TrapIf { cc, trap_code }
+    }
+
+    /// Choose which instruction to use for loading a register value from memory. For loads smaller
+    /// than 64 bits, this method expects a way to extend the value (i.e. [ExtKind::SignExtend],
+    /// [ExtKind::ZeroExtend]); loads with no extension necessary will ignore this.
+    pub(crate) fn load(
+        ty: Type,
+        from_addr: impl Into<SyntheticAmode>,
+        to_reg: Writable<Reg>,
+        ext_kind: ExtKind,
+    ) -> Inst {
+        let rc = to_reg.to_reg().get_class();
+        match rc {
+            RegClass::I64 => {
+                let ext_mode = match ty.bytes() {
+                    1 => Some(ExtMode::BQ),
+                    2 => Some(ExtMode::WQ),
+                    4 => Some(ExtMode::LQ),
+                    8 => None,
+                    _ => unreachable!("the type should never use a scalar load: {}", ty),
+                };
+                if let Some(ext_mode) = ext_mode {
+                    // Values smaller than 64 bits must be extended in some way.
+                    match ext_kind {
+                        ExtKind::SignExtend => {
+                            Inst::movsx_rm_r(ext_mode, RegMem::mem(from_addr), to_reg)
+                        }
+                        ExtKind::ZeroExtend => {
+                            Inst::movzx_rm_r(ext_mode, RegMem::mem(from_addr), to_reg)
+                        }
+                        ExtKind::None => panic!(
+                            "expected an extension kind for extension mode: {:?}",
+                            ext_mode
+                        ),
+                    }
+                } else {
+                    // 64-bit values can be moved directly.
+                    Inst::mov64_m_r(from_addr, to_reg)
+                }
+            }
+            RegClass::V128 => {
+                let opcode = match ty {
+                    types::F32 => SseOpcode::Movss,
+                    types::F64 => SseOpcode::Movsd,
+                    types::F32X4 => SseOpcode::Movups,
+                    types::F64X2 => SseOpcode::Movupd,
+                    _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqu,
+                    _ => unimplemented!("unable to load type: {}", ty),
+                };
+                Inst::xmm_unary_rm_r(opcode, RegMem::mem(from_addr), to_reg)
+            }
+            _ => panic!("unable to generate load for register class: {:?}", rc),
+        }
+    }
+
+    /// Choose which instruction to use for storing a register value to memory.
+    pub(crate) fn store(ty: Type, from_reg: Reg, to_addr: impl Into<SyntheticAmode>) -> Inst {
+        let rc = from_reg.get_class();
+        match rc {
+            RegClass::I64 => {
+                // Always store the full register, to ensure that the high bits are properly set
+                // when doing a full reload.
+                Inst::mov_r_m(8 /* bytes */, from_reg, to_addr)
+            }
+            RegClass::V128 => {
+                let opcode = match ty {
+                    types::F32 => SseOpcode::Movss,
+                    types::F64 => SseOpcode::Movsd,
+                    types::F32X4 => SseOpcode::Movups,
+                    types::F64X2 => SseOpcode::Movupd,
+                    _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqu,
+                    _ => unimplemented!("unable to store type: {}", ty),
+                };
+                Inst::xmm_mov_r_m(opcode, from_reg, to_addr)
+            }
+            _ => panic!("unable to generate store for register class: {:?}", rc),
+        }
+    }
+}
+
+// Inst helpers.
+
+impl Inst {
+    /// In certain cases, instructions of this format can act as a definition of an XMM register,
+    /// producing a value that is independent of its initial value.
+    ///
+    /// For example, a vector equality comparison (`cmppd` or `cmpps`) that compares a register to
+    /// itself will generate all ones as a result, regardless of its value. From the register
+    /// allocator's point of view, we should (i) record the first register, which is normally a
+    /// mod, as a def instead; and (ii) not record the second register as a use, because it is the
+    /// same as the first register (already handled).
+    fn produces_const(&self) -> bool {
+        match self {
+            Self::AluRmiR { op, src, dst, .. } => {
+                src.to_reg() == Some(dst.to_reg())
+                    && (*op == AluRmiROpcode::Xor || *op == AluRmiROpcode::Sub)
+            }
+
+            Self::XmmRmR { op, src, dst, .. } => {
+                src.to_reg() == Some(dst.to_reg())
+                    && (*op == SseOpcode::Xorps
+                        || *op == SseOpcode::Xorpd
+                        || *op == SseOpcode::Pxor
+                        || *op == SseOpcode::Pcmpeqb
+                        || *op == SseOpcode::Pcmpeqw
+                        || *op == SseOpcode::Pcmpeqd
+                        || *op == SseOpcode::Pcmpeqq)
+            }
+
+            Self::XmmRmRImm {
+                op, src, dst, imm, ..
+            } => {
+                src.to_reg() == Some(dst.to_reg())
+                    && (*op == SseOpcode::Cmppd || *op == SseOpcode::Cmpps)
+                    && *imm == FcmpImm::Equal.encode()
+            }
+
+            _ => false,
+        }
+    }
+
+    /// Choose which instruction to use for comparing two values for equality.
+    pub(crate) fn equals(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
+        match ty {
+            types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to),
+            types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to),
+            types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to),
+            types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to),
+            types::F32X4 => {
+                Inst::xmm_rm_r_imm(SseOpcode::Cmpps, from, to, FcmpImm::Equal.encode(), false)
+            }
+            types::F64X2 => {
+                Inst::xmm_rm_r_imm(SseOpcode::Cmppd, from, to, FcmpImm::Equal.encode(), false)
+            }
+            _ => unimplemented!("unimplemented type for Inst::equals: {}", ty),
+        }
+    }
+
+    /// Choose which instruction to use for computing a bitwise AND on two values.
+    pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
+        match ty {
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to),
+            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pand, from, to),
+            _ => unimplemented!("unimplemented type for Inst::and: {}", ty),
+        }
+    }
+
+    /// Choose which instruction to use for computing a bitwise AND NOT on two values.
+    pub(crate) fn and_not(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
+        match ty {
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to),
+            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pandn, from, to),
+            _ => unimplemented!("unimplemented type for Inst::and_not: {}", ty),
+        }
+    }
+
+    /// Choose which instruction to use for computing a bitwise OR on two values.
+    pub(crate) fn or(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
+        match ty {
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to),
+            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Por, from, to),
+            _ => unimplemented!("unimplemented type for Inst::or: {}", ty),
+        }
+    }
+
+    /// Choose which instruction to use for computing a bitwise XOR on two values.
+    pub(crate) fn xor(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
+        match ty {
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to),
+            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pxor, from, to),
+            _ => unimplemented!("unimplemented type for Inst::xor: {}", ty),
+        }
+    }
+}
+
+//=============================================================================
+// Instructions: printing
+
+impl PrettyPrint for Inst {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        fn ljustify(s: String) -> String {
+            let w = 7;
+            if s.len() >= w {
+                s
+            } else {
+                let need = usize::min(w, w - s.len());
+                s + &format!("{nil: <width$}", nil = "", width = need)
+            }
+        }
+
+        fn ljustify2(s1: String, s2: String) -> String {
+            ljustify(s1 + &s2)
+        }
+
+        fn suffix_lq(is_64: bool) -> String {
+            (if is_64 { "q" } else { "l" }).to_string()
+        }
+
+        fn size_lq(is_64: bool) -> u8 {
+            if is_64 {
+                8
+            } else {
+                4
+            }
+        }
+
+        fn suffix_bwlq(size: u8) -> String {
+            match size {
+                1 => "b".to_string(),
+                2 => "w".to_string(),
+                4 => "l".to_string(),
+                8 => "q".to_string(),
+                _ => panic!("Inst(x64).show.suffixBWLQ: size={}", size),
+            }
+        }
+
+        match self {
+            Inst::Nop { len } => format!("{} len={}", ljustify("nop".to_string()), len),
+
+            Inst::AluRmiR {
+                is_64,
+                op,
+                src,
+                dst,
+            } => format!(
+                "{} {}, {}",
+                ljustify2(op.to_string(), suffix_lq(*is_64)),
+                src.show_rru_sized(mb_rru, size_lq(*is_64)),
+                show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64)),
+            ),
+
+            Inst::UnaryRmR { src, dst, op, size } => format!(
+                "{} {}, {}",
+                ljustify2(op.to_string(), suffix_bwlq(*size)),
+                src.show_rru_sized(mb_rru, *size),
+                show_ireg_sized(dst.to_reg(), mb_rru, *size),
+            ),
+
+            Inst::Not { size, src } => format!(
+                "{} {}",
+                ljustify2("not".to_string(), suffix_bwlq(*size)),
+                show_ireg_sized(src.to_reg(), mb_rru, *size)
+            ),
+
+            Inst::Neg { size, src } => format!(
+                "{} {}",
+                ljustify2("neg".to_string(), suffix_bwlq(*size)),
+                show_ireg_sized(src.to_reg(), mb_rru, *size)
+            ),
+
+            Inst::Div {
+                size,
+                signed,
+                divisor,
+                ..
+            } => format!(
+                "{} {}",
+                ljustify(if *signed {
+                    "idiv".to_string()
+                } else {
+                    "div".into()
+                }),
+                divisor.show_rru_sized(mb_rru, *size)
+            ),
+
+            Inst::MulHi {
+                size, signed, rhs, ..
+            } => format!(
+                "{} {}",
+                ljustify(if *signed {
+                    "imul".to_string()
+                } else {
+                    "mul".to_string()
+                }),
+                rhs.show_rru_sized(mb_rru, *size)
+            ),
+
+            Inst::CheckedDivOrRemSeq {
+                kind,
+                size,
+                divisor,
+                ..
+            } => format!(
+                "{} $rax:$rdx, {}",
+                match kind {
+                    DivOrRemKind::SignedDiv => "sdiv",
+                    DivOrRemKind::UnsignedDiv => "udiv",
+                    DivOrRemKind::SignedRem => "srem",
+                    DivOrRemKind::UnsignedRem => "urem",
+                },
+                show_ireg_sized(divisor.to_reg(), mb_rru, *size),
+            ),
+
+            Inst::SignExtendData { size } => match size {
+                1 => "cbw",
+                2 => "cwd",
+                4 => "cdq",
+                8 => "cqo",
+                _ => unreachable!(),
+            }
+            .into(),
+
+            Inst::XmmUnaryRmR { op, src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                src.show_rru_sized(mb_rru, op.src_size()),
+                show_ireg_sized(dst.to_reg(), mb_rru, 8),
+            ),
+
+            Inst::XmmMovRM { op, src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                show_ireg_sized(*src, mb_rru, 8),
+                dst.show_rru(mb_rru),
+            ),
+
+            Inst::XmmRmR { op, src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                src.show_rru_sized(mb_rru, 8),
+                show_ireg_sized(dst.to_reg(), mb_rru, 8),
+            ),
+
+            Inst::XmmMinMaxSeq {
+                lhs,
+                rhs_dst,
+                is_min,
+                size,
+            } => format!(
+                "{} {}, {}",
+                ljustify2(
+                    if *is_min {
+                        "xmm min seq ".to_string()
+                    } else {
+                        "xmm max seq ".to_string()
+                    },
+                    match size {
+                        OperandSize::Size32 => "f32",
+                        OperandSize::Size64 => "f64",
+                    }
+                    .into()
+                ),
+                show_ireg_sized(*lhs, mb_rru, 8),
+                show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8),
+            ),
+
+            Inst::XmmRmRImm { op, src, dst, imm, is64, .. } => format!(
+                "{} ${}, {}, {}",
+                ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })),
+                imm,
+                src.show_rru(mb_rru),
+                dst.show_rru(mb_rru),
+            ),
+
+            Inst::XmmUninitializedValue { dst } => format!(
+                "{} {}",
+                ljustify("uninit".into()),
+                dst.show_rru(mb_rru),
+            ),
+
+            Inst::XmmLoadConst { src, dst, .. } => {
+                format!("load_const {:?}, {}", src, dst.show_rru(mb_rru),)
+            }
+
+            Inst::XmmToGpr {
+                op,
+                src,
+                dst,
+                dst_size,
+            } => {
+                let dst_size = match dst_size {
+                    OperandSize::Size32 => 4,
+                    OperandSize::Size64 => 8,
+                };
+                format!(
+                    "{} {}, {}",
+                    ljustify(op.to_string()),
+                    src.show_rru(mb_rru),
+                    show_ireg_sized(dst.to_reg(), mb_rru, dst_size),
+                )
+            }
+
+            Inst::GprToXmm {
+                op,
+                src,
+                src_size,
+                dst,
+            } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                src.show_rru_sized(mb_rru, src_size.to_bytes()),
+                dst.show_rru(mb_rru)
+            ),
+
+            Inst::XmmCmpRmR { op, src, dst } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                src.show_rru_sized(mb_rru, 8),
+                show_ireg_sized(*dst, mb_rru, 8),
+            ),
+
+            Inst::CvtUint64ToFloatSeq {
+                src, dst, to_f64, ..
+            } => format!(
+                "{} {}, {}",
+                ljustify(format!(
+                    "u64_to_{}_seq",
+                    if *to_f64 { "f64" } else { "f32" }
+                )),
+                show_ireg_sized(src.to_reg(), mb_rru, 8),
+                dst.show_rru(mb_rru),
+            ),
+
+            Inst::CvtFloatToSintSeq {
+                src,
+                dst,
+                src_size,
+                dst_size,
+                ..
+            } => format!(
+                "{} {}, {}",
+                ljustify(format!(
+                    "cvt_float{}_to_sint{}_seq",
+                    if *src_size == OperandSize::Size64 {
+                        "64"
+                    } else {
+                        "32"
+                    },
+                    if *dst_size == OperandSize::Size64 {
+                        "64"
+                    } else {
+                        "32"
+                    }
+                )),
+                show_ireg_sized(src.to_reg(), mb_rru, 8),
+                show_ireg_sized(dst.to_reg(), mb_rru, dst_size.to_bytes()),
+            ),
+
+            Inst::CvtFloatToUintSeq {
+                src,
+                dst,
+                src_size,
+                dst_size,
+                ..
+            } => format!(
+                "{} {}, {}",
+                ljustify(format!(
+                    "cvt_float{}_to_uint{}_seq",
+                    if *src_size == OperandSize::Size64 {
+                        "64"
+                    } else {
+                        "32"
+                    },
+                    if *dst_size == OperandSize::Size64 {
+                        "64"
+                    } else {
+                        "32"
+                    }
+                )),
+                show_ireg_sized(src.to_reg(), mb_rru, 8),
+                show_ireg_sized(dst.to_reg(), mb_rru, dst_size.to_bytes()),
+            ),
+
+            Inst::Imm {
+                dst_is_64,
+                simm64,
+                dst,
+            } => {
+                if *dst_is_64 {
+                    format!(
+                        "{} ${}, {}",
+                        ljustify("movabsq".to_string()),
+                        *simm64 as i64,
+                        show_ireg_sized(dst.to_reg(), mb_rru, 8)
+                    )
+                } else {
+                    format!(
+                        "{} ${}, {}",
+                        ljustify("movl".to_string()),
+                        (*simm64 as u32) as i32,
+                        show_ireg_sized(dst.to_reg(), mb_rru, 4)
+                    )
+                }
+            }
+
+            Inst::MovRR { is_64, src, dst } => format!(
+                "{} {}, {}",
+                ljustify2("mov".to_string(), suffix_lq(*is_64)),
+                show_ireg_sized(*src, mb_rru, size_lq(*is_64)),
+                show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64))
+            ),
+
+            Inst::MovzxRmR {
+                ext_mode, src, dst, ..
+            } => {
+                if *ext_mode == ExtMode::LQ {
+                    format!(
+                        "{} {}, {}",
+                        ljustify("movl".to_string()),
+                        src.show_rru_sized(mb_rru, ext_mode.src_size()),
+                        show_ireg_sized(dst.to_reg(), mb_rru, 4)
+                    )
+                } else {
+                    format!(
+                        "{} {}, {}",
+                        ljustify2("movz".to_string(), ext_mode.to_string()),
+                        src.show_rru_sized(mb_rru, ext_mode.src_size()),
+                        show_ireg_sized(dst.to_reg(), mb_rru, ext_mode.dst_size())
+                    )
+                }
+            }
+
+            Inst::Mov64MR { src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify("movq".to_string()),
+                src.show_rru(mb_rru),
+                dst.show_rru(mb_rru)
+            ),
+
+            Inst::LoadEffectiveAddress { addr, dst } => format!(
+                "{} {}, {}",
+                ljustify("lea".to_string()),
+                addr.show_rru(mb_rru),
+                dst.show_rru(mb_rru)
+            ),
+
+            Inst::MovsxRmR {
+                ext_mode, src, dst, ..
+            } => format!(
+                "{} {}, {}",
+                ljustify2("movs".to_string(), ext_mode.to_string()),
+                src.show_rru_sized(mb_rru, ext_mode.src_size()),
+                show_ireg_sized(dst.to_reg(), mb_rru, ext_mode.dst_size())
+            ),
+
+            Inst::MovRM { size, src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify2("mov".to_string(), suffix_bwlq(*size)),
+                show_ireg_sized(*src, mb_rru, *size),
+                dst.show_rru(mb_rru)
+            ),
+
+            Inst::ShiftR {
+                size,
+                kind,
+                num_bits,
+                dst,
+            } => match num_bits {
+                None => format!(
+                    "{} %cl, {}",
+                    ljustify2(kind.to_string(), suffix_bwlq(*size)),
+                    show_ireg_sized(dst.to_reg(), mb_rru, *size)
+                ),
+
+                Some(num_bits) => format!(
+                    "{} ${}, {}",
+                    ljustify2(kind.to_string(), suffix_bwlq(*size)),
+                    num_bits,
+                    show_ireg_sized(dst.to_reg(), mb_rru, *size)
+                ),
+            },
+
+            Inst::XmmRmiReg { opcode, src, dst } => format!(
+                "{} {}, {}",
+                ljustify(opcode.to_string()),
+                src.show_rru(mb_rru),
+                dst.to_reg().show_rru(mb_rru)
+            ),
+
+            Inst::CmpRmiR { size, src, dst } => format!(
+                "{} {}, {}",
+                ljustify2("cmp".to_string(), suffix_bwlq(*size)),
+                src.show_rru_sized(mb_rru, *size),
+                show_ireg_sized(*dst, mb_rru, *size)
+            ),
+
+            Inst::Setcc { cc, dst } => format!(
+                "{} {}",
+                ljustify2("set".to_string(), cc.to_string()),
+                show_ireg_sized(dst.to_reg(), mb_rru, 1)
+            ),
+
+            Inst::Cmove { size, cc, src, dst } => format!(
+                "{} {}, {}",
+                ljustify(format!("cmov{}{}", cc.to_string(), suffix_bwlq(*size))),
+                src.show_rru_sized(mb_rru, *size),
+                show_ireg_sized(dst.to_reg(), mb_rru, *size)
+            ),
+
+            Inst::XmmCmove {
+                is_64,
+                cc,
+                src,
+                dst,
+            } => {
+                let size = if *is_64 { 8 } else { 4 };
+                format!(
+                    "j{} $next; mov{} {}, {}; $next: ",
+                    cc.invert().to_string(),
+                    if *is_64 { "sd" } else { "ss" },
+                    src.show_rru_sized(mb_rru, size),
+                    show_ireg_sized(dst.to_reg(), mb_rru, size)
+                )
+            }
+
+            Inst::Push64 { src } => {
+                format!("{} {}", ljustify("pushq".to_string()), src.show_rru(mb_rru))
+            }
+
+            Inst::Pop64 { dst } => {
+                format!("{} {}", ljustify("popq".to_string()), dst.show_rru(mb_rru))
+            }
+
+            Inst::CallKnown { dest, .. } => format!("{} {:?}", ljustify("call".to_string()), dest),
+
+            Inst::CallUnknown { dest, .. } => format!(
+                "{} *{}",
+                ljustify("call".to_string()),
+                dest.show_rru(mb_rru)
+            ),
+
+            Inst::Ret => "ret".to_string(),
+
+            Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(),
+
+            Inst::JmpKnown { dst } => {
+                format!("{} {}", ljustify("jmp".to_string()), dst.to_string())
+            }
+
+            Inst::JmpIf { cc, taken } => format!(
+                "{} {}",
+                ljustify2("j".to_string(), cc.to_string()),
+                taken.to_string(),
+            ),
+
+            Inst::JmpCond {
+                cc,
+                taken,
+                not_taken,
+            } => format!(
+                "{} {}; j {}",
+                ljustify2("j".to_string(), cc.to_string()),
+                taken.to_string(),
+                not_taken.to_string()
+            ),
+
+            Inst::JmpTableSeq { idx, .. } => {
+                format!("{} {}", ljustify("br_table".into()), idx.show_rru(mb_rru))
+            }
+
+            Inst::JmpUnknown { target } => format!(
+                "{} *{}",
+                ljustify("jmp".to_string()),
+                target.show_rru(mb_rru)
+            ),
+
+            Inst::TrapIf { cc, trap_code, .. } => {
+                format!("j{} ; ud2 {} ;", cc.invert().to_string(), trap_code)
+            }
+
+            Inst::LoadExtName {
+                dst, name, offset, ..
+            } => format!(
+                "{} {}+{}, {}",
+                ljustify("movaps".into()),
+                name,
+                offset,
+                show_ireg_sized(dst.to_reg(), mb_rru, 8),
+            ),
+
+            Inst::LockCmpxchg { ty, src, dst, .. } => {
+                let size = ty.bytes() as u8;
+                format!("lock cmpxchg{} {}, {}",
+                        suffix_bwlq(size), show_ireg_sized(*src, mb_rru, size), dst.show_rru(mb_rru))
+            }
+
+            Inst::AtomicRmwSeq { ty, op, .. } => {
+                format!(
+                    "atomically {{ {}_bits_at_[%r9]) {:?}= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }}",
+                    ty.bits(), op)
+            },
+
+            Inst::Fence { kind } => {
+                match kind {
+                    FenceKind::MFence => "mfence".to_string(),
+                    FenceKind::LFence => "lfence".to_string(),
+                    FenceKind::SFence => "sfence".to_string(),
+                }
+            }
+
+            Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
+
+            Inst::Hlt => "hlt".into(),
+
+            Inst::Ud2 { trap_code } => format!("ud2 {}", trap_code),
+        }
+    }
+}
+
+// Temp hook for legacy printing machinery
+impl fmt::Debug for Inst {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        // Print the insn without a Universe :-(
+        write!(fmt, "{}", self.show_rru(None))
+    }
+}
+
+fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
+    // This is a bit subtle. If some register is in the modified set, then it may not be in either
+    // the use or def sets. However, enforcing that directly is somewhat difficult. Instead,
+    // regalloc.rs will "fix" this for us by removing the the modified set from the use and def
+    // sets.
+    match inst {
+        Inst::AluRmiR { src, dst, .. } => {
+            if inst.produces_const() {
+                // No need to account for src, since src == dst.
+                collector.add_def(*dst);
+            } else {
+                src.get_regs_as_uses(collector);
+                collector.add_mod(*dst);
+            }
+        }
+        Inst::Not { src, .. } => {
+            collector.add_mod(*src);
+        }
+        Inst::Neg { src, .. } => {
+            collector.add_mod(*src);
+        }
+        Inst::Div { size, divisor, .. } => {
+            collector.add_mod(Writable::from_reg(regs::rax()));
+            if *size == 1 {
+                collector.add_def(Writable::from_reg(regs::rdx()));
+            } else {
+                collector.add_mod(Writable::from_reg(regs::rdx()));
+            }
+            divisor.get_regs_as_uses(collector);
+        }
+        Inst::MulHi { rhs, .. } => {
+            collector.add_mod(Writable::from_reg(regs::rax()));
+            collector.add_def(Writable::from_reg(regs::rdx()));
+            rhs.get_regs_as_uses(collector);
+        }
+        Inst::CheckedDivOrRemSeq { divisor, tmp, .. } => {
+            // Mark both fixed registers as mods, to avoid an early clobber problem in codegen
+            // (i.e. the temporary is allocated one of the fixed registers). This requires writing
+            // the rdx register *before* the instruction, which is not too bad.
+            collector.add_mod(Writable::from_reg(regs::rax()));
+            collector.add_mod(Writable::from_reg(regs::rdx()));
+            collector.add_mod(*divisor);
+            if let Some(tmp) = tmp {
+                collector.add_def(*tmp);
+            }
+        }
+        Inst::SignExtendData { size } => match size {
+            1 => collector.add_mod(Writable::from_reg(regs::rax())),
+            2 | 4 | 8 => {
+                collector.add_use(regs::rax());
+                collector.add_def(Writable::from_reg(regs::rdx()));
+            }
+            _ => unreachable!(),
+        },
+        Inst::UnaryRmR { src, dst, .. } | Inst::XmmUnaryRmR { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::XmmRmR { src, dst, .. } => {
+            if inst.produces_const() {
+                // No need to account for src, since src == dst.
+                collector.add_def(*dst);
+            } else {
+                src.get_regs_as_uses(collector);
+                collector.add_mod(*dst);
+            }
+        }
+        Inst::XmmRmRImm { op, src, dst, .. } => {
+            if inst.produces_const() {
+                // No need to account for src, since src == dst.
+                collector.add_def(*dst);
+            } else if *op == SseOpcode::Pextrb
+                || *op == SseOpcode::Pextrw
+                || *op == SseOpcode::Pextrd
+                || *op == SseOpcode::Pshufd
+            {
+                src.get_regs_as_uses(collector);
+                collector.add_def(*dst);
+            } else {
+                src.get_regs_as_uses(collector);
+                collector.add_mod(*dst);
+            }
+        }
+        Inst::XmmUninitializedValue { dst } => collector.add_def(*dst),
+        Inst::XmmLoadConst { dst, .. } => collector.add_def(*dst),
+        Inst::XmmMinMaxSeq { lhs, rhs_dst, .. } => {
+            collector.add_use(*lhs);
+            collector.add_mod(*rhs_dst);
+        }
+        Inst::XmmRmiReg { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_mod(*dst);
+        }
+        Inst::XmmMovRM { src, dst, .. } => {
+            collector.add_use(*src);
+            dst.get_regs_as_uses(collector);
+        }
+        Inst::XmmCmpRmR { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_use(*dst);
+        }
+        Inst::Imm { dst, .. } => {
+            collector.add_def(*dst);
+        }
+        Inst::MovRR { src, dst, .. } | Inst::XmmToGpr { src, dst, .. } => {
+            collector.add_use(*src);
+            collector.add_def(*dst);
+        }
+        Inst::GprToXmm { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::CvtUint64ToFloatSeq {
+            src,
+            dst,
+            tmp_gpr1,
+            tmp_gpr2,
+            ..
+        } => {
+            collector.add_mod(*src);
+            collector.add_def(*dst);
+            collector.add_def(*tmp_gpr1);
+            collector.add_def(*tmp_gpr2);
+        }
+        Inst::CvtFloatToSintSeq {
+            src,
+            dst,
+            tmp_xmm,
+            tmp_gpr,
+            ..
+        }
+        | Inst::CvtFloatToUintSeq {
+            src,
+            dst,
+            tmp_gpr,
+            tmp_xmm,
+            ..
+        } => {
+            collector.add_mod(*src);
+            collector.add_def(*dst);
+            collector.add_def(*tmp_gpr);
+            collector.add_def(*tmp_xmm);
+        }
+        Inst::MovzxRmR { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::Mov64MR { src, dst, .. } | Inst::LoadEffectiveAddress { addr: src, dst } => {
+            src.get_regs_as_uses(collector);
+            collector.add_def(*dst)
+        }
+        Inst::MovsxRmR { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_def(*dst);
+        }
+        Inst::MovRM { src, dst, .. } => {
+            collector.add_use(*src);
+            dst.get_regs_as_uses(collector);
+        }
+        Inst::ShiftR { num_bits, dst, .. } => {
+            if num_bits.is_none() {
+                collector.add_use(regs::rcx());
+            }
+            collector.add_mod(*dst);
+        }
+        Inst::CmpRmiR { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_use(*dst); // yes, really `add_use`
+        }
+        Inst::Setcc { dst, .. } => {
+            collector.add_def(*dst);
+        }
+        Inst::Cmove { src, dst, .. } | Inst::XmmCmove { src, dst, .. } => {
+            src.get_regs_as_uses(collector);
+            collector.add_mod(*dst);
+        }
+        Inst::Push64 { src } => {
+            src.get_regs_as_uses(collector);
+            collector.add_mod(Writable::from_reg(regs::rsp()));
+        }
+        Inst::Pop64 { dst } => {
+            collector.add_def(*dst);
+        }
+
+        Inst::CallKnown {
+            ref uses, ref defs, ..
+        } => {
+            collector.add_uses(uses);
+            collector.add_defs(defs);
+        }
+
+        Inst::CallUnknown {
+            ref uses,
+            ref defs,
+            dest,
+            ..
+        } => {
+            collector.add_uses(uses);
+            collector.add_defs(defs);
+            dest.get_regs_as_uses(collector);
+        }
+
+        Inst::JmpTableSeq {
+            ref idx,
+            ref tmp1,
+            ref tmp2,
+            ..
+        } => {
+            collector.add_use(*idx);
+            collector.add_def(*tmp1);
+            collector.add_def(*tmp2);
+        }
+
+        Inst::JmpUnknown { target } => {
+            target.get_regs_as_uses(collector);
+        }
+
+        Inst::LoadExtName { dst, .. } => {
+            collector.add_def(*dst);
+        }
+
+        Inst::LockCmpxchg { src, dst, .. } => {
+            dst.get_regs_as_uses(collector);
+            collector.add_use(*src);
+            collector.add_mod(Writable::from_reg(regs::rax()));
+        }
+
+        Inst::AtomicRmwSeq { .. } => {
+            collector.add_use(regs::r9());
+            collector.add_use(regs::r10());
+            collector.add_def(Writable::from_reg(regs::r11()));
+            collector.add_def(Writable::from_reg(regs::rax()));
+        }
+
+        Inst::Ret
+        | Inst::EpiloguePlaceholder
+        | Inst::JmpKnown { .. }
+        | Inst::JmpIf { .. }
+        | Inst::JmpCond { .. }
+        | Inst::Nop { .. }
+        | Inst::TrapIf { .. }
+        | Inst::VirtualSPOffsetAdj { .. }
+        | Inst::Hlt
+        | Inst::Ud2 { .. }
+        | Inst::Fence { .. } => {
+            // No registers are used.
+        }
+    }
+}
+
+//=============================================================================
+// Instructions and subcomponents: map_regs
+
+fn map_use<RUM: RegUsageMapper>(m: &RUM, r: &mut Reg) {
+    if let Some(reg) = r.as_virtual_reg() {
+        let new = m.get_use(reg).unwrap().to_reg();
+        *r = new;
+    }
+}
+
+fn map_def<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
+    if let Some(reg) = r.to_reg().as_virtual_reg() {
+        let new = m.get_def(reg).unwrap().to_reg();
+        *r = Writable::from_reg(new);
+    }
+}
+
+fn map_mod<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
+    if let Some(reg) = r.to_reg().as_virtual_reg() {
+        let new = m.get_mod(reg).unwrap().to_reg();
+        *r = Writable::from_reg(new);
+    }
+}
+
+impl Amode {
+    fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            Amode::ImmReg { ref mut base, .. } => map_use(map, base),
+            Amode::ImmRegRegShift {
+                ref mut base,
+                ref mut index,
+                ..
+            } => {
+                map_use(map, base);
+                map_use(map, index);
+            }
+            Amode::RipRelative { .. } => {
+                // RIP isn't involved in regalloc.
+            }
+        }
+    }
+}
+
+impl RegMemImm {
+    fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            RegMemImm::Reg { ref mut reg } => map_use(map, reg),
+            RegMemImm::Mem { ref mut addr } => addr.map_uses(map),
+            RegMemImm::Imm { .. } => {}
+        }
+    }
+
+    fn map_as_def<RUM: RegUsageMapper>(&mut self, mapper: &RUM) {
+        match self {
+            Self::Reg { reg } => {
+                let mut writable_src = Writable::from_reg(*reg);
+                map_def(mapper, &mut writable_src);
+                *self = Self::reg(writable_src.to_reg());
+            }
+            _ => panic!("unexpected RegMemImm kind in map_src_reg_as_def"),
+        }
+    }
+}
+
+impl RegMem {
+    fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
+        match self {
+            RegMem::Reg { ref mut reg } => map_use(map, reg),
+            RegMem::Mem { ref mut addr, .. } => addr.map_uses(map),
+        }
+    }
+
+    fn map_as_def<RUM: RegUsageMapper>(&mut self, mapper: &RUM) {
+        match self {
+            Self::Reg { reg } => {
+                let mut writable_src = Writable::from_reg(*reg);
+                map_def(mapper, &mut writable_src);
+                *self = Self::reg(writable_src.to_reg());
+            }
+            _ => panic!("unexpected RegMem kind in map_src_reg_as_def"),
+        }
+    }
+}
+
+fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
+    // Note this must be carefully synchronized with x64_get_regs.
+    let produces_const = inst.produces_const();
+
+    match inst {
+        // ** Nop
+        Inst::AluRmiR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            if produces_const {
+                src.map_as_def(mapper);
+                map_def(mapper, dst);
+            } else {
+                src.map_uses(mapper);
+                map_mod(mapper, dst);
+            }
+        }
+        Inst::Not { src, .. } | Inst::Neg { src, .. } => map_mod(mapper, src),
+        Inst::Div { divisor, .. } => divisor.map_uses(mapper),
+        Inst::MulHi { rhs, .. } => rhs.map_uses(mapper),
+        Inst::CheckedDivOrRemSeq { divisor, tmp, .. } => {
+            map_mod(mapper, divisor);
+            if let Some(tmp) = tmp {
+                map_def(mapper, tmp)
+            }
+        }
+        Inst::SignExtendData { .. } => {}
+        Inst::XmmUnaryRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        }
+        | Inst::UnaryRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::XmmRmRImm {
+            ref op,
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            if produces_const {
+                src.map_as_def(mapper);
+                map_def(mapper, dst);
+            } else if *op == SseOpcode::Pextrb
+                || *op == SseOpcode::Pextrw
+                || *op == SseOpcode::Pextrd
+                || *op == SseOpcode::Pshufd
+            {
+                src.map_uses(mapper);
+                map_def(mapper, dst);
+            } else {
+                src.map_uses(mapper);
+                map_mod(mapper, dst);
+            }
+        }
+        Inst::XmmRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            if produces_const {
+                src.map_as_def(mapper);
+                map_def(mapper, dst);
+            } else {
+                src.map_uses(mapper);
+                map_mod(mapper, dst);
+            }
+        }
+        Inst::XmmRmiReg {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_mod(mapper, dst);
+        }
+        Inst::XmmUninitializedValue { ref mut dst, .. } => {
+            map_def(mapper, dst);
+        }
+        Inst::XmmLoadConst { ref mut dst, .. } => {
+            map_def(mapper, dst);
+        }
+        Inst::XmmMinMaxSeq {
+            ref mut lhs,
+            ref mut rhs_dst,
+            ..
+        } => {
+            map_use(mapper, lhs);
+            map_mod(mapper, rhs_dst);
+        }
+        Inst::XmmMovRM {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            map_use(mapper, src);
+            dst.map_uses(mapper);
+        }
+        Inst::XmmCmpRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_use(mapper, dst);
+        }
+        Inst::Imm { ref mut dst, .. } => map_def(mapper, dst),
+        Inst::MovRR {
+            ref mut src,
+            ref mut dst,
+            ..
+        }
+        | Inst::XmmToGpr {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            map_use(mapper, src);
+            map_def(mapper, dst);
+        }
+        Inst::GprToXmm {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::CvtUint64ToFloatSeq {
+            ref mut src,
+            ref mut dst,
+            ref mut tmp_gpr1,
+            ref mut tmp_gpr2,
+            ..
+        } => {
+            map_mod(mapper, src);
+            map_def(mapper, dst);
+            map_def(mapper, tmp_gpr1);
+            map_def(mapper, tmp_gpr2);
+        }
+        Inst::CvtFloatToSintSeq {
+            ref mut src,
+            ref mut dst,
+            ref mut tmp_xmm,
+            ref mut tmp_gpr,
+            ..
+        }
+        | Inst::CvtFloatToUintSeq {
+            ref mut src,
+            ref mut dst,
+            ref mut tmp_gpr,
+            ref mut tmp_xmm,
+            ..
+        } => {
+            map_mod(mapper, src);
+            map_def(mapper, dst);
+            map_def(mapper, tmp_gpr);
+            map_def(mapper, tmp_xmm);
+        }
+        Inst::MovzxRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::Mov64MR { src, dst, .. } | Inst::LoadEffectiveAddress { addr: src, dst } => {
+            src.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::MovsxRmR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_def(mapper, dst);
+        }
+        Inst::MovRM {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            map_use(mapper, src);
+            dst.map_uses(mapper);
+        }
+        Inst::ShiftR { ref mut dst, .. } => {
+            map_mod(mapper, dst);
+        }
+        Inst::CmpRmiR {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_use(mapper, dst);
+        }
+        Inst::Setcc { ref mut dst, .. } => map_def(mapper, dst),
+        Inst::Cmove {
+            ref mut src,
+            ref mut dst,
+            ..
+        }
+        | Inst::XmmCmove {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            src.map_uses(mapper);
+            map_mod(mapper, dst)
+        }
+        Inst::Push64 { ref mut src } => src.map_uses(mapper),
+        Inst::Pop64 { ref mut dst } => {
+            map_def(mapper, dst);
+        }
+
+        Inst::CallKnown {
+            ref mut uses,
+            ref mut defs,
+            ..
+        } => {
+            for r in uses.iter_mut() {
+                map_use(mapper, r);
+            }
+            for r in defs.iter_mut() {
+                map_def(mapper, r);
+            }
+        }
+
+        Inst::CallUnknown {
+            ref mut uses,
+            ref mut defs,
+            ref mut dest,
+            ..
+        } => {
+            for r in uses.iter_mut() {
+                map_use(mapper, r);
+            }
+            for r in defs.iter_mut() {
+                map_def(mapper, r);
+            }
+            dest.map_uses(mapper);
+        }
+
+        Inst::JmpTableSeq {
+            ref mut idx,
+            ref mut tmp1,
+            ref mut tmp2,
+            ..
+        } => {
+            map_use(mapper, idx);
+            map_def(mapper, tmp1);
+            map_def(mapper, tmp2);
+        }
+
+        Inst::JmpUnknown { ref mut target } => target.map_uses(mapper),
+
+        Inst::LoadExtName { ref mut dst, .. } => map_def(mapper, dst),
+
+        Inst::LockCmpxchg {
+            ref mut src,
+            ref mut dst,
+            ..
+        } => {
+            map_use(mapper, src);
+            dst.map_uses(mapper);
+        }
+
+        Inst::Ret
+        | Inst::EpiloguePlaceholder
+        | Inst::JmpKnown { .. }
+        | Inst::JmpCond { .. }
+        | Inst::JmpIf { .. }
+        | Inst::Nop { .. }
+        | Inst::TrapIf { .. }
+        | Inst::VirtualSPOffsetAdj { .. }
+        | Inst::Ud2 { .. }
+        | Inst::Hlt
+        | Inst::AtomicRmwSeq { .. }
+        | Inst::Fence { .. } => {
+            // Instruction doesn't explicitly mention any regs, so it can't have any virtual
+            // regs that we'd need to remap.  Hence no action required.
+        }
+    }
+}
+
+//=============================================================================
+// Instructions: misc functions and external interface
+
+impl MachInst for Inst {
+    fn get_regs(&self, collector: &mut RegUsageCollector) {
+        x64_get_regs(&self, collector)
+    }
+
+    fn map_regs<RUM: RegUsageMapper>(&mut self, mapper: &RUM) {
+        x64_map_regs(self, mapper);
+    }
+
+    fn is_move(&self) -> Option<(Writable<Reg>, Reg)> {
+        match self {
+            // Note (carefully!) that a 32-bit mov *isn't* a no-op since it zeroes
+            // out the upper 32 bits of the destination.  For example, we could
+            // conceivably use `movl %reg, %reg` to zero out the top 32 bits of
+            // %reg.
+            Self::MovRR {
+                is_64, src, dst, ..
+            } if *is_64 => Some((*dst, *src)),
+            // Note as well that MOVS[S|D] when used in the `XmmUnaryRmR` context are pure moves of
+            // scalar floating-point values (and annotate `dst` as `def`s to the register allocator)
+            // whereas the same operation in a packed context, e.g. `XMM_RM_R`, is used to merge a
+            // value into the lowest lane of a vector (not a move).
+            Self::XmmUnaryRmR { op, src, dst, .. }
+                if *op == SseOpcode::Movss
+                    || *op == SseOpcode::Movsd
+                    || *op == SseOpcode::Movaps
+                    || *op == SseOpcode::Movapd
+                    || *op == SseOpcode::Movups
+                    || *op == SseOpcode::Movupd
+                    || *op == SseOpcode::Movdqa
+                    || *op == SseOpcode::Movdqu =>
+            {
+                if let RegMem::Reg { reg } = src {
+                    Some((*dst, *reg))
+                } else {
+                    None
+                }
+            }
+            _ => None,
+        }
+    }
+
+    fn is_epilogue_placeholder(&self) -> bool {
+        if let Self::EpiloguePlaceholder = self {
+            true
+        } else {
+            false
+        }
+    }
+
+    fn is_term<'a>(&'a self) -> MachTerminator<'a> {
+        match self {
+            // Interesting cases.
+            &Self::Ret | &Self::EpiloguePlaceholder => MachTerminator::Ret,
+            &Self::JmpKnown { dst } => MachTerminator::Uncond(dst),
+            &Self::JmpCond {
+                taken, not_taken, ..
+            } => MachTerminator::Cond(taken, not_taken),
+            &Self::JmpTableSeq {
+                ref targets_for_term,
+                ..
+            } => MachTerminator::Indirect(&targets_for_term[..]),
+            // All other cases are boring.
+            _ => MachTerminator::None,
+        }
+    }
+
+    fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, ty: Type) -> Inst {
+        let rc_dst = dst_reg.to_reg().get_class();
+        let rc_src = src_reg.get_class();
+        // If this isn't true, we have gone way off the rails.
+        debug_assert!(rc_dst == rc_src);
+        match rc_dst {
+            RegClass::I64 => Inst::mov_r_r(true, src_reg, dst_reg),
+            RegClass::V128 => {
+                // The Intel optimization manual, in "3.5.1.13 Zero-Latency MOV Instructions",
+                // doesn't include MOVSS/MOVSD as instructions with zero-latency. Use movaps for
+                // those, which may write more lanes that we need, but are specified to have
+                // zero-latency.
+                let opcode = match ty {
+                    types::F32 | types::F64 | types::F32X4 => SseOpcode::Movaps,
+                    types::F64X2 => SseOpcode::Movapd,
+                    _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqa,
+                    _ => unimplemented!("unable to move type: {}", ty),
+                };
+                Inst::xmm_unary_rm_r(opcode, RegMem::reg(src_reg), dst_reg)
+            }
+            _ => panic!("gen_move(x64): unhandled regclass {:?}", rc_dst),
+        }
+    }
+
+    fn gen_zero_len_nop() -> Inst {
+        Inst::Nop { len: 0 }
+    }
+
+    fn gen_nop(preferred_size: usize) -> Inst {
+        Inst::nop((preferred_size % 16) as u8)
+    }
+
+    fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> {
+        None
+    }
+
+    fn rc_for_type(ty: Type) -> CodegenResult<RegClass> {
+        match ty {
+            types::I8
+            | types::I16
+            | types::I32
+            | types::I64
+            | types::B1
+            | types::B8
+            | types::B16
+            | types::B32
+            | types::B64
+            | types::R32
+            | types::R64 => Ok(RegClass::I64),
+            types::F32 | types::F64 => Ok(RegClass::V128),
+            _ if ty.bits() == 128 => Ok(RegClass::V128),
+            types::IFLAGS | types::FFLAGS => Ok(RegClass::I64),
+            _ => Err(CodegenError::Unsupported(format!(
+                "Unexpected SSA-value type: {}",
+                ty
+            ))),
+        }
+    }
+
+    fn gen_jump(label: MachLabel) -> Inst {
+        Inst::jmp_known(label)
+    }
+
+    fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        to_reg: Writable<Reg>,
+        value: u64,
+        ty: Type,
+        mut alloc_tmp: F,
+    ) -> SmallVec<[Self; 4]> {
+        let mut ret = SmallVec::new();
+        if ty == types::F32 {
+            if value == 0 {
+                ret.push(Inst::xmm_rm_r(
+                    SseOpcode::Xorps,
+                    RegMem::reg(to_reg.to_reg()),
+                    to_reg,
+                ));
+            } else {
+                let tmp = alloc_tmp(RegClass::I64, types::I32);
+                ret.push(Inst::imm(OperandSize::Size32, value, tmp));
+
+                ret.push(Inst::gpr_to_xmm(
+                    SseOpcode::Movd,
+                    RegMem::reg(tmp.to_reg()),
+                    OperandSize::Size32,
+                    to_reg,
+                ));
+            }
+        } else if ty == types::F64 {
+            if value == 0 {
+                ret.push(Inst::xmm_rm_r(
+                    SseOpcode::Xorpd,
+                    RegMem::reg(to_reg.to_reg()),
+                    to_reg,
+                ));
+            } else {
+                let tmp = alloc_tmp(RegClass::I64, types::I64);
+                ret.push(Inst::imm(OperandSize::Size64, value, tmp));
+
+                ret.push(Inst::gpr_to_xmm(
+                    SseOpcode::Movq,
+                    RegMem::reg(tmp.to_reg()),
+                    OperandSize::Size64,
+                    to_reg,
+                ));
+            }
+        } else {
+            // Must be an integer type.
+            debug_assert!(
+                ty == types::B1
+                    || ty == types::I8
+                    || ty == types::B8
+                    || ty == types::I16
+                    || ty == types::B16
+                    || ty == types::I32
+                    || ty == types::B32
+                    || ty == types::I64
+                    || ty == types::B64
+                    || ty == types::R32
+                    || ty == types::R64
+            );
+            if value == 0 {
+                ret.push(Inst::alu_rmi_r(
+                    ty == types::I64,
+                    AluRmiROpcode::Xor,
+                    RegMemImm::reg(to_reg.to_reg()),
+                    to_reg,
+                ));
+            } else {
+                ret.push(Inst::imm(
+                    OperandSize::from_bytes(ty.bytes()),
+                    value.into(),
+                    to_reg,
+                ));
+            }
+        }
+        ret
+    }
+
+    fn reg_universe(flags: &Flags) -> RealRegUniverse {
+        create_reg_universe_systemv(flags)
+    }
+
+    fn worst_case_size() -> CodeOffset {
+        15
+    }
+
+    fn ref_type_regclass(_: &settings::Flags) -> RegClass {
+        RegClass::I64
+    }
+
+    type LabelUse = LabelUse;
+}
+
+/// State carried between emissions of a sequence of instructions.
+#[derive(Default, Clone, Debug)]
+pub struct EmitState {
+    /// Addend to convert nominal-SP offsets to real-SP offsets at the current
+    /// program point.
+    pub(crate) virtual_sp_offset: i64,
+    /// Offset of FP from nominal-SP.
+    pub(crate) nominal_sp_to_fp: i64,
+    /// Safepoint stack map for upcoming instruction, as provided to `pre_safepoint()`.
+    stack_map: Option<StackMap>,
+    /// Current source location.
+    cur_srcloc: SourceLoc,
+}
+
+/// Constant state used during emissions of a sequence of instructions.
+pub struct EmitInfo {
+    flags: settings::Flags,
+    isa_flags: x64_settings::Flags,
+}
+
+impl EmitInfo {
+    pub(crate) fn new(flags: settings::Flags, isa_flags: x64_settings::Flags) -> Self {
+        Self { flags, isa_flags }
+    }
+}
+
+impl MachInstEmitInfo for EmitInfo {
+    fn flags(&self) -> &Flags {
+        &self.flags
+    }
+}
+
+impl MachInstEmit for Inst {
+    type State = EmitState;
+    type Info = EmitInfo;
+    type UnwindInfo = unwind::X64UnwindInfo;
+
+    fn emit(&self, sink: &mut MachBuffer<Inst>, info: &Self::Info, state: &mut Self::State) {
+        emit::emit(self, sink, info, state);
+    }
+
+    fn pretty_print(&self, mb_rru: Option<&RealRegUniverse>, _: &mut Self::State) -> String {
+        self.show_rru(mb_rru)
+    }
+}
+
+impl MachInstEmitState<Inst> for EmitState {
+    fn new(abi: &dyn ABICallee<I = Inst>) -> Self {
+        EmitState {
+            virtual_sp_offset: 0,
+            nominal_sp_to_fp: abi.frame_size() as i64,
+            stack_map: None,
+            cur_srcloc: SourceLoc::default(),
+        }
+    }
+
+    fn pre_safepoint(&mut self, stack_map: StackMap) {
+        self.stack_map = Some(stack_map);
+    }
+
+    fn pre_sourceloc(&mut self, srcloc: SourceLoc) {
+        self.cur_srcloc = srcloc;
+    }
+}
+
+impl EmitState {
+    fn take_stack_map(&mut self) -> Option<StackMap> {
+        self.stack_map.take()
+    }
+
+    fn clear_post_insn(&mut self) {
+        self.stack_map = None;
+    }
+
+    fn cur_srcloc(&self) -> SourceLoc {
+        self.cur_srcloc
+    }
+}
+
+/// A label-use (internal relocation) in generated code.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LabelUse {
+    /// A 32-bit offset from location of relocation itself, added to the existing value at that
+    /// location. Used for control flow instructions which consider an offset from the start of the
+    /// next instruction (so the size of the payload -- 4 bytes -- is subtracted from the payload).
+    JmpRel32,
+
+    /// A 32-bit offset from location of relocation itself, added to the existing value at that
+    /// location.
+    PCRel32,
+}
+
+impl MachInstLabelUse for LabelUse {
+    const ALIGN: CodeOffset = 1;
+
+    fn max_pos_range(self) -> CodeOffset {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => 0x7fff_ffff,
+        }
+    }
+
+    fn max_neg_range(self) -> CodeOffset {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => 0x8000_0000,
+        }
+    }
+
+    fn patch_size(self) -> CodeOffset {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => 4,
+        }
+    }
+
+    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
+        let pc_rel = (label_offset as i64) - (use_offset as i64);
+        debug_assert!(pc_rel <= self.max_pos_range() as i64);
+        debug_assert!(pc_rel >= -(self.max_neg_range() as i64));
+        let pc_rel = pc_rel as u32;
+        match self {
+            LabelUse::JmpRel32 => {
+                let addend = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+                let value = pc_rel.wrapping_add(addend).wrapping_sub(4);
+                buffer.copy_from_slice(&value.to_le_bytes()[..]);
+            }
+            LabelUse::PCRel32 => {
+                let addend = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+                let value = pc_rel.wrapping_add(addend);
+                buffer.copy_from_slice(&value.to_le_bytes()[..]);
+            }
+        }
+    }
+
+    fn supports_veneer(self) -> bool {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => false,
+        }
+    }
+
+    fn veneer_size(self) -> CodeOffset {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => 0,
+        }
+    }
+
+    fn generate_veneer(self, _: &mut [u8], _: CodeOffset) -> (CodeOffset, LabelUse) {
+        match self {
+            LabelUse::JmpRel32 | LabelUse::PCRel32 => {
+                panic!("Veneer not supported for JumpRel32 label-use.");
+            }
+        }
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs
new file mode 100644
index 0000000000..04bc1f09bf
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs
@@ -0,0 +1,289 @@
+//! Registers, the Universe thereof, and printing.
+//!
+//! These are ordered by sequence number, as required in the Universe.  The strange ordering is
+//! intended to make callee-save registers available before caller-saved ones.  This is a net win
+//! provided that each function makes at least one onward call.  It'll be a net loss for leaf
+//! functions, and we should change the ordering in that case, so as to make caller-save regs
+//! available first.
+//!
+//! TODO Maybe have two different universes, one for leaf functions and one for non-leaf functions?
+//! Also, they will have to be ABI dependent.  Need to find a way to avoid constructing a universe
+//! for each function we compile.
+
+use crate::settings;
+use alloc::vec::Vec;
+use regalloc::{
+    PrettyPrint, RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, NUM_REG_CLASSES,
+};
+use std::string::String;
+
+// Hardware encodings for a few registers.
+
+pub const ENC_RBX: u8 = 3;
+pub const ENC_RSP: u8 = 4;
+pub const ENC_RBP: u8 = 5;
+pub const ENC_R12: u8 = 12;
+pub const ENC_R13: u8 = 13;
+pub const ENC_R14: u8 = 14;
+pub const ENC_R15: u8 = 15;
+
+fn gpr(enc: u8, index: u8) -> Reg {
+    Reg::new_real(RegClass::I64, enc, index)
+}
+
+pub(crate) fn r12() -> Reg {
+    gpr(ENC_R12, 16)
+}
+pub(crate) fn r13() -> Reg {
+    gpr(ENC_R13, 17)
+}
+pub(crate) fn r14() -> Reg {
+    gpr(ENC_R14, 18)
+}
+pub(crate) fn rbx() -> Reg {
+    gpr(ENC_RBX, 19)
+}
+pub(crate) fn rsi() -> Reg {
+    gpr(6, 20)
+}
+pub(crate) fn rdi() -> Reg {
+    gpr(7, 21)
+}
+pub(crate) fn rax() -> Reg {
+    gpr(0, 22)
+}
+pub(crate) fn rcx() -> Reg {
+    gpr(1, 23)
+}
+pub(crate) fn rdx() -> Reg {
+    gpr(2, 24)
+}
+pub(crate) fn r8() -> Reg {
+    gpr(8, 25)
+}
+pub(crate) fn r9() -> Reg {
+    gpr(9, 26)
+}
+pub(crate) fn r10() -> Reg {
+    gpr(10, 27)
+}
+pub(crate) fn r11() -> Reg {
+    gpr(11, 28)
+}
+
+pub(crate) fn r15() -> Reg {
+    // r15 is put aside since this is the pinned register.
+    gpr(ENC_R15, 29)
+}
+
+/// The pinned register on this architecture.
+/// It must be the same as Spidermonkey's HeapReg, as found in this file.
+/// https://searchfox.org/mozilla-central/source/js/src/jit/x64/Assembler-x64.h#99
+pub(crate) fn pinned_reg() -> Reg {
+    r15()
+}
+
+fn fpr(enc: u8, index: u8) -> Reg {
+    Reg::new_real(RegClass::V128, enc, index)
+}
+
+pub(crate) fn xmm0() -> Reg {
+    fpr(0, 0)
+}
+pub(crate) fn xmm1() -> Reg {
+    fpr(1, 1)
+}
+pub(crate) fn xmm2() -> Reg {
+    fpr(2, 2)
+}
+pub(crate) fn xmm3() -> Reg {
+    fpr(3, 3)
+}
+pub(crate) fn xmm4() -> Reg {
+    fpr(4, 4)
+}
+pub(crate) fn xmm5() -> Reg {
+    fpr(5, 5)
+}
+pub(crate) fn xmm6() -> Reg {
+    fpr(6, 6)
+}
+pub(crate) fn xmm7() -> Reg {
+    fpr(7, 7)
+}
+pub(crate) fn xmm8() -> Reg {
+    fpr(8, 8)
+}
+pub(crate) fn xmm9() -> Reg {
+    fpr(9, 9)
+}
+pub(crate) fn xmm10() -> Reg {
+    fpr(10, 10)
+}
+pub(crate) fn xmm11() -> Reg {
+    fpr(11, 11)
+}
+pub(crate) fn xmm12() -> Reg {
+    fpr(12, 12)
+}
+pub(crate) fn xmm13() -> Reg {
+    fpr(13, 13)
+}
+pub(crate) fn xmm14() -> Reg {
+    fpr(14, 14)
+}
+pub(crate) fn xmm15() -> Reg {
+    fpr(15, 15)
+}
+
+pub(crate) fn rsp() -> Reg {
+    gpr(ENC_RSP, 30)
+}
+pub(crate) fn rbp() -> Reg {
+    gpr(ENC_RBP, 31)
+}
+
+/// Create the register universe for X64.
+///
+/// The ordering of registers matters, as commented in the file doc comment: assumes the
+/// calling-convention is SystemV, at the moment.
+pub(crate) fn create_reg_universe_systemv(flags: &settings::Flags) -> RealRegUniverse {
+    let mut regs = Vec::<(RealReg, String)>::new();
+    let mut allocable_by_class = [None; NUM_REG_CLASSES];
+
+    let use_pinned_reg = flags.enable_pinned_reg();
+
+    // XMM registers
+    let first_fpr = regs.len();
+    regs.push((xmm0().to_real_reg(), "%xmm0".into()));
+    regs.push((xmm1().to_real_reg(), "%xmm1".into()));
+    regs.push((xmm2().to_real_reg(), "%xmm2".into()));
+    regs.push((xmm3().to_real_reg(), "%xmm3".into()));
+    regs.push((xmm4().to_real_reg(), "%xmm4".into()));
+    regs.push((xmm5().to_real_reg(), "%xmm5".into()));
+    regs.push((xmm6().to_real_reg(), "%xmm6".into()));
+    regs.push((xmm7().to_real_reg(), "%xmm7".into()));
+    regs.push((xmm8().to_real_reg(), "%xmm8".into()));
+    regs.push((xmm9().to_real_reg(), "%xmm9".into()));
+    regs.push((xmm10().to_real_reg(), "%xmm10".into()));
+    regs.push((xmm11().to_real_reg(), "%xmm11".into()));
+    regs.push((xmm12().to_real_reg(), "%xmm12".into()));
+    regs.push((xmm13().to_real_reg(), "%xmm13".into()));
+    regs.push((xmm14().to_real_reg(), "%xmm14".into()));
+    regs.push((xmm15().to_real_reg(), "%xmm15".into()));
+    let last_fpr = regs.len() - 1;
+
+    // Integer regs.
+    let first_gpr = regs.len();
+
+    // Callee-saved, in the SystemV x86_64 ABI.
+    regs.push((r12().to_real_reg(), "%r12".into()));
+    regs.push((r13().to_real_reg(), "%r13".into()));
+    regs.push((r14().to_real_reg(), "%r14".into()));
+
+    regs.push((rbx().to_real_reg(), "%rbx".into()));
+
+    // Caller-saved, in the SystemV x86_64 ABI.
+    regs.push((rsi().to_real_reg(), "%rsi".into()));
+    regs.push((rdi().to_real_reg(), "%rdi".into()));
+    regs.push((rax().to_real_reg(), "%rax".into()));
+    regs.push((rcx().to_real_reg(), "%rcx".into()));
+    regs.push((rdx().to_real_reg(), "%rdx".into()));
+    regs.push((r8().to_real_reg(), "%r8".into()));
+    regs.push((r9().to_real_reg(), "%r9".into()));
+    regs.push((r10().to_real_reg(), "%r10".into()));
+    regs.push((r11().to_real_reg(), "%r11".into()));
+
+    // Other regs, not available to the allocator.
+    debug_assert_eq!(r15(), pinned_reg());
+    let allocable = if use_pinned_reg {
+        // The pinned register is not allocatable in this case, so record the length before adding
+        // it.
+        let len = regs.len();
+        regs.push((r15().to_real_reg(), "%r15/pinned".into()));
+        len
+    } else {
+        regs.push((r15().to_real_reg(), "%r15".into()));
+        regs.len()
+    };
+    let last_gpr = allocable - 1;
+
+    regs.push((rsp().to_real_reg(), "%rsp".into()));
+    regs.push((rbp().to_real_reg(), "%rbp".into()));
+
+    allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo {
+        first: first_gpr,
+        last: last_gpr,
+        suggested_scratch: Some(r12().get_index()),
+    });
+    allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo {
+        first: first_fpr,
+        last: last_fpr,
+        suggested_scratch: Some(xmm15().get_index()),
+    });
+
+    // Sanity-check: the index passed to the Reg ctor must match the order in the register list.
+    for (i, reg) in regs.iter().enumerate() {
+        assert_eq!(i, reg.0.get_index());
+    }
+
+    RealRegUniverse {
+        regs,
+        allocable,
+        allocable_by_class,
+    }
+}
+
+/// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show its name at some
+/// smaller size (4, 2 or 1 bytes).
+pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+    let mut s = reg.show_rru(mb_rru);
+
+    if reg.get_class() != RegClass::I64 || size == 8 {
+        // We can't do any better.
+        return s;
+    }
+
+    if reg.is_real() {
+        // Change (eg) "rax" into "eax", "ax" or "al" as appropriate.  This is something one could
+        // describe diplomatically as "a kludge", but it's only debug code.
+        let remapper = match s.as_str() {
+            "%rax" => Some(["%eax", "%ax", "%al"]),
+            "%rbx" => Some(["%ebx", "%bx", "%bl"]),
+            "%rcx" => Some(["%ecx", "%cx", "%cl"]),
+            "%rdx" => Some(["%edx", "%dx", "%dl"]),
+            "%rsi" => Some(["%esi", "%si", "%sil"]),
+            "%rdi" => Some(["%edi", "%di", "%dil"]),
+            "%rbp" => Some(["%ebp", "%bp", "%bpl"]),
+            "%rsp" => Some(["%esp", "%sp", "%spl"]),
+            "%r8" => Some(["%r8d", "%r8w", "%r8b"]),
+            "%r9" => Some(["%r9d", "%r9w", "%r9b"]),
+            "%r10" => Some(["%r10d", "%r10w", "%r10b"]),
+            "%r11" => Some(["%r11d", "%r11w", "%r11b"]),
+            "%r12" => Some(["%r12d", "%r12w", "%r12b"]),
+            "%r13" => Some(["%r13d", "%r13w", "%r13b"]),
+            "%r14" => Some(["%r14d", "%r14w", "%r14b"]),
+            "%r15" => Some(["%r15d", "%r15w", "%r15b"]),
+            _ => None,
+        };
+        if let Some(smaller_names) = remapper {
+            match size {
+                4 => s = smaller_names[0].into(),
+                2 => s = smaller_names[1].into(),
+                1 => s = smaller_names[2].into(),
+                _ => panic!("show_ireg_sized: real"),
+            }
+        }
+    } else {
+        // Add a "l", "w" or "b" suffix to RegClass::I64 vregs used at narrower widths.
+        let suffix = match size {
+            4 => "l",
+            2 => "w",
+            1 => "b",
+            _ => panic!("show_ireg_sized: virtual"),
+        };
+        s = s + suffix;
+    }
+
+    s
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs
new file mode 100644
index 0000000000..ffe43930f0
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs
@@ -0,0 +1,125 @@
+use crate::isa::unwind::input::UnwindInfo;
+use crate::isa::x64::inst::{
+    args::{AluRmiROpcode, Amode, RegMemImm, SyntheticAmode},
+    regs, Inst,
+};
+use crate::machinst::{UnwindInfoContext, UnwindInfoGenerator};
+use crate::result::CodegenResult;
+use alloc::vec::Vec;
+use regalloc::Reg;
+
+#[cfg(feature = "unwind")]
+pub(crate) mod systemv;
+
+pub struct X64UnwindInfo;
+
+impl UnwindInfoGenerator<Inst> for X64UnwindInfo {
+    fn create_unwind_info(
+        context: UnwindInfoContext<Inst>,
+    ) -> CodegenResult<Option<UnwindInfo<Reg>>> {
+        use crate::isa::unwind::input::{self, UnwindCode};
+        let mut codes = Vec::new();
+        const WORD_SIZE: u8 = 8;
+
+        for i in context.prologue.clone() {
+            let i = i as usize;
+            let inst = &context.insts[i];
+            let offset = context.insts_layout[i];
+
+            match inst {
+                Inst::Push64 {
+                    src: RegMemImm::Reg { reg },
+                } => {
+                    codes.push((
+                        offset,
+                        UnwindCode::StackAlloc {
+                            size: WORD_SIZE.into(),
+                        },
+                    ));
+                    codes.push((
+                        offset,
+                        UnwindCode::SaveRegister {
+                            reg: *reg,
+                            stack_offset: 0,
+                        },
+                    ));
+                }
+                Inst::MovRR { src, dst, .. } => {
+                    if *src == regs::rsp() {
+                        codes.push((offset, UnwindCode::SetFramePointer { reg: dst.to_reg() }));
+                    }
+                }
+                Inst::AluRmiR {
+                    is_64: true,
+                    op: AluRmiROpcode::Sub,
+                    src: RegMemImm::Imm { simm32 },
+                    dst,
+                    ..
+                } if dst.to_reg() == regs::rsp() => {
+                    let imm = *simm32;
+                    codes.push((offset, UnwindCode::StackAlloc { size: imm }));
+                }
+                Inst::MovRM {
+                    src,
+                    dst: SyntheticAmode::Real(Amode::ImmReg { simm32, base, .. }),
+                    ..
+                } if *base == regs::rsp() => {
+                    // `mov reg, imm(rsp)`
+                    let imm = *simm32;
+                    codes.push((
+                        offset,
+                        UnwindCode::SaveRegister {
+                            reg: *src,
+                            stack_offset: imm,
+                        },
+                    ));
+                }
+                Inst::AluRmiR {
+                    is_64: true,
+                    op: AluRmiROpcode::Add,
+                    src: RegMemImm::Imm { simm32 },
+                    dst,
+                    ..
+                } if dst.to_reg() == regs::rsp() => {
+                    let imm = *simm32;
+                    codes.push((offset, UnwindCode::StackDealloc { size: imm }));
+                }
+                _ => {}
+            }
+        }
+
+        let last_epilogue_end = context.len;
+        let epilogues_unwind_codes = context
+            .epilogues
+            .iter()
+            .map(|epilogue| {
+                // TODO add logic to process epilogue instruction instead of
+                // returning empty array.
+                let end = epilogue.end as usize - 1;
+                let end_offset = context.insts_layout[end];
+                if end_offset == last_epilogue_end {
+                    // Do not remember/restore for very last epilogue.
+                    return vec![];
+                }
+
+                let start = epilogue.start as usize;
+                let offset = context.insts_layout[start];
+                vec![
+                    (offset, UnwindCode::RememberState),
+                    // TODO epilogue instructions
+                    (end_offset, UnwindCode::RestoreState),
+                ]
+            })
+            .collect();
+
+        let prologue_size = context.insts_layout[context.prologue.end as usize];
+        Ok(Some(input::UnwindInfo {
+            prologue_size,
+            prologue_unwind_codes: codes,
+            epilogues_unwind_codes,
+            function_size: context.len,
+            word_size: WORD_SIZE,
+            initial_sp_offset: WORD_SIZE,
+        }))
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs
new file mode 100644
index 0000000000..68473a8afb
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs
@@ -0,0 +1,204 @@
+//! Unwind information for System V ABI (x86-64).
+
+use crate::isa::unwind::input;
+use crate::isa::unwind::systemv::{RegisterMappingError, UnwindInfo};
+use crate::result::CodegenResult;
+use gimli::{write::CommonInformationEntry, Encoding, Format, Register, X86_64};
+use regalloc::{Reg, RegClass};
+
+/// Creates a new x86-64 common information entry (CIE).
+pub fn create_cie() -> CommonInformationEntry {
+    use gimli::write::CallFrameInstruction;
+
+    let mut entry = CommonInformationEntry::new(
+        Encoding {
+            address_size: 8,
+            format: Format::Dwarf32,
+            version: 1,
+        },
+        1,  // Code alignment factor
+        -8, // Data alignment factor
+        X86_64::RA,
+    );
+
+    // Every frame will start with the call frame address (CFA) at RSP+8
+    // It is +8 to account for the push of the return address by the call instruction
+    entry.add_instruction(CallFrameInstruction::Cfa(X86_64::RSP, 8));
+
+    // Every frame will start with the return address at RSP (CFA-8 = RSP+8-8 = RSP)
+    entry.add_instruction(CallFrameInstruction::Offset(X86_64::RA, -8));
+
+    entry
+}
+
+/// Map Cranelift registers to their corresponding Gimli registers.
+pub fn map_reg(reg: Reg) -> Result<Register, RegisterMappingError> {
+    // Mapping from https://github.com/bytecodealliance/cranelift/pull/902 by @iximeow
+    const X86_GP_REG_MAP: [gimli::Register; 16] = [
+        X86_64::RAX,
+        X86_64::RCX,
+        X86_64::RDX,
+        X86_64::RBX,
+        X86_64::RSP,
+        X86_64::RBP,
+        X86_64::RSI,
+        X86_64::RDI,
+        X86_64::R8,
+        X86_64::R9,
+        X86_64::R10,
+        X86_64::R11,
+        X86_64::R12,
+        X86_64::R13,
+        X86_64::R14,
+        X86_64::R15,
+    ];
+    const X86_XMM_REG_MAP: [gimli::Register; 16] = [
+        X86_64::XMM0,
+        X86_64::XMM1,
+        X86_64::XMM2,
+        X86_64::XMM3,
+        X86_64::XMM4,
+        X86_64::XMM5,
+        X86_64::XMM6,
+        X86_64::XMM7,
+        X86_64::XMM8,
+        X86_64::XMM9,
+        X86_64::XMM10,
+        X86_64::XMM11,
+        X86_64::XMM12,
+        X86_64::XMM13,
+        X86_64::XMM14,
+        X86_64::XMM15,
+    ];
+
+    match reg.get_class() {
+        RegClass::I64 => {
+            // x86 GP registers have a weird mapping to DWARF registers, so we use a
+            // lookup table.
+            Ok(X86_GP_REG_MAP[reg.get_hw_encoding() as usize])
+        }
+        RegClass::V128 => Ok(X86_XMM_REG_MAP[reg.get_hw_encoding() as usize]),
+        _ => Err(RegisterMappingError::UnsupportedRegisterBank("class?")),
+    }
+}
+
+pub(crate) fn create_unwind_info(
+    unwind: input::UnwindInfo<Reg>,
+) -> CodegenResult<Option<UnwindInfo>> {
+    struct RegisterMapper;
+    impl crate::isa::unwind::systemv::RegisterMapper<Reg> for RegisterMapper {
+        fn map(&self, reg: Reg) -> Result<u16, RegisterMappingError> {
+            Ok(map_reg(reg)?.0)
+        }
+        fn sp(&self) -> u16 {
+            X86_64::RSP.0
+        }
+    }
+    let map = RegisterMapper;
+
+    Ok(Some(UnwindInfo::build(unwind, &map)?))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::{
+        types, AbiParam, ExternalName, Function, InstBuilder, Signature, StackSlotData,
+        StackSlotKind,
+    };
+    use crate::isa::{lookup, CallConv};
+    use crate::settings::{builder, Flags};
+    use crate::Context;
+    use gimli::write::Address;
+    use std::str::FromStr;
+    use target_lexicon::triple;
+
+    #[test]
+    fn test_simple_func() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::SystemV,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let fde = match context
+            .create_unwind_info(isa.as_ref())
+            .expect("can create unwind info")
+        {
+            Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => {
+                info.to_fde(Address::Constant(1234))
+            }
+            _ => panic!("expected unwind information"),
+        };
+
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(1234), length: 13, lsda: None, instructions: [(1, CfaOffset(16)), (1, Offset(Register(6), -16)), (4, CfaRegister(Register(6)))] }");
+    }
+
+    fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function {
+        let mut func =
+            Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv));
+
+        let block0 = func.dfg.make_block();
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().return_(&[]);
+
+        if let Some(stack_slot) = stack_slot {
+            func.stack_slots.push(stack_slot);
+        }
+
+        func
+    }
+
+    #[test]
+    fn test_multi_return_func() {
+        let isa = lookup(triple!("x86_64"))
+            .expect("expect x86 ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let fde = match context
+            .create_unwind_info(isa.as_ref())
+            .expect("can create unwind info")
+        {
+            Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => {
+                info.to_fde(Address::Constant(4321))
+            }
+            _ => panic!("expected unwind information"),
+        };
+
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 23, lsda: None, instructions: [(1, CfaOffset(16)), (1, Offset(Register(6), -16)), (4, CfaRegister(Register(6))), (16, RememberState), (18, RestoreState)] }");
+    }
+
+    fn create_multi_return_function(call_conv: CallConv) -> Function {
+        let mut sig = Signature::new(call_conv);
+        sig.params.push(AbiParam::new(types::I32));
+        let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig);
+
+        let block0 = func.dfg.make_block();
+        let v0 = func.dfg.append_block_param(block0, types::I32);
+        let block1 = func.dfg.make_block();
+        let block2 = func.dfg.make_block();
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().brnz(v0, block2, &[]);
+        pos.ins().jump(block1, &[]);
+
+        pos.insert_block(block1);
+        pos.ins().return_(&[]);
+
+        pos.insert_block(block2);
+        pos.ins().return_(&[]);
+
+        func
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs
new file mode 100644
index 0000000000..0862154360
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs
@@ -0,0 +1,3771 @@
+//! Lowering rules for X64.
+
+use crate::data_value::DataValue;
+use crate::ir::{
+    condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
+    Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type,
+};
+use crate::isa::x64::abi::*;
+use crate::isa::x64::inst::args::*;
+use crate::isa::x64::inst::*;
+use crate::isa::{x64::X64Backend, CallConv};
+use crate::machinst::lower::*;
+use crate::machinst::*;
+use crate::result::CodegenResult;
+use crate::settings::Flags;
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use cranelift_codegen_shared::condcodes::CondCode;
+use log::trace;
+use regalloc::{Reg, RegClass, Writable};
+use smallvec::SmallVec;
+use std::convert::TryFrom;
+use target_lexicon::Triple;
+
+/// Context passed to all lowering functions.
+type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>;
+
+//=============================================================================
+// Helpers for instruction lowering.
+
+fn is_int_or_ref_ty(ty: Type) -> bool {
+    match ty {
+        types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
+        types::R32 => panic!("shouldn't have 32-bits refs on x64"),
+        _ => false,
+    }
+}
+
+fn is_bool_ty(ty: Type) -> bool {
+    match ty {
+        types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
+        types::R32 => panic!("shouldn't have 32-bits refs on x64"),
+        _ => false,
+    }
+}
+
+/// This is target-word-size dependent.  And it excludes booleans and reftypes.
+fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
+    match ty {
+        types::I8 | types::I16 | types::I32 | types::I64 => true,
+        _ => false,
+    }
+}
+
+/// Returns whether the given specified `input` is a result produced by an instruction with Opcode
+/// `op`.
+// TODO investigate failures with checking against the result index.
+fn matches_input<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    op: Opcode,
+) -> Option<IRInst> {
+    let inputs = ctx.get_input(input.insn, input.input);
+    inputs.inst.and_then(|(src_inst, _)| {
+        let data = ctx.data(src_inst);
+        if data.opcode() == op {
+            return Some(src_inst);
+        }
+        None
+    })
+}
+
+/// Returns whether the given specified `input` is a result produced by an instruction with any of
+/// the opcodes specified in `ops`.
+fn matches_input_any<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    ops: &[Opcode],
+) -> Option<IRInst> {
+    let inputs = ctx.get_input(input.insn, input.input);
+    inputs.inst.and_then(|(src_inst, _)| {
+        let data = ctx.data(src_inst);
+        for &op in ops {
+            if data.opcode() == op {
+                return Some(src_inst);
+            }
+        }
+        None
+    })
+}
+
+fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg {
+    ctx.use_input_reg(input);
+    input.reg
+}
+
+/// Put the given input into a register, and mark it as used (side-effect).
+fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg {
+    let input = ctx.get_input(spec.insn, spec.input);
+
+    if let Some(c) = input.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        let ty = ctx.input_ty(spec.insn, spec.input);
+        let from_bits = ty_bits(ty);
+        let masked = if from_bits < 64 {
+            c & ((1u64 << from_bits) - 1)
+        } else {
+            c
+        };
+
+        let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| {
+            ctx.alloc_tmp(reg_class, ty)
+        })
+        .into_iter()
+        {
+            ctx.emit(inst);
+        }
+        cst_copy.to_reg()
+    } else {
+        lowerinput_to_reg(ctx, input)
+    }
+}
+
+/// An extension specification for `extend_input_to_reg`.
+#[derive(Clone, Copy)]
+enum ExtSpec {
+    ZeroExtendTo32,
+    ZeroExtendTo64,
+    SignExtendTo32,
+    SignExtendTo64,
+}
+
+/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
+/// required. (This obviously causes side-effects.)
+fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
+    let requested_size = match ext_spec {
+        ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
+        ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
+    };
+    let input_size = ctx.input_ty(spec.insn, spec.input).bits();
+
+    let requested_ty = if requested_size == 32 {
+        types::I32
+    } else {
+        types::I64
+    };
+
+    let ext_mode = match (input_size, requested_size) {
+        (a, b) if a == b => return put_input_in_reg(ctx, spec),
+        (1, 8) => return put_input_in_reg(ctx, spec),
+        (a, b) => ExtMode::new(a, b).expect(&format!("invalid extension: {} -> {}", a, b)),
+    };
+
+    let src = input_to_reg_mem(ctx, spec);
+    let dst = ctx.alloc_tmp(RegClass::I64, requested_ty);
+    match ext_spec {
+        ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => {
+            ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst))
+        }
+        ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => {
+            ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst))
+        }
+    }
+    dst.to_reg()
+}
+
+fn lowerinput_to_reg_mem(ctx: Ctx, input: LowerInput) -> RegMem {
+    // TODO handle memory.
+    RegMem::reg(lowerinput_to_reg(ctx, input))
+}
+
+/// Put the given input into a register or a memory operand.
+/// Effectful: may mark the given input as used, when returning the register form.
+fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem {
+    let input = ctx.get_input(spec.insn, spec.input);
+    lowerinput_to_reg_mem(ctx, input)
+}
+
+/// Returns whether the given input is an immediate that can be properly sign-extended, without any
+/// possible side-effect.
+fn lowerinput_to_sext_imm(input: LowerInput, input_ty: Type) -> Option<u32> {
+    input.constant.and_then(|x| {
+        // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend
+        // to 64 bits. For other sizes, it doesn't matter and we can just use the plain
+        // constant.
+        if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) {
+            Some(x as u32)
+        } else {
+            None
+        }
+    })
+}
+
+fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> {
+    let input = ctx.get_input(spec.insn, spec.input);
+    let input_ty = ctx.input_ty(spec.insn, spec.input);
+    lowerinput_to_sext_imm(input, input_ty)
+}
+
+fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> {
+    ctx.get_input(spec.insn, spec.input).constant
+}
+
+/// Put the given input into an immediate, a register or a memory operand.
+/// Effectful: may mark the given input as used, when returning the register form.
+fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm {
+    let input = ctx.get_input(spec.insn, spec.input);
+    let input_ty = ctx.input_ty(spec.insn, spec.input);
+    match lowerinput_to_sext_imm(input, input_ty) {
+        Some(x) => RegMemImm::imm(x),
+        None => match lowerinput_to_reg_mem(ctx, input) {
+            RegMem::Reg { reg } => RegMemImm::reg(reg),
+            RegMem::Mem { addr } => RegMemImm::mem(addr),
+        },
+    }
+}
+
+/// Emit an instruction to insert a value `src` into a lane of `dst`.
+fn emit_insert_lane<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    src: RegMem,
+    dst: Writable<Reg>,
+    lane: u8,
+    ty: Type,
+) {
+    if !ty.is_float() {
+        let (sse_op, is64) = match ty.lane_bits() {
+            8 => (SseOpcode::Pinsrb, false),
+            16 => (SseOpcode::Pinsrw, false),
+            32 => (SseOpcode::Pinsrd, false),
+            64 => (SseOpcode::Pinsrd, true),
+            _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
+        };
+        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64));
+    } else if ty == types::F32 {
+        let sse_op = SseOpcode::Insertps;
+        // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
+        // shifted into bits 5:6).
+        let lane = 0b00_00_00_00 | lane << 4;
+        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false));
+    } else if ty == types::F64 {
+        let sse_op = match lane {
+            // Move the lowest quadword in replacement to vector without changing
+            // the upper bits.
+            0 => SseOpcode::Movsd,
+            // Move the low 64 bits of replacement vector to the high 64 bits of the
+            // vector.
+            1 => SseOpcode::Movlhps,
+            _ => unreachable!(),
+        };
+        // Here we use the `xmm_rm_r` encoding because it correctly tells the register
+        // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
+        // encoding formats like `xmm_unary_rm_r` treat it as a `def`.
+        ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
+    } else {
+        panic!("unable to emit insertlane for type: {}", ty)
+    }
+}
+
+/// Emits an int comparison instruction.
+///
+/// Note: make sure that there are no instructions modifying the flags between a call to this
+/// function and the use of the flags!
+fn emit_cmp(ctx: Ctx, insn: IRInst) {
+    let ty = ctx.input_ty(insn, 0);
+
+    let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
+
+    // TODO Try to commute the operands (and invert the condition) if one is an immediate.
+    let lhs = put_input_in_reg(ctx, inputs[0]);
+    let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
+
+    // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
+    // us dst - src at the machine instruction level, so invert operands.
+    ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs));
+}
+
+/// A specification for a fcmp emission.
+enum FcmpSpec {
+    /// Normal flow.
+    Normal,
+
+    /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that
+    /// happens with `InvertedEqualOrConditions`.
+    ///
+    /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or
+    /// sequence of instructions) that check for an "AND" combination of condition codes; see for
+    /// instance lowering of Select.
+    InvertEqual,
+}
+
+/// This explains how to interpret the results of an fcmp instruction.
+enum FcmpCondResult {
+    /// The given condition code must be set.
+    Condition(CC),
+
+    /// Both condition codes must be set.
+    AndConditions(CC, CC),
+
+    /// Either of the conditions codes must be set.
+    OrConditions(CC, CC),
+
+    /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either
+    /// of the condition codes must be set, and the user must invert meaning of analyzing the
+    /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be
+    /// reached.
+    InvertedEqualOrConditions(CC, CC),
+}
+
+/// Emits a float comparison instruction.
+///
+/// Note: make sure that there are no instructions modifying the flags between a call to this
+/// function and the use of the flags!
+fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult {
+    let (flip_operands, inverted_equal) = match cond_code {
+        FloatCC::LessThan
+        | FloatCC::LessThanOrEqual
+        | FloatCC::UnorderedOrGreaterThan
+        | FloatCC::UnorderedOrGreaterThanOrEqual => {
+            cond_code = cond_code.reverse();
+            (true, false)
+        }
+        FloatCC::Equal => {
+            let inverted_equal = match spec {
+                FcmpSpec::Normal => false,
+                FcmpSpec::InvertEqual => {
+                    cond_code = FloatCC::NotEqual; // same as .inverse()
+                    true
+                }
+            };
+            (false, inverted_equal)
+        }
+        _ => (false, false),
+    };
+
+    // The only valid CC constructed with `from_floatcc` can be put in the flag
+    // register with a direct float comparison; do this here.
+    let op = match ctx.input_ty(insn, 0) {
+        types::F32 => SseOpcode::Ucomiss,
+        types::F64 => SseOpcode::Ucomisd,
+        _ => panic!("Bad input type to Fcmp"),
+    };
+
+    let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
+    let (lhs_input, rhs_input) = if flip_operands {
+        (inputs[1], inputs[0])
+    } else {
+        (inputs[0], inputs[1])
+    };
+    let lhs = put_input_in_reg(ctx, lhs_input);
+    let rhs = input_to_reg_mem(ctx, rhs_input);
+    ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
+
+    let cond_result = match cond_code {
+        FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
+        FloatCC::NotEqual if inverted_equal => {
+            FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ)
+        }
+        FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ),
+        _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)),
+    };
+
+    cond_result
+}
+
+fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature {
+    let mut sig = Signature::new(call_conv);
+    for i in 0..ctx.num_inputs(insn) {
+        sig.params.push(AbiParam::new(ctx.input_ty(insn, i)));
+    }
+    for i in 0..ctx.num_outputs(insn) {
+        sig.returns.push(AbiParam::new(ctx.output_ty(insn, i)));
+    }
+    if call_conv.extends_baldrdash() {
+        // Adds the special VMContext parameter to the signature.
+        sig.params
+            .push(AbiParam::special(ptr_ty, ArgumentPurpose::VMContext));
+    }
+    sig
+}
+
+fn emit_vm_call<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    flags: &Flags,
+    triple: &Triple,
+    libcall: LibCall,
+    insn: IRInst,
+    inputs: SmallVec<[InsnInput; 4]>,
+    outputs: SmallVec<[InsnOutput; 2]>,
+) -> CodegenResult<()> {
+    let extname = ExternalName::LibCall(libcall);
+
+    let dist = if flags.use_colocated_libcalls() {
+        RelocDistance::Near
+    } else {
+        RelocDistance::Far
+    };
+
+    // TODO avoid recreating signatures for every single Libcall function.
+    let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple));
+    let sig = make_libcall_sig(ctx, insn, call_conv, types::I64);
+    let caller_conv = ctx.abi().call_conv();
+
+    let mut abi = X64ABICaller::from_func(&sig, &extname, dist, caller_conv)?;
+
+    abi.emit_stack_pre_adjust(ctx);
+
+    let vm_context = if call_conv.extends_baldrdash() { 1 } else { 0 };
+    assert_eq!(inputs.len() + vm_context, abi.num_args());
+
+    for (i, input) in inputs.iter().enumerate() {
+        let arg_reg = put_input_in_reg(ctx, *input);
+        abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
+    }
+    if call_conv.extends_baldrdash() {
+        let vm_context_vreg = ctx
+            .get_vm_context()
+            .expect("should have a VMContext to pass to libcall funcs");
+        abi.emit_copy_reg_to_arg(ctx, inputs.len(), vm_context_vreg);
+    }
+
+    abi.emit_call(ctx);
+    for (i, output) in outputs.iter().enumerate() {
+        let retval_reg = get_output_reg(ctx, *output);
+        abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
+    }
+    abi.emit_stack_post_adjust(ctx);
+
+    Ok(())
+}
+
+/// Returns whether the given input is a shift by a constant value less or equal than 3.
+/// The goal is to embed it within an address mode.
+fn matches_small_constant_shift<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    spec: InsnInput,
+) -> Option<(InsnInput, u8)> {
+    matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| {
+        match input_to_imm(
+            ctx,
+            InsnInput {
+                insn: shift,
+                input: 1,
+            },
+        ) {
+            Some(shift_amt) if shift_amt <= 3 => Some((
+                InsnInput {
+                    insn: shift,
+                    input: 0,
+                },
+                shift_amt as u8,
+            )),
+            _ => None,
+        }
+    })
+}
+
+/// Lowers an instruction to one of the x86 addressing modes.
+///
+/// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior.
+fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i32) -> Amode {
+    let flags = ctx
+        .memflags(spec.insn)
+        .expect("Instruction with amode should have memflags");
+
+    // We now either have an add that we must materialize, or some other input; as well as the
+    // final offset.
+    if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
+        debug_assert_eq!(ctx.output_ty(add, 0), types::I64);
+        let add_inputs = &[
+            InsnInput {
+                insn: add,
+                input: 0,
+            },
+            InsnInput {
+                insn: add,
+                input: 1,
+            },
+        ];
+
+        // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations
+        // aren't happening in the wasm case. We could do better, given some range analysis.
+        let (base, index, shift) = if let Some((shift_input, shift_amt)) =
+            matches_small_constant_shift(ctx, add_inputs[0])
+        {
+            (
+                put_input_in_reg(ctx, add_inputs[1]),
+                put_input_in_reg(ctx, shift_input),
+                shift_amt,
+            )
+        } else if let Some((shift_input, shift_amt)) =
+            matches_small_constant_shift(ctx, add_inputs[1])
+        {
+            (
+                put_input_in_reg(ctx, add_inputs[0]),
+                put_input_in_reg(ctx, shift_input),
+                shift_amt,
+            )
+        } else {
+            for i in 0..=1 {
+                let input = ctx.get_input(add, i);
+
+                // Try to pierce through uextend.
+                if let Some(uextend) = matches_input(
+                    ctx,
+                    InsnInput {
+                        insn: add,
+                        input: i,
+                    },
+                    Opcode::Uextend,
+                ) {
+                    if let Some(cst) = ctx.get_input(uextend, 0).constant {
+                        // Zero the upper bits.
+                        let input_size = ctx.input_ty(uextend, 0).bits() as u64;
+                        let shift: u64 = 64 - input_size;
+                        let uext_cst: u64 = (cst << shift) >> shift;
+
+                        let final_offset = (offset as i64).wrapping_add(uext_cst as i64);
+                        if low32_will_sign_extend_to_64(final_offset as u64) {
+                            let base = put_input_in_reg(ctx, add_inputs[1 - i]);
+                            return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
+                        }
+                    }
+                }
+
+                // If it's a constant, add it directly!
+                if let Some(cst) = input.constant {
+                    let final_offset = (offset as i64).wrapping_add(cst as i64);
+                    if low32_will_sign_extend_to_64(final_offset as u64) {
+                        let base = put_input_in_reg(ctx, add_inputs[1 - i]);
+                        return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
+                    }
+                }
+            }
+
+            (
+                put_input_in_reg(ctx, add_inputs[0]),
+                put_input_in_reg(ctx, add_inputs[1]),
+                0,
+            )
+        };
+
+        return Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags);
+    }
+
+    let input = put_input_in_reg(ctx, spec);
+    Amode::imm_reg(offset as u32, input).with_flags(flags)
+}
+
+//=============================================================================
+// Top-level instruction lowering entry point, for one instruction.
+
+/// Actually codegen an instruction's results into registers.
+fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+    flags: &Flags,
+    triple: &Triple,
+) -> CodegenResult<()> {
+    let op = ctx.data(insn).opcode();
+
+    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
+        .map(|i| InsnInput { insn, input: i })
+        .collect();
+    let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
+        .map(|i| InsnOutput { insn, output: i })
+        .collect();
+
+    let ty = if outputs.len() > 0 {
+        Some(ctx.output_ty(insn, 0))
+    } else {
+        None
+    };
+
+    match op {
+        Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
+            let value = ctx
+                .get_constant(insn)
+                .expect("constant value for iconst et al");
+            let dst = get_output_reg(ctx, outputs[0]);
+            for inst in Inst::gen_constant(dst, value, ty.unwrap(), |reg_class, ty| {
+                ctx.alloc_tmp(reg_class, ty)
+            }) {
+                ctx.emit(inst);
+            }
+        }
+
+        Opcode::Iadd
+        | Opcode::IaddIfcout
+        | Opcode::SaddSat
+        | Opcode::UaddSat
+        | Opcode::Isub
+        | Opcode::SsubSat
+        | Opcode::UsubSat
+        | Opcode::Imul
+        | Opcode::AvgRound
+        | Opcode::Band
+        | Opcode::Bor
+        | Opcode::Bxor => {
+            let ty = ty.unwrap();
+            if ty.lane_count() > 1 {
+                let sse_op = match op {
+                    Opcode::Iadd => match ty {
+                        types::I8X16 => SseOpcode::Paddb,
+                        types::I16X8 => SseOpcode::Paddw,
+                        types::I32X4 => SseOpcode::Paddd,
+                        types::I64X2 => SseOpcode::Paddq,
+                        _ => panic!("Unsupported type for packed iadd instruction: {}", ty),
+                    },
+                    Opcode::SaddSat => match ty {
+                        types::I8X16 => SseOpcode::Paddsb,
+                        types::I16X8 => SseOpcode::Paddsw,
+                        _ => panic!("Unsupported type for packed sadd_sat instruction: {}", ty),
+                    },
+                    Opcode::UaddSat => match ty {
+                        types::I8X16 => SseOpcode::Paddusb,
+                        types::I16X8 => SseOpcode::Paddusw,
+                        _ => panic!("Unsupported type for packed uadd_sat instruction: {}", ty),
+                    },
+                    Opcode::Isub => match ty {
+                        types::I8X16 => SseOpcode::Psubb,
+                        types::I16X8 => SseOpcode::Psubw,
+                        types::I32X4 => SseOpcode::Psubd,
+                        types::I64X2 => SseOpcode::Psubq,
+                        _ => panic!("Unsupported type for packed isub instruction: {}", ty),
+                    },
+                    Opcode::SsubSat => match ty {
+                        types::I8X16 => SseOpcode::Psubsb,
+                        types::I16X8 => SseOpcode::Psubsw,
+                        _ => panic!("Unsupported type for packed ssub_sat instruction: {}", ty),
+                    },
+                    Opcode::UsubSat => match ty {
+                        types::I8X16 => SseOpcode::Psubusb,
+                        types::I16X8 => SseOpcode::Psubusw,
+                        _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty),
+                    },
+                    Opcode::Imul => match ty {
+                        types::I16X8 => SseOpcode::Pmullw,
+                        types::I32X4 => SseOpcode::Pmulld,
+                        types::I64X2 => {
+                            // Note for I64X2 we describe a lane A as being composed of a
+                            // 32-bit upper half "Ah" and a 32-bit lower half "Al".
+                            // The 32-bit long hand multiplication can then be written as:
+                            //    Ah Al
+                            // *  Bh Bl
+                            //    -----
+                            //    Al * Bl
+                            // + (Ah * Bl) << 32
+                            // + (Al * Bh) << 32
+                            //
+                            // So for each lane we will compute:
+                            // A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
+                            //
+                            // Note, the algorithm will use pmuldq which operates directly on
+                            // the lower 32-bit (Al or Bl) of a lane and writes the result
+                            // to the full 64-bits of the lane of the destination. For this
+                            // reason we don't need shifts to isolate the lower 32-bits, however
+                            // we will need to use shifts to isolate the high 32-bits when doing
+                            // calculations, i.e. Ah == A >> 32
+                            //
+                            // The full sequence then is as follows:
+                            // A' = A
+                            // A' = A' >> 32
+                            // A' = Ah' * Bl
+                            // B' = B
+                            // B' = B' >> 32
+                            // B' = Bh' * Al
+                            // B' = B' + A'
+                            // B' = B' << 32
+                            // A' = A
+                            // A' = Al' * Bl
+                            // A' = A' + B'
+                            // dst = A'
+
+                            // Get inputs rhs=A and lhs=B and the dst register
+                            let lhs = put_input_in_reg(ctx, inputs[0]);
+                            let rhs = put_input_in_reg(ctx, inputs[1]);
+                            let dst = get_output_reg(ctx, outputs[0]);
+
+                            // A' = A
+                            let rhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2);
+                            ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
+
+                            // A' = A' >> 32
+                            // A' = Ah' * Bl
+                            ctx.emit(Inst::xmm_rmi_reg(
+                                SseOpcode::Psrlq,
+                                RegMemImm::imm(32),
+                                rhs_1,
+                            ));
+                            ctx.emit(Inst::xmm_rm_r(
+                                SseOpcode::Pmuludq,
+                                RegMem::reg(lhs.clone()),
+                                rhs_1,
+                            ));
+
+                            // B' = B
+                            let lhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2);
+                            ctx.emit(Inst::gen_move(lhs_1, lhs, ty));
+
+                            // B' = B' >> 32
+                            // B' = Bh' * Al
+                            ctx.emit(Inst::xmm_rmi_reg(
+                                SseOpcode::Psrlq,
+                                RegMemImm::imm(32),
+                                lhs_1,
+                            ));
+                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
+
+                            // B' = B' + A'
+                            // B' = B' << 32
+                            ctx.emit(Inst::xmm_rm_r(
+                                SseOpcode::Paddq,
+                                RegMem::reg(rhs_1.to_reg()),
+                                lhs_1,
+                            ));
+                            ctx.emit(Inst::xmm_rmi_reg(
+                                SseOpcode::Psllq,
+                                RegMemImm::imm(32),
+                                lhs_1,
+                            ));
+
+                            // A' = A
+                            // A' = Al' * Bl
+                            // A' = A' + B'
+                            // dst = A'
+                            ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
+                            ctx.emit(Inst::xmm_rm_r(
+                                SseOpcode::Pmuludq,
+                                RegMem::reg(lhs.clone()),
+                                rhs_1,
+                            ));
+                            ctx.emit(Inst::xmm_rm_r(
+                                SseOpcode::Paddq,
+                                RegMem::reg(lhs_1.to_reg()),
+                                rhs_1,
+                            ));
+                            ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
+                            return Ok(());
+                        }
+                        _ => panic!("Unsupported type for packed imul instruction: {}", ty),
+                    },
+                    Opcode::AvgRound => match ty {
+                        types::I8X16 => SseOpcode::Pavgb,
+                        types::I16X8 => SseOpcode::Pavgw,
+                        _ => panic!("Unsupported type for packed avg_round instruction: {}", ty),
+                    },
+                    Opcode::Band => match ty {
+                        types::F32X4 => SseOpcode::Andps,
+                        types::F64X2 => SseOpcode::Andpd,
+                        _ => SseOpcode::Pand,
+                    },
+                    Opcode::Bor => match ty {
+                        types::F32X4 => SseOpcode::Orps,
+                        types::F64X2 => SseOpcode::Orpd,
+                        _ => SseOpcode::Por,
+                    },
+                    Opcode::Bxor => match ty {
+                        types::F32X4 => SseOpcode::Xorps,
+                        types::F64X2 => SseOpcode::Xorpd,
+                        _ => SseOpcode::Pxor,
+                    },
+                    _ => panic!("Unsupported packed instruction: {}", op),
+                };
+                let lhs = put_input_in_reg(ctx, inputs[0]);
+                let rhs = input_to_reg_mem(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                // Move the `lhs` to the same register as `dst`.
+                ctx.emit(Inst::gen_move(dst, lhs, ty));
+                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+            } else {
+                let is_64 = ty == types::I64;
+                let alu_op = match op {
+                    Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add,
+                    Opcode::Isub => AluRmiROpcode::Sub,
+                    Opcode::Imul => AluRmiROpcode::Mul,
+                    Opcode::Band => AluRmiROpcode::And,
+                    Opcode::Bor => AluRmiROpcode::Or,
+                    Opcode::Bxor => AluRmiROpcode::Xor,
+                    _ => unreachable!(),
+                };
+
+                let (lhs, rhs) = match op {
+                    Opcode::Iadd
+                    | Opcode::IaddIfcout
+                    | Opcode::Imul
+                    | Opcode::Band
+                    | Opcode::Bor
+                    | Opcode::Bxor => {
+                        // For commutative operations, try to commute operands if one is an
+                        // immediate.
+                        if let Some(imm) = input_to_sext_imm(ctx, inputs[0]) {
+                            (put_input_in_reg(ctx, inputs[1]), RegMemImm::imm(imm))
+                        } else {
+                            (
+                                put_input_in_reg(ctx, inputs[0]),
+                                input_to_reg_mem_imm(ctx, inputs[1]),
+                            )
+                        }
+                    }
+                    Opcode::Isub => (
+                        put_input_in_reg(ctx, inputs[0]),
+                        input_to_reg_mem_imm(ctx, inputs[1]),
+                    ),
+                    _ => unreachable!(),
+                };
+
+                let dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::mov_r_r(true, lhs, dst));
+                ctx.emit(Inst::alu_rmi_r(is_64, alu_op, rhs, dst));
+            }
+        }
+
+        Opcode::BandNot => {
+            let ty = ty.unwrap();
+            debug_assert!(ty.is_vector() && ty.bytes() == 16);
+            let lhs = input_to_reg_mem(ctx, inputs[0]);
+            let rhs = put_input_in_reg(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let sse_op = match ty {
+                types::F32X4 => SseOpcode::Andnps,
+                types::F64X2 => SseOpcode::Andnpd,
+                _ => SseOpcode::Pandn,
+            };
+            // Note the flipping of operands: the `rhs` operand is used as the destination instead
+            // of the `lhs` as in the other bit operations above (e.g. `band`).
+            ctx.emit(Inst::gen_move(dst, rhs, ty));
+            ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst));
+        }
+
+        Opcode::Iabs => {
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            if ty.is_vector() {
+                let opcode = match ty {
+                    types::I8X16 => SseOpcode::Pabsb,
+                    types::I16X8 => SseOpcode::Pabsw,
+                    types::I32X4 => SseOpcode::Pabsd,
+                    _ => panic!("Unsupported type for packed iabs instruction: {}", ty),
+                };
+                ctx.emit(Inst::xmm_unary_rm_r(opcode, src, dst));
+            } else {
+                unimplemented!("iabs is unimplemented for non-vector type: {}", ty);
+            }
+        }
+
+        Opcode::Imax | Opcode::Umax | Opcode::Imin | Opcode::Umin => {
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = input_to_reg_mem(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            if ty.is_vector() {
+                let sse_op = match op {
+                    Opcode::Imax => match ty {
+                        types::I8X16 => SseOpcode::Pmaxsb,
+                        types::I16X8 => SseOpcode::Pmaxsw,
+                        types::I32X4 => SseOpcode::Pmaxsd,
+                        _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
+                    },
+                    Opcode::Umax => match ty {
+                        types::I8X16 => SseOpcode::Pmaxub,
+                        types::I16X8 => SseOpcode::Pmaxuw,
+                        types::I32X4 => SseOpcode::Pmaxud,
+                        _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
+                    },
+                    Opcode::Imin => match ty {
+                        types::I8X16 => SseOpcode::Pminsb,
+                        types::I16X8 => SseOpcode::Pminsw,
+                        types::I32X4 => SseOpcode::Pminsd,
+                        _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
+                    },
+                    Opcode::Umin => match ty {
+                        types::I8X16 => SseOpcode::Pminub,
+                        types::I16X8 => SseOpcode::Pminuw,
+                        types::I32X4 => SseOpcode::Pminud,
+                        _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
+                    },
+                    _ => unreachable!("This is a bug: the external and internal `match op` should be over the same opcodes."),
+                };
+
+                // Move the `lhs` to the same register as `dst`.
+                ctx.emit(Inst::gen_move(dst, lhs, ty));
+                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+            } else {
+                panic!("Unsupported type for {} instruction: {}", op, ty);
+            }
+        }
+
+        Opcode::Bnot => {
+            let ty = ty.unwrap();
+            let size = ty.bytes() as u8;
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::gen_move(dst, src, ty));
+
+            if ty.is_vector() {
+                let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp));
+                ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst));
+            } else if ty.is_bool() {
+                unimplemented!("bool bnot")
+            } else {
+                ctx.emit(Inst::not(size, dst));
+            }
+        }
+
+        Opcode::Bitselect => {
+            let ty = ty.unwrap();
+            let condition = put_input_in_reg(ctx, inputs[0]);
+            let if_true = put_input_in_reg(ctx, inputs[1]);
+            let if_false = input_to_reg_mem(ctx, inputs[2]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            if ty.is_vector() {
+                let tmp1 = ctx.alloc_tmp(RegClass::V128, ty);
+                ctx.emit(Inst::gen_move(tmp1, if_true, ty));
+                ctx.emit(Inst::and(ty, RegMem::reg(condition.clone()), tmp1));
+
+                let tmp2 = ctx.alloc_tmp(RegClass::V128, ty);
+                ctx.emit(Inst::gen_move(tmp2, condition, ty));
+                ctx.emit(Inst::and_not(ty, if_false, tmp2));
+
+                ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
+                ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
+            } else {
+                unimplemented!("scalar bitselect")
+            }
+        }
+
+        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
+            let dst_ty = ctx.output_ty(insn, 0);
+            debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
+
+            let (size, lhs) = match dst_ty {
+                types::I8 | types::I16 => match op {
+                    Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])),
+                    Opcode::Ushr => (
+                        4,
+                        extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
+                    ),
+                    Opcode::Sshr => (
+                        4,
+                        extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
+                    ),
+                    Opcode::Rotl | Opcode::Rotr => {
+                        (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0]))
+                    }
+                    _ => unreachable!(),
+                },
+                types::I32 | types::I64 => (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])),
+                _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
+            };
+
+            let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant {
+                // Mask count, according to Cranelift's semantics.
+                let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
+                (Some(cst), None)
+            } else {
+                (None, Some(put_input_in_reg(ctx, inputs[1])))
+            };
+
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let shift_kind = match op {
+                Opcode::Ishl => ShiftKind::ShiftLeft,
+                Opcode::Ushr => ShiftKind::ShiftRightLogical,
+                Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
+                Opcode::Rotl => ShiftKind::RotateLeft,
+                Opcode::Rotr => ShiftKind::RotateRight,
+                _ => unreachable!(),
+            };
+
+            let w_rcx = Writable::from_reg(regs::rcx());
+            ctx.emit(Inst::mov_r_r(true, lhs, dst));
+            if count.is_none() {
+                ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx));
+            }
+            ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
+        }
+
+        Opcode::Ineg => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+
+            if ty.is_vector() {
+                // Zero's out a register and then does a packed subtraction
+                // of the input from the register.
+
+                let src = input_to_reg_mem(ctx, inputs[0]);
+                let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+
+                let subtract_opcode = match ty {
+                    types::I8X16 => SseOpcode::Psubb,
+                    types::I16X8 => SseOpcode::Psubw,
+                    types::I32X4 => SseOpcode::Psubd,
+                    types::I64X2 => SseOpcode::Psubq,
+                    _ => panic!("Unsupported type for Ineg instruction, found {}", ty),
+                };
+
+                // Note we must zero out a tmp instead of using the destination register since
+                // the desitnation could be an alias for the source input register
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Pxor,
+                    RegMem::reg(tmp.to_reg()),
+                    tmp,
+                ));
+                ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp));
+                ctx.emit(Inst::xmm_unary_rm_r(
+                    SseOpcode::Movapd,
+                    RegMem::reg(tmp.to_reg()),
+                    dst,
+                ));
+            } else {
+                let size = ty.bytes() as u8;
+                let src = put_input_in_reg(ctx, inputs[0]);
+                ctx.emit(Inst::gen_move(dst, src, ty));
+                ctx.emit(Inst::neg(size, dst));
+            }
+        }
+
+        Opcode::Clz => {
+            // TODO when the x86 flags have use_lzcnt, we can use LZCNT.
+
+            // General formula using bit-scan reverse (BSR):
+            // mov -1, %dst
+            // bsr %src, %tmp
+            // cmovz %dst, %tmp
+            // mov $(size_bits - 1), %dst
+            // sub %tmp, %dst
+
+            let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
+                types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
+                a if a == types::I32 || a == types::I64 => (None, a),
+                _ => unreachable!(),
+            };
+
+            let src = if let Some(ext_spec) = ext_spec {
+                RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
+            } else {
+                input_to_reg_mem(ctx, inputs[0])
+            };
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
+            ctx.emit(Inst::imm(
+                OperandSize::from_bytes(ty.bytes()),
+                u64::max_value(),
+                dst,
+            ));
+
+            ctx.emit(Inst::unary_rm_r(
+                ty.bytes() as u8,
+                UnaryRmROpcode::Bsr,
+                src,
+                tmp,
+            ));
+
+            ctx.emit(Inst::cmove(
+                ty.bytes() as u8,
+                CC::Z,
+                RegMem::reg(dst.to_reg()),
+                tmp,
+            ));
+
+            ctx.emit(Inst::imm(
+                OperandSize::from_bytes(ty.bytes()),
+                ty.bits() as u64 - 1,
+                dst,
+            ));
+
+            ctx.emit(Inst::alu_rmi_r(
+                ty == types::I64,
+                AluRmiROpcode::Sub,
+                RegMemImm::reg(tmp.to_reg()),
+                dst,
+            ));
+        }
+
+        Opcode::Ctz => {
+            // TODO when the x86 flags have use_bmi1, we can use TZCNT.
+
+            // General formula using bit-scan forward (BSF):
+            // bsf %src, %dst
+            // mov $(size_bits), %tmp
+            // cmovz %tmp, %dst
+            let ty = ctx.input_ty(insn, 0);
+            let ty = if ty.bits() < 32 { types::I32 } else { ty };
+            debug_assert!(ty == types::I32 || ty == types::I64);
+
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
+            ctx.emit(Inst::imm(OperandSize::Size32, ty.bits() as u64, tmp));
+
+            ctx.emit(Inst::unary_rm_r(
+                ty.bytes() as u8,
+                UnaryRmROpcode::Bsf,
+                src,
+                dst,
+            ));
+
+            ctx.emit(Inst::cmove(
+                ty.bytes() as u8,
+                CC::Z,
+                RegMem::reg(tmp.to_reg()),
+                dst,
+            ));
+        }
+
+        Opcode::Popcnt => {
+            // TODO when the x86 flags have use_popcnt, we can use the popcnt instruction.
+
+            let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
+                types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
+                a if a == types::I32 || a == types::I64 => (None, a),
+                _ => unreachable!(),
+            };
+
+            let src = if let Some(ext_spec) = ext_spec {
+                RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
+            } else {
+                input_to_reg_mem(ctx, inputs[0])
+            };
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            if ty == types::I64 {
+                let is_64 = true;
+
+                let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let cst = ctx.alloc_tmp(RegClass::I64, types::I64);
+
+                // mov src, tmp1
+                ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    8,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // mov 0x7777_7777_7777_7777, cst
+                ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst));
+
+                // andq cst, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cst.to_reg()),
+                    tmp1,
+                ));
+
+                // mov src, tmp2
+                ctx.emit(Inst::mov64_rm_r(src, tmp2));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    8,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // and cst, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cst.to_reg()),
+                    tmp1,
+                ));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    8,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // and cst, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cst.to_reg()),
+                    tmp1,
+                ));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // mov tmp2, dst
+                ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
+
+                // shr $4, dst
+                ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst));
+
+                // add tmp2, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Add,
+                    RegMemImm::reg(tmp2.to_reg()),
+                    dst,
+                ));
+
+                // mov $0x0F0F_0F0F_0F0F_0F0F, cst
+                ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst));
+
+                // and cst, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::reg(cst.to_reg()),
+                    dst,
+                ));
+
+                // mov $0x0101_0101_0101_0101, cst
+                ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst));
+
+                // mul cst, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Mul,
+                    RegMemImm::reg(cst.to_reg()),
+                    dst,
+                ));
+
+                // shr $56, dst
+                ctx.emit(Inst::shift_r(
+                    8,
+                    ShiftKind::ShiftRightLogical,
+                    Some(56),
+                    dst,
+                ));
+            } else {
+                assert_eq!(ty, types::I32);
+                let is_64 = false;
+
+                let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+
+                // mov src, tmp1
+                ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    4,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // andq $0x7777_7777, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(0x77777777),
+                    tmp1,
+                ));
+
+                // mov src, tmp2
+                ctx.emit(Inst::mov64_rm_r(src, tmp2));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    4,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // and 0x7777_7777, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(0x77777777),
+                    tmp1,
+                ));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // shr $1, tmp1
+                ctx.emit(Inst::shift_r(
+                    4,
+                    ShiftKind::ShiftRightLogical,
+                    Some(1),
+                    tmp1,
+                ));
+
+                // and $0x7777_7777, tmp1
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(0x77777777),
+                    tmp1,
+                ));
+
+                // sub tmp1, tmp2
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Sub,
+                    RegMemImm::reg(tmp1.to_reg()),
+                    tmp2,
+                ));
+
+                // mov tmp2, dst
+                ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));
+
+                // shr $4, dst
+                ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst));
+
+                // add tmp2, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Add,
+                    RegMemImm::reg(tmp2.to_reg()),
+                    dst,
+                ));
+
+                // and $0x0F0F_0F0F, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::And,
+                    RegMemImm::imm(0x0F0F0F0F),
+                    dst,
+                ));
+
+                // mul $0x0101_0101, dst
+                ctx.emit(Inst::alu_rmi_r(
+                    is_64,
+                    AluRmiROpcode::Mul,
+                    RegMemImm::imm(0x01010101),
+                    dst,
+                ));
+
+                // shr $24, dst
+                ctx.emit(Inst::shift_r(
+                    4,
+                    ShiftKind::ShiftRightLogical,
+                    Some(24),
+                    dst,
+                ));
+            }
+        }
+
+        Opcode::IsNull | Opcode::IsInvalid => {
+            // Null references are represented by the constant value 0; invalid references are
+            // represented by the constant value -1. See `define_reftypes()` in
+            // `meta/src/isa/x86/encodings.rs` to confirm.
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+            let imm = match op {
+                Opcode::IsNull => {
+                    // TODO could use tst src, src for IsNull
+                    0
+                }
+                Opcode::IsInvalid => {
+                    // We can do a 32-bit comparison even in 64-bits mode, as the constant is then
+                    // sign-extended.
+                    0xffffffff
+                }
+                _ => unreachable!(),
+            };
+            ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::imm(imm), src));
+            ctx.emit(Inst::setcc(CC::Z, dst));
+        }
+
+        Opcode::Uextend
+        | Opcode::Sextend
+        | Opcode::Bint
+        | Opcode::Breduce
+        | Opcode::Bextend
+        | Opcode::Ireduce => {
+            let src_ty = ctx.input_ty(insn, 0);
+            let dst_ty = ctx.output_ty(insn, 0);
+
+            // Sextend requires a sign-extended move, but all the other opcodes are simply a move
+            // from a zero-extended source. Here is why this works, in each case:
+            //
+            // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to
+            // zero-extend here.
+            //
+            // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so
+            // again, this is a zero-extend / no-op.
+            //
+            // - Ireduce: changing width of an integer. Smaller ints are stored with undefined
+            // high-order bits, so we can simply do a copy.
+
+            if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend {
+                // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on
+                // 32-bits will zero-extend the upper 32-bits, so we can even not generate a
+                // zero-extended move in this case.
+                // TODO add loads and shifts here.
+                if let Some(_) = matches_input_any(
+                    ctx,
+                    inputs[0],
+                    &[
+                        Opcode::Iadd,
+                        Opcode::IaddIfcout,
+                        Opcode::Isub,
+                        Opcode::Imul,
+                        Opcode::Band,
+                        Opcode::Bor,
+                        Opcode::Bxor,
+                    ],
+                ) {
+                    let src = put_input_in_reg(ctx, inputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]);
+                    ctx.emit(Inst::gen_move(dst, src, types::I64));
+                    return Ok(());
+                }
+            }
+
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits());
+            assert_eq!(
+                src_ty.bits() < dst_ty.bits(),
+                ext_mode.is_some(),
+                "unexpected extension: {} -> {}",
+                src_ty,
+                dst_ty
+            );
+
+            if let Some(ext_mode) = ext_mode {
+                if op == Opcode::Sextend {
+                    ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst));
+                } else {
+                    ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst));
+                }
+            } else {
+                ctx.emit(Inst::mov64_rm_r(src, dst));
+            }
+        }
+
+        Opcode::Icmp => {
+            let condcode = ctx.data(insn).cond_code().unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+            if !ty.is_vector() {
+                emit_cmp(ctx, insn);
+                let cc = CC::from_intcc(condcode);
+                ctx.emit(Inst::setcc(cc, dst));
+            } else {
+                assert_eq!(ty.bits(), 128);
+                let eq = |ty| match ty {
+                    types::I8X16 => SseOpcode::Pcmpeqb,
+                    types::I16X8 => SseOpcode::Pcmpeqw,
+                    types::I32X4 => SseOpcode::Pcmpeqd,
+                    types::I64X2 => SseOpcode::Pcmpeqq,
+                    _ => panic!(
+                        "Unable to find an instruction for {} for type: {}",
+                        condcode, ty
+                    ),
+                };
+                let gt = |ty| match ty {
+                    types::I8X16 => SseOpcode::Pcmpgtb,
+                    types::I16X8 => SseOpcode::Pcmpgtw,
+                    types::I32X4 => SseOpcode::Pcmpgtd,
+                    types::I64X2 => SseOpcode::Pcmpgtq,
+                    _ => panic!(
+                        "Unable to find an instruction for {} for type: {}",
+                        condcode, ty
+                    ),
+                };
+                let maxu = |ty| match ty {
+                    types::I8X16 => SseOpcode::Pmaxub,
+                    types::I16X8 => SseOpcode::Pmaxuw,
+                    types::I32X4 => SseOpcode::Pmaxud,
+                    _ => panic!(
+                        "Unable to find an instruction for {} for type: {}",
+                        condcode, ty
+                    ),
+                };
+                let mins = |ty| match ty {
+                    types::I8X16 => SseOpcode::Pminsb,
+                    types::I16X8 => SseOpcode::Pminsw,
+                    types::I32X4 => SseOpcode::Pminsd,
+                    _ => panic!(
+                        "Unable to find an instruction for {} for type: {}",
+                        condcode, ty
+                    ),
+                };
+                let minu = |ty| match ty {
+                    types::I8X16 => SseOpcode::Pminub,
+                    types::I16X8 => SseOpcode::Pminuw,
+                    types::I32X4 => SseOpcode::Pminud,
+                    _ => panic!(
+                        "Unable to find an instruction for {} for type: {}",
+                        condcode, ty
+                    ),
+                };
+
+                // Here we decide which operand to use as the read/write `dst` (ModRM reg field)
+                // and which to use as the read `input` (ModRM r/m field). In the normal case we
+                // use Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for
+                // the less-than cases so that we can reuse the greater-than implementation.
+                let input = match condcode {
+                    IntCC::SignedLessThan
+                    | IntCC::SignedLessThanOrEqual
+                    | IntCC::UnsignedLessThan
+                    | IntCC::UnsignedLessThanOrEqual => {
+                        let lhs = input_to_reg_mem(ctx, inputs[0]);
+                        let rhs = put_input_in_reg(ctx, inputs[1]);
+                        ctx.emit(Inst::gen_move(dst, rhs, ty));
+                        lhs
+                    }
+                    _ => {
+                        let lhs = put_input_in_reg(ctx, inputs[0]);
+                        let rhs = input_to_reg_mem(ctx, inputs[1]);
+                        ctx.emit(Inst::gen_move(dst, lhs, ty));
+                        rhs
+                    }
+                };
+
+                match condcode {
+                    IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)),
+                    IntCC::NotEqual => {
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
+                        // Emit all 1s into the `tmp` register.
+                        let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
+                        // Invert the result of the `PCMPEQ*`.
+                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
+                    }
+                    IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
+                        ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
+                    }
+                    IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => {
+                        ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
+                    }
+                    IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
+                        ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
+                        // Emit all 1s into the `tmp` register.
+                        let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
+                        // Invert the result of the `PCMPEQ*`.
+                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
+                    }
+                    IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
+                        ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
+                    }
+                    _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
+                }
+            }
+        }
+
+        Opcode::Fcmp => {
+            let cond_code = ctx.data(insn).fp_cond_code().unwrap();
+            let input_ty = ctx.input_ty(insn, 0);
+            if !input_ty.is_vector() {
+                // Unordered is returned by setting ZF, PF, CF <- 111
+                // Greater than by ZF, PF, CF <- 000
+                // Less than by ZF, PF, CF <- 001
+                // Equal by ZF, PF, CF <- 100
+                //
+                // Checking the result of comiss is somewhat annoying because you don't have setcc
+                // instructions that explicitly check simultaneously for the condition (i.e. eq, le,
+                // gt, etc) *and* orderedness.
+                //
+                // So that might mean we need more than one setcc check and then a logical "and" or
+                // "or" to determine both, in some cases.  However knowing that if the parity bit is
+                // set, then the result was considered unordered and knowing that if the parity bit is
+                // set, then both the ZF and CF flag bits must also be set we can get away with using
+                // one setcc for most condition codes.
+
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) {
+                    FcmpCondResult::Condition(cc) => {
+                        ctx.emit(Inst::setcc(cc, dst));
+                    }
+                    FcmpCondResult::AndConditions(cc1, cc2) => {
+                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        ctx.emit(Inst::setcc(cc1, tmp));
+                        ctx.emit(Inst::setcc(cc2, dst));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false,
+                            AluRmiROpcode::And,
+                            RegMemImm::reg(tmp.to_reg()),
+                            dst,
+                        ));
+                    }
+                    FcmpCondResult::OrConditions(cc1, cc2) => {
+                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        ctx.emit(Inst::setcc(cc1, tmp));
+                        ctx.emit(Inst::setcc(cc2, dst));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp.to_reg()),
+                            dst,
+                        ));
+                    }
+                    FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
+                }
+            } else {
+                let op = match input_ty {
+                    types::F32X4 => SseOpcode::Cmpps,
+                    types::F64X2 => SseOpcode::Cmppd,
+                    _ => panic!("Bad input type to fcmp: {}", input_ty),
+                };
+
+                // Since some packed comparisons are not available, some of the condition codes
+                // must be inverted, with a corresponding `flip` of the operands.
+                let (imm, flip) = match cond_code {
+                    FloatCC::GreaterThan => (FcmpImm::LessThan, true),
+                    FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true),
+                    FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true),
+                    FloatCC::UnorderedOrLessThanOrEqual => {
+                        (FcmpImm::UnorderedOrGreaterThanOrEqual, true)
+                    }
+                    FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => {
+                        panic!("unsupported float condition code: {}", cond_code)
+                    }
+                    _ => (FcmpImm::from(cond_code), false),
+                };
+
+                // Determine the operands of the comparison, possibly by flipping them.
+                let (lhs, rhs) = if flip {
+                    (
+                        put_input_in_reg(ctx, inputs[1]),
+                        input_to_reg_mem(ctx, inputs[0]),
+                    )
+                } else {
+                    (
+                        put_input_in_reg(ctx, inputs[0]),
+                        input_to_reg_mem(ctx, inputs[1]),
+                    )
+                };
+
+                // Move the `lhs` to the same register as `dst`; this may not emit an actual move
+                // but ensures that the registers are the same to match x86's read-write operand
+                // encoding.
+                let dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(dst, lhs, input_ty));
+
+                // Emit the comparison.
+                ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false));
+            }
+        }
+
+        Opcode::FallthroughReturn | Opcode::Return => {
+            for i in 0..ctx.num_inputs(insn) {
+                let src_reg = put_input_in_reg(ctx, inputs[i]);
+                let retval_reg = ctx.retval(i);
+                let ty = ctx.input_ty(insn, i);
+                ctx.emit(Inst::gen_move(retval_reg, src_reg, ty));
+            }
+            // N.B.: the Ret itself is generated by the ABI.
+        }
+
+        Opcode::Call | Opcode::CallIndirect => {
+            let caller_conv = ctx.abi().call_conv();
+            let (mut abi, inputs) = match op {
+                Opcode::Call => {
+                    let (extname, dist) = ctx.call_target(insn).unwrap();
+                    let sig = ctx.call_sig(insn).unwrap();
+                    assert_eq!(inputs.len(), sig.params.len());
+                    assert_eq!(outputs.len(), sig.returns.len());
+                    (
+                        X64ABICaller::from_func(sig, &extname, dist, caller_conv)?,
+                        &inputs[..],
+                    )
+                }
+
+                Opcode::CallIndirect => {
+                    let ptr = put_input_in_reg(ctx, inputs[0]);
+                    let sig = ctx.call_sig(insn).unwrap();
+                    assert_eq!(inputs.len() - 1, sig.params.len());
+                    assert_eq!(outputs.len(), sig.returns.len());
+                    (
+                        X64ABICaller::from_ptr(sig, ptr, op, caller_conv)?,
+                        &inputs[1..],
+                    )
+                }
+
+                _ => unreachable!(),
+            };
+
+            abi.emit_stack_pre_adjust(ctx);
+            assert_eq!(inputs.len(), abi.num_args());
+            for (i, input) in inputs.iter().enumerate() {
+                let arg_reg = put_input_in_reg(ctx, *input);
+                abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
+            }
+            abi.emit_call(ctx);
+            for (i, output) in outputs.iter().enumerate() {
+                let retval_reg = get_output_reg(ctx, *output);
+                abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
+            }
+            abi.emit_stack_post_adjust(ctx);
+        }
+
+        Opcode::Debugtrap => {
+            ctx.emit(Inst::Hlt);
+        }
+
+        Opcode::Trap | Opcode::ResumableTrap => {
+            let trap_code = ctx.data(insn).trap_code().unwrap();
+            ctx.emit_safepoint(Inst::Ud2 { trap_code });
+        }
+
+        Opcode::Trapif | Opcode::Trapff => {
+            let trap_code = ctx.data(insn).trap_code().unwrap();
+
+            if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
+                let cond_code = ctx.data(insn).cond_code().unwrap();
+                // The flags must not have been clobbered by any other instruction between the
+                // iadd_ifcout and this instruction, as verified by the CLIF validator; so we can
+                // simply use the flags here.
+                let cc = CC::from_intcc(cond_code);
+
+                ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
+            } else if op == Opcode::Trapif {
+                let cond_code = ctx.data(insn).cond_code().unwrap();
+                let cc = CC::from_intcc(cond_code);
+
+                // Verification ensures that the input is always a single-def ifcmp.
+                let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap();
+                emit_cmp(ctx, ifcmp);
+
+                ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
+            } else {
+                let cond_code = ctx.data(insn).fp_cond_code().unwrap();
+
+                // Verification ensures that the input is always a single-def ffcmp.
+                let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap();
+
+                match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
+                    FcmpCondResult::Condition(cc) => {
+                        ctx.emit_safepoint(Inst::TrapIf { trap_code, cc })
+                    }
+                    FcmpCondResult::AndConditions(cc1, cc2) => {
+                        // A bit unfortunate, but materialize the flags in their own register, and
+                        // check against this.
+                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        ctx.emit(Inst::setcc(cc1, tmp));
+                        ctx.emit(Inst::setcc(cc2, tmp2));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false, /* is_64 */
+                            AluRmiROpcode::And,
+                            RegMemImm::reg(tmp.to_reg()),
+                            tmp2,
+                        ));
+                        ctx.emit_safepoint(Inst::TrapIf {
+                            trap_code,
+                            cc: CC::NZ,
+                        });
+                    }
+                    FcmpCondResult::OrConditions(cc1, cc2) => {
+                        ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc1 });
+                        ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc2 });
+                    }
+                    FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
+                };
+            };
+        }
+
+        Opcode::F64const => {
+            // TODO use cmpeqpd for all 1s.
+            let value = ctx.get_constant(insn).unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            for inst in Inst::gen_constant(dst, value, types::F64, |reg_class, ty| {
+                ctx.alloc_tmp(reg_class, ty)
+            }) {
+                ctx.emit(inst);
+            }
+        }
+
+        Opcode::F32const => {
+            // TODO use cmpeqps for all 1s.
+            let value = ctx.get_constant(insn).unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            for inst in Inst::gen_constant(dst, value, types::F32, |reg_class, ty| {
+                ctx.alloc_tmp(reg_class, ty)
+            }) {
+                ctx.emit(inst);
+            }
+        }
+
+        Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = input_to_reg_mem(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+
+            // Move the `lhs` to the same register as `dst`; this may not emit an actual move
+            // but ensures that the registers are the same to match x86's read-write operand
+            // encoding.
+            ctx.emit(Inst::gen_move(dst, lhs, ty));
+
+            // Note: min and max can't be handled here, because of the way Cranelift defines them:
+            // if any operand is a NaN, they must return the NaN operand, while the x86 machine
+            // instruction will return the second operand if either operand is a NaN.
+            let sse_op = match ty {
+                types::F32 => match op {
+                    Opcode::Fadd => SseOpcode::Addss,
+                    Opcode::Fsub => SseOpcode::Subss,
+                    Opcode::Fmul => SseOpcode::Mulss,
+                    Opcode::Fdiv => SseOpcode::Divss,
+                    _ => unreachable!(),
+                },
+                types::F64 => match op {
+                    Opcode::Fadd => SseOpcode::Addsd,
+                    Opcode::Fsub => SseOpcode::Subsd,
+                    Opcode::Fmul => SseOpcode::Mulsd,
+                    Opcode::Fdiv => SseOpcode::Divsd,
+                    _ => unreachable!(),
+                },
+                types::F32X4 => match op {
+                    Opcode::Fadd => SseOpcode::Addps,
+                    Opcode::Fsub => SseOpcode::Subps,
+                    Opcode::Fmul => SseOpcode::Mulps,
+                    Opcode::Fdiv => SseOpcode::Divps,
+                    _ => unreachable!(),
+                },
+                types::F64X2 => match op {
+                    Opcode::Fadd => SseOpcode::Addpd,
+                    Opcode::Fsub => SseOpcode::Subpd,
+                    Opcode::Fmul => SseOpcode::Mulpd,
+                    Opcode::Fdiv => SseOpcode::Divpd,
+                    _ => unreachable!(),
+                },
+                _ => panic!(
+                    "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
+                    ty
+                ),
+            };
+            ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+        }
+
+        Opcode::Fmin | Opcode::Fmax => {
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = put_input_in_reg(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let is_min = op == Opcode::Fmin;
+            let output_ty = ty.unwrap();
+            ctx.emit(Inst::gen_move(dst, rhs, output_ty));
+            if !output_ty.is_vector() {
+                let op_size = match output_ty {
+                    types::F32 => OperandSize::Size32,
+                    types::F64 => OperandSize::Size64,
+                    _ => panic!("unexpected type {:?} for fmin/fmax", output_ty),
+                };
+                ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst));
+            } else {
+                // X64's implementation of floating point min and floating point max does not
+                // propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the
+                // scalar approach we use jumps to handle cases where NaN and +0 propagation is
+                // not consistent with what is needed. However for packed floating point min and
+                // floating point max we implement a different approach to avoid the sequence
+                // of jumps that would be required on a per lane basis. Because we do not need to
+                // lower labels and jumps but do need ctx for creating temporaries we implement
+                // the lowering here in lower.rs instead of emit.rs as is done in the case for scalars.
+                // The outline of approach is as follows:
+                //
+                // First we preform the Min/Max in both directions. This is because in the
+                // case of an operand's lane containing a NaN or in the case of the lanes of the
+                // two operands containing 0 but with mismatched signs, x64 will return the second
+                // operand regardless of its contents. So in order to make sure we capture NaNs and
+                // normalize NaNs and 0 values we capture the operation in both directions and merge the
+                // results. Then we normalize the results through operations that create a mask for the
+                // lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize
+                // 0s.
+                //
+                // The following sequence is generated for min:
+                //
+                // movap{s,d} %lhs, %tmp
+                // minp{s,d} %dst, %tmp
+                // minp,{s,d} %lhs, %dst
+                // orp{s,d} %dst, %tmp
+                // cmpp{s,d} %tmp, %dst, $3
+                // orps{s,d} %dst, %tmp
+                // psrl{s,d} {$10, $13}, %dst
+                // andnp{s,d} %tmp, %dst
+                //
+                // and for max the sequence is:
+                //
+                // movap{s,d} %lhs, %tmp
+                // minp{s,d} %dst, %tmp
+                // minp,{s,d} %lhs, %dst
+                // xorp{s,d} %tmp, %dst
+                // orp{s,d} %dst, %tmp
+                // subp{s,d} %dst, %tmp
+                // cmpp{s,d} %tmp, %dst, $3
+                // psrl{s,d} {$10, $13}, %dst
+                // andnp{s,d} %tmp, %dst
+
+                if is_min {
+                    let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) =
+                        match output_ty {
+                            types::F32X4 => (
+                                SseOpcode::Movaps,
+                                SseOpcode::Minps,
+                                SseOpcode::Orps,
+                                SseOpcode::Cmpps,
+                                SseOpcode::Psrld,
+                                10,
+                                SseOpcode::Andnps,
+                            ),
+                            types::F64X2 => (
+                                SseOpcode::Movapd,
+                                SseOpcode::Minpd,
+                                SseOpcode::Orpd,
+                                SseOpcode::Cmppd,
+                                SseOpcode::Psrlq,
+                                13,
+                                SseOpcode::Andnpd,
+                            ),
+                            _ => unimplemented!("unsupported op type {:?}", output_ty),
+                        };
+
+                    // Copy lhs into tmp
+                    let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, output_ty);
+                    ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
+
+                    // Perform min in reverse direction
+                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
+
+                    // Perform min in original direction
+                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
+
+                    // X64 handles propagation of -0's and Nans differently between left and right
+                    // operands. After doing the min in both directions, this OR will
+                    // guarrentee capture of -0's and Nan in our tmp register
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
+
+                    // Compare unordered to create mask for lanes containing NaNs and then use
+                    // that mask to saturate the NaN containing lanes in the tmp register with 1s.
+                    // TODO: Would a check for NaN and then a jump be better here in the
+                    // common case than continuing on to normalize NaNs that might not exist?
+                    let cond = FcmpImm::from(FloatCC::Unordered);
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        cmp_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        cond.encode(),
+                        false,
+                    ));
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // The dst register holds a mask for lanes containing NaNs.
+                    // We take that mask and shift in preparation for creating a different mask
+                    // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
+                    // number of least signficant bits. We shift right each lane by 10 bits
+                    // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
+                    // 11 exp. + 1 MSB sig.) for F64X2.
+                    ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
+
+                    // Finally we do a nand with the tmp register to produce the final results
+                    // in the dst.
+                    ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                } else {
+                    let (
+                        mov_op,
+                        max_op,
+                        xor_op,
+                        or_op,
+                        sub_op,
+                        cmp_op,
+                        shift_op,
+                        shift_by,
+                        andn_op,
+                    ) = match output_ty {
+                        types::F32X4 => (
+                            SseOpcode::Movaps,
+                            SseOpcode::Maxps,
+                            SseOpcode::Xorps,
+                            SseOpcode::Orps,
+                            SseOpcode::Subps,
+                            SseOpcode::Cmpps,
+                            SseOpcode::Psrld,
+                            10,
+                            SseOpcode::Andnps,
+                        ),
+                        types::F64X2 => (
+                            SseOpcode::Movapd,
+                            SseOpcode::Maxpd,
+                            SseOpcode::Xorpd,
+                            SseOpcode::Orpd,
+                            SseOpcode::Subpd,
+                            SseOpcode::Cmppd,
+                            SseOpcode::Psrlq,
+                            13,
+                            SseOpcode::Andnpd,
+                        ),
+                        _ => unimplemented!("unsupported op type {:?}", output_ty),
+                    };
+
+                    // Copy lhs into tmp.
+                    let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32);
+                    ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
+
+                    // Perform max in reverse direction.
+                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // Perform max in original direction.
+                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
+
+                    // Get the difference between the two results and store in tmp.
+                    // Max uses a different approach than min to account for potential
+                    // discrepancies with plus/minus 0.
+                    ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+
+                    // X64 handles propagation of -0's and Nans differently between left and right
+                    // operands. After doing the max in both directions, this OR will
+                    // guarentee capture of 0's and Nan in our tmp register.
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // Capture NaNs and sign discrepancies.
+                    ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+
+                    // Compare unordered to create mask for lanes containing NaNs and then use
+                    // that mask to saturate the NaN containing lanes in the tmp register with 1s.
+                    let cond = FcmpImm::from(FloatCC::Unordered);
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        cmp_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        cond.encode(),
+                        false,
+                    ));
+
+                    // The dst register holds a mask for lanes containing NaNs.
+                    // We take that mask and shift in preparation for creating a different mask
+                    // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
+                    // number of least signficant bits. We shift right each lane by 10 bits
+                    // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
+                    // 11 exp. + 1 MSB sig.) for F64X2.
+                    ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
+
+                    // Finally we do a nand with the tmp register to produce the final results
+                    // in the dst.
+                    ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                }
+            }
+        }
+
+        Opcode::FminPseudo | Opcode::FmaxPseudo => {
+            let lhs = input_to_reg_mem(ctx, inputs[0]);
+            let rhs = put_input_in_reg(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            ctx.emit(Inst::gen_move(dst, rhs, ty));
+            let sse_opcode = match (ty, op) {
+                (types::F32X4, Opcode::FminPseudo) => SseOpcode::Minps,
+                (types::F32X4, Opcode::FmaxPseudo) => SseOpcode::Maxps,
+                (types::F64X2, Opcode::FminPseudo) => SseOpcode::Minpd,
+                (types::F64X2, Opcode::FmaxPseudo) => SseOpcode::Maxpd,
+                _ => unimplemented!("unsupported type {} for {}", ty, op),
+            };
+            ctx.emit(Inst::xmm_rm_r(sse_opcode, lhs, dst));
+        }
+
+        Opcode::Sqrt => {
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+
+            let sse_op = match ty {
+                types::F32 => SseOpcode::Sqrtss,
+                types::F64 => SseOpcode::Sqrtsd,
+                types::F32X4 => SseOpcode::Sqrtps,
+                types::F64X2 => SseOpcode::Sqrtpd,
+                _ => panic!(
+                    "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
+                    ty
+                ),
+            };
+
+            ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst));
+        }
+
+        Opcode::Fpromote => {
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
+        }
+
+        Opcode::Fdemote => {
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
+        }
+
+        Opcode::FcvtFromSint => {
+            let output_ty = ty.unwrap();
+            if !output_ty.is_vector() {
+                let (ext_spec, src_size) = match ctx.input_ty(insn, 0) {
+                    types::I8 | types::I16 => (Some(ExtSpec::SignExtendTo32), OperandSize::Size32),
+                    types::I32 => (None, OperandSize::Size32),
+                    types::I64 => (None, OperandSize::Size64),
+                    _ => unreachable!(),
+                };
+
+                let src = match ext_spec {
+                    Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)),
+                    None => input_to_reg_mem(ctx, inputs[0]),
+                };
+
+                let opcode = if output_ty == types::F32 {
+                    SseOpcode::Cvtsi2ss
+                } else {
+                    assert_eq!(output_ty, types::F64);
+                    SseOpcode::Cvtsi2sd
+                };
+                let dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gpr_to_xmm(opcode, src, src_size, dst));
+            } else {
+                let ty = ty.unwrap();
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                let opcode = match ctx.input_ty(insn, 0) {
+                    types::I32X4 => SseOpcode::Cvtdq2ps,
+                    _ => {
+                        unimplemented!("unable to use type {} for op {}", ctx.input_ty(insn, 0), op)
+                    }
+                };
+                ctx.emit(Inst::gen_move(dst, src, ty));
+                ctx.emit(Inst::xmm_rm_r(opcode, RegMem::from(dst), dst));
+            }
+        }
+
+        Opcode::FcvtFromUint => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+
+            let input_ty = ctx.input_ty(insn, 0);
+            if !ty.is_vector() {
+                match input_ty {
+                    types::I8 | types::I16 | types::I32 => {
+                        // Conversion from an unsigned int smaller than 64-bit is easy: zero-extend +
+                        // do a signed conversion (which won't overflow).
+                        let opcode = if ty == types::F32 {
+                            SseOpcode::Cvtsi2ss
+                        } else {
+                            assert_eq!(ty, types::F64);
+                            SseOpcode::Cvtsi2sd
+                        };
+
+                        let src = RegMem::reg(extend_input_to_reg(
+                            ctx,
+                            inputs[0],
+                            ExtSpec::ZeroExtendTo64,
+                        ));
+                        ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst));
+                    }
+
+                    types::I64 => {
+                        let src = put_input_in_reg(ctx, inputs[0]);
+
+                        let src_copy = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        ctx.emit(Inst::gen_move(src_copy, src, types::I64));
+
+                        let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        let tmp_gpr2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                        ctx.emit(Inst::cvt_u64_to_float_seq(
+                            ty == types::F64,
+                            src_copy,
+                            tmp_gpr1,
+                            tmp_gpr2,
+                            dst,
+                        ));
+                    }
+                    _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
+                };
+            } else {
+                // Converting packed unsigned integers to packed floats requires a few steps.
+                // There is no single instruction lowering for converting unsigned floats but there
+                // is for converting packed signed integers to float (cvtdq2ps). In the steps below
+                // we isolate the upper half (16 bits) and lower half (16 bits) of each lane and
+                // then we convert each half separately using cvtdq2ps meant for signed integers.
+                // In order for this to work for the upper half bits we must shift right by 1
+                // (divide by 2) these bits in order to ensure the most significant bit is 0 not
+                // signed, and then after the conversion we double the value. Finally we add the
+                // converted values where addition will correctly round.
+                //
+                // Sequence:
+                // -> A = 0xffffffff
+                // -> Ah = 0xffff0000
+                // -> Al = 0x0000ffff
+                // -> Convert(Al) // Convert int to float
+                // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
+                // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
+                // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
+                // -> dst = Ah + Al // Add the two floats together
+
+                assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
+                let src = put_input_in_reg(ctx, inputs[0]);
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                // Create a temporary register
+                let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+                ctx.emit(Inst::xmm_unary_rm_r(
+                    SseOpcode::Movapd,
+                    RegMem::reg(src),
+                    tmp,
+                ));
+                ctx.emit(Inst::gen_move(dst, src, ty));
+
+                // Get the low 16 bits
+                ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
+                ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));
+
+                // Get the high 16 bits
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));
+
+                // Convert the low 16 bits
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp));
+
+                // Shift the high bits by 1, convert, and double to get the correct value.
+                ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Addps,
+                    RegMem::reg(dst.to_reg()),
+                    dst,
+                ));
+
+                // Add together the two converted values.
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Addps,
+                    RegMem::reg(tmp.to_reg()),
+                    dst,
+                ));
+            }
+        }
+
+        Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            let input_ty = ctx.input_ty(insn, 0);
+            if !input_ty.is_vector() {
+                let src_size = if input_ty == types::F32 {
+                    OperandSize::Size32
+                } else {
+                    assert_eq!(input_ty, types::F64);
+                    OperandSize::Size64
+                };
+
+                let output_ty = ty.unwrap();
+                let dst_size = if output_ty == types::I32 {
+                    OperandSize::Size32
+                } else {
+                    assert_eq!(output_ty, types::I64);
+                    OperandSize::Size64
+                };
+
+                let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat;
+                let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat;
+
+                let src_copy = ctx.alloc_tmp(RegClass::V128, input_ty);
+                ctx.emit(Inst::gen_move(src_copy, src, input_ty));
+
+                let tmp_xmm = ctx.alloc_tmp(RegClass::V128, input_ty);
+                let tmp_gpr = ctx.alloc_tmp(RegClass::I64, output_ty);
+
+                if to_signed {
+                    ctx.emit(Inst::cvt_float_to_sint_seq(
+                        src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
+                    ));
+                } else {
+                    ctx.emit(Inst::cvt_float_to_uint_seq(
+                        src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
+                    ));
+                }
+            } else {
+                if op == Opcode::FcvtToSintSat {
+                    // Sets destination to zero if float is NaN
+                    let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
+                    ctx.emit(Inst::xmm_unary_rm_r(
+                        SseOpcode::Movapd,
+                        RegMem::reg(src),
+                        tmp,
+                    ));
+                    ctx.emit(Inst::gen_move(dst, src, input_ty));
+                    let cond = FcmpImm::from(FloatCC::Equal);
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        SseOpcode::Cmpps,
+                        RegMem::reg(tmp.to_reg()),
+                        tmp,
+                        cond.encode(),
+                        false,
+                    ));
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Andps,
+                        RegMem::reg(tmp.to_reg()),
+                        dst,
+                    ));
+
+                    // Sets top bit of tmp if float is positive
+                    // Setting up to set top bit on negative float values
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pxor,
+                        RegMem::reg(dst.to_reg()),
+                        tmp,
+                    ));
+
+                    // Convert the packed float to packed doubleword.
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Cvttps2dq,
+                        RegMem::reg(dst.to_reg()),
+                        dst,
+                    ));
+
+                    // Set top bit only if < 0
+                    // Saturate lane with sign (top) bit.
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pand,
+                        RegMem::reg(dst.to_reg()),
+                        tmp,
+                    ));
+                    ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp));
+
+                    // On overflow 0x80000000 is returned to a lane.
+                    // Below sets positive overflow lanes to 0x7FFFFFFF
+                    // Keeps negative overflow lanes as is.
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pxor,
+                        RegMem::reg(tmp.to_reg()),
+                        dst,
+                    ));
+                } else if op == Opcode::FcvtToUintSat {
+                    unimplemented!("f32x4.convert_i32x4_u");
+                } else {
+                    // Since this branch is also guarded by a check for vector types
+                    // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here
+                    // due to vector varients not existing. The first two branches will
+                    // cover all reachable cases.
+                    unreachable!();
+                }
+            }
+        }
+
+        Opcode::Bitcast => {
+            let input_ty = ctx.input_ty(insn, 0);
+            let output_ty = ctx.output_ty(insn, 0);
+            match (input_ty, output_ty) {
+                (types::F32, types::I32) => {
+                    let src = put_input_in_reg(ctx, inputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]);
+                    ctx.emit(Inst::xmm_to_gpr(
+                        SseOpcode::Movd,
+                        src,
+                        dst,
+                        OperandSize::Size32,
+                    ));
+                }
+                (types::I32, types::F32) => {
+                    let src = input_to_reg_mem(ctx, inputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]);
+                    ctx.emit(Inst::gpr_to_xmm(
+                        SseOpcode::Movd,
+                        src,
+                        OperandSize::Size32,
+                        dst,
+                    ));
+                }
+                (types::F64, types::I64) => {
+                    let src = put_input_in_reg(ctx, inputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]);
+                    ctx.emit(Inst::xmm_to_gpr(
+                        SseOpcode::Movq,
+                        src,
+                        dst,
+                        OperandSize::Size64,
+                    ));
+                }
+                (types::I64, types::F64) => {
+                    let src = input_to_reg_mem(ctx, inputs[0]);
+                    let dst = get_output_reg(ctx, outputs[0]);
+                    ctx.emit(Inst::gpr_to_xmm(
+                        SseOpcode::Movq,
+                        src,
+                        OperandSize::Size64,
+                        dst,
+                    ));
+                }
+                _ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty),
+            }
+        }
+
+        Opcode::Fabs | Opcode::Fneg => {
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            // In both cases, generate a constant and apply a single binary instruction:
+            // - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the
+            // src with it.
+            // - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the
+            // src with it.
+            let output_ty = ty.unwrap();
+            if !output_ty.is_vector() {
+                let (val, opcode) = match output_ty {
+                    types::F32 => match op {
+                        Opcode::Fabs => (0x7fffffff, SseOpcode::Andps),
+                        Opcode::Fneg => (0x80000000, SseOpcode::Xorps),
+                        _ => unreachable!(),
+                    },
+                    types::F64 => match op {
+                        Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd),
+                        Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd),
+                        _ => unreachable!(),
+                    },
+                    _ => panic!("unexpected type {:?} for Fabs", output_ty),
+                };
+
+                for inst in Inst::gen_constant(dst, val, output_ty, |reg_class, ty| {
+                    ctx.alloc_tmp(reg_class, ty)
+                }) {
+                    ctx.emit(inst);
+                }
+
+                ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
+            } else {
+                // Eventually vector constants should be available in `gen_constant` and this block
+                // can be merged with the one above (TODO).
+                if output_ty.bits() == 128 {
+                    // Move the `lhs` to the same register as `dst`; this may not emit an actual move
+                    // but ensures that the registers are the same to match x86's read-write operand
+                    // encoding.
+                    let src = put_input_in_reg(ctx, inputs[0]);
+                    ctx.emit(Inst::gen_move(dst, src, output_ty));
+
+                    // Generate an all 1s constant in an XMM register. This uses CMPPS but could
+                    // have used CMPPD with the same effect.
+                    let tmp = ctx.alloc_tmp(RegClass::V128, output_ty);
+                    let cond = FcmpImm::from(FloatCC::Equal);
+                    let cmpps = Inst::xmm_rm_r_imm(
+                        SseOpcode::Cmpps,
+                        RegMem::reg(tmp.to_reg()),
+                        tmp,
+                        cond.encode(),
+                        false,
+                    );
+                    ctx.emit(cmpps);
+
+                    // Shift the all 1s constant to generate the mask.
+                    let lane_bits = output_ty.lane_bits();
+                    let (shift_opcode, opcode, shift_by) = match (op, lane_bits) {
+                        (Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1),
+                        (Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1),
+                        (Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31),
+                        (Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63),
+                        _ => unreachable!(
+                            "unexpected opcode and lane size: {:?}, {} bits",
+                            op, lane_bits
+                        ),
+                    };
+                    let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp);
+                    ctx.emit(shift);
+
+                    // Apply shifted mask (XOR or AND).
+                    let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
+                    ctx.emit(mask);
+                } else {
+                    panic!("unexpected type {:?} for Fabs", output_ty);
+                }
+            }
+        }
+
+        Opcode::Fcopysign => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = put_input_in_reg(ctx, inputs[1]);
+
+            let ty = ty.unwrap();
+
+            // We're going to generate the following sequence:
+            //
+            // movabs     $INT_MIN, tmp_gpr1
+            // mov{d,q}   tmp_gpr1, tmp_xmm1
+            // movap{s,d} tmp_xmm1, dst
+            // andnp{s,d} src_1, dst
+            // movap{s,d} src_2, tmp_xmm2
+            // andp{s,d}  tmp_xmm1, tmp_xmm2
+            // orp{s,d}   tmp_xmm2, dst
+
+            let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32);
+            let tmp_xmm2 = ctx.alloc_tmp(RegClass::V128, types::F32);
+
+            let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty {
+                types::F32 => (
+                    0x8000_0000,
+                    SseOpcode::Movaps,
+                    SseOpcode::Andnps,
+                    SseOpcode::Andps,
+                    SseOpcode::Orps,
+                ),
+                types::F64 => (
+                    0x8000_0000_0000_0000,
+                    SseOpcode::Movapd,
+                    SseOpcode::Andnpd,
+                    SseOpcode::Andpd,
+                    SseOpcode::Orpd,
+                ),
+                _ => {
+                    panic!("unexpected type {:?} for copysign", ty);
+                }
+            };
+
+            for inst in Inst::gen_constant(tmp_xmm1, sign_bit_cst, ty, |reg_class, ty| {
+                ctx.alloc_tmp(reg_class, ty)
+            }) {
+                ctx.emit(inst);
+            }
+            ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+            ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst));
+            ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2));
+            ctx.emit(Inst::xmm_rm_r(
+                and_op,
+                RegMem::reg(tmp_xmm1.to_reg()),
+                tmp_xmm2,
+            ));
+            ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst));
+        }
+
+        Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
+            // TODO use ROUNDSS/ROUNDSD after sse4.1.
+
+            // Lower to VM calls when there's no access to SSE4.1.
+            let ty = ty.unwrap();
+            let libcall = match (ty, op) {
+                (types::F32, Opcode::Ceil) => LibCall::CeilF32,
+                (types::F64, Opcode::Ceil) => LibCall::CeilF64,
+                (types::F32, Opcode::Floor) => LibCall::FloorF32,
+                (types::F64, Opcode::Floor) => LibCall::FloorF64,
+                (types::F32, Opcode::Nearest) => LibCall::NearestF32,
+                (types::F64, Opcode::Nearest) => LibCall::NearestF64,
+                (types::F32, Opcode::Trunc) => LibCall::TruncF32,
+                (types::F64, Opcode::Trunc) => LibCall::TruncF64,
+                _ => panic!(
+                    "unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc",
+                    ty, op
+                ),
+            };
+
+            emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?;
+        }
+
+        Opcode::Load
+        | Opcode::Uload8
+        | Opcode::Sload8
+        | Opcode::Uload16
+        | Opcode::Sload16
+        | Opcode::Uload32
+        | Opcode::Sload32
+        | Opcode::LoadComplex
+        | Opcode::Uload8Complex
+        | Opcode::Sload8Complex
+        | Opcode::Uload16Complex
+        | Opcode::Sload16Complex
+        | Opcode::Uload32Complex
+        | Opcode::Sload32Complex => {
+            let offset = ctx.data(insn).load_store_offset().unwrap();
+
+            let elem_ty = match op {
+                Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
+                    types::I8
+                }
+                Opcode::Sload16
+                | Opcode::Uload16
+                | Opcode::Sload16Complex
+                | Opcode::Uload16Complex => types::I16,
+                Opcode::Sload32
+                | Opcode::Uload32
+                | Opcode::Sload32Complex
+                | Opcode::Uload32Complex => types::I32,
+                Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
+                _ => unimplemented!(),
+            };
+
+            let ext_mode = ExtMode::new(elem_ty.bits(), 64);
+
+            let sign_extend = match op {
+                Opcode::Sload8
+                | Opcode::Sload8Complex
+                | Opcode::Sload16
+                | Opcode::Sload16Complex
+                | Opcode::Sload32
+                | Opcode::Sload32Complex => true,
+                _ => false,
+            };
+
+            let amode = match op {
+                Opcode::Load
+                | Opcode::Uload8
+                | Opcode::Sload8
+                | Opcode::Uload16
+                | Opcode::Sload16
+                | Opcode::Uload32
+                | Opcode::Sload32 => {
+                    assert_eq!(inputs.len(), 1, "only one input for load operands");
+                    lower_to_amode(ctx, inputs[0], offset)
+                }
+
+                Opcode::LoadComplex
+                | Opcode::Uload8Complex
+                | Opcode::Sload8Complex
+                | Opcode::Uload16Complex
+                | Opcode::Sload16Complex
+                | Opcode::Uload32Complex
+                | Opcode::Sload32Complex => {
+                    assert_eq!(
+                        inputs.len(),
+                        2,
+                        "can't handle more than two inputs in complex load"
+                    );
+                    let base = put_input_in_reg(ctx, inputs[0]);
+                    let index = put_input_in_reg(ctx, inputs[1]);
+                    let shift = 0;
+                    let flags = ctx.memflags(insn).expect("load should have memflags");
+                    Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags)
+                }
+
+                _ => unreachable!(),
+            };
+
+            let dst = get_output_reg(ctx, outputs[0]);
+            let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
+            match (sign_extend, is_xmm) {
+                (true, false) => {
+                    // The load is sign-extended only when the output size is lower than 64 bits,
+                    // so ext-mode is defined in this case.
+                    ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst));
+                }
+                (false, false) => {
+                    if elem_ty.bytes() == 8 {
+                        // Use a plain load.
+                        ctx.emit(Inst::mov64_m_r(amode, dst))
+                    } else {
+                        // Use a zero-extended load.
+                        ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst))
+                    }
+                }
+                (_, true) => {
+                    ctx.emit(match elem_ty {
+                        types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst),
+                        types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst),
+                        _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
+                            Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst)
+                        } // TODO Specialize for different types: MOVUPD, MOVDQU
+                        _ => unreachable!("unexpected type for load: {:?}", elem_ty),
+                    });
+                }
+            }
+        }
+
+        Opcode::Store
+        | Opcode::Istore8
+        | Opcode::Istore16
+        | Opcode::Istore32
+        | Opcode::StoreComplex
+        | Opcode::Istore8Complex
+        | Opcode::Istore16Complex
+        | Opcode::Istore32Complex => {
+            let offset = ctx.data(insn).load_store_offset().unwrap();
+
+            let elem_ty = match op {
+                Opcode::Istore8 | Opcode::Istore8Complex => types::I8,
+                Opcode::Istore16 | Opcode::Istore16Complex => types::I16,
+                Opcode::Istore32 | Opcode::Istore32Complex => types::I32,
+                Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
+                _ => unreachable!(),
+            };
+
+            let addr = match op {
+                Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
+                    assert_eq!(inputs.len(), 2, "only one input for store memory operands");
+                    lower_to_amode(ctx, inputs[1], offset)
+                }
+
+                Opcode::StoreComplex
+                | Opcode::Istore8Complex
+                | Opcode::Istore16Complex
+                | Opcode::Istore32Complex => {
+                    assert_eq!(
+                        inputs.len(),
+                        3,
+                        "can't handle more than two inputs in complex store"
+                    );
+                    let base = put_input_in_reg(ctx, inputs[1]);
+                    let index = put_input_in_reg(ctx, inputs[2]);
+                    let shift = 0;
+                    let flags = ctx.memflags(insn).expect("store should have memflags");
+                    Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags)
+                }
+
+                _ => unreachable!(),
+            };
+
+            let src = put_input_in_reg(ctx, inputs[0]);
+
+            ctx.emit(match elem_ty {
+                types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr),
+                types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr),
+                _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
+                    // TODO Specialize for different types: MOVUPD, MOVDQU, etc.
+                    Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr)
+                }
+                _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr),
+            });
+        }
+
+        Opcode::AtomicRmw => {
+            // This is a simple, general-case atomic update, based on a loop involving
+            // `cmpxchg`.  Note that we could do much better than this in the case where the old
+            // value at the location (that is to say, the SSA `Value` computed by this CLIF
+            // instruction) is not required.  In that case, we could instead implement this
+            // using a single `lock`-prefixed x64 read-modify-write instruction.  Also, even in
+            // the case where the old value is required, for the `add` and `sub` cases, we can
+            // use the single instruction `lock xadd`.  However, those improvements have been
+            // left for another day.
+            // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153
+            let dst = get_output_reg(ctx, outputs[0]);
+            let mut addr = put_input_in_reg(ctx, inputs[0]);
+            let mut arg2 = put_input_in_reg(ctx, inputs[1]);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+
+            // Make sure that both args are in virtual regs, since in effect we have to do a
+            // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not
+            // guaranteed safe if either is in a real reg.
+            addr = ctx.ensure_in_vreg(addr, types::I64);
+            arg2 = ctx.ensure_in_vreg(arg2, types::I64);
+
+            // Move the args to the preordained AtomicRMW input regs.  Note that `AtomicRmwSeq`
+            // operates at whatever width is specified by `ty`, so there's no need to
+            // zero-extend `arg2` in the case of `ty` being I8/I16/I32.
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::r9()),
+                addr,
+                types::I64,
+            ));
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::r10()),
+                arg2,
+                types::I64,
+            ));
+
+            // Now the AtomicRmwSeq (pseudo-) instruction itself
+            let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
+            ctx.emit(Inst::AtomicRmwSeq { ty: ty_access, op });
+
+            // And finally, copy the preordained AtomicRmwSeq output reg to its destination.
+            ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
+        }
+
+        Opcode::AtomicCas => {
+            // This is very similar to, but not identical to, the `AtomicRmw` case.  As with
+            // `AtomicRmw`, there's no need to zero-extend narrow values here.
+            let dst = get_output_reg(ctx, outputs[0]);
+            let addr = lower_to_amode(ctx, inputs[0], 0);
+            let expected = put_input_in_reg(ctx, inputs[1]);
+            let replacement = put_input_in_reg(ctx, inputs[2]);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+
+            // Move the expected value into %rax.  Because there's only one fixed register on
+            // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the
+            // `AtomicRmw` case.
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::rax()),
+                expected,
+                types::I64,
+            ));
+            ctx.emit(Inst::LockCmpxchg {
+                ty: ty_access,
+                src: replacement,
+                dst: addr.into(),
+            });
+            // And finally, copy the old value at the location to its destination reg.
+            ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
+        }
+
+        Opcode::AtomicLoad => {
+            // This is a normal load.  The x86-TSO memory model provides sufficient sequencing
+            // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
+            // need for any fence instructions.
+            let data = get_output_reg(ctx, outputs[0]);
+            let addr = lower_to_amode(ctx, inputs[0], 0);
+            let ty_access = ty.unwrap();
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+
+            let rm = RegMem::mem(addr);
+            if ty_access == types::I64 {
+                ctx.emit(Inst::mov64_rm_r(rm, data));
+            } else {
+                let ext_mode = ExtMode::new(ty_access.bits(), 64).expect(&format!(
+                    "invalid extension during AtomicLoad: {} -> {}",
+                    ty_access.bits(),
+                    64
+                ));
+                ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data));
+            }
+        }
+
+        Opcode::AtomicStore => {
+            // This is a normal store, followed by an `mfence` instruction.
+            let data = put_input_in_reg(ctx, inputs[0]);
+            let addr = lower_to_amode(ctx, inputs[1], 0);
+            let ty_access = ctx.input_ty(insn, 0);
+            assert!(is_valid_atomic_transaction_ty(ty_access));
+
+            ctx.emit(Inst::mov_r_m(ty_access.bytes() as u8, data, addr));
+            ctx.emit(Inst::Fence {
+                kind: FenceKind::MFence,
+            });
+        }
+
+        Opcode::Fence => {
+            ctx.emit(Inst::Fence {
+                kind: FenceKind::MFence,
+            });
+        }
+
+        Opcode::FuncAddr => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let (extname, _) = ctx.call_target(insn).unwrap();
+            let extname = extname.clone();
+            ctx.emit(Inst::LoadExtName {
+                dst,
+                name: Box::new(extname),
+                offset: 0,
+            });
+        }
+
+        Opcode::SymbolValue => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
+            let extname = extname.clone();
+            ctx.emit(Inst::LoadExtName {
+                dst,
+                name: Box::new(extname),
+                offset,
+            });
+        }
+
+        Opcode::StackAddr => {
+            let (stack_slot, offset) = match *ctx.data(insn) {
+                InstructionData::StackLoad {
+                    opcode: Opcode::StackAddr,
+                    stack_slot,
+                    offset,
+                } => (stack_slot, offset),
+                _ => unreachable!(),
+            };
+            let dst = get_output_reg(ctx, outputs[0]);
+            let offset: i32 = offset.into();
+            let inst = ctx
+                .abi()
+                .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
+            ctx.emit(inst);
+        }
+
+        Opcode::Select => {
+            let flag_input = inputs[0];
+            if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
+                let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
+
+                // For equal, we flip the operands, because we can't test a conjunction of
+                // CPU flags with a single cmove; see InvertedEqualOrConditions doc comment.
+                let (lhs_input, rhs_input) = match cond_code {
+                    FloatCC::Equal => (inputs[2], inputs[1]),
+                    _ => (inputs[1], inputs[2]),
+                };
+
+                let ty = ctx.output_ty(insn, 0);
+                let rhs = put_input_in_reg(ctx, rhs_input);
+                let dst = get_output_reg(ctx, outputs[0]);
+                let lhs = if is_int_or_ref_ty(ty) && ty.bytes() < 4 {
+                    // Special case: since the higher bits are undefined per CLIF semantics, we
+                    // can just apply a 32-bit cmove here. Force inputs into registers, to
+                    // avoid partial spilling out-of-bounds with memory accesses, though.
+                    // Sign-extend operands to 32, then do a cmove of size 4.
+                    RegMem::reg(put_input_in_reg(ctx, lhs_input))
+                } else {
+                    input_to_reg_mem(ctx, lhs_input)
+                };
+
+                // We request inversion of Equal to NotEqual here: taking LHS if equal would mean
+                // take it if both CC::NP and CC::Z are set, the conjunction of which can't be
+                // modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the
+                // select operation, and invert the equal to a not-equal here.
+                let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual);
+
+                if let FcmpCondResult::InvertedEqualOrConditions(_, _) = &fcmp_results {
+                    // Keep this sync'd with the lowering of the select inputs above.
+                    assert_eq!(cond_code, FloatCC::Equal);
+                }
+
+                ctx.emit(Inst::gen_move(dst, rhs, ty));
+
+                match fcmp_results {
+                    FcmpCondResult::Condition(cc) => {
+                        if is_int_or_ref_ty(ty) {
+                            let size = u8::max(ty.bytes() as u8, 4);
+                            ctx.emit(Inst::cmove(size, cc, lhs, dst));
+                        } else {
+                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+                        }
+                    }
+                    FcmpCondResult::AndConditions(_, _) => {
+                        unreachable!(
+                            "can't AND with select; see above comment about inverting equal"
+                        );
+                    }
+                    FcmpCondResult::InvertedEqualOrConditions(cc1, cc2)
+                    | FcmpCondResult::OrConditions(cc1, cc2) => {
+                        if is_int_or_ref_ty(ty) {
+                            let size = u8::max(ty.bytes() as u8, 4);
+                            ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst));
+                            ctx.emit(Inst::cmove(size, cc2, lhs, dst));
+                        } else {
+                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst));
+                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst));
+                        }
+                    }
+                }
+            } else {
+                let ty = ty.unwrap();
+
+                let mut size = ty.bytes() as u8;
+                let lhs = if is_int_or_ref_ty(ty) {
+                    if size < 4 {
+                        // Special case: since the higher bits are undefined per CLIF semantics, we
+                        // can just apply a 32-bit cmove here. Force inputs into registers, to
+                        // avoid partial spilling out-of-bounds with memory accesses, though.
+                        size = 4;
+                        RegMem::reg(put_input_in_reg(ctx, inputs[1]))
+                    } else {
+                        input_to_reg_mem(ctx, inputs[1])
+                    }
+                } else {
+                    input_to_reg_mem(ctx, inputs[1])
+                };
+
+                let rhs = put_input_in_reg(ctx, inputs[2]);
+                let dst = get_output_reg(ctx, outputs[0]);
+
+                let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
+                    emit_cmp(ctx, icmp);
+                    let cond_code = ctx.data(icmp).cond_code().unwrap();
+                    CC::from_intcc(cond_code)
+                } else {
+                    // The input is a boolean value, compare it against zero.
+                    let size = ctx.input_ty(insn, 0).bytes() as u8;
+                    let test = put_input_in_reg(ctx, flag_input);
+                    ctx.emit(Inst::cmp_rmi_r(size, RegMemImm::imm(0), test));
+                    CC::NZ
+                };
+
+                // This doesn't affect the flags.
+                ctx.emit(Inst::gen_move(dst, rhs, ty));
+
+                if is_int_or_ref_ty(ty) {
+                    ctx.emit(Inst::cmove(size, cc, lhs, dst));
+                } else {
+                    debug_assert!(ty == types::F32 || ty == types::F64);
+                    ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+                }
+            }
+        }
+
+        Opcode::Selectif | Opcode::SelectifSpectreGuard => {
+            let lhs = input_to_reg_mem(ctx, inputs[1]);
+            let rhs = put_input_in_reg(ctx, inputs[2]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ctx.output_ty(insn, 0);
+
+            // Verification ensures that the input is always a single-def ifcmp.
+            let cmp_insn = ctx
+                .get_input(inputs[0].insn, inputs[0].input)
+                .inst
+                .unwrap()
+                .0;
+            debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
+            emit_cmp(ctx, cmp_insn);
+
+            let cc = CC::from_intcc(ctx.data(insn).cond_code().unwrap());
+
+            if is_int_or_ref_ty(ty) {
+                let size = ty.bytes() as u8;
+                if size == 1 {
+                    // Sign-extend operands to 32, then do a cmove of size 4.
+                    let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32);
+                    ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se));
+                    ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst));
+                    ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst));
+                } else {
+                    ctx.emit(Inst::gen_move(dst, rhs, ty));
+                    ctx.emit(Inst::cmove(size, cc, lhs, dst));
+                }
+            } else {
+                debug_assert!(ty == types::F32 || ty == types::F64);
+                ctx.emit(Inst::gen_move(dst, rhs, ty));
+                ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
+            }
+        }
+
+        Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => {
+            let kind = match op {
+                Opcode::Udiv => DivOrRemKind::UnsignedDiv,
+                Opcode::Sdiv => DivOrRemKind::SignedDiv,
+                Opcode::Urem => DivOrRemKind::UnsignedRem,
+                Opcode::Srem => DivOrRemKind::SignedRem,
+                _ => unreachable!(),
+            };
+            let is_div = kind.is_div();
+
+            let input_ty = ctx.input_ty(insn, 0);
+            let size = input_ty.bytes() as u8;
+
+            let dividend = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::rax()),
+                dividend,
+                input_ty,
+            ));
+
+            if flags.avoid_div_traps() {
+                // A vcode meta-instruction is used to lower the inline checks, since they embed
+                // pc-relative offsets that must not change, thus requiring regalloc to not
+                // interfere by introducing spills and reloads.
+                //
+                // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
+                // regalloc is aware of the coalescing opportunity between rax/rdx and the
+                // destination register.
+                let divisor = put_input_in_reg(ctx, inputs[1]);
+
+                let divisor_copy = ctx.alloc_tmp(RegClass::I64, types::I64);
+                ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64));
+
+                let tmp = if op == Opcode::Sdiv && size == 8 {
+                    Some(ctx.alloc_tmp(RegClass::I64, types::I64))
+                } else {
+                    None
+                };
+                // TODO use xor
+                ctx.emit(Inst::imm(
+                    OperandSize::Size32,
+                    0,
+                    Writable::from_reg(regs::rdx()),
+                ));
+                ctx.emit(Inst::checked_div_or_rem_seq(kind, size, divisor_copy, tmp));
+            } else {
+                let divisor = input_to_reg_mem(ctx, inputs[1]);
+
+                // Fill in the high parts:
+                if kind.is_signed() {
+                    // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
+                    // signed opcodes.
+                    ctx.emit(Inst::sign_extend_data(size));
+                } else if input_ty == types::I8 {
+                    ctx.emit(Inst::movzx_rm_r(
+                        ExtMode::BL,
+                        RegMem::reg(regs::rax()),
+                        Writable::from_reg(regs::rax()),
+                    ));
+                } else {
+                    // zero for unsigned opcodes.
+                    ctx.emit(Inst::imm(
+                        OperandSize::Size64,
+                        0,
+                        Writable::from_reg(regs::rdx()),
+                    ));
+                }
+
+                // Emit the actual idiv.
+                ctx.emit(Inst::div(size, kind.is_signed(), divisor));
+            }
+
+            // Move the result back into the destination reg.
+            if is_div {
+                // The quotient is in rax.
+                ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
+            } else {
+                // The remainder is in rdx.
+                ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
+            }
+        }
+
+        Opcode::Umulhi | Opcode::Smulhi => {
+            let input_ty = ctx.input_ty(insn, 0);
+            let size = input_ty.bytes() as u8;
+
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = input_to_reg_mem(ctx, inputs[1]);
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            // Move lhs in %rax.
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::rax()),
+                lhs,
+                input_ty,
+            ));
+
+            // Emit the actual mul or imul.
+            let signed = op == Opcode::Smulhi;
+            ctx.emit(Inst::mul_hi(size, signed, rhs));
+
+            // Read the result from the high part (stored in %rdx).
+            ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
+        }
+
+        Opcode::GetPinnedReg => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64));
+        }
+
+        Opcode::SetPinnedReg => {
+            let src = put_input_in_reg(ctx, inputs[0]);
+            ctx.emit(Inst::gen_move(
+                Writable::from_reg(regs::pinned_reg()),
+                src,
+                types::I64,
+            ));
+        }
+
+        Opcode::Vconst => {
+            let used_constant = if let &InstructionData::UnaryConst {
+                constant_handle, ..
+            } = ctx.data(insn)
+            {
+                ctx.use_constant(VCodeConstantData::Pool(
+                    constant_handle,
+                    ctx.get_constant_data(constant_handle).clone(),
+                ))
+            } else {
+                unreachable!("vconst should always have unary_const format")
+            };
+            // TODO use Inst::gen_constant() instead.
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            ctx.emit(Inst::xmm_load_const(used_constant, dst, ty));
+        }
+
+        Opcode::RawBitcast => {
+            // A raw_bitcast is just a mechanism for correcting the type of V128 values (see
+            // https://github.com/bytecodealliance/wasmtime/issues/1147). As such, this IR
+            // instruction should emit no machine code but a move is necessary to give the register
+            // allocator a definition for the output virtual register.
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let ty = ty.unwrap();
+            ctx.emit(Inst::gen_move(dst, src, ty));
+        }
+
+        Opcode::Shuffle => {
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let lhs_ty = ctx.input_ty(insn, 0);
+            let lhs = put_input_in_reg(ctx, inputs[0]);
+            let rhs = put_input_in_reg(ctx, inputs[1]);
+            let mask = match ctx.get_immediate(insn) {
+                Some(DataValue::V128(bytes)) => bytes.to_vec(),
+                _ => unreachable!("shuffle should always have a 16-byte immediate"),
+            };
+
+            // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
+            // 1 in the most significant position zeroes the lane.
+            let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
+
+            ctx.emit(Inst::gen_move(dst, rhs, ty));
+            if rhs == lhs {
+                // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
+                // register. We statically build `constructed_mask` to zero out any unknown lane
+                // indices (may not be completely necessary: verification could fail incorrect mask
+                // values) and fix the indexes to all point to the `dst` vector.
+                let constructed_mask = mask
+                    .iter()
+                    // If the mask is greater than 15 it still may be referring to a lane in b.
+                    .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
+                    .map(zero_unknown_lane_index)
+                    .collect();
+                let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
+                let tmp = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+                ctx.emit(Inst::xmm_load_const(constant, tmp, ty));
+                // After loading the constructed mask in a temporary register, we use this to
+                // shuffle the `dst` register (remember that, in this case, it is the same as
+                // `src` so we disregard this register).
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
+            } else {
+                // If `lhs` and `rhs` are different, we must shuffle each separately and then OR
+                // them together. This is necessary due to PSHUFB semantics. As in the case above,
+                // we build the `constructed_mask` for each case statically.
+
+                // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
+                let tmp0 = ctx.alloc_tmp(RegClass::V128, lhs_ty);
+                ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
+                let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
+                let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
+                let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+                ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
+
+                // PSHUFB the second argument, placing zeroes for unused lanes.
+                let constructed_mask = mask
+                    .iter()
+                    .map(|b| b.wrapping_sub(16))
+                    .map(zero_unknown_lane_index)
+                    .collect();
+                let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
+                let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+                ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
+
+                // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
+                // is not important).
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
+
+                // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
+            }
+        }
+
+        Opcode::Swizzle => {
+            // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec
+            // requiring mask indexes greater than 15 to have the same semantics as a 0 index. For
+            // the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF
+            // semantics match the Wasm SIMD semantics for this instruction.
+            // The instruction format maps to variables like: %dst = swizzle %src, %mask
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let swizzle_mask = put_input_in_reg(ctx, inputs[1]);
+
+            // Inform the register allocator that `src` and `dst` should be in the same register.
+            ctx.emit(Inst::gen_move(dst, src, ty));
+
+            // Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
+            let zero_mask = ctx.alloc_tmp(RegClass::V128, types::I8X16);
+            static ZERO_MASK_VALUE: [u8; 16] = [
+                0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                0x70, 0x70,
+            ];
+            let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE));
+            ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty));
+
+            // Use the `zero_mask` on a writable `swizzle_mask`.
+            let swizzle_mask = Writable::from_reg(swizzle_mask);
+            ctx.emit(Inst::xmm_rm_r(
+                SseOpcode::Paddusb,
+                RegMem::from(zero_mask),
+                swizzle_mask,
+            ));
+
+            // Shuffle `dst` using the fixed-up `swizzle_mask`.
+            ctx.emit(Inst::xmm_rm_r(
+                SseOpcode::Pshufb,
+                RegMem::from(swizzle_mask),
+                dst,
+            ));
+        }
+
+        Opcode::Insertlane => {
+            // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let in_vec = put_input_in_reg(ctx, inputs[0]);
+            let src_ty = ctx.input_ty(insn, 1);
+            debug_assert!(!src_ty.is_vector());
+            let src = input_to_reg_mem(ctx, inputs[1]);
+            let lane = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
+                *imm
+            } else {
+                unreachable!();
+            };
+            debug_assert!(lane < ty.lane_count() as u8);
+
+            ctx.emit(Inst::gen_move(dst, in_vec, ty));
+            emit_insert_lane(ctx, src, dst, lane, ty.lane_type());
+        }
+
+        Opcode::Extractlane => {
+            // The instruction format maps to variables like: %dst = extractlane %src, %lane
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let src_ty = ctx.input_ty(insn, 0);
+            assert_eq!(src_ty.bits(), 128);
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
+                *imm
+            } else {
+                unreachable!();
+            };
+            debug_assert!(lane < src_ty.lane_count() as u8);
+
+            if !ty.is_float() {
+                let (sse_op, w_bit) = match ty.lane_bits() {
+                    8 => (SseOpcode::Pextrb, false),
+                    16 => (SseOpcode::Pextrw, false),
+                    32 => (SseOpcode::Pextrd, false),
+                    64 => (SseOpcode::Pextrd, true),
+                    _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
+                };
+                let src = RegMem::reg(src);
+                ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit));
+            } else {
+                if lane == 0 {
+                    // Remove the extractlane instruction, leaving the float where it is. The upper
+                    // bits will remain unchanged; for correctness, this relies on Cranelift type
+                    // checking to avoid using those bits.
+                    ctx.emit(Inst::gen_move(dst, src, ty));
+                } else {
+                    // Otherwise, shuffle the bits in `lane` to the lowest lane.
+                    let sse_op = SseOpcode::Pshufd;
+                    let mask = match src_ty {
+                        // Move the value at `lane` to lane 0, copying existing value at lane 0 to
+                        // other lanes. Again, this relies on Cranelift type checking to avoid
+                        // using those bits.
+                        types::F32X4 => 0b00_00_00_00 | lane,
+                        // Move the value at `lane` 1 (we know it must be 1 because of the `if`
+                        // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
+                        // checking assumption also applies here.
+                        types::F64X2 => 0b11_10_11_10,
+                        _ => unreachable!(),
+                    };
+                    let src = RegMem::reg(src);
+                    ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false));
+                }
+            }
+        }
+
+        Opcode::Splat | Opcode::LoadSplat => {
+            let ty = ty.unwrap();
+            assert_eq!(ty.bits(), 128);
+            let src_ty = ctx.input_ty(insn, 0);
+            assert!(src_ty.bits() < 128);
+
+            let src = match op {
+                Opcode::Splat => input_to_reg_mem(ctx, inputs[0]),
+                Opcode::LoadSplat => {
+                    let offset = ctx.data(insn).load_store_offset().unwrap();
+                    let amode = lower_to_amode(ctx, inputs[0], offset);
+                    RegMem::mem(amode)
+                }
+                _ => unreachable!(),
+            };
+            let dst = get_output_reg(ctx, outputs[0]);
+
+            // We know that splat will overwrite all of the lanes of `dst` but it takes several
+            // instructions to do so. Because of the multiple instructions, there is no good way to
+            // declare `dst` a `def` except with the following pseudo-instruction.
+            ctx.emit(Inst::xmm_uninit_value(dst));
+
+            // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
+            // and VPBROADCAST*.
+            match ty.lane_bits() {
+                8 => {
+                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
+                    // Initialize a register with all 0s.
+                    let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
+                    // Shuffle the lowest byte lane to all other lanes.
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
+                }
+                16 => {
+                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
+                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
+                    // Shuffle the lowest two lanes to all other lanes.
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        SseOpcode::Pshufd,
+                        RegMem::from(dst),
+                        dst,
+                        0,
+                        false,
+                    ))
+                }
+                32 => {
+                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
+                    // Shuffle the lowest lane to all other lanes.
+                    ctx.emit(Inst::xmm_rm_r_imm(
+                        SseOpcode::Pshufd,
+                        RegMem::from(dst),
+                        dst,
+                        0,
+                        false,
+                    ))
+                }
+                64 => {
+                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
+                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
+                }
+                _ => panic!("Invalid type to splat: {}", ty),
+            }
+        }
+
+        Opcode::VanyTrue => {
+            let dst = get_output_reg(ctx, outputs[0]);
+            let src_ty = ctx.input_ty(insn, 0);
+            assert_eq!(src_ty.bits(), 128);
+            let src = put_input_in_reg(ctx, inputs[0]);
+            // Set the ZF if the result is all zeroes.
+            ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src));
+            // If the ZF is not set, place a 1 in `dst`.
+            ctx.emit(Inst::setcc(CC::NZ, dst));
+        }
+
+        Opcode::VallTrue => {
+            let ty = ty.unwrap();
+            let dst = get_output_reg(ctx, outputs[0]);
+            let src_ty = ctx.input_ty(insn, 0);
+            assert_eq!(src_ty.bits(), 128);
+            let src = input_to_reg_mem(ctx, inputs[0]);
+
+            let eq = |ty: Type| match ty.lane_bits() {
+                8 => SseOpcode::Pcmpeqb,
+                16 => SseOpcode::Pcmpeqw,
+                32 => SseOpcode::Pcmpeqd,
+                64 => SseOpcode::Pcmpeqq,
+                _ => panic!("Unable to find an instruction for {} for type: {}", op, ty),
+            };
+
+            // Initialize a register with all 0s.
+            let tmp = ctx.alloc_tmp(RegClass::V128, ty);
+            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
+            // Compare to see what lanes are filled with all 1s.
+            ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
+            // Set the ZF if the result is all zeroes.
+            ctx.emit(Inst::xmm_cmp_rm_r(
+                SseOpcode::Ptest,
+                RegMem::from(tmp),
+                tmp.to_reg(),
+            ));
+            // If the ZF is set, place a 1 in `dst`.
+            ctx.emit(Inst::setcc(CC::Z, dst));
+        }
+
+        Opcode::VhighBits => {
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let src_ty = ctx.input_ty(insn, 0);
+            debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
+            let dst = get_output_reg(ctx, outputs[0]);
+            debug_assert!(dst.to_reg().get_class() == RegClass::I64);
+
+            // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
+            // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
+            // the instruction can access additional registers when used with a REX.R prefix. The
+            // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
+            // Manual, vol. 2). This being the case, we will always clear REX.W since its use is
+            // unnecessary (`OperandSize` is used for setting/clearing REX.W).
+            let size = OperandSize::Size32;
+
+            match src_ty {
+                types::I8X16 | types::B8X16 => {
+                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
+                }
+                types::I32X4 | types::B32X4 | types::F32X4 => {
+                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
+                }
+                types::I64X2 | types::B64X2 | types::F64X2 => {
+                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
+                }
+                types::I16X8 | types::B16X8 => {
+                    // There is no x86 instruction for extracting the high bit of 16-bit lanes so
+                    // here we:
+                    // - duplicate the 16-bit lanes of `src` into 8-bit lanes:
+                    //     PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
+                    // - use PMOVMSKB to gather the high bits; now we have duplicates, though
+                    // - shift away the bottom 8 high bits to remove the duplicates.
+                    let tmp = ctx.alloc_tmp(RegClass::V128, src_ty);
+                    ctx.emit(Inst::gen_move(tmp, src, src_ty));
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp));
+                    ctx.emit(Inst::xmm_to_gpr(
+                        SseOpcode::Pmovmskb,
+                        tmp.to_reg(),
+                        dst,
+                        size,
+                    ));
+                    ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(8), dst));
+                }
+                _ => unimplemented!("unknown input type {} for {}", src_ty, op),
+            }
+        }
+
+        Opcode::IaddImm
+        | Opcode::ImulImm
+        | Opcode::UdivImm
+        | Opcode::SdivImm
+        | Opcode::UremImm
+        | Opcode::SremImm
+        | Opcode::IrsubImm
+        | Opcode::IaddCin
+        | Opcode::IaddIfcin
+        | Opcode::IaddCout
+        | Opcode::IaddCarry
+        | Opcode::IaddIfcarry
+        | Opcode::IsubBin
+        | Opcode::IsubIfbin
+        | Opcode::IsubBout
+        | Opcode::IsubIfbout
+        | Opcode::IsubBorrow
+        | Opcode::IsubIfborrow
+        | Opcode::BandImm
+        | Opcode::BorImm
+        | Opcode::BxorImm
+        | Opcode::RotlImm
+        | Opcode::RotrImm
+        | Opcode::IshlImm
+        | Opcode::UshrImm
+        | Opcode::SshrImm => {
+            panic!("ALU+imm and ALU+carry ops should not appear here!");
+        }
+        _ => unimplemented!("unimplemented lowering for opcode {:?}", op),
+    }
+
+    Ok(())
+}
+
+//=============================================================================
+// Lowering-backend trait implementation.
+
+impl LowerBackend for X64Backend {
+    type MInst = Inst;
+
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
+        lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.triple)
+    }
+
+    fn lower_branch_group<C: LowerCtx<I = Inst>>(
+        &self,
+        ctx: &mut C,
+        branches: &[IRInst],
+        targets: &[MachLabel],
+        fallthrough: Option<MachLabel>,
+    ) -> CodegenResult<()> {
+        // A block should end with at most two branches. The first may be a
+        // conditional branch; a conditional branch can be followed only by an
+        // unconditional branch or fallthrough. Otherwise, if only one branch,
+        // it may be an unconditional branch, a fallthrough, a return, or a
+        // trap. These conditions are verified by `is_ebb_basic()` during the
+        // verifier pass.
+        assert!(branches.len() <= 2);
+
+        if branches.len() == 2 {
+            // Must be a conditional branch followed by an unconditional branch.
+            let op0 = ctx.data(branches[0]).opcode();
+            let op1 = ctx.data(branches[1]).opcode();
+
+            trace!(
+                "lowering two-branch group: opcodes are {:?} and {:?}",
+                op0,
+                op1
+            );
+            assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
+
+            let taken = targets[0];
+            let not_taken = match op1 {
+                Opcode::Jump => targets[1],
+                Opcode::Fallthrough => fallthrough.unwrap(),
+                _ => unreachable!(), // assert above.
+            };
+
+            match op0 {
+                Opcode::Brz | Opcode::Brnz => {
+                    let flag_input = InsnInput {
+                        insn: branches[0],
+                        input: 0,
+                    };
+
+                    let src_ty = ctx.input_ty(branches[0], 0);
+
+                    if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
+                        emit_cmp(ctx, icmp);
+
+                        let cond_code = ctx.data(icmp).cond_code().unwrap();
+                        let cond_code = if op0 == Opcode::Brz {
+                            cond_code.inverse()
+                        } else {
+                            cond_code
+                        };
+
+                        let cc = CC::from_intcc(cond_code);
+                        ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
+                    } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
+                        let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
+                        let cond_code = if op0 == Opcode::Brz {
+                            cond_code.inverse()
+                        } else {
+                            cond_code
+                        };
+                        match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) {
+                            FcmpCondResult::Condition(cc) => {
+                                ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
+                            }
+                            FcmpCondResult::AndConditions(cc1, cc2) => {
+                                ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
+                                ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
+                            }
+                            FcmpCondResult::OrConditions(cc1, cc2) => {
+                                ctx.emit(Inst::jmp_if(cc1, taken));
+                                ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
+                            }
+                            FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
+                        }
+                    } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
+                        let src = put_input_in_reg(
+                            ctx,
+                            InsnInput {
+                                insn: branches[0],
+                                input: 0,
+                            },
+                        );
+                        let cc = match op0 {
+                            Opcode::Brz => CC::Z,
+                            Opcode::Brnz => CC::NZ,
+                            _ => unreachable!(),
+                        };
+                        let size_bytes = src_ty.bytes() as u8;
+                        ctx.emit(Inst::cmp_rmi_r(size_bytes, RegMemImm::imm(0), src));
+                        ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
+                    } else {
+                        unimplemented!("brz/brnz with non-int type {:?}", src_ty);
+                    }
+                }
+
+                Opcode::BrIcmp => {
+                    let src_ty = ctx.input_ty(branches[0], 0);
+                    if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
+                        let lhs = put_input_in_reg(
+                            ctx,
+                            InsnInput {
+                                insn: branches[0],
+                                input: 0,
+                            },
+                        );
+                        let rhs = input_to_reg_mem_imm(
+                            ctx,
+                            InsnInput {
+                                insn: branches[0],
+                                input: 1,
+                            },
+                        );
+                        let cc = CC::from_intcc(ctx.data(branches[0]).cond_code().unwrap());
+                        let byte_size = src_ty.bytes() as u8;
+                        // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
+                        // us dst - src at the machine instruction level, so invert operands.
+                        ctx.emit(Inst::cmp_rmi_r(byte_size, rhs, lhs));
+                        ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
+                    } else {
+                        unimplemented!("bricmp with non-int type {:?}", src_ty);
+                    }
+                }
+
+                _ => panic!("unexpected branch opcode: {:?}", op0),
+            }
+        } else {
+            assert_eq!(branches.len(), 1);
+
+            // Must be an unconditional branch or trap.
+            let op = ctx.data(branches[0]).opcode();
+            match op {
+                Opcode::Jump | Opcode::Fallthrough => {
+                    ctx.emit(Inst::jmp_known(targets[0]));
+                }
+
+                Opcode::BrTable => {
+                    let jt_size = targets.len() - 1;
+                    assert!(jt_size <= u32::max_value() as usize);
+                    let jt_size = jt_size as u32;
+
+                    let idx = extend_input_to_reg(
+                        ctx,
+                        InsnInput {
+                            insn: branches[0],
+                            input: 0,
+                        },
+                        ExtSpec::ZeroExtendTo32,
+                    );
+
+                    // Bounds-check (compute flags from idx - jt_size) and branch to default.
+                    ctx.emit(Inst::cmp_rmi_r(4, RegMemImm::imm(jt_size), idx));
+
+                    // Emit the compound instruction that does:
+                    //
+                    // lea $jt, %rA
+                    // movsbl [%rA, %rIndex, 2], %rB
+                    // add %rB, %rA
+                    // j *%rA
+                    // [jt entries]
+                    //
+                    // This must be *one* instruction in the vcode because we cannot allow regalloc
+                    // to insert any spills/fills in the middle of the sequence; otherwise, the
+                    // lea PC-rel offset to the jumptable would be incorrect.  (The alternative
+                    // is to introduce a relocation pass for inlined jumptables, which is much
+                    // worse.)
+
+                    // This temporary is used as a signed integer of 64-bits (to hold addresses).
+                    let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
+                    // This temporary is used as a signed integer of 32-bits (for the wasm-table
+                    // index) and then 64-bits (address addend). The small lie about the I64 type
+                    // is benign, since the temporary is dead after this instruction (and its
+                    // Cranelift type is thus unused).
+                    let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
+
+                    let targets_for_term: Vec<MachLabel> = targets.to_vec();
+                    let default_target = targets[0];
+
+                    let jt_targets: Vec<MachLabel> = targets.iter().skip(1).cloned().collect();
+
+                    ctx.emit(Inst::JmpTableSeq {
+                        idx,
+                        tmp1,
+                        tmp2,
+                        default_target,
+                        targets: jt_targets,
+                        targets_for_term,
+                    });
+                }
+
+                _ => panic!("Unknown branch type {:?}", op),
+            }
+        }
+
+        Ok(())
+    }
+
+    fn maybe_pinned_reg(&self) -> Option<Reg> {
+        Some(regs::pinned_reg())
+    }
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs b/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs
new file mode 100644
index 0000000000..fd4444498d
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs
@@ -0,0 +1,149 @@
+//! X86_64-bit Instruction Set Architecture.
+
+use self::inst::EmitInfo;
+
+use super::TargetIsa;
+use crate::ir::{condcodes::IntCC, Function};
+use crate::isa::x64::{inst::regs::create_reg_universe_systemv, settings as x64_settings};
+use crate::isa::Builder as IsaBuilder;
+use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
+use crate::result::CodegenResult;
+use crate::settings::{self as shared_settings, Flags};
+use alloc::boxed::Box;
+use regalloc::{PrettyPrint, RealRegUniverse};
+use target_lexicon::Triple;
+
+mod abi;
+mod inst;
+mod lower;
+mod settings;
+
+/// An X64 backend.
+pub(crate) struct X64Backend {
+    triple: Triple,
+    flags: Flags,
+    x64_flags: x64_settings::Flags,
+    reg_universe: RealRegUniverse,
+}
+
+impl X64Backend {
+    /// Create a new X64 backend with the given (shared) flags.
+    fn new_with_flags(triple: Triple, flags: Flags, x64_flags: x64_settings::Flags) -> Self {
+        let reg_universe = create_reg_universe_systemv(&flags);
+        Self {
+            triple,
+            flags,
+            x64_flags,
+            reg_universe,
+        }
+    }
+
+    fn compile_vcode(&self, func: &Function, flags: Flags) -> CodegenResult<VCode<inst::Inst>> {
+        // This performs lowering to VCode, register-allocates the code, computes
+        // block layout and finalizes branches. The result is ready for binary emission.
+        let emit_info = EmitInfo::new(flags.clone(), self.x64_flags.clone());
+        let abi = Box::new(abi::X64ABICallee::new(&func, flags)?);
+        compile::compile::<Self>(&func, self, abi, emit_info)
+    }
+}
+
+impl MachBackend for X64Backend {
+    fn compile_function(
+        &self,
+        func: &Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult> {
+        let flags = self.flags();
+        let vcode = self.compile_vcode(func, flags.clone())?;
+
+        let buffer = vcode.emit();
+        let buffer = buffer.finish();
+        let frame_size = vcode.frame_size();
+        let unwind_info = vcode.unwind_info()?;
+
+        let disasm = if want_disasm {
+            Some(vcode.show_rru(Some(&create_reg_universe_systemv(flags))))
+        } else {
+            None
+        };
+
+        Ok(MachCompileResult {
+            buffer,
+            frame_size,
+            disasm,
+            unwind_info,
+        })
+    }
+
+    fn flags(&self) -> &Flags {
+        &self.flags
+    }
+
+    fn name(&self) -> &'static str {
+        "x64"
+    }
+
+    fn triple(&self) -> Triple {
+        self.triple.clone()
+    }
+
+    fn reg_universe(&self) -> &RealRegUniverse {
+        &self.reg_universe
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> IntCC {
+        // Unsigned `>=`; this corresponds to the carry flag set on x86, which happens on
+        // overflow of an add.
+        IntCC::UnsignedGreaterThanOrEqual
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> IntCC {
+        // unsigned `>=`; this corresponds to the carry flag set on x86, which happens on
+        // underflow of a subtract (carry is borrow for subtract).
+        IntCC::UnsignedGreaterThanOrEqual
+    }
+
+    #[cfg(feature = "unwind")]
+    fn emit_unwind_info(
+        &self,
+        result: &MachCompileResult,
+        kind: crate::machinst::UnwindInfoKind,
+    ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> {
+        use crate::isa::unwind::UnwindInfo;
+        use crate::machinst::UnwindInfoKind;
+        Ok(match (result.unwind_info.as_ref(), kind) {
+            (Some(info), UnwindInfoKind::SystemV) => {
+                inst::unwind::systemv::create_unwind_info(info.clone())?.map(UnwindInfo::SystemV)
+            }
+            (Some(_info), UnwindInfoKind::Windows) => {
+                //TODO inst::unwind::winx64::create_unwind_info(info.clone())?.map(|u| UnwindInfo::WindowsX64(u))
+                None
+            }
+            _ => None,
+        })
+    }
+
+    #[cfg(feature = "unwind")]
+    fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> {
+        Some(inst::unwind::systemv::create_cie())
+    }
+}
+
+/// Create a new `isa::Builder`.
+pub(crate) fn isa_builder(triple: Triple) -> IsaBuilder {
+    IsaBuilder {
+        triple,
+        setup: x64_settings::builder(),
+        constructor: isa_constructor,
+    }
+}
+
+fn isa_constructor(
+    triple: Triple,
+    shared_flags: Flags,
+    builder: shared_settings::Builder,
+) -> Box<dyn TargetIsa> {
+    let isa_flags = x64_settings::Flags::new(&shared_flags, builder);
+    let backend = X64Backend::new_with_flags(triple, shared_flags, isa_flags);
+    Box::new(TargetIsaAdapter::new(backend))
+}
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs b/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs
new file mode 100644
index 0000000000..c5371bb132
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs
@@ -0,0 +1,9 @@
+//! x86 Settings.
+
+use crate::settings::{self, detail, Builder};
+use core::fmt;
+
+// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
+// public `Flags` struct with an impl for all of the settings defined in
+// `cranelift-codegen/meta/src/isa/x86/settings.rs`.
+include!(concat!(env!("OUT_DIR"), "/settings-x86.rs"));
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 14:29:10 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 14:29:10 +0000
commit	2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
tree	b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/cranelift-codegen/src/isa/x64
parent	Initial commit. (diff)
download	firefox-upstream.tar.xz firefox-upstream.zip