Adding upstream version 86.0.1.upstream/86.0.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 14:29:10 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 14:29:10 +0000
commit: 2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
tree: b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/cranelift-codegen-meta/src/isa
parent: Initial commit. (diff)
download: firefox-upstream.tar.xz
firefox-upstream.zip
14 files changed, 9788 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/arm32/mod.rs b/third_party/rust/cranelift-codegen-meta/src/isa/arm32/mod.rs
new file mode 100644
index 0000000000..f699ece8eb
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/arm32/mod.rs
@@ -0,0 +1,88 @@
+use crate::cdsl::cpu_modes::CpuMode;
+use crate::cdsl::instructions::{InstructionGroupBuilder, InstructionPredicateMap};
+use crate::cdsl::isa::TargetIsa;
+use crate::cdsl::recipes::Recipes;
+use crate::cdsl::regs::{IsaRegs, IsaRegsBuilder, RegBankBuilder, RegClassBuilder};
+use crate::cdsl::settings::{SettingGroup, SettingGroupBuilder};
+
+use crate::shared::Definitions as SharedDefinitions;
+
+fn define_settings(_shared: &SettingGroup) -> SettingGroup {
+    let setting = SettingGroupBuilder::new("arm32");
+    setting.build()
+}
+
+fn define_regs() -> IsaRegs {
+    let mut regs = IsaRegsBuilder::new();
+
+    let builder = RegBankBuilder::new("FloatRegs", "s")
+        .units(64)
+        .track_pressure(true);
+    let float_regs = regs.add_bank(builder);
+
+    let builder = RegBankBuilder::new("IntRegs", "r")
+        .units(16)
+        .track_pressure(true);
+    let int_regs = regs.add_bank(builder);
+
+    let builder = RegBankBuilder::new("FlagRegs", "")
+        .units(1)
+        .names(vec!["nzcv"])
+        .track_pressure(false);
+    let flag_reg = regs.add_bank(builder);
+
+    let builder = RegClassBuilder::new_toplevel("S", float_regs).count(32);
+    regs.add_class(builder);
+
+    let builder = RegClassBuilder::new_toplevel("D", float_regs).width(2);
+    regs.add_class(builder);
+
+    let builder = RegClassBuilder::new_toplevel("Q", float_regs).width(4);
+    regs.add_class(builder);
+
+    let builder = RegClassBuilder::new_toplevel("GPR", int_regs);
+    regs.add_class(builder);
+
+    let builder = RegClassBuilder::new_toplevel("FLAG", flag_reg);
+    regs.add_class(builder);
+
+    regs.build()
+}
+
+pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
+    let settings = define_settings(&shared_defs.settings);
+    let regs = define_regs();
+
+    let inst_group = InstructionGroupBuilder::new(&mut shared_defs.all_instructions).build();
+
+    // CPU modes for 32-bit ARM and Thumb2.
+    let mut a32 = CpuMode::new("A32");
+    let mut t32 = CpuMode::new("T32");
+
+    // TODO refine these.
+    let narrow_flags = shared_defs.transform_groups.by_name("narrow_flags");
+    a32.legalize_default(narrow_flags);
+    t32.legalize_default(narrow_flags);
+
+    // Make sure that the expand code is used, thus generated.
+    let expand = shared_defs.transform_groups.by_name("expand");
+    a32.legalize_monomorphic(expand);
+
+    let cpu_modes = vec![a32, t32];
+
+    // TODO implement arm32 recipes.
+    let recipes = Recipes::new();
+
+    // TODO implement arm32 encodings and predicates.
+    let encodings_predicates = InstructionPredicateMap::new();
+
+    TargetIsa::new(
+        "arm32",
+        inst_group,
+        settings,
+        regs,
+        recipes,
+        cpu_modes,
+        encodings_predicates,
+    )
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/arm64/mod.rs b/third_party/rust/cranelift-codegen-meta/src/isa/arm64/mod.rs
new file mode 100644
index 0000000000..5d8bc76fc4
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/arm64/mod.rs
@@ -0,0 +1,79 @@
+use crate::cdsl::cpu_modes::CpuMode;
+use crate::cdsl::instructions::{InstructionGroupBuilder, InstructionPredicateMap};
+use crate::cdsl::isa::TargetIsa;
+use crate::cdsl::recipes::Recipes;
+use crate::cdsl::regs::{IsaRegs, IsaRegsBuilder, RegBankBuilder, RegClassBuilder};
+use crate::cdsl::settings::{SettingGroup, SettingGroupBuilder};
+
+use crate::shared::Definitions as SharedDefinitions;
+
+fn define_settings(_shared: &SettingGroup) -> SettingGroup {
+    let setting = SettingGroupBuilder::new("arm64");
+    setting.build()
+}
+
+fn define_registers() -> IsaRegs {
+    let mut regs = IsaRegsBuilder::new();
+
+    // The `x31` regunit serves as the stack pointer / zero register depending on context. We
+    // reserve it and don't model the difference.
+    let builder = RegBankBuilder::new("IntRegs", "x")
+        .units(32)
+        .track_pressure(true);
+    let int_regs = regs.add_bank(builder);
+
+    let builder = RegBankBuilder::new("FloatRegs", "v")
+        .units(32)
+        .track_pressure(true);
+    let float_regs = regs.add_bank(builder);
+
+    let builder = RegBankBuilder::new("FlagRegs", "")
+        .units(1)
+        .names(vec!["nzcv"])
+        .track_pressure(false);
+    let flag_reg = regs.add_bank(builder);
+
+    let builder = RegClassBuilder::new_toplevel("GPR", int_regs);
+    regs.add_class(builder);
+
+    let builder = RegClassBuilder::new_toplevel("FPR", float_regs);
+    regs.add_class(builder);
+
+    let builder = RegClassBuilder::new_toplevel("FLAG", flag_reg);
+    regs.add_class(builder);
+
+    regs.build()
+}
+
+pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
+    let settings = define_settings(&shared_defs.settings);
+    let regs = define_registers();
+
+    let inst_group = InstructionGroupBuilder::new(&mut shared_defs.all_instructions).build();
+
+    let mut a64 = CpuMode::new("A64");
+
+    // TODO refine these.
+    let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
+    let narrow_flags = shared_defs.transform_groups.by_name("narrow_flags");
+    a64.legalize_monomorphic(expand_flags);
+    a64.legalize_default(narrow_flags);
+
+    let cpu_modes = vec![a64];
+
+    // TODO implement arm64 recipes.
+    let recipes = Recipes::new();
+
+    // TODO implement arm64 encodings and predicates.
+    let encodings_predicates = InstructionPredicateMap::new();
+
+    TargetIsa::new(
+        "arm64",
+        inst_group,
+        settings,
+        regs,
+        recipes,
+        cpu_modes,
+        encodings_predicates,
+    )
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/mod.rs b/third_party/rust/cranelift-codegen-meta/src/isa/mod.rs
new file mode 100644
index 0000000000..ed8db85f0d
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/mod.rs
@@ -0,0 +1,67 @@
+//! Define supported ISAs; includes ISA-specific instructions, encodings, registers, settings, etc.
+use crate::cdsl::isa::TargetIsa;
+use crate::shared::Definitions as SharedDefinitions;
+use std::fmt;
+
+mod arm32;
+mod arm64;
+mod riscv;
+pub(crate) mod x86;
+
+/// Represents known ISA target.
+#[derive(PartialEq, Copy, Clone)]
+pub enum Isa {
+    Riscv,
+    X86,
+    Arm32,
+    Arm64,
+}
+
+impl Isa {
+    /// Creates isa target using name.
+    pub fn from_name(name: &str) -> Option<Self> {
+        Isa::all()
+            .iter()
+            .cloned()
+            .find(|isa| isa.to_string() == name)
+    }
+
+    /// Creates isa target from arch.
+    pub fn from_arch(arch: &str) -> Option<Self> {
+        match arch {
+            "riscv" => Some(Isa::Riscv),
+            "aarch64" => Some(Isa::Arm64),
+            x if ["x86_64", "i386", "i586", "i686"].contains(&x) => Some(Isa::X86),
+            x if x.starts_with("arm") || arch.starts_with("thumb") => Some(Isa::Arm32),
+            _ => None,
+        }
+    }
+
+    /// Returns all supported isa targets.
+    pub fn all() -> &'static [Isa] {
+        &[Isa::Riscv, Isa::X86, Isa::Arm32, Isa::Arm64]
+    }
+}
+
+impl fmt::Display for Isa {
+    // These names should be kept in sync with the crate features.
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            Isa::Riscv => write!(f, "riscv"),
+            Isa::X86 => write!(f, "x86"),
+            Isa::Arm32 => write!(f, "arm32"),
+            Isa::Arm64 => write!(f, "arm64"),
+        }
+    }
+}
+
+pub(crate) fn define(isas: &[Isa], shared_defs: &mut SharedDefinitions) -> Vec<TargetIsa> {
+    isas.iter()
+        .map(|isa| match isa {
+            Isa::Riscv => riscv::define(shared_defs),
+            Isa::X86 => x86::define(shared_defs),
+            Isa::Arm32 => arm32::define(shared_defs),
+            Isa::Arm64 => arm64::define(shared_defs),
+        })
+        .collect()
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/riscv/encodings.rs b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/encodings.rs
new file mode 100644
index 0000000000..c255ddb483
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/encodings.rs
@@ -0,0 +1,431 @@
+use crate::cdsl::ast::{Apply, Expr, Literal, VarPool};
+use crate::cdsl::encodings::{Encoding, EncodingBuilder};
+use crate::cdsl::instructions::{
+    Bindable, BoundInstruction, InstSpec, InstructionPredicateNode, InstructionPredicateRegistry,
+};
+use crate::cdsl::recipes::{EncodingRecipeNumber, Recipes};
+use crate::cdsl::settings::SettingGroup;
+
+use crate::shared::types::Bool::B1;
+use crate::shared::types::Float::{F32, F64};
+use crate::shared::types::Int::{I16, I32, I64, I8};
+use crate::shared::types::Reference::{R32, R64};
+use crate::shared::Definitions as SharedDefinitions;
+
+use super::recipes::RecipeGroup;
+
+pub(crate) struct PerCpuModeEncodings<'defs> {
+    pub inst_pred_reg: InstructionPredicateRegistry,
+    pub enc32: Vec<Encoding>,
+    pub enc64: Vec<Encoding>,
+    recipes: &'defs Recipes,
+}
+
+impl<'defs> PerCpuModeEncodings<'defs> {
+    fn new(recipes: &'defs Recipes) -> Self {
+        Self {
+            inst_pred_reg: InstructionPredicateRegistry::new(),
+            enc32: Vec::new(),
+            enc64: Vec::new(),
+            recipes,
+        }
+    }
+    fn enc(
+        &self,
+        inst: impl Into<InstSpec>,
+        recipe: EncodingRecipeNumber,
+        bits: u16,
+    ) -> EncodingBuilder {
+        EncodingBuilder::new(inst.into(), recipe, bits)
+    }
+    fn add32(&mut self, encoding: EncodingBuilder) {
+        self.enc32
+            .push(encoding.build(self.recipes, &mut self.inst_pred_reg));
+    }
+    fn add64(&mut self, encoding: EncodingBuilder) {
+        self.enc64
+            .push(encoding.build(self.recipes, &mut self.inst_pred_reg));
+    }
+}
+
+// The low 7 bits of a RISC-V instruction is the base opcode. All 32-bit instructions have 11 as
+// the two low bits, with bits 6:2 determining the base opcode.
+//
+// Encbits for the 32-bit recipes are opcode[6:2] | (funct3 << 5) | ...
+// The functions below encode the encbits.
+
+fn load_bits(funct3: u16) -> u16 {
+    assert!(funct3 <= 0b111);
+    funct3 << 5
+}
+
+fn store_bits(funct3: u16) -> u16 {
+    assert!(funct3 <= 0b111);
+    0b01000 | (funct3 << 5)
+}
+
+fn branch_bits(funct3: u16) -> u16 {
+    assert!(funct3 <= 0b111);
+    0b11000 | (funct3 << 5)
+}
+
+fn jalr_bits() -> u16 {
+    // This was previously accepting an argument funct3 of 3 bits and used the following formula:
+    //0b11001 | (funct3 << 5)
+    0b11001
+}
+
+fn jal_bits() -> u16 {
+    0b11011
+}
+
+fn opimm_bits(funct3: u16, funct7: u16) -> u16 {
+    assert!(funct3 <= 0b111);
+    0b00100 | (funct3 << 5) | (funct7 << 8)
+}
+
+fn opimm32_bits(funct3: u16, funct7: u16) -> u16 {
+    assert!(funct3 <= 0b111);
+    0b00110 | (funct3 << 5) | (funct7 << 8)
+}
+
+fn op_bits(funct3: u16, funct7: u16) -> u16 {
+    assert!(funct3 <= 0b111);
+    assert!(funct7 <= 0b111_1111);
+    0b01100 | (funct3 << 5) | (funct7 << 8)
+}
+
+fn op32_bits(funct3: u16, funct7: u16) -> u16 {
+    assert!(funct3 <= 0b111);
+    assert!(funct7 <= 0b111_1111);
+    0b01110 | (funct3 << 5) | (funct7 << 8)
+}
+
+fn lui_bits() -> u16 {
+    0b01101
+}
+
+pub(crate) fn define<'defs>(
+    shared_defs: &'defs SharedDefinitions,
+    isa_settings: &SettingGroup,
+    recipes: &'defs RecipeGroup,
+) -> PerCpuModeEncodings<'defs> {
+    // Instructions shorthands.
+    let shared = &shared_defs.instructions;
+
+    let band = shared.by_name("band");
+    let band_imm = shared.by_name("band_imm");
+    let bor = shared.by_name("bor");
+    let bor_imm = shared.by_name("bor_imm");
+    let br_icmp = shared.by_name("br_icmp");
+    let brz = shared.by_name("brz");
+    let brnz = shared.by_name("brnz");
+    let bxor = shared.by_name("bxor");
+    let bxor_imm = shared.by_name("bxor_imm");
+    let call = shared.by_name("call");
+    let call_indirect = shared.by_name("call_indirect");
+    let copy = shared.by_name("copy");
+    let copy_nop = shared.by_name("copy_nop");
+    let copy_to_ssa = shared.by_name("copy_to_ssa");
+    let fill = shared.by_name("fill");
+    let fill_nop = shared.by_name("fill_nop");
+    let iadd = shared.by_name("iadd");
+    let iadd_imm = shared.by_name("iadd_imm");
+    let iconst = shared.by_name("iconst");
+    let icmp = shared.by_name("icmp");
+    let icmp_imm = shared.by_name("icmp_imm");
+    let imul = shared.by_name("imul");
+    let ishl = shared.by_name("ishl");
+    let ishl_imm = shared.by_name("ishl_imm");
+    let isub = shared.by_name("isub");
+    let jump = shared.by_name("jump");
+    let regmove = shared.by_name("regmove");
+    let spill = shared.by_name("spill");
+    let sshr = shared.by_name("sshr");
+    let sshr_imm = shared.by_name("sshr_imm");
+    let ushr = shared.by_name("ushr");
+    let ushr_imm = shared.by_name("ushr_imm");
+    let return_ = shared.by_name("return");
+
+    // Recipes shorthands, prefixed with r_.
+    let r_copytossa = recipes.by_name("copytossa");
+    let r_fillnull = recipes.by_name("fillnull");
+    let r_icall = recipes.by_name("Icall");
+    let r_icopy = recipes.by_name("Icopy");
+    let r_ii = recipes.by_name("Ii");
+    let r_iicmp = recipes.by_name("Iicmp");
+    let r_iret = recipes.by_name("Iret");
+    let r_irmov = recipes.by_name("Irmov");
+    let r_iz = recipes.by_name("Iz");
+    let r_gp_sp = recipes.by_name("GPsp");
+    let r_gp_fi = recipes.by_name("GPfi");
+    let r_r = recipes.by_name("R");
+    let r_ricmp = recipes.by_name("Ricmp");
+    let r_rshamt = recipes.by_name("Rshamt");
+    let r_sb = recipes.by_name("SB");
+    let r_sb_zero = recipes.by_name("SBzero");
+    let r_stacknull = recipes.by_name("stacknull");
+    let r_u = recipes.by_name("U");
+    let r_uj = recipes.by_name("UJ");
+    let r_uj_call = recipes.by_name("UJcall");
+
+    // Predicates shorthands.
+    let use_m = isa_settings.predicate_by_name("use_m");
+
+    // Definitions.
+    let mut e = PerCpuModeEncodings::new(&recipes.recipes);
+
+    // Basic arithmetic binary instructions are encoded in an R-type instruction.
+    for &(inst, inst_imm, f3, f7) in &[
+        (iadd, Some(iadd_imm), 0b000, 0b000_0000),
+        (isub, None, 0b000, 0b010_0000),
+        (bxor, Some(bxor_imm), 0b100, 0b000_0000),
+        (bor, Some(bor_imm), 0b110, 0b000_0000),
+        (band, Some(band_imm), 0b111, 0b000_0000),
+    ] {
+        e.add32(e.enc(inst.bind(I32), r_r, op_bits(f3, f7)));
+        e.add64(e.enc(inst.bind(I64), r_r, op_bits(f3, f7)));
+
+        // Immediate versions for add/xor/or/and.
+        if let Some(inst_imm) = inst_imm {
+            e.add32(e.enc(inst_imm.bind(I32), r_ii, opimm_bits(f3, 0)));
+            e.add64(e.enc(inst_imm.bind(I64), r_ii, opimm_bits(f3, 0)));
+        }
+    }
+
+    // 32-bit ops in RV64.
+    e.add64(e.enc(iadd.bind(I32), r_r, op32_bits(0b000, 0b000_0000)));
+    e.add64(e.enc(isub.bind(I32), r_r, op32_bits(0b000, 0b010_0000)));
+    // There are no andiw/oriw/xoriw variations.
+    e.add64(e.enc(iadd_imm.bind(I32), r_ii, opimm32_bits(0b000, 0)));
+
+    // Use iadd_imm with %x0 to materialize constants.
+    e.add32(e.enc(iconst.bind(I32), r_iz, opimm_bits(0b0, 0)));
+    e.add64(e.enc(iconst.bind(I32), r_iz, opimm_bits(0b0, 0)));
+    e.add64(e.enc(iconst.bind(I64), r_iz, opimm_bits(0b0, 0)));
+
+    // Dynamic shifts have the same masking semantics as the clif base instructions.
+    for &(inst, inst_imm, f3, f7) in &[
+        (ishl, ishl_imm, 0b1, 0b0),
+        (ushr, ushr_imm, 0b101, 0b0),
+        (sshr, sshr_imm, 0b101, 0b10_0000),
+    ] {
+        e.add32(e.enc(inst.bind(I32).bind(I32), r_r, op_bits(f3, f7)));
+        e.add64(e.enc(inst.bind(I64).bind(I64), r_r, op_bits(f3, f7)));
+        e.add64(e.enc(inst.bind(I32).bind(I32), r_r, op32_bits(f3, f7)));
+        // Allow i32 shift amounts in 64-bit shifts.
+        e.add64(e.enc(inst.bind(I64).bind(I32), r_r, op_bits(f3, f7)));
+        e.add64(e.enc(inst.bind(I32).bind(I64), r_r, op32_bits(f3, f7)));
+
+        // Immediate shifts.
+        e.add32(e.enc(inst_imm.bind(I32), r_rshamt, opimm_bits(f3, f7)));
+        e.add64(e.enc(inst_imm.bind(I64), r_rshamt, opimm_bits(f3, f7)));
+        e.add64(e.enc(inst_imm.bind(I32), r_rshamt, opimm32_bits(f3, f7)));
+    }
+
+    // Signed and unsigned integer 'less than'. There are no 'w' variants for comparing 32-bit
+    // numbers in RV64.
+    {
+        let mut var_pool = VarPool::new();
+
+        // Helper that creates an instruction predicate for an instruction in the icmp family.
+        let mut icmp_instp = |bound_inst: &BoundInstruction,
+                              intcc_field: &'static str|
+         -> InstructionPredicateNode {
+            let x = var_pool.create("x");
+            let y = var_pool.create("y");
+            let cc = Literal::enumerator_for(&shared_defs.imm.intcc, intcc_field);
+            Apply::new(
+                bound_inst.clone().into(),
+                vec![Expr::Literal(cc), Expr::Var(x), Expr::Var(y)],
+            )
+            .inst_predicate(&var_pool)
+            .unwrap()
+        };
+
+        let icmp_i32 = icmp.bind(I32);
+        let icmp_i64 = icmp.bind(I64);
+        e.add32(
+            e.enc(icmp_i32.clone(), r_ricmp, op_bits(0b010, 0b000_0000))
+                .inst_predicate(icmp_instp(&icmp_i32, "slt")),
+        );
+        e.add64(
+            e.enc(icmp_i64.clone(), r_ricmp, op_bits(0b010, 0b000_0000))
+                .inst_predicate(icmp_instp(&icmp_i64, "slt")),
+        );
+
+        e.add32(
+            e.enc(icmp_i32.clone(), r_ricmp, op_bits(0b011, 0b000_0000))
+                .inst_predicate(icmp_instp(&icmp_i32, "ult")),
+        );
+        e.add64(
+            e.enc(icmp_i64.clone(), r_ricmp, op_bits(0b011, 0b000_0000))
+                .inst_predicate(icmp_instp(&icmp_i64, "ult")),
+        );
+
+        // Immediate variants.
+        let icmp_i32 = icmp_imm.bind(I32);
+        let icmp_i64 = icmp_imm.bind(I64);
+        e.add32(
+            e.enc(icmp_i32.clone(), r_iicmp, opimm_bits(0b010, 0))
+                .inst_predicate(icmp_instp(&icmp_i32, "slt")),
+        );
+        e.add64(
+            e.enc(icmp_i64.clone(), r_iicmp, opimm_bits(0b010, 0))
+                .inst_predicate(icmp_instp(&icmp_i64, "slt")),
+        );
+
+        e.add32(
+            e.enc(icmp_i32.clone(), r_iicmp, opimm_bits(0b011, 0))
+                .inst_predicate(icmp_instp(&icmp_i32, "ult")),
+        );
+        e.add64(
+            e.enc(icmp_i64.clone(), r_iicmp, opimm_bits(0b011, 0))
+                .inst_predicate(icmp_instp(&icmp_i64, "ult")),
+        );
+    }
+
+    // Integer constants with the low 12 bits clear are materialized by lui.
+    e.add32(e.enc(iconst.bind(I32), r_u, lui_bits()));
+    e.add64(e.enc(iconst.bind(I32), r_u, lui_bits()));
+    e.add64(e.enc(iconst.bind(I64), r_u, lui_bits()));
+
+    // "M" Standard Extension for Integer Multiplication and Division.
+    // Gated by the `use_m` flag.
+    e.add32(
+        e.enc(imul.bind(I32), r_r, op_bits(0b000, 0b0000_0001))
+            .isa_predicate(use_m),
+    );
+    e.add64(
+        e.enc(imul.bind(I64), r_r, op_bits(0b000, 0b0000_0001))
+            .isa_predicate(use_m),
+    );
+    e.add64(
+        e.enc(imul.bind(I32), r_r, op32_bits(0b000, 0b0000_0001))
+            .isa_predicate(use_m),
+    );
+
+    // Control flow.
+
+    // Unconditional branches.
+    e.add32(e.enc(jump, r_uj, jal_bits()));
+    e.add64(e.enc(jump, r_uj, jal_bits()));
+    e.add32(e.enc(call, r_uj_call, jal_bits()));
+    e.add64(e.enc(call, r_uj_call, jal_bits()));
+
+    // Conditional branches.
+    {
+        let mut var_pool = VarPool::new();
+
+        // Helper that creates an instruction predicate for an instruction in the icmp family.
+        let mut br_icmp_instp = |bound_inst: &BoundInstruction,
+                                 intcc_field: &'static str|
+         -> InstructionPredicateNode {
+            let x = var_pool.create("x");
+            let y = var_pool.create("y");
+            let dest = var_pool.create("dest");
+            let args = var_pool.create("args");
+            let cc = Literal::enumerator_for(&shared_defs.imm.intcc, intcc_field);
+            Apply::new(
+                bound_inst.clone().into(),
+                vec![
+                    Expr::Literal(cc),
+                    Expr::Var(x),
+                    Expr::Var(y),
+                    Expr::Var(dest),
+                    Expr::Var(args),
+                ],
+            )
+            .inst_predicate(&var_pool)
+            .unwrap()
+        };
+
+        let br_icmp_i32 = br_icmp.bind(I32);
+        let br_icmp_i64 = br_icmp.bind(I64);
+        for &(cond, f3) in &[
+            ("eq", 0b000),
+            ("ne", 0b001),
+            ("slt", 0b100),
+            ("sge", 0b101),
+            ("ult", 0b110),
+            ("uge", 0b111),
+        ] {
+            e.add32(
+                e.enc(br_icmp_i32.clone(), r_sb, branch_bits(f3))
+                    .inst_predicate(br_icmp_instp(&br_icmp_i32, cond)),
+            );
+            e.add64(
+                e.enc(br_icmp_i64.clone(), r_sb, branch_bits(f3))
+                    .inst_predicate(br_icmp_instp(&br_icmp_i64, cond)),
+            );
+        }
+    }
+
+    for &(inst, f3) in &[(brz, 0b000), (brnz, 0b001)] {
+        e.add32(e.enc(inst.bind(I32), r_sb_zero, branch_bits(f3)));
+        e.add64(e.enc(inst.bind(I64), r_sb_zero, branch_bits(f3)));
+        e.add32(e.enc(inst.bind(B1), r_sb_zero, branch_bits(f3)));
+        e.add64(e.enc(inst.bind(B1), r_sb_zero, branch_bits(f3)));
+    }
+
+    // Returns are a special case of jalr_bits using %x1 to hold the return address.
+    // The return address is provided by a special-purpose `link` return value that
+    // is added by legalize_signature().
+    e.add32(e.enc(return_, r_iret, jalr_bits()));
+    e.add64(e.enc(return_, r_iret, jalr_bits()));
+    e.add32(e.enc(call_indirect.bind(I32), r_icall, jalr_bits()));
+    e.add64(e.enc(call_indirect.bind(I64), r_icall, jalr_bits()));
+
+    // Spill and fill.
+    e.add32(e.enc(spill.bind(I32), r_gp_sp, store_bits(0b010)));
+    e.add64(e.enc(spill.bind(I32), r_gp_sp, store_bits(0b010)));
+    e.add64(e.enc(spill.bind(I64), r_gp_sp, store_bits(0b011)));
+    e.add32(e.enc(fill.bind(I32), r_gp_fi, load_bits(0b010)));
+    e.add64(e.enc(fill.bind(I32), r_gp_fi, load_bits(0b010)));
+    e.add64(e.enc(fill.bind(I64), r_gp_fi, load_bits(0b011)));
+
+    // No-op fills, created by late-stage redundant-fill removal.
+    for &ty in &[I64, I32] {
+        e.add64(e.enc(fill_nop.bind(ty), r_fillnull, 0));
+        e.add32(e.enc(fill_nop.bind(ty), r_fillnull, 0));
+    }
+    e.add64(e.enc(fill_nop.bind(B1), r_fillnull, 0));
+    e.add32(e.enc(fill_nop.bind(B1), r_fillnull, 0));
+
+    // Register copies.
+    e.add32(e.enc(copy.bind(I32), r_icopy, opimm_bits(0b000, 0)));
+    e.add64(e.enc(copy.bind(I64), r_icopy, opimm_bits(0b000, 0)));
+    e.add64(e.enc(copy.bind(I32), r_icopy, opimm32_bits(0b000, 0)));
+
+    e.add32(e.enc(regmove.bind(I32), r_irmov, opimm_bits(0b000, 0)));
+    e.add64(e.enc(regmove.bind(I64), r_irmov, opimm_bits(0b000, 0)));
+    e.add64(e.enc(regmove.bind(I32), r_irmov, opimm32_bits(0b000, 0)));
+
+    e.add32(e.enc(copy.bind(B1), r_icopy, opimm_bits(0b000, 0)));
+    e.add64(e.enc(copy.bind(B1), r_icopy, opimm_bits(0b000, 0)));
+    e.add32(e.enc(regmove.bind(B1), r_irmov, opimm_bits(0b000, 0)));
+    e.add64(e.enc(regmove.bind(B1), r_irmov, opimm_bits(0b000, 0)));
+
+    // Stack-slot-to-the-same-stack-slot copy, which is guaranteed to turn
+    // into a no-op.
+    // The same encoding is generated for both the 64- and 32-bit architectures.
+    for &ty in &[I64, I32, I16, I8] {
+        e.add32(e.enc(copy_nop.bind(ty), r_stacknull, 0));
+        e.add64(e.enc(copy_nop.bind(ty), r_stacknull, 0));
+    }
+    for &ty in &[F64, F32] {
+        e.add32(e.enc(copy_nop.bind(ty), r_stacknull, 0));
+        e.add64(e.enc(copy_nop.bind(ty), r_stacknull, 0));
+    }
+
+    // Copy-to-SSA
+    e.add32(e.enc(copy_to_ssa.bind(I32), r_copytossa, opimm_bits(0b000, 0)));
+    e.add64(e.enc(copy_to_ssa.bind(I64), r_copytossa, opimm_bits(0b000, 0)));
+    e.add64(e.enc(copy_to_ssa.bind(I32), r_copytossa, opimm32_bits(0b000, 0)));
+    e.add32(e.enc(copy_to_ssa.bind(B1), r_copytossa, opimm_bits(0b000, 0)));
+    e.add64(e.enc(copy_to_ssa.bind(B1), r_copytossa, opimm_bits(0b000, 0)));
+    e.add32(e.enc(copy_to_ssa.bind(R32), r_copytossa, opimm_bits(0b000, 0)));
+    e.add64(e.enc(copy_to_ssa.bind(R64), r_copytossa, opimm_bits(0b000, 0)));
+
+    e
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/riscv/mod.rs b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/mod.rs
new file mode 100644
index 0000000000..801e61a3d2
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/mod.rs
@@ -0,0 +1,134 @@
+use crate::cdsl::cpu_modes::CpuMode;
+use crate::cdsl::instructions::InstructionGroupBuilder;
+use crate::cdsl::isa::TargetIsa;
+use crate::cdsl::regs::{IsaRegs, IsaRegsBuilder, RegBankBuilder, RegClassBuilder};
+use crate::cdsl::settings::{PredicateNode, SettingGroup, SettingGroupBuilder};
+
+use crate::shared::types::Float::{F32, F64};
+use crate::shared::types::Int::{I32, I64};
+use crate::shared::Definitions as SharedDefinitions;
+
+mod encodings;
+mod recipes;
+
+fn define_settings(shared: &SettingGroup) -> SettingGroup {
+    let mut setting = SettingGroupBuilder::new("riscv");
+
+    let supports_m = setting.add_bool(
+        "supports_m",
+        "CPU supports the 'M' extension (mul/div)",
+        false,
+    );
+    let supports_a = setting.add_bool(
+        "supports_a",
+        "CPU supports the 'A' extension (atomics)",
+        false,
+    );
+    let supports_f = setting.add_bool(
+        "supports_f",
+        "CPU supports the 'F' extension (float)",
+        false,
+    );
+    let supports_d = setting.add_bool(
+        "supports_d",
+        "CPU supports the 'D' extension (double)",
+        false,
+    );
+
+    let enable_m = setting.add_bool(
+        "enable_m",
+        "Enable the use of 'M' instructions if available",
+        true,
+    );
+
+    setting.add_bool(
+        "enable_e",
+        "Enable the 'RV32E' instruction set with only 16 registers",
+        false,
+    );
+
+    let shared_enable_atomics = shared.get_bool("enable_atomics");
+    let shared_enable_float = shared.get_bool("enable_float");
+    let shared_enable_simd = shared.get_bool("enable_simd");
+
+    setting.add_predicate("use_m", predicate!(supports_m && enable_m));
+    setting.add_predicate("use_a", predicate!(supports_a && shared_enable_atomics));
+    setting.add_predicate("use_f", predicate!(supports_f && shared_enable_float));
+    setting.add_predicate("use_d", predicate!(supports_d && shared_enable_float));
+    setting.add_predicate(
+        "full_float",
+        predicate!(shared_enable_simd && supports_f && supports_d),
+    );
+
+    setting.build()
+}
+
+fn define_registers() -> IsaRegs {
+    let mut regs = IsaRegsBuilder::new();
+
+    let builder = RegBankBuilder::new("IntRegs", "x")
+        .units(32)
+        .track_pressure(true);
+    let int_regs = regs.add_bank(builder);
+
+    let builder = RegBankBuilder::new("FloatRegs", "f")
+        .units(32)
+        .track_pressure(true);
+    let float_regs = regs.add_bank(builder);
+
+    let builder = RegClassBuilder::new_toplevel("GPR", int_regs);
+    regs.add_class(builder);
+
+    let builder = RegClassBuilder::new_toplevel("FPR", float_regs);
+    regs.add_class(builder);
+
+    regs.build()
+}
+
+pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
+    let settings = define_settings(&shared_defs.settings);
+    let regs = define_registers();
+
+    let inst_group = InstructionGroupBuilder::new(&mut shared_defs.all_instructions).build();
+
+    // CPU modes for 32-bit and 64-bit operation.
+    let mut rv_32 = CpuMode::new("RV32");
+    let mut rv_64 = CpuMode::new("RV64");
+
+    let expand = shared_defs.transform_groups.by_name("expand");
+    let narrow_no_flags = shared_defs.transform_groups.by_name("narrow_no_flags");
+
+    rv_32.legalize_monomorphic(expand);
+    rv_32.legalize_default(narrow_no_flags);
+    rv_32.legalize_type(I32, expand);
+    rv_32.legalize_type(F32, expand);
+    rv_32.legalize_type(F64, expand);
+
+    rv_64.legalize_monomorphic(expand);
+    rv_64.legalize_default(narrow_no_flags);
+    rv_64.legalize_type(I32, expand);
+    rv_64.legalize_type(I64, expand);
+    rv_64.legalize_type(F32, expand);
+    rv_64.legalize_type(F64, expand);
+
+    let recipes = recipes::define(shared_defs, &regs);
+
+    let encodings = encodings::define(shared_defs, &settings, &recipes);
+    rv_32.set_encodings(encodings.enc32);
+    rv_64.set_encodings(encodings.enc64);
+    let encodings_predicates = encodings.inst_pred_reg.extract();
+
+    let recipes = recipes.collect();
+
+    let cpu_modes = vec![rv_32, rv_64];
+
+    TargetIsa::new(
+        "riscv",
+        inst_group,
+        settings,
+        regs,
+        recipes,
+        cpu_modes,
+        encodings_predicates,
+    )
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/riscv/recipes.rs b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/recipes.rs
new file mode 100644
index 0000000000..47acdbb042
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/recipes.rs
@@ -0,0 +1,279 @@
+use std::collections::HashMap;
+
+use crate::cdsl::instructions::InstructionPredicate;
+use crate::cdsl::recipes::{EncodingRecipeBuilder, EncodingRecipeNumber, Recipes, Stack};
+use crate::cdsl::regs::IsaRegs;
+use crate::shared::Definitions as SharedDefinitions;
+
+/// An helper to create recipes and use them when defining the RISCV encodings.
+pub(crate) struct RecipeGroup {
+    /// The actualy list of recipes explicitly created in this file.
+    pub recipes: Recipes,
+
+    /// Provides fast lookup from a name to an encoding recipe.
+    name_to_recipe: HashMap<String, EncodingRecipeNumber>,
+}
+
+impl RecipeGroup {
+    fn new() -> Self {
+        Self {
+            recipes: Recipes::new(),
+            name_to_recipe: HashMap::new(),
+        }
+    }
+
+    fn push(&mut self, builder: EncodingRecipeBuilder) {
+        assert!(
+            self.name_to_recipe.get(&builder.name).is_none(),
+            format!("riscv recipe '{}' created twice", builder.name)
+        );
+        let name = builder.name.clone();
+        let number = self.recipes.push(builder.build());
+        self.name_to_recipe.insert(name, number);
+    }
+
+    pub fn by_name(&self, name: &str) -> EncodingRecipeNumber {
+        *self
+            .name_to_recipe
+            .get(name)
+            .unwrap_or_else(|| panic!("unknown riscv recipe name {}", name))
+    }
+
+    pub fn collect(self) -> Recipes {
+        self.recipes
+    }
+}
+
+pub(crate) fn define(shared_defs: &SharedDefinitions, regs: &IsaRegs) -> RecipeGroup {
+    let formats = &shared_defs.formats;
+
+    // Register classes shorthands.
+    let gpr = regs.class_by_name("GPR");
+
+    // Definitions.
+    let mut recipes = RecipeGroup::new();
+
+    // R-type 32-bit instructions: These are mostly binary arithmetic instructions.
+    // The encbits are `opcode[6:2] | (funct3 << 5) | (funct7 << 8)
+    recipes.push(
+        EncodingRecipeBuilder::new("R", &formats.binary, 4)
+            .operands_in(vec![gpr, gpr])
+            .operands_out(vec![gpr])
+            .emit("put_r(bits, in_reg0, in_reg1, out_reg0, sink);"),
+    );
+
+    // R-type with an immediate shift amount instead of rs2.
+    recipes.push(
+        EncodingRecipeBuilder::new("Rshamt", &formats.binary_imm64, 4)
+            .operands_in(vec![gpr])
+            .operands_out(vec![gpr])
+            .emit("put_rshamt(bits, in_reg0, imm.into(), out_reg0, sink);"),
+    );
+
+    // R-type encoding of an integer comparison.
+    recipes.push(
+        EncodingRecipeBuilder::new("Ricmp", &formats.int_compare, 4)
+            .operands_in(vec![gpr, gpr])
+            .operands_out(vec![gpr])
+            .emit("put_r(bits, in_reg0, in_reg1, out_reg0, sink);"),
+    );
+
+    recipes.push(
+        EncodingRecipeBuilder::new("Ii", &formats.binary_imm64, 4)
+            .operands_in(vec![gpr])
+            .operands_out(vec![gpr])
+            .inst_predicate(InstructionPredicate::new_is_signed_int(
+                &*formats.binary_imm64,
+                "imm",
+                12,
+                0,
+            ))
+            .emit("put_i(bits, in_reg0, imm.into(), out_reg0, sink);"),
+    );
+
+    // I-type instruction with a hardcoded %x0 rs1.
+    recipes.push(
+        EncodingRecipeBuilder::new("Iz", &formats.unary_imm, 4)
+            .operands_out(vec![gpr])
+            .inst_predicate(InstructionPredicate::new_is_signed_int(
+                &formats.unary_imm,
+                "imm",
+                12,
+                0,
+            ))
+            .emit("put_i(bits, 0, imm.into(), out_reg0, sink);"),
+    );
+
+    // I-type encoding of an integer comparison.
+    recipes.push(
+        EncodingRecipeBuilder::new("Iicmp", &formats.int_compare_imm, 4)
+            .operands_in(vec![gpr])
+            .operands_out(vec![gpr])
+            .inst_predicate(InstructionPredicate::new_is_signed_int(
+                &formats.int_compare_imm,
+                "imm",
+                12,
+                0,
+            ))
+            .emit("put_i(bits, in_reg0, imm.into(), out_reg0, sink);"),
+    );
+
+    // I-type encoding for `jalr` as a return instruction. We won't use the immediate offset.  The
+    // variable return values are not encoded.
+    recipes.push(
+        EncodingRecipeBuilder::new("Iret", &formats.multiary, 4).emit(
+            r#"
+                    // Return instructions are always a jalr to %x1.
+                    // The return address is provided as a special-purpose link argument.
+                    put_i(
+                        bits,
+                        1, // rs1 = %x1
+                        0, // no offset.
+                        0, // rd = %x0: no address written.
+                        sink,
+                    );
+                "#,
+        ),
+    );
+
+    // I-type encoding for `jalr` as a call_indirect.
+    recipes.push(
+        EncodingRecipeBuilder::new("Icall", &formats.call_indirect, 4)
+            .operands_in(vec![gpr])
+            .emit(
+                r#"
+                    // call_indirect instructions are jalr with rd=%x1.
+                    put_i(
+                        bits,
+                        in_reg0,
+                        0, // no offset.
+                        1, // rd = %x1: link register.
+                        sink,
+                    );
+                "#,
+            ),
+    );
+
+    // Copy of a GPR is implemented as addi x, 0.
+    recipes.push(
+        EncodingRecipeBuilder::new("Icopy", &formats.unary, 4)
+            .operands_in(vec![gpr])
+            .operands_out(vec![gpr])
+            .emit("put_i(bits, in_reg0, 0, out_reg0, sink);"),
+    );
+
+    // Same for a GPR regmove.
+    recipes.push(
+        EncodingRecipeBuilder::new("Irmov", &formats.reg_move, 4)
+            .operands_in(vec![gpr])
+            .emit("put_i(bits, src, 0, dst, sink);"),
+    );
+
+    // Same for copy-to-SSA -- GPR regmove.
+    recipes.push(
+        EncodingRecipeBuilder::new("copytossa", &formats.copy_to_ssa, 4)
+            // No operands_in to mention, because a source register is specified directly.
+            .operands_out(vec![gpr])
+            .emit("put_i(bits, src, 0, out_reg0, sink);"),
+    );
+
+    // U-type instructions have a 20-bit immediate that targets bits 12-31.
+    recipes.push(
+        EncodingRecipeBuilder::new("U", &formats.unary_imm, 4)
+            .operands_out(vec![gpr])
+            .inst_predicate(InstructionPredicate::new_is_signed_int(
+                &formats.unary_imm,
+                "imm",
+                32,
+                12,
+            ))
+            .emit("put_u(bits, imm.into(), out_reg0, sink);"),
+    );
+
+    // UJ-type unconditional branch instructions.
+    recipes.push(
+        EncodingRecipeBuilder::new("UJ", &formats.jump, 4)
+            .branch_range((0, 21))
+            .emit(
+                r#"
+                    let dest = i64::from(func.offsets[destination]);
+                    let disp = dest - i64::from(sink.offset());
+                    put_uj(bits, disp, 0, sink);
+                "#,
+            ),
+    );
+
+    recipes.push(EncodingRecipeBuilder::new("UJcall", &formats.call, 4).emit(
+        r#"
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::RiscvCall,
+                                        &func.dfg.ext_funcs[func_ref].name,
+                                        0);
+                    // rd=%x1 is the standard link register.
+                    put_uj(bits, 0, 1, sink);
+                "#,
+    ));
+
+    // SB-type branch instructions.
+    recipes.push(
+        EncodingRecipeBuilder::new("SB", &formats.branch_icmp, 4)
+            .operands_in(vec![gpr, gpr])
+            .branch_range((0, 13))
+            .emit(
+                r#"
+                    let dest = i64::from(func.offsets[destination]);
+                    let disp = dest - i64::from(sink.offset());
+                    put_sb(bits, disp, in_reg0, in_reg1, sink);
+                "#,
+            ),
+    );
+
+    // SB-type branch instruction with rs2 fixed to zero.
+    recipes.push(
+        EncodingRecipeBuilder::new("SBzero", &formats.branch, 4)
+            .operands_in(vec![gpr])
+            .branch_range((0, 13))
+            .emit(
+                r#"
+                    let dest = i64::from(func.offsets[destination]);
+                    let disp = dest - i64::from(sink.offset());
+                    put_sb(bits, disp, in_reg0, 0, sink);
+                "#,
+            ),
+    );
+
+    // Spill of a GPR.
+    recipes.push(
+        EncodingRecipeBuilder::new("GPsp", &formats.unary, 4)
+            .operands_in(vec![gpr])
+            .operands_out(vec![Stack::new(gpr)])
+            .emit("unimplemented!();"),
+    );
+
+    // Fill of a GPR.
+    recipes.push(
+        EncodingRecipeBuilder::new("GPfi", &formats.unary, 4)
+            .operands_in(vec![Stack::new(gpr)])
+            .operands_out(vec![gpr])
+            .emit("unimplemented!();"),
+    );
+
+    // Stack-slot to same stack-slot copy, which is guaranteed to turn into a no-op.
+    recipes.push(
+        EncodingRecipeBuilder::new("stacknull", &formats.unary, 0)
+            .operands_in(vec![Stack::new(gpr)])
+            .operands_out(vec![Stack::new(gpr)])
+            .emit(""),
+    );
+
+    // No-op fills, created by late-stage redundant-fill removal.
+    recipes.push(
+        EncodingRecipeBuilder::new("fillnull", &formats.unary, 0)
+            .operands_in(vec![Stack::new(gpr)])
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(""),
+    );
+
+    recipes
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/encodings.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/encodings.rs
new file mode 100644
index 0000000000..9ee12656c0
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/encodings.rs
@@ -0,0 +1,2726 @@
+#![allow(non_snake_case)]
+
+use cranelift_codegen_shared::condcodes::IntCC;
+use std::collections::HashMap;
+
+use crate::cdsl::encodings::{Encoding, EncodingBuilder};
+use crate::cdsl::instructions::{
+    vector, Bindable, Immediate, InstSpec, Instruction, InstructionGroup, InstructionPredicate,
+    InstructionPredicateNode, InstructionPredicateRegistry,
+};
+use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes};
+use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber};
+use crate::cdsl::types::{LaneType, ValueType};
+use crate::shared::types::Bool::{B1, B16, B32, B64, B8};
+use crate::shared::types::Float::{F32, F64};
+use crate::shared::types::Int::{I16, I32, I64, I8};
+use crate::shared::types::Reference::{R32, R64};
+use crate::shared::Definitions as SharedDefinitions;
+
+use crate::isa::x86::opcodes::*;
+
+use super::recipes::{RecipeGroup, Template};
+use crate::cdsl::instructions::BindParameter::Any;
+
+pub(crate) struct PerCpuModeEncodings {
+    pub enc32: Vec<Encoding>,
+    pub enc64: Vec<Encoding>,
+    pub recipes: Recipes,
+    recipes_by_name: HashMap<String, EncodingRecipeNumber>,
+    pub inst_pred_reg: InstructionPredicateRegistry,
+}
+
+impl PerCpuModeEncodings {
+    fn new() -> Self {
+        Self {
+            enc32: Vec::new(),
+            enc64: Vec::new(),
+            recipes: Recipes::new(),
+            recipes_by_name: HashMap::new(),
+            inst_pred_reg: InstructionPredicateRegistry::new(),
+        }
+    }
+
+    fn add_recipe(&mut self, recipe: EncodingRecipe) -> EncodingRecipeNumber {
+        if let Some(found_index) = self.recipes_by_name.get(&recipe.name) {
+            assert!(
+                self.recipes[*found_index] == recipe,
+                format!(
+                    "trying to insert different recipes with a same name ({})",
+                    recipe.name
+                )
+            );
+            *found_index
+        } else {
+            let recipe_name = recipe.name.clone();
+            let index = self.recipes.push(recipe);
+            self.recipes_by_name.insert(recipe_name, index);
+            index
+        }
+    }
+
+    fn make_encoding<T>(
+        &mut self,
+        inst: InstSpec,
+        template: Template,
+        builder_closure: T,
+    ) -> Encoding
+    where
+        T: FnOnce(EncodingBuilder) -> EncodingBuilder,
+    {
+        let (recipe, bits) = template.build();
+        let recipe_number = self.add_recipe(recipe);
+        let builder = EncodingBuilder::new(inst, recipe_number, bits);
+        builder_closure(builder).build(&self.recipes, &mut self.inst_pred_reg)
+    }
+
+    fn enc32_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T)
+    where
+        T: FnOnce(EncodingBuilder) -> EncodingBuilder,
+    {
+        let encoding = self.make_encoding(inst.into(), template, builder_closure);
+        self.enc32.push(encoding);
+    }
+    fn enc32(&mut self, inst: impl Into<InstSpec>, template: Template) {
+        self.enc32_func(inst, template, |x| x);
+    }
+    fn enc32_isap(
+        &mut self,
+        inst: impl Into<InstSpec>,
+        template: Template,
+        isap: SettingPredicateNumber,
+    ) {
+        self.enc32_func(inst, template, |encoding| encoding.isa_predicate(isap));
+    }
+    fn enc32_instp(
+        &mut self,
+        inst: impl Into<InstSpec>,
+        template: Template,
+        instp: InstructionPredicateNode,
+    ) {
+        self.enc32_func(inst, template, |encoding| encoding.inst_predicate(instp));
+    }
+    fn enc32_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16) {
+        let recipe_number = self.add_recipe(recipe.clone());
+        let builder = EncodingBuilder::new(inst.into(), recipe_number, bits);
+        let encoding = builder.build(&self.recipes, &mut self.inst_pred_reg);
+        self.enc32.push(encoding);
+    }
+
+    fn enc64_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T)
+    where
+        T: FnOnce(EncodingBuilder) -> EncodingBuilder,
+    {
+        let encoding = self.make_encoding(inst.into(), template, builder_closure);
+        self.enc64.push(encoding);
+    }
+    fn enc64(&mut self, inst: impl Into<InstSpec>, template: Template) {
+        self.enc64_func(inst, template, |x| x);
+    }
+    fn enc64_isap(
+        &mut self,
+        inst: impl Into<InstSpec>,
+        template: Template,
+        isap: SettingPredicateNumber,
+    ) {
+        self.enc64_func(inst, template, |encoding| encoding.isa_predicate(isap));
+    }
+    fn enc64_instp(
+        &mut self,
+        inst: impl Into<InstSpec>,
+        template: Template,
+        instp: InstructionPredicateNode,
+    ) {
+        self.enc64_func(inst, template, |encoding| encoding.inst_predicate(instp));
+    }
+    fn enc64_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16) {
+        let recipe_number = self.add_recipe(recipe.clone());
+        let builder = EncodingBuilder::new(inst.into(), recipe_number, bits);
+        let encoding = builder.build(&self.recipes, &mut self.inst_pred_reg);
+        self.enc64.push(encoding);
+    }
+
+    /// Adds I32/I64 encodings as appropriate for a typed instruction.
+    /// The REX prefix is always inferred at runtime.
+    ///
+    /// Add encodings for `inst.i32` to X86_32.
+    /// Add encodings for `inst.i32` to X86_64 with optional, inferred REX.
+    /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
+    fn enc_i32_i64(&mut self, inst: impl Into<InstSpec>, template: Template) {
+        let inst: InstSpec = inst.into();
+
+        // I32 on x86: no REX prefix.
+        self.enc32(inst.bind(I32), template.infer_rex());
+
+        // I32 on x86_64: REX.W unset; REX.RXB determined at runtime from registers.
+        self.enc64(inst.bind(I32), template.infer_rex());
+
+        // I64 on x86_64: REX.W set; REX.RXB determined at runtime from registers.
+        self.enc64(inst.bind(I64), template.rex().w());
+    }
+
+    /// Adds I32/I64 encodings as appropriate for a typed instruction.
+    /// All variants of REX prefix are explicitly emitted, not inferred.
+    ///
+    /// Add encodings for `inst.i32` to X86_32.
+    /// Add encodings for `inst.i32` to X86_64 with and without REX.
+    /// Add encodings for `inst.i64` to X86_64 with and without REX.
+    fn enc_i32_i64_explicit_rex(&mut self, inst: impl Into<InstSpec>, template: Template) {
+        let inst: InstSpec = inst.into();
+        self.enc32(inst.bind(I32), template.nonrex());
+
+        // REX-less encoding must come after REX encoding so we don't use it by default.
+        // Otherwise reg-alloc would never use r8 and up.
+        self.enc64(inst.bind(I32), template.rex());
+        self.enc64(inst.bind(I32), template.nonrex());
+        self.enc64(inst.bind(I64), template.rex().w());
+    }
+
+    /// Adds B32/B64 encodings as appropriate for a typed instruction.
+    /// The REX prefix is always inferred at runtime.
+    ///
+    /// Adds encoding for `inst.b32` to X86_32.
+    /// Adds encoding for `inst.b32` to X86_64 with optional, inferred REX.
+    /// Adds encoding for `inst.b64` to X86_64 with a REX.W prefix.
+    fn enc_b32_b64(&mut self, inst: impl Into<InstSpec>, template: Template) {
+        let inst: InstSpec = inst.into();
+
+        // B32 on x86: no REX prefix.
+        self.enc32(inst.bind(B32), template.infer_rex());
+
+        // B32 on x86_64: REX.W unset; REX.RXB determined at runtime from registers.
+        self.enc64(inst.bind(B32), template.infer_rex());
+
+        // B64 on x86_64: REX.W set; REX.RXB determined at runtime from registers.
+        self.enc64(inst.bind(B64), template.rex().w());
+    }
+
+    /// Add encodings for `inst.i32` to X86_32.
+    /// Add encodings for `inst.i32` to X86_64 with a REX prefix.
+    /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
+    fn enc_i32_i64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template) {
+        let inst: InstSpec = inst.into();
+        self.enc32(inst.bind(I32), template.nonrex());
+        self.enc64(inst.bind(I32), template.rex());
+        self.enc64(inst.bind(I64), template.rex().w());
+    }
+
+    /// Add encodings for `inst.i32` to X86_32.
+    /// Add encodings for `inst.i32` to X86_64 with and without REX.
+    /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
+    fn enc_i32_i64_instp(
+        &mut self,
+        inst: &Instruction,
+        template: Template,
+        instp: InstructionPredicateNode,
+    ) {
+        self.enc32_func(inst.bind(I32), template.nonrex(), |builder| {
+            builder.inst_predicate(instp.clone())
+        });
+
+        // REX-less encoding must come after REX encoding so we don't use it by default. Otherwise
+        // reg-alloc would never use r8 and up.
+        self.enc64_func(inst.bind(I32), template.rex(), |builder| {
+            builder.inst_predicate(instp.clone())
+        });
+        self.enc64_func(inst.bind(I32), template.nonrex(), |builder| {
+            builder.inst_predicate(instp.clone())
+        });
+        self.enc64_func(inst.bind(I64), template.rex().w(), |builder| {
+            builder.inst_predicate(instp)
+        });
+    }
+
+    /// Add encodings for `inst.r32` to X86_32.
+    /// Add encodings for `inst.r32` to X86_64 with and without REX.
+    /// Add encodings for `inst.r64` to X86_64 with a REX.W prefix.
+    fn enc_r32_r64_instp(
+        &mut self,
+        inst: &Instruction,
+        template: Template,
+        instp: InstructionPredicateNode,
+    ) {
+        self.enc32_func(inst.bind(R32), template.nonrex(), |builder| {
+            builder.inst_predicate(instp.clone())
+        });
+
+        // REX-less encoding must come after REX encoding so we don't use it by default. Otherwise
+        // reg-alloc would never use r8 and up.
+        self.enc64_func(inst.bind(R32), template.rex(), |builder| {
+            builder.inst_predicate(instp.clone())
+        });
+        self.enc64_func(inst.bind(R32), template.nonrex(), |builder| {
+            builder.inst_predicate(instp.clone())
+        });
+        self.enc64_func(inst.bind(R64), template.rex().w(), |builder| {
+            builder.inst_predicate(instp)
+        });
+    }
+
+    /// Add encodings for `inst.r32` to X86_32.
+    /// Add encodings for `inst.r64` to X86_64 with a REX.W prefix.
+    fn enc_r32_r64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template) {
+        let inst: InstSpec = inst.into();
+        self.enc32(inst.bind(R32), template.nonrex());
+        self.enc64(inst.bind(R64), template.rex().w());
+    }
+
+    fn enc_r32_r64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template) {
+        self.enc32(inst.clone().bind(R32).bind(Any), template.clone());
+
+        // REX-less encoding must come after REX encoding so we don't use it by
+        // default. Otherwise reg-alloc would never use r8 and up.
+        self.enc64(inst.clone().bind(R32).bind(Any), template.clone().rex());
+        self.enc64(inst.clone().bind(R32).bind(Any), template.clone());
+
+        if w_bit {
+            self.enc64(inst.clone().bind(R64).bind(Any), template.rex().w());
+        } else {
+            self.enc64(inst.clone().bind(R64).bind(Any), template.clone().rex());
+            self.enc64(inst.clone().bind(R64).bind(Any), template);
+        }
+    }
+
+    /// Add encodings for `inst` to X86_64 with and without a REX prefix.
+    fn enc_x86_64(&mut self, inst: impl Into<InstSpec> + Clone, template: Template) {
+        // See above comment about the ordering of rex vs non-rex encodings.
+        self.enc64(inst.clone(), template.rex());
+        self.enc64(inst, template);
+    }
+
+    /// Add encodings for `inst` to X86_64 with and without a REX prefix.
+    fn enc_x86_64_instp(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        template: Template,
+        instp: InstructionPredicateNode,
+    ) {
+        // See above comment about the ordering of rex vs non-rex encodings.
+        self.enc64_func(inst.clone(), template.rex(), |builder| {
+            builder.inst_predicate(instp.clone())
+        });
+        self.enc64_func(inst, template, |builder| builder.inst_predicate(instp));
+    }
+    fn enc_x86_64_isap(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        template: Template,
+        isap: SettingPredicateNumber,
+    ) {
+        // See above comment about the ordering of rex vs non-rex encodings.
+        self.enc64_isap(inst.clone(), template.rex(), isap);
+        self.enc64_isap(inst, template, isap);
+    }
+
+    /// Add all three encodings for `inst`:
+    /// - X86_32
+    /// - X86_64 with and without the REX prefix.
+    fn enc_both(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) {
+        self.enc32(inst.clone(), template.clone());
+        self.enc_x86_64(inst, template);
+    }
+    fn enc_both_isap(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        template: Template,
+        isap: SettingPredicateNumber,
+    ) {
+        self.enc32_isap(inst.clone(), template.clone(), isap);
+        self.enc_x86_64_isap(inst, template, isap);
+    }
+    fn enc_both_instp(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        template: Template,
+        instp: InstructionPredicateNode,
+    ) {
+        self.enc32_instp(inst.clone(), template.clone(), instp.clone());
+        self.enc_x86_64_instp(inst, template, instp);
+    }
+
+    /// Add two encodings for `inst`:
+    /// - X86_32, no REX prefix, since this is not valid in 32-bit mode.
+    /// - X86_64, dynamically infer the REX prefix.
+    fn enc_both_inferred(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) {
+        self.enc32(inst.clone(), template.clone());
+        self.enc64(inst, template.infer_rex());
+    }
+    fn enc_both_inferred_maybe_isap(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        template: Template,
+        isap: Option<SettingPredicateNumber>,
+    ) {
+        self.enc32_maybe_isap(inst.clone(), template.clone(), isap);
+        self.enc64_maybe_isap(inst, template.infer_rex(), isap);
+    }
+
+    /// Add two encodings for `inst`:
+    /// - X86_32
+    /// - X86_64 with the REX prefix.
+    fn enc_both_rex_only(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) {
+        self.enc32(inst.clone(), template.clone());
+        self.enc64(inst, template.rex());
+    }
+
+    /// Add encodings for `inst.i32` to X86_32.
+    /// Add encodings for `inst.i32` to X86_64 with and without REX.
+    /// Add encodings for `inst.i64` to X86_64 with a REX prefix, using the `w_bit`
+    /// argument to determine whether or not to set the REX.W bit.
+    fn enc_i32_i64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template) {
+        self.enc32(inst.clone().bind(I32).bind(Any), template.clone());
+
+        // REX-less encoding must come after REX encoding so we don't use it by
+        // default. Otherwise reg-alloc would never use r8 and up.
+        self.enc64(inst.clone().bind(I32).bind(Any), template.clone().rex());
+        self.enc64(inst.clone().bind(I32).bind(Any), template.clone());
+
+        if w_bit {
+            self.enc64(inst.clone().bind(I64).bind(Any), template.rex().w());
+        } else {
+            self.enc64(inst.clone().bind(I64).bind(Any), template.clone().rex());
+            self.enc64(inst.clone().bind(I64).bind(Any), template);
+        }
+    }
+
+    /// Add the same encoding/recipe pairing to both X86_32 and X86_64
+    fn enc_32_64_rec(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        recipe: &EncodingRecipe,
+        bits: u16,
+    ) {
+        self.enc32_rec(inst.clone(), recipe, bits);
+        self.enc64_rec(inst, recipe, bits);
+    }
+
+    /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened
+    fn enc_32_64_func<T>(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        template: Template,
+        builder_closure: T,
+    ) where
+        T: FnOnce(EncodingBuilder) -> EncodingBuilder,
+    {
+        let encoding = self.make_encoding(inst.into(), template, builder_closure);
+        self.enc32.push(encoding.clone());
+        self.enc64.push(encoding);
+    }
+
+    /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand
+    /// binding) has already happened.
+    fn enc_32_64_maybe_isap(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        template: Template,
+        isap: Option<SettingPredicateNumber>,
+    ) {
+        self.enc32_maybe_isap(inst.clone(), template.clone(), isap);
+        self.enc64_maybe_isap(inst, template, isap);
+    }
+
+    fn enc32_maybe_isap(
+        &mut self,
+        inst: impl Into<InstSpec>,
+        template: Template,
+        isap: Option<SettingPredicateNumber>,
+    ) {
+        match isap {
+            None => self.enc32(inst, template),
+            Some(isap) => self.enc32_isap(inst, template, isap),
+        }
+    }
+
+    fn enc64_maybe_isap(
+        &mut self,
+        inst: impl Into<InstSpec>,
+        template: Template,
+        isap: Option<SettingPredicateNumber>,
+    ) {
+        match isap {
+            None => self.enc64(inst, template),
+            Some(isap) => self.enc64_isap(inst, template, isap),
+        }
+    }
+}
+
+// Definitions.
+
+#[inline(never)]
+fn define_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) {
+    let shared = &shared_defs.instructions;
+    let formats = &shared_defs.formats;
+
+    // Shorthands for instructions.
+    let bconst = shared.by_name("bconst");
+    let bint = shared.by_name("bint");
+    let copy = shared.by_name("copy");
+    let copy_special = shared.by_name("copy_special");
+    let copy_to_ssa = shared.by_name("copy_to_ssa");
+    let get_pinned_reg = shared.by_name("get_pinned_reg");
+    let iconst = shared.by_name("iconst");
+    let ireduce = shared.by_name("ireduce");
+    let regmove = shared.by_name("regmove");
+    let sextend = shared.by_name("sextend");
+    let set_pinned_reg = shared.by_name("set_pinned_reg");
+    let uextend = shared.by_name("uextend");
+    let dummy_sarg_t = shared.by_name("dummy_sarg_t");
+
+    // Shorthands for recipes.
+    let rec_copysp = r.template("copysp");
+    let rec_furm_reg_to_ssa = r.template("furm_reg_to_ssa");
+    let rec_get_pinned_reg = r.recipe("get_pinned_reg");
+    let rec_null = r.recipe("null");
+    let rec_pu_id = r.template("pu_id");
+    let rec_pu_id_bool = r.template("pu_id_bool");
+    let rec_pu_iq = r.template("pu_iq");
+    let rec_rmov = r.template("rmov");
+    let rec_set_pinned_reg = r.template("set_pinned_reg");
+    let rec_u_id = r.template("u_id");
+    let rec_u_id_z = r.template("u_id_z");
+    let rec_umr = r.template("umr");
+    let rec_umr_reg_to_ssa = r.template("umr_reg_to_ssa");
+    let rec_urm_noflags = r.template("urm_noflags");
+    let rec_urm_noflags_abcd = r.template("urm_noflags_abcd");
+    let rec_dummy_sarg_t = r.recipe("dummy_sarg_t");
+
+    // The pinned reg is fixed to a certain value entirely user-controlled, so it generates nothing!
+    e.enc64_rec(get_pinned_reg.bind(I64), rec_get_pinned_reg, 0);
+    e.enc_x86_64(
+        set_pinned_reg.bind(I64),
+        rec_set_pinned_reg.opcodes(&MOV_STORE).rex().w(),
+    );
+
+    e.enc_i32_i64(copy, rec_umr.opcodes(&MOV_STORE));
+    e.enc_r32_r64_rex_only(copy, rec_umr.opcodes(&MOV_STORE));
+    e.enc_both(copy.bind(B1), rec_umr.opcodes(&MOV_STORE));
+    e.enc_both(copy.bind(I8), rec_umr.opcodes(&MOV_STORE));
+    e.enc_both(copy.bind(I16), rec_umr.opcodes(&MOV_STORE));
+
+    // TODO For x86-64, only define REX forms for now, since we can't describe the
+    // special regunit immediate operands with the current constraint language.
+    for &ty in &[I8, I16, I32] {
+        e.enc32(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE));
+        e.enc64(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE).rex());
+    }
+    for &ty in &[B8, B16, B32] {
+        e.enc32(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE));
+        e.enc64(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE).rex());
+    }
+    e.enc64(regmove.bind(I64), rec_rmov.opcodes(&MOV_STORE).rex().w());
+    e.enc_both(regmove.bind(B1), rec_rmov.opcodes(&MOV_STORE));
+    e.enc_both(regmove.bind(I8), rec_rmov.opcodes(&MOV_STORE));
+    e.enc32(regmove.bind(R32), rec_rmov.opcodes(&MOV_STORE));
+    e.enc64(regmove.bind(R32), rec_rmov.opcodes(&MOV_STORE).rex());
+    e.enc64(regmove.bind(R64), rec_rmov.opcodes(&MOV_STORE).rex().w());
+
+    // Immediate constants.
+    e.enc32(iconst.bind(I32), rec_pu_id.opcodes(&MOV_IMM));
+
+    e.enc64(iconst.bind(I32), rec_pu_id.rex().opcodes(&MOV_IMM));
+    e.enc64(iconst.bind(I32), rec_pu_id.opcodes(&MOV_IMM));
+
+    // The 32-bit immediate movl also zero-extends to 64 bits.
+    let is_unsigned_int32 =
+        InstructionPredicate::new_is_unsigned_int(&*formats.unary_imm, "imm", 32, 0);
+
+    e.enc64_func(
+        iconst.bind(I64),
+        rec_pu_id.opcodes(&MOV_IMM).rex(),
+        |encoding| encoding.inst_predicate(is_unsigned_int32.clone()),
+    );
+    e.enc64_func(iconst.bind(I64), rec_pu_id.opcodes(&MOV_IMM), |encoding| {
+        encoding.inst_predicate(is_unsigned_int32)
+    });
+
+    // Sign-extended 32-bit immediate.
+    e.enc64(
+        iconst.bind(I64),
+        rec_u_id.rex().opcodes(&MOV_IMM_SIGNEXTEND).rrr(0).w(),
+    );
+
+    // Finally, the MOV_IMM opcode takes an 8-byte immediate with a REX.W prefix.
+    e.enc64(iconst.bind(I64), rec_pu_iq.opcodes(&MOV_IMM).rex().w());
+
+    // Bool constants (uses MOV)
+    for &ty in &[B1, B8, B16, B32] {
+        e.enc_both(bconst.bind(ty), rec_pu_id_bool.opcodes(&MOV_IMM));
+    }
+    e.enc64(bconst.bind(B64), rec_pu_id_bool.opcodes(&MOV_IMM).rex());
+
+    let is_zero_int = InstructionPredicate::new_is_zero_int(&formats.unary_imm, "imm");
+    e.enc_both_instp(
+        iconst.bind(I8),
+        rec_u_id_z.opcodes(&XORB),
+        is_zero_int.clone(),
+    );
+
+    // You may expect that i16 encodings would have an 0x66 prefix on the opcode to indicate that
+    // encodings should be on 16-bit operands (f.ex, "xor %ax, %ax"). Cranelift currently does not
+    // know that it can drop the 0x66 prefix and clear the upper half of a 32-bit register in these
+    // scenarios, so we explicitly select a wider but permissible opcode.
+    //
+    // This effectively formalizes the i16->i32 widening that Cranelift performs when there isn't
+    // an appropriate i16 encoding available.
+    e.enc_both_instp(
+        iconst.bind(I16),
+        rec_u_id_z.opcodes(&XOR),
+        is_zero_int.clone(),
+    );
+    e.enc_both_instp(
+        iconst.bind(I32),
+        rec_u_id_z.opcodes(&XOR),
+        is_zero_int.clone(),
+    );
+    e.enc_x86_64_instp(iconst.bind(I64), rec_u_id_z.opcodes(&XOR), is_zero_int);
+
+    // Numerical conversions.
+
+    // Reducing an integer is a no-op.
+    e.enc32_rec(ireduce.bind(I8).bind(I16), rec_null, 0);
+    e.enc32_rec(ireduce.bind(I8).bind(I32), rec_null, 0);
+    e.enc32_rec(ireduce.bind(I16).bind(I32), rec_null, 0);
+
+    e.enc64_rec(ireduce.bind(I8).bind(I16), rec_null, 0);
+    e.enc64_rec(ireduce.bind(I8).bind(I32), rec_null, 0);
+    e.enc64_rec(ireduce.bind(I16).bind(I32), rec_null, 0);
+    e.enc64_rec(ireduce.bind(I8).bind(I64), rec_null, 0);
+    e.enc64_rec(ireduce.bind(I16).bind(I64), rec_null, 0);
+    e.enc64_rec(ireduce.bind(I32).bind(I64), rec_null, 0);
+
+    // TODO: Add encodings for cbw, cwde, cdqe, which are sign-extending
+    // instructions for %al/%ax/%eax to %ax/%eax/%rax.
+
+    // movsbl
+    e.enc32(
+        sextend.bind(I32).bind(I8),
+        rec_urm_noflags_abcd.opcodes(&MOVSX_BYTE),
+    );
+    e.enc64(
+        sextend.bind(I32).bind(I8),
+        rec_urm_noflags.opcodes(&MOVSX_BYTE).rex(),
+    );
+    e.enc64(
+        sextend.bind(I32).bind(I8),
+        rec_urm_noflags_abcd.opcodes(&MOVSX_BYTE),
+    );
+
+    // movswl
+    e.enc32(
+        sextend.bind(I32).bind(I16),
+        rec_urm_noflags.opcodes(&MOVSX_WORD),
+    );
+    e.enc64(
+        sextend.bind(I32).bind(I16),
+        rec_urm_noflags.opcodes(&MOVSX_WORD).rex(),
+    );
+    e.enc64(
+        sextend.bind(I32).bind(I16),
+        rec_urm_noflags.opcodes(&MOVSX_WORD),
+    );
+
+    // movsbq
+    e.enc64(
+        sextend.bind(I64).bind(I8),
+        rec_urm_noflags.opcodes(&MOVSX_BYTE).rex().w(),
+    );
+
+    // movswq
+    e.enc64(
+        sextend.bind(I64).bind(I16),
+        rec_urm_noflags.opcodes(&MOVSX_WORD).rex().w(),
+    );
+
+    // movslq
+    e.enc64(
+        sextend.bind(I64).bind(I32),
+        rec_urm_noflags.opcodes(&MOVSXD).rex().w(),
+    );
+
+    // movzbl
+    e.enc32(
+        uextend.bind(I32).bind(I8),
+        rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
+    );
+    e.enc64(
+        uextend.bind(I32).bind(I8),
+        rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
+    );
+    e.enc64(
+        uextend.bind(I32).bind(I8),
+        rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
+    );
+
+    // movzwl
+    e.enc32(
+        uextend.bind(I32).bind(I16),
+        rec_urm_noflags.opcodes(&MOVZX_WORD),
+    );
+    e.enc64(
+        uextend.bind(I32).bind(I16),
+        rec_urm_noflags.opcodes(&MOVZX_WORD).rex(),
+    );
+    e.enc64(
+        uextend.bind(I32).bind(I16),
+        rec_urm_noflags.opcodes(&MOVZX_WORD),
+    );
+
+    // movzbq, encoded as movzbl because it's equivalent and shorter.
+    e.enc64(
+        uextend.bind(I64).bind(I8),
+        rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
+    );
+    e.enc64(
+        uextend.bind(I64).bind(I8),
+        rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
+    );
+
+    // movzwq, encoded as movzwl because it's equivalent and shorter
+    e.enc64(
+        uextend.bind(I64).bind(I16),
+        rec_urm_noflags.opcodes(&MOVZX_WORD).rex(),
+    );
+    e.enc64(
+        uextend.bind(I64).bind(I16),
+        rec_urm_noflags.opcodes(&MOVZX_WORD),
+    );
+
+    // A 32-bit register copy clears the high 32 bits.
+    e.enc64(
+        uextend.bind(I64).bind(I32),
+        rec_umr.opcodes(&MOV_STORE).rex(),
+    );
+    e.enc64(uextend.bind(I64).bind(I32), rec_umr.opcodes(&MOV_STORE));
+
+    // Convert bool to int.
+    //
+    // This assumes that b1 is represented as an 8-bit low register with the value 0
+    // or 1.
+    //
+    // Encode movzbq as movzbl, because it's equivalent and shorter.
+    for &to in &[I8, I16, I32, I64] {
+        for &from in &[B1, B8] {
+            e.enc64(
+                bint.bind(to).bind(from),
+                rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
+            );
+            e.enc64(
+                bint.bind(to).bind(from),
+                rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
+            );
+            if to != I64 {
+                e.enc32(
+                    bint.bind(to).bind(from),
+                    rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
+                );
+            }
+        }
+    }
+    for (to, from) in &[(I16, B16), (I32, B32), (I64, B64)] {
+        e.enc_both(
+            bint.bind(*to).bind(*from),
+            rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
+        );
+    }
+
+    // Copy Special
+    // For x86-64, only define REX forms for now, since we can't describe the
+    // special regunit immediate operands with the current constraint language.
+    e.enc64(copy_special, rec_copysp.opcodes(&MOV_STORE).rex().w());
+    e.enc32(copy_special, rec_copysp.opcodes(&MOV_STORE));
+
+    // Copy to SSA.  These have to be done with special _rex_only encoders, because the standard
+    // machinery for deciding whether a REX.{RXB} prefix is needed doesn't take into account
+    // the source register, which is specified directly in the instruction.
+    e.enc_i32_i64_rex_only(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
+    e.enc_r32_r64_rex_only(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
+    e.enc_both_rex_only(copy_to_ssa.bind(B1), rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
+    e.enc_both_rex_only(copy_to_ssa.bind(I8), rec_umr_reg_to_ssa.opcodes(&MOV_STORE));
+    e.enc_both_rex_only(
+        copy_to_ssa.bind(I16),
+        rec_umr_reg_to_ssa.opcodes(&MOV_STORE),
+    );
+    e.enc_both_rex_only(
+        copy_to_ssa.bind(F64),
+        rec_furm_reg_to_ssa.opcodes(&MOVSD_LOAD),
+    );
+    e.enc_both_rex_only(
+        copy_to_ssa.bind(F32),
+        rec_furm_reg_to_ssa.opcodes(&MOVSS_LOAD),
+    );
+
+    e.enc_32_64_rec(dummy_sarg_t, rec_dummy_sarg_t, 0);
+}
+
+#[inline(never)]
+fn define_memory(
+    e: &mut PerCpuModeEncodings,
+    shared_defs: &SharedDefinitions,
+    x86: &InstructionGroup,
+    r: &RecipeGroup,
+) {
+    let shared = &shared_defs.instructions;
+    let formats = &shared_defs.formats;
+
+    // Shorthands for instructions.
+    let adjust_sp_down = shared.by_name("adjust_sp_down");
+    let adjust_sp_down_imm = shared.by_name("adjust_sp_down_imm");
+    let adjust_sp_up_imm = shared.by_name("adjust_sp_up_imm");
+    let copy_nop = shared.by_name("copy_nop");
+    let fill = shared.by_name("fill");
+    let fill_nop = shared.by_name("fill_nop");
+    let istore16 = shared.by_name("istore16");
+    let istore16_complex = shared.by_name("istore16_complex");
+    let istore32 = shared.by_name("istore32");
+    let istore32_complex = shared.by_name("istore32_complex");
+    let istore8 = shared.by_name("istore8");
+    let istore8_complex = shared.by_name("istore8_complex");
+    let load = shared.by_name("load");
+    let load_complex = shared.by_name("load_complex");
+    let regfill = shared.by_name("regfill");
+    let regspill = shared.by_name("regspill");
+    let sload16 = shared.by_name("sload16");
+    let sload16_complex = shared.by_name("sload16_complex");
+    let sload32 = shared.by_name("sload32");
+    let sload32_complex = shared.by_name("sload32_complex");
+    let sload8 = shared.by_name("sload8");
+    let sload8_complex = shared.by_name("sload8_complex");
+    let spill = shared.by_name("spill");
+    let store = shared.by_name("store");
+    let store_complex = shared.by_name("store_complex");
+    let uload16 = shared.by_name("uload16");
+    let uload16_complex = shared.by_name("uload16_complex");
+    let uload32 = shared.by_name("uload32");
+    let uload32_complex = shared.by_name("uload32_complex");
+    let uload8 = shared.by_name("uload8");
+    let uload8_complex = shared.by_name("uload8_complex");
+    let x86_pop = x86.by_name("x86_pop");
+    let x86_push = x86.by_name("x86_push");
+
+    // Shorthands for recipes.
+    let rec_adjustsp = r.template("adjustsp");
+    let rec_adjustsp_ib = r.template("adjustsp_ib");
+    let rec_adjustsp_id = r.template("adjustsp_id");
+    let rec_ffillnull = r.recipe("ffillnull");
+    let rec_fillnull = r.recipe("fillnull");
+    let rec_fillSib32 = r.template("fillSib32");
+    let rec_ld = r.template("ld");
+    let rec_ldDisp32 = r.template("ldDisp32");
+    let rec_ldDisp8 = r.template("ldDisp8");
+    let rec_ldWithIndex = r.template("ldWithIndex");
+    let rec_ldWithIndexDisp32 = r.template("ldWithIndexDisp32");
+    let rec_ldWithIndexDisp8 = r.template("ldWithIndexDisp8");
+    let rec_popq = r.template("popq");
+    let rec_pushq = r.template("pushq");
+    let rec_regfill32 = r.template("regfill32");
+    let rec_regspill32 = r.template("regspill32");
+    let rec_spillSib32 = r.template("spillSib32");
+    let rec_st = r.template("st");
+    let rec_stacknull = r.recipe("stacknull");
+    let rec_stDisp32 = r.template("stDisp32");
+    let rec_stDisp32_abcd = r.template("stDisp32_abcd");
+    let rec_stDisp8 = r.template("stDisp8");
+    let rec_stDisp8_abcd = r.template("stDisp8_abcd");
+    let rec_stWithIndex = r.template("stWithIndex");
+    let rec_stWithIndexDisp32 = r.template("stWithIndexDisp32");
+    let rec_stWithIndexDisp32_abcd = r.template("stWithIndexDisp32_abcd");
+    let rec_stWithIndexDisp8 = r.template("stWithIndexDisp8");
+    let rec_stWithIndexDisp8_abcd = r.template("stWithIndexDisp8_abcd");
+    let rec_stWithIndex_abcd = r.template("stWithIndex_abcd");
+    let rec_st_abcd = r.template("st_abcd");
+
+    // Loads and stores.
+    let is_load_complex_length_two =
+        InstructionPredicate::new_length_equals(&*formats.load_complex, 2);
+
+    for recipe in &[rec_ldWithIndex, rec_ldWithIndexDisp8, rec_ldWithIndexDisp32] {
+        e.enc_i32_i64_instp(
+            load_complex,
+            recipe.opcodes(&MOV_LOAD),
+            is_load_complex_length_two.clone(),
+        );
+        e.enc_r32_r64_instp(
+            load_complex,
+            recipe.opcodes(&MOV_LOAD),
+            is_load_complex_length_two.clone(),
+        );
+        e.enc_x86_64_instp(
+            uload32_complex,
+            recipe.opcodes(&MOV_LOAD),
+            is_load_complex_length_two.clone(),
+        );
+
+        e.enc64_instp(
+            sload32_complex,
+            recipe.opcodes(&MOVSXD).rex().w(),
+            is_load_complex_length_two.clone(),
+        );
+
+        e.enc_i32_i64_instp(
+            uload16_complex,
+            recipe.opcodes(&MOVZX_WORD),
+            is_load_complex_length_two.clone(),
+        );
+        e.enc_i32_i64_instp(
+            sload16_complex,
+            recipe.opcodes(&MOVSX_WORD),
+            is_load_complex_length_two.clone(),
+        );
+
+        e.enc_i32_i64_instp(
+            uload8_complex,
+            recipe.opcodes(&MOVZX_BYTE),
+            is_load_complex_length_two.clone(),
+        );
+
+        e.enc_i32_i64_instp(
+            sload8_complex,
+            recipe.opcodes(&MOVSX_BYTE),
+            is_load_complex_length_two.clone(),
+        );
+    }
+
+    let is_store_complex_length_three =
+        InstructionPredicate::new_length_equals(&*formats.store_complex, 3);
+
+    for recipe in &[rec_stWithIndex, rec_stWithIndexDisp8, rec_stWithIndexDisp32] {
+        e.enc_i32_i64_instp(
+            store_complex,
+            recipe.opcodes(&MOV_STORE),
+            is_store_complex_length_three.clone(),
+        );
+        e.enc_r32_r64_instp(
+            store_complex,
+            recipe.opcodes(&MOV_STORE),
+            is_store_complex_length_three.clone(),
+        );
+        e.enc_x86_64_instp(
+            istore32_complex,
+            recipe.opcodes(&MOV_STORE),
+            is_store_complex_length_three.clone(),
+        );
+        e.enc_both_instp(
+            istore16_complex.bind(I32),
+            recipe.opcodes(&MOV_STORE_16),
+            is_store_complex_length_three.clone(),
+        );
+        e.enc_x86_64_instp(
+            istore16_complex.bind(I64),
+            recipe.opcodes(&MOV_STORE_16),
+            is_store_complex_length_three.clone(),
+        );
+    }
+
+    for recipe in &[
+        rec_stWithIndex_abcd,
+        rec_stWithIndexDisp8_abcd,
+        rec_stWithIndexDisp32_abcd,
+    ] {
+        e.enc_both_instp(
+            istore8_complex.bind(I32),
+            recipe.opcodes(&MOV_BYTE_STORE),
+            is_store_complex_length_three.clone(),
+        );
+        e.enc_x86_64_instp(
+            istore8_complex.bind(I64),
+            recipe.opcodes(&MOV_BYTE_STORE),
+            is_store_complex_length_three.clone(),
+        );
+    }
+
+    for recipe in &[rec_st, rec_stDisp8, rec_stDisp32] {
+        e.enc_i32_i64_ld_st(store, true, recipe.opcodes(&MOV_STORE));
+        e.enc_r32_r64_ld_st(store, true, recipe.opcodes(&MOV_STORE));
+        e.enc_x86_64(istore32.bind(I64).bind(Any), recipe.opcodes(&MOV_STORE));
+        e.enc_i32_i64_ld_st(istore16, false, recipe.opcodes(&MOV_STORE_16));
+    }
+
+    // Byte stores are more complicated because the registers they can address
+    // depends of the presence of a REX prefix. The st*_abcd recipes fall back to
+    // the corresponding st* recipes when a REX prefix is applied.
+
+    for recipe in &[rec_st_abcd, rec_stDisp8_abcd, rec_stDisp32_abcd] {
+        e.enc_both(istore8.bind(I32).bind(Any), recipe.opcodes(&MOV_BYTE_STORE));
+        e.enc_x86_64(istore8.bind(I64).bind(Any), recipe.opcodes(&MOV_BYTE_STORE));
+    }
+
+    e.enc_i32_i64_explicit_rex(spill, rec_spillSib32.opcodes(&MOV_STORE));
+    e.enc_i32_i64_explicit_rex(regspill, rec_regspill32.opcodes(&MOV_STORE));
+    e.enc_r32_r64_rex_only(spill, rec_spillSib32.opcodes(&MOV_STORE));
+    e.enc_r32_r64_rex_only(regspill, rec_regspill32.opcodes(&MOV_STORE));
+
+    // Use a 32-bit write for spilling `b1`, `i8` and `i16` to avoid
+    // constraining the permitted registers.
+    // See MIN_SPILL_SLOT_SIZE which makes this safe.
+
+    e.enc_both(spill.bind(B1), rec_spillSib32.opcodes(&MOV_STORE));
+    e.enc_both(regspill.bind(B1), rec_regspill32.opcodes(&MOV_STORE));
+    for &ty in &[I8, I16] {
+        e.enc_both(spill.bind(ty), rec_spillSib32.opcodes(&MOV_STORE));
+        e.enc_both(regspill.bind(ty), rec_regspill32.opcodes(&MOV_STORE));
+    }
+
+    for recipe in &[rec_ld, rec_ldDisp8, rec_ldDisp32] {
+        e.enc_i32_i64_ld_st(load, true, recipe.opcodes(&MOV_LOAD));
+        e.enc_r32_r64_ld_st(load, true, recipe.opcodes(&MOV_LOAD));
+        e.enc_x86_64(uload32.bind(I64), recipe.opcodes(&MOV_LOAD));
+        e.enc64(sload32.bind(I64), recipe.opcodes(&MOVSXD).rex().w());
+        e.enc_i32_i64_ld_st(uload16, true, recipe.opcodes(&MOVZX_WORD));
+        e.enc_i32_i64_ld_st(sload16, true, recipe.opcodes(&MOVSX_WORD));
+        e.enc_i32_i64_ld_st(uload8, true, recipe.opcodes(&MOVZX_BYTE));
+        e.enc_i32_i64_ld_st(sload8, true, recipe.opcodes(&MOVSX_BYTE));
+    }
+
+    e.enc_i32_i64_explicit_rex(fill, rec_fillSib32.opcodes(&MOV_LOAD));
+    e.enc_i32_i64_explicit_rex(regfill, rec_regfill32.opcodes(&MOV_LOAD));
+    e.enc_r32_r64_rex_only(fill, rec_fillSib32.opcodes(&MOV_LOAD));
+    e.enc_r32_r64_rex_only(regfill, rec_regfill32.opcodes(&MOV_LOAD));
+
+    // No-op fills, created by late-stage redundant-fill removal.
+    for &ty in &[I64, I32, I16, I8] {
+        e.enc64_rec(fill_nop.bind(ty), rec_fillnull, 0);
+        e.enc32_rec(fill_nop.bind(ty), rec_fillnull, 0);
+    }
+    e.enc64_rec(fill_nop.bind(B1), rec_fillnull, 0);
+    e.enc32_rec(fill_nop.bind(B1), rec_fillnull, 0);
+    for &ty in &[F64, F32] {
+        e.enc64_rec(fill_nop.bind(ty), rec_ffillnull, 0);
+        e.enc32_rec(fill_nop.bind(ty), rec_ffillnull, 0);
+    }
+    for &ty in &[R64, R32] {
+        e.enc64_rec(fill_nop.bind(ty), rec_fillnull, 0);
+        e.enc32_rec(fill_nop.bind(ty), rec_fillnull, 0);
+    }
+
+    // Load 32 bits from `b1`, `i8` and `i16` spill slots. See `spill.b1` above.
+
+    e.enc_both(fill.bind(B1), rec_fillSib32.opcodes(&MOV_LOAD));
+    e.enc_both(regfill.bind(B1), rec_regfill32.opcodes(&MOV_LOAD));
+    for &ty in &[I8, I16] {
+        e.enc_both(fill.bind(ty), rec_fillSib32.opcodes(&MOV_LOAD));
+        e.enc_both(regfill.bind(ty), rec_regfill32.opcodes(&MOV_LOAD));
+    }
+
+    // Push and Pop.
+    e.enc32(x86_push.bind(I32), rec_pushq.opcodes(&PUSH_REG));
+    e.enc_x86_64(x86_push.bind(I64), rec_pushq.opcodes(&PUSH_REG));
+
+    e.enc32(x86_pop.bind(I32), rec_popq.opcodes(&POP_REG));
+    e.enc_x86_64(x86_pop.bind(I64), rec_popq.opcodes(&POP_REG));
+
+    // Stack-slot-to-the-same-stack-slot copy, which is guaranteed to turn
+    // into a no-op.
+    // The same encoding is generated for both the 64- and 32-bit architectures.
+    for &ty in &[I64, I32, I16, I8] {
+        e.enc64_rec(copy_nop.bind(ty), rec_stacknull, 0);
+        e.enc32_rec(copy_nop.bind(ty), rec_stacknull, 0);
+    }
+    for &ty in &[F64, F32] {
+        e.enc64_rec(copy_nop.bind(ty), rec_stacknull, 0);
+        e.enc32_rec(copy_nop.bind(ty), rec_stacknull, 0);
+    }
+
+    // Adjust SP down by a dynamic value (or up, with a negative operand).
+    e.enc32(adjust_sp_down.bind(I32), rec_adjustsp.opcodes(&SUB));
+    e.enc64(
+        adjust_sp_down.bind(I64),
+        rec_adjustsp.opcodes(&SUB).rex().w(),
+    );
+
+    // Adjust SP up by an immediate (or down, with a negative immediate).
+    e.enc32(adjust_sp_up_imm, rec_adjustsp_ib.opcodes(&CMP_IMM8));
+    e.enc32(adjust_sp_up_imm, rec_adjustsp_id.opcodes(&CMP_IMM));
+    e.enc64(
+        adjust_sp_up_imm,
+        rec_adjustsp_ib.opcodes(&CMP_IMM8).rex().w(),
+    );
+    e.enc64(
+        adjust_sp_up_imm,
+        rec_adjustsp_id.opcodes(&CMP_IMM).rex().w(),
+    );
+
+    // Adjust SP down by an immediate (or up, with a negative immediate).
+    e.enc32(
+        adjust_sp_down_imm,
+        rec_adjustsp_ib.opcodes(&CMP_IMM8).rrr(5),
+    );
+    e.enc32(adjust_sp_down_imm, rec_adjustsp_id.opcodes(&CMP_IMM).rrr(5));
+    e.enc64(
+        adjust_sp_down_imm,
+        rec_adjustsp_ib.opcodes(&CMP_IMM8).rrr(5).rex().w(),
+    );
+    e.enc64(
+        adjust_sp_down_imm,
+        rec_adjustsp_id.opcodes(&CMP_IMM).rrr(5).rex().w(),
+    );
+}
+
+#[inline(never)]
+fn define_fpu_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) {
+    let shared = &shared_defs.instructions;
+
+    // Shorthands for instructions.
+    let bitcast = shared.by_name("bitcast");
+    let copy = shared.by_name("copy");
+    let regmove = shared.by_name("regmove");
+
+    // Shorthands for recipes.
+    let rec_frmov = r.template("frmov");
+    let rec_frurm = r.template("frurm");
+    let rec_furm = r.template("furm");
+    let rec_rfumr = r.template("rfumr");
+
+    // Floating-point moves.
+    // movd
+    e.enc_both(
+        bitcast.bind(F32).bind(I32),
+        rec_frurm.opcodes(&MOVD_LOAD_XMM),
+    );
+    e.enc_both(
+        bitcast.bind(I32).bind(F32),
+        rec_rfumr.opcodes(&MOVD_STORE_XMM),
+    );
+
+    // movq
+    e.enc64(
+        bitcast.bind(F64).bind(I64),
+        rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(),
+    );
+    e.enc64(
+        bitcast.bind(I64).bind(F64),
+        rec_rfumr.opcodes(&MOVD_STORE_XMM).rex().w(),
+    );
+
+    // movaps
+    e.enc_both(copy.bind(F32), rec_furm.opcodes(&MOVAPS_LOAD));
+    e.enc_both(copy.bind(F64), rec_furm.opcodes(&MOVAPS_LOAD));
+
+    // TODO For x86-64, only define REX forms for now, since we can't describe the special regunit
+    // immediate operands with the current constraint language.
+    e.enc32(regmove.bind(F32), rec_frmov.opcodes(&MOVAPS_LOAD));
+    e.enc64(regmove.bind(F32), rec_frmov.opcodes(&MOVAPS_LOAD).rex());
+
+    // TODO For x86-64, only define REX forms for now, since we can't describe the special regunit
+    // immediate operands with the current constraint language.
+    e.enc32(regmove.bind(F64), rec_frmov.opcodes(&MOVAPS_LOAD));
+    e.enc64(regmove.bind(F64), rec_frmov.opcodes(&MOVAPS_LOAD).rex());
+}
+
+#[inline(never)]
+fn define_fpu_memory(
+    e: &mut PerCpuModeEncodings,
+    shared_defs: &SharedDefinitions,
+    r: &RecipeGroup,
+) {
+    let shared = &shared_defs.instructions;
+
+    // Shorthands for instructions.
+    let fill = shared.by_name("fill");
+    let load = shared.by_name("load");
+    let load_complex = shared.by_name("load_complex");
+    let regfill = shared.by_name("regfill");
+    let regspill = shared.by_name("regspill");
+    let spill = shared.by_name("spill");
+    let store = shared.by_name("store");
+    let store_complex = shared.by_name("store_complex");
+
+    // Shorthands for recipes.
+    let rec_ffillSib32 = r.template("ffillSib32");
+    let rec_fld = r.template("fld");
+    let rec_fldDisp32 = r.template("fldDisp32");
+    let rec_fldDisp8 = r.template("fldDisp8");
+    let rec_fldWithIndex = r.template("fldWithIndex");
+    let rec_fldWithIndexDisp32 = r.template("fldWithIndexDisp32");
+    let rec_fldWithIndexDisp8 = r.template("fldWithIndexDisp8");
+    let rec_fregfill32 = r.template("fregfill32");
+    let rec_fregspill32 = r.template("fregspill32");
+    let rec_fspillSib32 = r.template("fspillSib32");
+    let rec_fst = r.template("fst");
+    let rec_fstDisp32 = r.template("fstDisp32");
+    let rec_fstDisp8 = r.template("fstDisp8");
+    let rec_fstWithIndex = r.template("fstWithIndex");
+    let rec_fstWithIndexDisp32 = r.template("fstWithIndexDisp32");
+    let rec_fstWithIndexDisp8 = r.template("fstWithIndexDisp8");
+
+    // Float loads and stores.
+    e.enc_both(load.bind(F32).bind(Any), rec_fld.opcodes(&MOVSS_LOAD));
+    e.enc_both(load.bind(F32).bind(Any), rec_fldDisp8.opcodes(&MOVSS_LOAD));
+    e.enc_both(load.bind(F32).bind(Any), rec_fldDisp32.opcodes(&MOVSS_LOAD));
+
+    e.enc_both(
+        load_complex.bind(F32),
+        rec_fldWithIndex.opcodes(&MOVSS_LOAD),
+    );
+    e.enc_both(
+        load_complex.bind(F32),
+        rec_fldWithIndexDisp8.opcodes(&MOVSS_LOAD),
+    );
+    e.enc_both(
+        load_complex.bind(F32),
+        rec_fldWithIndexDisp32.opcodes(&MOVSS_LOAD),
+    );
+
+    e.enc_both(load.bind(F64).bind(Any), rec_fld.opcodes(&MOVSD_LOAD));
+    e.enc_both(load.bind(F64).bind(Any), rec_fldDisp8.opcodes(&MOVSD_LOAD));
+    e.enc_both(load.bind(F64).bind(Any), rec_fldDisp32.opcodes(&MOVSD_LOAD));
+
+    e.enc_both(
+        load_complex.bind(F64),
+        rec_fldWithIndex.opcodes(&MOVSD_LOAD),
+    );
+    e.enc_both(
+        load_complex.bind(F64),
+        rec_fldWithIndexDisp8.opcodes(&MOVSD_LOAD),
+    );
+    e.enc_both(
+        load_complex.bind(F64),
+        rec_fldWithIndexDisp32.opcodes(&MOVSD_LOAD),
+    );
+
+    e.enc_both(store.bind(F32).bind(Any), rec_fst.opcodes(&MOVSS_STORE));
+    e.enc_both(
+        store.bind(F32).bind(Any),
+        rec_fstDisp8.opcodes(&MOVSS_STORE),
+    );
+    e.enc_both(
+        store.bind(F32).bind(Any),
+        rec_fstDisp32.opcodes(&MOVSS_STORE),
+    );
+
+    e.enc_both(
+        store_complex.bind(F32),
+        rec_fstWithIndex.opcodes(&MOVSS_STORE),
+    );
+    e.enc_both(
+        store_complex.bind(F32),
+        rec_fstWithIndexDisp8.opcodes(&MOVSS_STORE),
+    );
+    e.enc_both(
+        store_complex.bind(F32),
+        rec_fstWithIndexDisp32.opcodes(&MOVSS_STORE),
+    );
+
+    e.enc_both(store.bind(F64).bind(Any), rec_fst.opcodes(&MOVSD_STORE));
+    e.enc_both(
+        store.bind(F64).bind(Any),
+        rec_fstDisp8.opcodes(&MOVSD_STORE),
+    );
+    e.enc_both(
+        store.bind(F64).bind(Any),
+        rec_fstDisp32.opcodes(&MOVSD_STORE),
+    );
+
+    e.enc_both(
+        store_complex.bind(F64),
+        rec_fstWithIndex.opcodes(&MOVSD_STORE),
+    );
+    e.enc_both(
+        store_complex.bind(F64),
+        rec_fstWithIndexDisp8.opcodes(&MOVSD_STORE),
+    );
+    e.enc_both(
+        store_complex.bind(F64),
+        rec_fstWithIndexDisp32.opcodes(&MOVSD_STORE),
+    );
+
+    e.enc_both(fill.bind(F32), rec_ffillSib32.opcodes(&MOVSS_LOAD));
+    e.enc_both(regfill.bind(F32), rec_fregfill32.opcodes(&MOVSS_LOAD));
+    e.enc_both(fill.bind(F64), rec_ffillSib32.opcodes(&MOVSD_LOAD));
+    e.enc_both(regfill.bind(F64), rec_fregfill32.opcodes(&MOVSD_LOAD));
+
+    e.enc_both(spill.bind(F32), rec_fspillSib32.opcodes(&MOVSS_STORE));
+    e.enc_both(regspill.bind(F32), rec_fregspill32.opcodes(&MOVSS_STORE));
+    e.enc_both(spill.bind(F64), rec_fspillSib32.opcodes(&MOVSD_STORE));
+    e.enc_both(regspill.bind(F64), rec_fregspill32.opcodes(&MOVSD_STORE));
+}
+
+#[inline(never)]
+fn define_fpu_ops(
+    e: &mut PerCpuModeEncodings,
+    shared_defs: &SharedDefinitions,
+    settings: &SettingGroup,
+    x86: &InstructionGroup,
+    r: &RecipeGroup,
+) {
+    let shared = &shared_defs.instructions;
+    let formats = &shared_defs.formats;
+
+    // Shorthands for instructions.
+    let ceil = shared.by_name("ceil");
+    let f32const = shared.by_name("f32const");
+    let f64const = shared.by_name("f64const");
+    let fadd = shared.by_name("fadd");
+    let fcmp = shared.by_name("fcmp");
+    let fcvt_from_sint = shared.by_name("fcvt_from_sint");
+    let fdemote = shared.by_name("fdemote");
+    let fdiv = shared.by_name("fdiv");
+    let ffcmp = shared.by_name("ffcmp");
+    let floor = shared.by_name("floor");
+    let fmul = shared.by_name("fmul");
+    let fpromote = shared.by_name("fpromote");
+    let fsub = shared.by_name("fsub");
+    let nearest = shared.by_name("nearest");
+    let sqrt = shared.by_name("sqrt");
+    let trunc = shared.by_name("trunc");
+    let x86_cvtt2si = x86.by_name("x86_cvtt2si");
+    let x86_fmax = x86.by_name("x86_fmax");
+    let x86_fmin = x86.by_name("x86_fmin");
+
+    // Shorthands for recipes.
+    let rec_f32imm_z = r.template("f32imm_z");
+    let rec_f64imm_z = r.template("f64imm_z");
+    let rec_fa = r.template("fa");
+    let rec_fcmp = r.template("fcmp");
+    let rec_fcscc = r.template("fcscc");
+    let rec_frurm = r.template("frurm");
+    let rec_furm = r.template("furm");
+    let rec_furmi_rnd = r.template("furmi_rnd");
+    let rec_rfurm = r.template("rfurm");
+
+    // Predicates shorthands.
+    let use_sse41 = settings.predicate_by_name("use_sse41");
+
+    // Floating-point constants equal to 0.0 can be encoded using either `xorps` or `xorpd`, for
+    // 32-bit and 64-bit floats respectively.
+    let is_zero_32_bit_float =
+        InstructionPredicate::new_is_zero_32bit_float(&*formats.unary_ieee32, "imm");
+    e.enc32_instp(
+        f32const,
+        rec_f32imm_z.opcodes(&XORPS),
+        is_zero_32_bit_float.clone(),
+    );
+
+    let is_zero_64_bit_float =
+        InstructionPredicate::new_is_zero_64bit_float(&*formats.unary_ieee64, "imm");
+    e.enc32_instp(
+        f64const,
+        rec_f64imm_z.opcodes(&XORPD),
+        is_zero_64_bit_float.clone(),
+    );
+
+    e.enc_x86_64_instp(f32const, rec_f32imm_z.opcodes(&XORPS), is_zero_32_bit_float);
+    e.enc_x86_64_instp(f64const, rec_f64imm_z.opcodes(&XORPD), is_zero_64_bit_float);
+
+    // cvtsi2ss
+    e.enc_i32_i64(fcvt_from_sint.bind(F32), rec_frurm.opcodes(&CVTSI2SS));
+
+    // cvtsi2sd
+    e.enc_i32_i64(fcvt_from_sint.bind(F64), rec_frurm.opcodes(&CVTSI2SD));
+
+    // cvtss2sd
+    e.enc_both(fpromote.bind(F64).bind(F32), rec_furm.opcodes(&CVTSS2SD));
+
+    // cvtsd2ss
+    e.enc_both(fdemote.bind(F32).bind(F64), rec_furm.opcodes(&CVTSD2SS));
+
+    // cvttss2si
+    e.enc_both(
+        x86_cvtt2si.bind(I32).bind(F32),
+        rec_rfurm.opcodes(&CVTTSS2SI),
+    );
+    e.enc64(
+        x86_cvtt2si.bind(I64).bind(F32),
+        rec_rfurm.opcodes(&CVTTSS2SI).rex().w(),
+    );
+
+    // cvttsd2si
+    e.enc_both(
+        x86_cvtt2si.bind(I32).bind(F64),
+        rec_rfurm.opcodes(&CVTTSD2SI),
+    );
+    e.enc64(
+        x86_cvtt2si.bind(I64).bind(F64),
+        rec_rfurm.opcodes(&CVTTSD2SI).rex().w(),
+    );
+
+    // Exact square roots.
+    e.enc_both(sqrt.bind(F32), rec_furm.opcodes(&SQRTSS));
+    e.enc_both(sqrt.bind(F64), rec_furm.opcodes(&SQRTSD));
+
+    // Rounding. The recipe looks at the opcode to pick an immediate.
+    for inst in &[nearest, floor, ceil, trunc] {
+        e.enc_both_isap(inst.bind(F32), rec_furmi_rnd.opcodes(&ROUNDSS), use_sse41);
+        e.enc_both_isap(inst.bind(F64), rec_furmi_rnd.opcodes(&ROUNDSD), use_sse41);
+    }
+
+    // Binary arithmetic ops.
+    e.enc_both(fadd.bind(F32), rec_fa.opcodes(&ADDSS));
+    e.enc_both(fadd.bind(F64), rec_fa.opcodes(&ADDSD));
+
+    e.enc_both(fsub.bind(F32), rec_fa.opcodes(&SUBSS));
+    e.enc_both(fsub.bind(F64), rec_fa.opcodes(&SUBSD));
+
+    e.enc_both(fmul.bind(F32), rec_fa.opcodes(&MULSS));
+    e.enc_both(fmul.bind(F64), rec_fa.opcodes(&MULSD));
+
+    e.enc_both(fdiv.bind(F32), rec_fa.opcodes(&DIVSS));
+    e.enc_both(fdiv.bind(F64), rec_fa.opcodes(&DIVSD));
+
+    e.enc_both(x86_fmin.bind(F32), rec_fa.opcodes(&MINSS));
+    e.enc_both(x86_fmin.bind(F64), rec_fa.opcodes(&MINSD));
+
+    e.enc_both(x86_fmax.bind(F32), rec_fa.opcodes(&MAXSS));
+    e.enc_both(x86_fmax.bind(F64), rec_fa.opcodes(&MAXSD));
+
+    // Comparisons.
+    //
+    // This only covers the condition codes in `supported_floatccs`, the rest are
+    // handled by legalization patterns.
+    e.enc_both(fcmp.bind(F32), rec_fcscc.opcodes(&UCOMISS));
+    e.enc_both(fcmp.bind(F64), rec_fcscc.opcodes(&UCOMISD));
+    e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(&UCOMISS));
+    e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(&UCOMISD));
+}
+
+#[inline(never)]
+fn define_alu(
+    e: &mut PerCpuModeEncodings,
+    shared_defs: &SharedDefinitions,
+    settings: &SettingGroup,
+    x86: &InstructionGroup,
+    r: &RecipeGroup,
+) {
+    let shared = &shared_defs.instructions;
+
+    // Shorthands for instructions.
+    let clz = shared.by_name("clz");
+    let ctz = shared.by_name("ctz");
+    let icmp = shared.by_name("icmp");
+    let icmp_imm = shared.by_name("icmp_imm");
+    let ifcmp = shared.by_name("ifcmp");
+    let ifcmp_imm = shared.by_name("ifcmp_imm");
+    let ifcmp_sp = shared.by_name("ifcmp_sp");
+    let ishl = shared.by_name("ishl");
+    let ishl_imm = shared.by_name("ishl_imm");
+    let popcnt = shared.by_name("popcnt");
+    let rotl = shared.by_name("rotl");
+    let rotl_imm = shared.by_name("rotl_imm");
+    let rotr = shared.by_name("rotr");
+    let rotr_imm = shared.by_name("rotr_imm");
+    let selectif = shared.by_name("selectif");
+    let selectif_spectre_guard = shared.by_name("selectif_spectre_guard");
+    let sshr = shared.by_name("sshr");
+    let sshr_imm = shared.by_name("sshr_imm");
+    let trueff = shared.by_name("trueff");
+    let trueif = shared.by_name("trueif");
+    let ushr = shared.by_name("ushr");
+    let ushr_imm = shared.by_name("ushr_imm");
+    let x86_bsf = x86.by_name("x86_bsf");
+    let x86_bsr = x86.by_name("x86_bsr");
+
+    // Shorthands for recipes.
+    let rec_bsf_and_bsr = r.template("bsf_and_bsr");
+    let rec_cmov = r.template("cmov");
+    let rec_icscc = r.template("icscc");
+    let rec_icscc_ib = r.template("icscc_ib");
+    let rec_icscc_id = r.template("icscc_id");
+    let rec_rcmp = r.template("rcmp");
+    let rec_rcmp_ib = r.template("rcmp_ib");
+    let rec_rcmp_id = r.template("rcmp_id");
+    let rec_rcmp_sp = r.template("rcmp_sp");
+    let rec_rc = r.template("rc");
+    let rec_setf_abcd = r.template("setf_abcd");
+    let rec_seti_abcd = r.template("seti_abcd");
+    let rec_urm = r.template("urm");
+
+    // Predicates shorthands.
+    let use_popcnt = settings.predicate_by_name("use_popcnt");
+    let use_lzcnt = settings.predicate_by_name("use_lzcnt");
+    let use_bmi1 = settings.predicate_by_name("use_bmi1");
+
+    let band = shared.by_name("band");
+    let band_imm = shared.by_name("band_imm");
+    let band_not = shared.by_name("band_not");
+    let bnot = shared.by_name("bnot");
+    let bor = shared.by_name("bor");
+    let bor_imm = shared.by_name("bor_imm");
+    let bxor = shared.by_name("bxor");
+    let bxor_imm = shared.by_name("bxor_imm");
+    let iadd = shared.by_name("iadd");
+    let iadd_ifcarry = shared.by_name("iadd_ifcarry");
+    let iadd_ifcin = shared.by_name("iadd_ifcin");
+    let iadd_ifcout = shared.by_name("iadd_ifcout");
+    let iadd_imm = shared.by_name("iadd_imm");
+    let imul = shared.by_name("imul");
+    let isub = shared.by_name("isub");
+    let isub_ifbin = shared.by_name("isub_ifbin");
+    let isub_ifborrow = shared.by_name("isub_ifborrow");
+    let isub_ifbout = shared.by_name("isub_ifbout");
+    let x86_sdivmodx = x86.by_name("x86_sdivmodx");
+    let x86_smulx = x86.by_name("x86_smulx");
+    let x86_udivmodx = x86.by_name("x86_udivmodx");
+    let x86_umulx = x86.by_name("x86_umulx");
+
+    let rec_div = r.template("div");
+    let rec_fa = r.template("fa");
+    let rec_fax = r.template("fax");
+    let rec_mulx = r.template("mulx");
+    let rec_r_ib = r.template("r_ib");
+    let rec_r_id = r.template("r_id");
+    let rec_rin = r.template("rin");
+    let rec_rio = r.template("rio");
+    let rec_rout = r.template("rout");
+    let rec_rr = r.template("rr");
+    let rec_rrx = r.template("rrx");
+    let rec_ur = r.template("ur");
+
+    e.enc_i32_i64(iadd, rec_rr.opcodes(&ADD));
+    e.enc_i32_i64(iadd_ifcout, rec_rout.opcodes(&ADD));
+    e.enc_i32_i64(iadd_ifcin, rec_rin.opcodes(&ADC));
+    e.enc_i32_i64(iadd_ifcarry, rec_rio.opcodes(&ADC));
+    e.enc_i32_i64(iadd_imm, rec_r_ib.opcodes(&ADD_IMM8_SIGN_EXTEND).rrr(0));
+    e.enc_i32_i64(iadd_imm, rec_r_id.opcodes(&ADD_IMM).rrr(0));
+
+    e.enc_i32_i64(isub, rec_rr.opcodes(&SUB));
+    e.enc_i32_i64(isub_ifbout, rec_rout.opcodes(&SUB));
+    e.enc_i32_i64(isub_ifbin, rec_rin.opcodes(&SBB));
+    e.enc_i32_i64(isub_ifborrow, rec_rio.opcodes(&SBB));
+
+    e.enc_i32_i64(band, rec_rr.opcodes(&AND));
+    e.enc_b32_b64(band, rec_rr.opcodes(&AND));
+
+    // TODO: band_imm.i64 with an unsigned 32-bit immediate can be encoded as band_imm.i32. Can
+    // even use the single-byte immediate for 0xffff_ffXX masks.
+
+    e.enc_i32_i64(band_imm, rec_r_ib.opcodes(&AND_IMM8_SIGN_EXTEND).rrr(4));
+    e.enc_i32_i64(band_imm, rec_r_id.opcodes(&AND_IMM).rrr(4));
+
+    e.enc_i32_i64(bor, rec_rr.opcodes(&OR));
+    e.enc_b32_b64(bor, rec_rr.opcodes(&OR));
+    e.enc_i32_i64(bor_imm, rec_r_ib.opcodes(&OR_IMM8_SIGN_EXTEND).rrr(1));
+    e.enc_i32_i64(bor_imm, rec_r_id.opcodes(&OR_IMM).rrr(1));
+
+    e.enc_i32_i64(bxor, rec_rr.opcodes(&XOR));
+    e.enc_b32_b64(bxor, rec_rr.opcodes(&XOR));
+    e.enc_i32_i64(bxor_imm, rec_r_ib.opcodes(&XOR_IMM8_SIGN_EXTEND).rrr(6));
+    e.enc_i32_i64(bxor_imm, rec_r_id.opcodes(&XOR_IMM).rrr(6));
+
+    // x86 has a bitwise not instruction NOT.
+    e.enc_i32_i64(bnot, rec_ur.opcodes(&NOT).rrr(2));
+    e.enc_b32_b64(bnot, rec_ur.opcodes(&NOT).rrr(2));
+    e.enc_both(bnot.bind(B1), rec_ur.opcodes(&NOT).rrr(2));
+
+    // Also add a `b1` encodings for the logic instructions.
+    // TODO: Should this be done with 8-bit instructions? It would improve partial register
+    // dependencies.
+    e.enc_both(band.bind(B1), rec_rr.opcodes(&AND));
+    e.enc_both(bor.bind(B1), rec_rr.opcodes(&OR));
+    e.enc_both(bxor.bind(B1), rec_rr.opcodes(&XOR));
+
+    e.enc_i32_i64(imul, rec_rrx.opcodes(&IMUL));
+    e.enc_i32_i64(x86_sdivmodx, rec_div.opcodes(&IDIV).rrr(7));
+    e.enc_i32_i64(x86_udivmodx, rec_div.opcodes(&DIV).rrr(6));
+
+    e.enc_i32_i64(x86_smulx, rec_mulx.opcodes(&IMUL_RDX_RAX).rrr(5));
+    e.enc_i32_i64(x86_umulx, rec_mulx.opcodes(&MUL).rrr(4));
+
+    // Binary bitwise ops.
+    //
+    // The F64 version is intentionally encoded using the single-precision opcode:
+    // the operation is identical and the encoding is one byte shorter.
+    e.enc_both(band.bind(F32), rec_fa.opcodes(&ANDPS));
+    e.enc_both(band.bind(F64), rec_fa.opcodes(&ANDPS));
+
+    e.enc_both(bor.bind(F32), rec_fa.opcodes(&ORPS));
+    e.enc_both(bor.bind(F64), rec_fa.opcodes(&ORPS));
+
+    e.enc_both(bxor.bind(F32), rec_fa.opcodes(&XORPS));
+    e.enc_both(bxor.bind(F64), rec_fa.opcodes(&XORPS));
+
+    // The `andnps(x,y)` instruction computes `~x&y`, while band_not(x,y)` is `x&~y.
+    e.enc_both(band_not.bind(F32), rec_fax.opcodes(&ANDNPS));
+    e.enc_both(band_not.bind(F64), rec_fax.opcodes(&ANDNPS));
+
+    // Shifts and rotates.
+    // Note that the dynamic shift amount is only masked by 5 or 6 bits; the 8-bit
+    // and 16-bit shifts would need explicit masking.
+
+    for &(inst, rrr) in &[(rotl, 0), (rotr, 1), (ishl, 4), (ushr, 5), (sshr, 7)] {
+        // Cannot use enc_i32_i64 for this pattern because instructions require
+        // to bind any.
+        e.enc32(inst.bind(I32).bind(I8), rec_rc.opcodes(&ROTATE_CL).rrr(rrr));
+        e.enc32(
+            inst.bind(I32).bind(I16),
+            rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
+        );
+        e.enc32(
+            inst.bind(I32).bind(I32),
+            rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
+        );
+        e.enc64(
+            inst.bind(I64).bind(Any),
+            rec_rc.opcodes(&ROTATE_CL).rrr(rrr).rex().w(),
+        );
+        e.enc64(
+            inst.bind(I32).bind(Any),
+            rec_rc.opcodes(&ROTATE_CL).rrr(rrr).rex(),
+        );
+        e.enc64(
+            inst.bind(I32).bind(Any),
+            rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
+        );
+    }
+
+    e.enc_i32_i64(rotl_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(0));
+    e.enc_i32_i64(rotr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(1));
+    e.enc_i32_i64(ishl_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(4));
+    e.enc_i32_i64(ushr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(5));
+    e.enc_i32_i64(sshr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(7));
+
+    // Population count.
+    e.enc32_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT), use_popcnt);
+    e.enc64_isap(
+        popcnt.bind(I64),
+        rec_urm.opcodes(&POPCNT).rex().w(),
+        use_popcnt,
+    );
+    e.enc64_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT).rex(), use_popcnt);
+    e.enc64_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT), use_popcnt);
+
+    // Count leading zero bits.
+    e.enc32_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT), use_lzcnt);
+    e.enc64_isap(clz.bind(I64), rec_urm.opcodes(&LZCNT).rex().w(), use_lzcnt);
+    e.enc64_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT).rex(), use_lzcnt);
+    e.enc64_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT), use_lzcnt);
+
+    // Count trailing zero bits.
+    e.enc32_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT), use_bmi1);
+    e.enc64_isap(ctz.bind(I64), rec_urm.opcodes(&TZCNT).rex().w(), use_bmi1);
+    e.enc64_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT).rex(), use_bmi1);
+    e.enc64_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT), use_bmi1);
+
+    // Bit scan forwards and reverse
+    e.enc_i32_i64(x86_bsf, rec_bsf_and_bsr.opcodes(&BIT_SCAN_FORWARD));
+    e.enc_i32_i64(x86_bsr, rec_bsf_and_bsr.opcodes(&BIT_SCAN_REVERSE));
+
+    // Comparisons
+    e.enc_i32_i64(icmp, rec_icscc.opcodes(&CMP_REG));
+    e.enc_i32_i64(icmp_imm, rec_icscc_ib.opcodes(&CMP_IMM8).rrr(7));
+    e.enc_i32_i64(icmp_imm, rec_icscc_id.opcodes(&CMP_IMM).rrr(7));
+    e.enc_i32_i64(ifcmp, rec_rcmp.opcodes(&CMP_REG));
+    e.enc_i32_i64(ifcmp_imm, rec_rcmp_ib.opcodes(&CMP_IMM8).rrr(7));
+    e.enc_i32_i64(ifcmp_imm, rec_rcmp_id.opcodes(&CMP_IMM).rrr(7));
+    // TODO: We could special-case ifcmp_imm(x, 0) to TEST(x, x).
+
+    e.enc32(ifcmp_sp.bind(I32), rec_rcmp_sp.opcodes(&CMP_REG));
+    e.enc64(ifcmp_sp.bind(I64), rec_rcmp_sp.opcodes(&CMP_REG).rex().w());
+
+    // Convert flags to bool.
+    // This encodes `b1` as an 8-bit low register with the value 0 or 1.
+    e.enc_both(trueif, rec_seti_abcd.opcodes(&SET_BYTE_IF_OVERFLOW));
+    e.enc_both(trueff, rec_setf_abcd.opcodes(&SET_BYTE_IF_OVERFLOW));
+
+    // Conditional move (a.k.a integer select).
+    e.enc_i32_i64(selectif, rec_cmov.opcodes(&CMOV_OVERFLOW));
+    // A Spectre-guard integer select is exactly the same as a selectif, but
+    // is not associated with any other legalization rules and is not
+    // recognized by any optimizations, so it must arrive here unmodified
+    // and in its original place.
+    e.enc_i32_i64(selectif_spectre_guard, rec_cmov.opcodes(&CMOV_OVERFLOW));
+}
+
+#[inline(never)]
+#[allow(clippy::cognitive_complexity)]
+fn define_simd(
+    e: &mut PerCpuModeEncodings,
+    shared_defs: &SharedDefinitions,
+    settings: &SettingGroup,
+    x86: &InstructionGroup,
+    r: &RecipeGroup,
+) {
+    let shared = &shared_defs.instructions;
+    let formats = &shared_defs.formats;
+
+    // Shorthands for instructions.
+    let avg_round = shared.by_name("avg_round");
+    let bitcast = shared.by_name("bitcast");
+    let bor = shared.by_name("bor");
+    let bxor = shared.by_name("bxor");
+    let copy = shared.by_name("copy");
+    let copy_nop = shared.by_name("copy_nop");
+    let copy_to_ssa = shared.by_name("copy_to_ssa");
+    let fadd = shared.by_name("fadd");
+    let fcmp = shared.by_name("fcmp");
+    let fcvt_from_sint = shared.by_name("fcvt_from_sint");
+    let fdiv = shared.by_name("fdiv");
+    let fill = shared.by_name("fill");
+    let fill_nop = shared.by_name("fill_nop");
+    let fmul = shared.by_name("fmul");
+    let fsub = shared.by_name("fsub");
+    let iabs = shared.by_name("iabs");
+    let iadd = shared.by_name("iadd");
+    let icmp = shared.by_name("icmp");
+    let imul = shared.by_name("imul");
+    let ishl_imm = shared.by_name("ishl_imm");
+    let load = shared.by_name("load");
+    let load_complex = shared.by_name("load_complex");
+    let raw_bitcast = shared.by_name("raw_bitcast");
+    let regfill = shared.by_name("regfill");
+    let regmove = shared.by_name("regmove");
+    let regspill = shared.by_name("regspill");
+    let sadd_sat = shared.by_name("sadd_sat");
+    let scalar_to_vector = shared.by_name("scalar_to_vector");
+    let sload8x8 = shared.by_name("sload8x8");
+    let sload8x8_complex = shared.by_name("sload8x8_complex");
+    let sload16x4 = shared.by_name("sload16x4");
+    let sload16x4_complex = shared.by_name("sload16x4_complex");
+    let sload32x2 = shared.by_name("sload32x2");
+    let sload32x2_complex = shared.by_name("sload32x2_complex");
+    let spill = shared.by_name("spill");
+    let sqrt = shared.by_name("sqrt");
+    let sshr_imm = shared.by_name("sshr_imm");
+    let ssub_sat = shared.by_name("ssub_sat");
+    let store = shared.by_name("store");
+    let store_complex = shared.by_name("store_complex");
+    let swiden_low = shared.by_name("swiden_low");
+    let uadd_sat = shared.by_name("uadd_sat");
+    let uload8x8 = shared.by_name("uload8x8");
+    let uload8x8_complex = shared.by_name("uload8x8_complex");
+    let uload16x4 = shared.by_name("uload16x4");
+    let uload16x4_complex = shared.by_name("uload16x4_complex");
+    let uload32x2 = shared.by_name("uload32x2");
+    let uload32x2_complex = shared.by_name("uload32x2_complex");
+    let snarrow = shared.by_name("snarrow");
+    let unarrow = shared.by_name("unarrow");
+    let uwiden_low = shared.by_name("uwiden_low");
+    let ushr_imm = shared.by_name("ushr_imm");
+    let usub_sat = shared.by_name("usub_sat");
+    let vconst = shared.by_name("vconst");
+    let vselect = shared.by_name("vselect");
+    let x86_cvtt2si = x86.by_name("x86_cvtt2si");
+    let x86_insertps = x86.by_name("x86_insertps");
+    let x86_fmax = x86.by_name("x86_fmax");
+    let x86_fmin = x86.by_name("x86_fmin");
+    let x86_movlhps = x86.by_name("x86_movlhps");
+    let x86_movsd = x86.by_name("x86_movsd");
+    let x86_pblendw = x86.by_name("x86_pblendw");
+    let x86_pextr = x86.by_name("x86_pextr");
+    let x86_pinsr = x86.by_name("x86_pinsr");
+    let x86_pmaxs = x86.by_name("x86_pmaxs");
+    let x86_pmaxu = x86.by_name("x86_pmaxu");
+    let x86_pmins = x86.by_name("x86_pmins");
+    let x86_pminu = x86.by_name("x86_pminu");
+    let x86_pmullq = x86.by_name("x86_pmullq");
+    let x86_pmuludq = x86.by_name("x86_pmuludq");
+    let x86_palignr = x86.by_name("x86_palignr");
+    let x86_pshufb = x86.by_name("x86_pshufb");
+    let x86_pshufd = x86.by_name("x86_pshufd");
+    let x86_psll = x86.by_name("x86_psll");
+    let x86_psra = x86.by_name("x86_psra");
+    let x86_psrl = x86.by_name("x86_psrl");
+    let x86_ptest = x86.by_name("x86_ptest");
+    let x86_punpckh = x86.by_name("x86_punpckh");
+    let x86_punpckl = x86.by_name("x86_punpckl");
+    let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps");
+
+    // Shorthands for recipes.
+    let rec_blend = r.template("blend");
+    let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
+    let rec_evex_reg_rm_128 = r.template("evex_reg_rm_128");
+    let rec_f_ib = r.template("f_ib");
+    let rec_fa = r.template("fa");
+    let rec_fa_ib = r.template("fa_ib");
+    let rec_fax = r.template("fax");
+    let rec_fcmp = r.template("fcmp");
+    let rec_ffillSib32 = r.template("ffillSib32");
+    let rec_ffillnull = r.recipe("ffillnull");
+    let rec_fld = r.template("fld");
+    let rec_fldDisp32 = r.template("fldDisp32");
+    let rec_fldDisp8 = r.template("fldDisp8");
+    let rec_fldWithIndex = r.template("fldWithIndex");
+    let rec_fldWithIndexDisp32 = r.template("fldWithIndexDisp32");
+    let rec_fldWithIndexDisp8 = r.template("fldWithIndexDisp8");
+    let rec_fregfill32 = r.template("fregfill32");
+    let rec_fregspill32 = r.template("fregspill32");
+    let rec_frmov = r.template("frmov");
+    let rec_frurm = r.template("frurm");
+    let rec_fspillSib32 = r.template("fspillSib32");
+    let rec_fst = r.template("fst");
+    let rec_fstDisp32 = r.template("fstDisp32");
+    let rec_fstDisp8 = r.template("fstDisp8");
+    let rec_fstWithIndex = r.template("fstWithIndex");
+    let rec_fstWithIndexDisp32 = r.template("fstWithIndexDisp32");
+    let rec_fstWithIndexDisp8 = r.template("fstWithIndexDisp8");
+    let rec_furm = r.template("furm");
+    let rec_furm_reg_to_ssa = r.template("furm_reg_to_ssa");
+    let rec_icscc_fpr = r.template("icscc_fpr");
+    let rec_null_fpr = r.recipe("null_fpr");
+    let rec_pfcmp = r.template("pfcmp");
+    let rec_r_ib_unsigned_fpr = r.template("r_ib_unsigned_fpr");
+    let rec_r_ib_unsigned_gpr = r.template("r_ib_unsigned_gpr");
+    let rec_r_ib_unsigned_r = r.template("r_ib_unsigned_r");
+    let rec_stacknull = r.recipe("stacknull");
+    let rec_vconst = r.template("vconst");
+    let rec_vconst_optimized = r.template("vconst_optimized");
+
+    // Predicates shorthands.
+    settings.predicate_by_name("all_ones_funcaddrs_and_not_is_pic");
+    settings.predicate_by_name("not_all_ones_funcaddrs_and_not_is_pic");
+    let use_ssse3_simd = settings.predicate_by_name("use_ssse3_simd");
+    let use_sse41_simd = settings.predicate_by_name("use_sse41_simd");
+    let use_sse42_simd = settings.predicate_by_name("use_sse42_simd");
+    let use_avx512dq_simd = settings.predicate_by_name("use_avx512dq_simd");
+    let use_avx512vl_simd = settings.predicate_by_name("use_avx512vl_simd");
+
+    // SIMD vector size: eventually multiple vector sizes may be supported but for now only
+    // SSE-sized vectors are available.
+    let sse_vector_size: u64 = 128;
+
+    // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
+    // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
+    // value across the register.
+
+    let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
+
+    // PSHUFB, 8-bit shuffle using two XMM registers.
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let instruction = x86_pshufb.bind(vector(ty, sse_vector_size));
+        let template = rec_fa.opcodes(&PSHUFB);
+        e.enc_both_inferred_maybe_isap(instruction.clone(), template.clone(), Some(use_ssse3_simd));
+    }
+
+    // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate.
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
+        let instruction = x86_pshufd.bind(vector(ty, sse_vector_size));
+        let template = rec_r_ib_unsigned_fpr.opcodes(&PSHUFD);
+        e.enc_both_inferred(instruction, template);
+    }
+
+    // SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be
+    // either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB;
+    // for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let opcode = match ty.lane_bits() {
+            32 => &BLENDVPS,
+            64 => &BLENDVPD,
+            _ => &PBLENDVB,
+        };
+        let instruction = vselect.bind(vector(ty, sse_vector_size));
+        let template = rec_blend.opcodes(opcode);
+        e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
+    }
+
+    // PBLENDW, select lanes using a u8 immediate.
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
+        let instruction = x86_pblendw.bind(vector(ty, sse_vector_size));
+        let template = rec_fa_ib.opcodes(&PBLENDW);
+        e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
+    }
+
+    // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
+    // to the Intel manual: "When the destination operand is an XMM register, the source operand is
+    // written to the low doubleword of the register and the register is zero-extended to 128 bits."
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let instruction = scalar_to_vector.bind(vector(ty, sse_vector_size));
+        if ty.is_float() {
+            // No need to move floats--they already live in XMM registers.
+            e.enc_32_64_rec(instruction, rec_null_fpr, 0);
+        } else {
+            let template = rec_frurm.opcodes(&MOVD_LOAD_XMM);
+            if ty.lane_bits() < 64 {
+                e.enc_both_inferred(instruction, template);
+            } else {
+                // No 32-bit encodings for 64-bit widths.
+                assert_eq!(ty.lane_bits(), 64);
+                e.enc64(instruction, template.rex().w());
+            }
+        }
+    }
+
+    // SIMD insertlane
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let (opcode, isap): (&[_], _) = match ty.lane_bits() {
+            8 => (&PINSRB, Some(use_sse41_simd)),
+            16 => (&PINSRW, None),
+            32 | 64 => (&PINSR, Some(use_sse41_simd)),
+            _ => panic!("invalid size for SIMD insertlane"),
+        };
+
+        let instruction = x86_pinsr.bind(vector(ty, sse_vector_size));
+        let template = rec_r_ib_unsigned_r.opcodes(opcode);
+        if ty.lane_bits() < 64 {
+            e.enc_both_inferred_maybe_isap(instruction, template, isap);
+        } else {
+            // It turns out the 64-bit widths have REX/W encodings and only are available on
+            // x86_64.
+            e.enc64_maybe_isap(instruction, template.rex().w(), isap);
+        }
+    }
+
+    // For legalizing insertlane with floats, INSERTPS from SSE4.1.
+    {
+        let instruction = x86_insertps.bind(vector(F32, sse_vector_size));
+        let template = rec_fa_ib.opcodes(&INSERTPS);
+        e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
+    }
+
+    // For legalizing insertlane with floats,  MOVSD from SSE2.
+    {
+        let instruction = x86_movsd.bind(vector(F64, sse_vector_size));
+        let template = rec_fa.opcodes(&MOVSD_LOAD);
+        e.enc_both_inferred(instruction, template); // from SSE2
+    }
+
+    // For legalizing insertlane with floats, MOVLHPS from SSE.
+    {
+        let instruction = x86_movlhps.bind(vector(F64, sse_vector_size));
+        let template = rec_fa.opcodes(&MOVLHPS);
+        e.enc_both_inferred(instruction, template); // from SSE
+    }
+
+    // SIMD extractlane
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let opcode = match ty.lane_bits() {
+            8 => &PEXTRB,
+            16 => &PEXTRW,
+            32 | 64 => &PEXTR,
+            _ => panic!("invalid size for SIMD extractlane"),
+        };
+
+        let instruction = x86_pextr.bind(vector(ty, sse_vector_size));
+        let template = rec_r_ib_unsigned_gpr.opcodes(opcode);
+        if ty.lane_bits() < 64 {
+            e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
+        } else {
+            // It turns out the 64-bit widths have REX/W encodings and only are available on
+            // x86_64.
+            e.enc64_maybe_isap(instruction, template.rex().w(), Some(use_sse41_simd));
+        }
+    }
+
+    // SIMD packing/unpacking
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let (high, low) = match ty.lane_bits() {
+            8 => (&PUNPCKHBW, &PUNPCKLBW),
+            16 => (&PUNPCKHWD, &PUNPCKLWD),
+            32 => (&PUNPCKHDQ, &PUNPCKLDQ),
+            64 => (&PUNPCKHQDQ, &PUNPCKLQDQ),
+            _ => panic!("invalid size for SIMD packing/unpacking"),
+        };
+
+        e.enc_both_inferred(
+            x86_punpckh.bind(vector(ty, sse_vector_size)),
+            rec_fa.opcodes(high),
+        );
+        e.enc_both_inferred(
+            x86_punpckl.bind(vector(ty, sse_vector_size)),
+            rec_fa.opcodes(low),
+        );
+    }
+
+    // SIMD narrow/widen
+    for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] {
+        let snarrow = snarrow.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes));
+    }
+    for (ty, opcodes, isap) in &[
+        (I16, &PACKUSWB[..], None),
+        (I32, &PACKUSDW[..], Some(use_sse41_simd)),
+    ] {
+        let unarrow = unarrow.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap);
+    }
+    for (ty, swiden_opcode, uwiden_opcode) in &[
+        (I8, &PMOVSXBW[..], &PMOVZXBW[..]),
+        (I16, &PMOVSXWD[..], &PMOVZXWD[..]),
+    ] {
+        let isap = Some(use_sse41_simd);
+        let swiden_low = swiden_low.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(swiden_low, rec_furm.opcodes(*swiden_opcode), isap);
+        let uwiden_low = uwiden_low.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(uwiden_low, rec_furm.opcodes(*uwiden_opcode), isap);
+    }
+    for ty in &[I8, I16, I32, I64] {
+        e.enc_both_inferred_maybe_isap(
+            x86_palignr.bind(vector(*ty, sse_vector_size)),
+            rec_fa_ib.opcodes(&PALIGNR[..]),
+            Some(use_ssse3_simd),
+        );
+    }
+
+    // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
+    for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
+        for to_type in
+            ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type)
+        {
+            let instruction = raw_bitcast
+                .bind(vector(to_type, sse_vector_size))
+                .bind(vector(from_type, sse_vector_size));
+            e.enc_32_64_rec(instruction, rec_null_fpr, 0);
+        }
+    }
+
+    // SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an
+    // XMM register.
+    for float_type in &[F32, F64] {
+        for lane_type in ValueType::all_lane_types().filter(allowed_simd_type) {
+            e.enc_32_64_rec(
+                raw_bitcast
+                    .bind(vector(lane_type, sse_vector_size))
+                    .bind(*float_type),
+                rec_null_fpr,
+                0,
+            );
+            e.enc_32_64_rec(
+                raw_bitcast
+                    .bind(*float_type)
+                    .bind(vector(lane_type, sse_vector_size)),
+                rec_null_fpr,
+                0,
+            );
+        }
+    }
+
+    // SIMD conversions
+    {
+        let fcvt_from_sint_32 = fcvt_from_sint
+            .bind(vector(F32, sse_vector_size))
+            .bind(vector(I32, sse_vector_size));
+        e.enc_both(fcvt_from_sint_32, rec_furm.opcodes(&CVTDQ2PS));
+
+        e.enc_32_64_maybe_isap(
+            x86_vcvtudq2ps,
+            rec_evex_reg_rm_128.opcodes(&VCVTUDQ2PS),
+            Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
+        );
+
+        e.enc_both_inferred(
+            x86_cvtt2si
+                .bind(vector(I32, sse_vector_size))
+                .bind(vector(F32, sse_vector_size)),
+            rec_furm.opcodes(&CVTTPS2DQ),
+        );
+    }
+
+    // SIMD vconst for special cases (all zeroes, all ones)
+    // this must be encoded prior to the MOVUPS implementation (below) so the compiler sees this
+    // encoding first
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let instruction = vconst.bind(vector(ty, sse_vector_size));
+
+        let is_zero_128bit =
+            InstructionPredicate::new_is_all_zeroes(&*formats.unary_const, "constant_handle");
+        let template = rec_vconst_optimized.opcodes(&PXOR).infer_rex();
+        e.enc_32_64_func(instruction.clone(), template, |builder| {
+            builder.inst_predicate(is_zero_128bit)
+        });
+
+        let is_ones_128bit =
+            InstructionPredicate::new_is_all_ones(&*formats.unary_const, "constant_handle");
+        let template = rec_vconst_optimized.opcodes(&PCMPEQB).infer_rex();
+        e.enc_32_64_func(instruction, template, |builder| {
+            builder.inst_predicate(is_ones_128bit)
+        });
+    }
+
+    // SIMD vconst using MOVUPS
+    // TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have
+    // to guarantee that the constants are aligned when emitted and there is currently no mechanism
+    // for that; alternately, constants could be loaded into XMM registers using a sequence like:
+    // MOVQ + MOVHPD + MOVQ + MOVLPD (this allows the constants to be immediates instead of stored
+    // in memory) but some performance measurements are needed.
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let instruction = vconst.bind(vector(ty, sse_vector_size));
+        let template = rec_vconst.opcodes(&MOVUPS_LOAD);
+        e.enc_both_inferred(instruction, template); // from SSE
+    }
+
+    // SIMD register movement: store, load, spill, fill, regmove, etc. All of these use encodings of
+    // MOVUPS and MOVAPS from SSE (TODO ideally all of these would either use MOVAPS when we have
+    // alignment or type-specific encodings, see https://github.com/bytecodealliance/wasmtime/issues/1124).
+    // Also, it would be ideal to infer REX prefixes for all of these instructions but for the
+    // time being only instructions with common recipes have `infer_rex()` support.
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        // Store
+        let bound_store = store.bind(vector(ty, sse_vector_size)).bind(Any);
+        e.enc_both_inferred(bound_store.clone(), rec_fst.opcodes(&MOVUPS_STORE));
+        e.enc_both_inferred(bound_store.clone(), rec_fstDisp8.opcodes(&MOVUPS_STORE));
+        e.enc_both_inferred(bound_store, rec_fstDisp32.opcodes(&MOVUPS_STORE));
+
+        // Store complex
+        let bound_store_complex = store_complex.bind(vector(ty, sse_vector_size));
+        e.enc_both(
+            bound_store_complex.clone(),
+            rec_fstWithIndex.opcodes(&MOVUPS_STORE),
+        );
+        e.enc_both(
+            bound_store_complex.clone(),
+            rec_fstWithIndexDisp8.opcodes(&MOVUPS_STORE),
+        );
+        e.enc_both(
+            bound_store_complex,
+            rec_fstWithIndexDisp32.opcodes(&MOVUPS_STORE),
+        );
+
+        // Load
+        let bound_load = load.bind(vector(ty, sse_vector_size)).bind(Any);
+        e.enc_both_inferred(bound_load.clone(), rec_fld.opcodes(&MOVUPS_LOAD));
+        e.enc_both_inferred(bound_load.clone(), rec_fldDisp8.opcodes(&MOVUPS_LOAD));
+        e.enc_both_inferred(bound_load, rec_fldDisp32.opcodes(&MOVUPS_LOAD));
+
+        // Load complex
+        let bound_load_complex = load_complex.bind(vector(ty, sse_vector_size));
+        e.enc_both(
+            bound_load_complex.clone(),
+            rec_fldWithIndex.opcodes(&MOVUPS_LOAD),
+        );
+        e.enc_both(
+            bound_load_complex.clone(),
+            rec_fldWithIndexDisp8.opcodes(&MOVUPS_LOAD),
+        );
+        e.enc_both(
+            bound_load_complex,
+            rec_fldWithIndexDisp32.opcodes(&MOVUPS_LOAD),
+        );
+
+        // Spill
+        let bound_spill = spill.bind(vector(ty, sse_vector_size));
+        e.enc_both(bound_spill, rec_fspillSib32.opcodes(&MOVUPS_STORE));
+        let bound_regspill = regspill.bind(vector(ty, sse_vector_size));
+        e.enc_both(bound_regspill, rec_fregspill32.opcodes(&MOVUPS_STORE));
+
+        // Fill
+        let bound_fill = fill.bind(vector(ty, sse_vector_size));
+        e.enc_both(bound_fill, rec_ffillSib32.opcodes(&MOVUPS_LOAD));
+        let bound_regfill = regfill.bind(vector(ty, sse_vector_size));
+        e.enc_both(bound_regfill, rec_fregfill32.opcodes(&MOVUPS_LOAD));
+        let bound_fill_nop = fill_nop.bind(vector(ty, sse_vector_size));
+        e.enc_32_64_rec(bound_fill_nop, rec_ffillnull, 0);
+
+        // Regmove
+        let bound_regmove = regmove.bind(vector(ty, sse_vector_size));
+        e.enc_both(bound_regmove, rec_frmov.opcodes(&MOVAPS_LOAD));
+
+        // Copy
+        let bound_copy = copy.bind(vector(ty, sse_vector_size));
+        e.enc_both(bound_copy, rec_furm.opcodes(&MOVAPS_LOAD));
+        let bound_copy_to_ssa = copy_to_ssa.bind(vector(ty, sse_vector_size));
+        e.enc_both(bound_copy_to_ssa, rec_furm_reg_to_ssa.opcodes(&MOVAPS_LOAD));
+        let bound_copy_nop = copy_nop.bind(vector(ty, sse_vector_size));
+        e.enc_32_64_rec(bound_copy_nop, rec_stacknull, 0);
+    }
+
+    // SIMD load extend
+    for (inst, opcodes) in &[
+        (uload8x8, &PMOVZXBW),
+        (uload16x4, &PMOVZXWD),
+        (uload32x2, &PMOVZXDQ),
+        (sload8x8, &PMOVSXBW),
+        (sload16x4, &PMOVSXWD),
+        (sload32x2, &PMOVSXDQ),
+    ] {
+        let isap = Some(use_sse41_simd);
+        for recipe in &[rec_fld, rec_fldDisp8, rec_fldDisp32] {
+            let inst = *inst;
+            let template = recipe.opcodes(*opcodes);
+            e.enc_both_inferred_maybe_isap(inst.clone().bind(I32), template.clone(), isap);
+            e.enc64_maybe_isap(inst.bind(I64), template.infer_rex(), isap);
+        }
+    }
+
+    // SIMD load extend (complex addressing)
+    let is_load_complex_length_two =
+        InstructionPredicate::new_length_equals(&*formats.load_complex, 2);
+    for (inst, opcodes) in &[
+        (uload8x8_complex, &PMOVZXBW),
+        (uload16x4_complex, &PMOVZXWD),
+        (uload32x2_complex, &PMOVZXDQ),
+        (sload8x8_complex, &PMOVSXBW),
+        (sload16x4_complex, &PMOVSXWD),
+        (sload32x2_complex, &PMOVSXDQ),
+    ] {
+        for recipe in &[
+            rec_fldWithIndex,
+            rec_fldWithIndexDisp8,
+            rec_fldWithIndexDisp32,
+        ] {
+            let template = recipe.opcodes(*opcodes);
+            let predicate = |encoding: EncodingBuilder| {
+                encoding
+                    .isa_predicate(use_sse41_simd)
+                    .inst_predicate(is_load_complex_length_two.clone())
+            };
+            e.enc32_func(inst.clone(), template.clone(), predicate);
+            // No infer_rex calculator for these recipes; place REX version first as in enc_x86_64.
+            e.enc64_func(inst.clone(), template.rex(), predicate);
+            e.enc64_func(inst.clone(), template, predicate);
+        }
+    }
+
+    // SIMD integer addition
+    for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] {
+        let iadd = iadd.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(iadd, rec_fa.opcodes(*opcodes));
+    }
+
+    // SIMD integer saturating addition
+    e.enc_both_inferred(
+        sadd_sat.bind(vector(I8, sse_vector_size)),
+        rec_fa.opcodes(&PADDSB),
+    );
+    e.enc_both_inferred(
+        sadd_sat.bind(vector(I16, sse_vector_size)),
+        rec_fa.opcodes(&PADDSW),
+    );
+    e.enc_both_inferred(
+        uadd_sat.bind(vector(I8, sse_vector_size)),
+        rec_fa.opcodes(&PADDUSB),
+    );
+    e.enc_both_inferred(
+        uadd_sat.bind(vector(I16, sse_vector_size)),
+        rec_fa.opcodes(&PADDUSW),
+    );
+
+    // SIMD integer subtraction
+    let isub = shared.by_name("isub");
+    for (ty, opcodes) in &[(I8, &PSUBB), (I16, &PSUBW), (I32, &PSUBD), (I64, &PSUBQ)] {
+        let isub = isub.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(isub, rec_fa.opcodes(*opcodes));
+    }
+
+    // SIMD integer saturating subtraction
+    e.enc_both_inferred(
+        ssub_sat.bind(vector(I8, sse_vector_size)),
+        rec_fa.opcodes(&PSUBSB),
+    );
+    e.enc_both_inferred(
+        ssub_sat.bind(vector(I16, sse_vector_size)),
+        rec_fa.opcodes(&PSUBSW),
+    );
+    e.enc_both_inferred(
+        usub_sat.bind(vector(I8, sse_vector_size)),
+        rec_fa.opcodes(&PSUBUSB),
+    );
+    e.enc_both_inferred(
+        usub_sat.bind(vector(I16, sse_vector_size)),
+        rec_fa.opcodes(&PSUBUSW),
+    );
+
+    // SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16
+    // and I64x2 and these are (at the time of writing) not necessary for WASM SIMD.
+    for (ty, opcodes, isap) in &[
+        (I16, &PMULLW[..], None),
+        (I32, &PMULLD[..], Some(use_sse41_simd)),
+    ] {
+        let imul = imul.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(imul, rec_fa.opcodes(opcodes), *isap);
+    }
+
+    // SIMD multiplication with lane expansion.
+    e.enc_both_inferred(x86_pmuludq, rec_fa.opcodes(&PMULUDQ));
+
+    // SIMD integer multiplication for I64x2 using a AVX512.
+    {
+        e.enc_32_64_maybe_isap(
+            x86_pmullq,
+            rec_evex_reg_vvvv_rm_128.opcodes(&VPMULLQ).w(),
+            Some(use_avx512dq_simd), // TODO need an OR predicate to join with AVX512VL
+        );
+    }
+
+    // SIMD integer average with rounding.
+    for (ty, opcodes) in &[(I8, &PAVGB[..]), (I16, &PAVGW[..])] {
+        let avgr = avg_round.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(avgr, rec_fa.opcodes(opcodes));
+    }
+
+    // SIMD integer absolute value.
+    for (ty, opcodes) in &[(I8, &PABSB[..]), (I16, &PABSW[..]), (I32, &PABSD)] {
+        let iabs = iabs.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(iabs, rec_furm.opcodes(opcodes), Some(use_ssse3_simd));
+    }
+
+    // SIMD logical operations
+    let band = shared.by_name("band");
+    let band_not = shared.by_name("band_not");
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        // and
+        let band = band.bind(vector(ty, sse_vector_size));
+        e.enc_both_inferred(band, rec_fa.opcodes(&PAND));
+
+        // and not (note flipped recipe operands to match band_not order)
+        let band_not = band_not.bind(vector(ty, sse_vector_size));
+        e.enc_both_inferred(band_not, rec_fax.opcodes(&PANDN));
+
+        // or
+        let bor = bor.bind(vector(ty, sse_vector_size));
+        e.enc_both_inferred(bor, rec_fa.opcodes(&POR));
+
+        // xor
+        let bxor = bxor.bind(vector(ty, sse_vector_size));
+        e.enc_both_inferred(bxor, rec_fa.opcodes(&PXOR));
+
+        // ptest
+        let x86_ptest = x86_ptest.bind(vector(ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(x86_ptest, rec_fcmp.opcodes(&PTEST), Some(use_sse41_simd));
+    }
+
+    // SIMD bitcast from I32/I64 to the low bits of a vector (e.g. I64x2); this register movement
+    // allows SIMD shifts to be legalized more easily. TODO ideally this would be typed as an
+    // I128x1 but restrictions on the type builder prevent this; the general idea here is that
+    // the upper bits are all zeroed and do not form parts of any separate lane. See
+    // https://github.com/bytecodealliance/wasmtime/issues/1140.
+    e.enc_both_inferred(
+        bitcast.bind(vector(I64, sse_vector_size)).bind(I32),
+        rec_frurm.opcodes(&MOVD_LOAD_XMM),
+    );
+    e.enc64(
+        bitcast.bind(vector(I64, sse_vector_size)).bind(I64),
+        rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(),
+    );
+
+    // SIMD shift left
+    for (ty, opcodes) in &[(I16, &PSLLW), (I32, &PSLLD), (I64, &PSLLQ)] {
+        let x86_psll = x86_psll.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(x86_psll, rec_fa.opcodes(*opcodes));
+    }
+
+    // SIMD shift right (logical)
+    for (ty, opcodes) in &[(I16, &PSRLW), (I32, &PSRLD), (I64, &PSRLQ)] {
+        let x86_psrl = x86_psrl.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(x86_psrl, rec_fa.opcodes(*opcodes));
+    }
+
+    // SIMD shift right (arithmetic)
+    for (ty, opcodes) in &[(I16, &PSRAW), (I32, &PSRAD)] {
+        let x86_psra = x86_psra.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(x86_psra, rec_fa.opcodes(*opcodes));
+    }
+
+    // SIMD immediate shift
+    for (ty, opcodes) in &[(I16, &PS_W_IMM), (I32, &PS_D_IMM), (I64, &PS_Q_IMM)] {
+        let ishl_imm = ishl_imm.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(ishl_imm, rec_f_ib.opcodes(*opcodes).rrr(6));
+
+        let ushr_imm = ushr_imm.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(ushr_imm, rec_f_ib.opcodes(*opcodes).rrr(2));
+
+        // One exception: PSRAQ does not exist in for 64x2 in SSE2, it requires a higher CPU feature set.
+        if *ty != I64 {
+            let sshr_imm = sshr_imm.bind(vector(*ty, sse_vector_size));
+            e.enc_both_inferred(sshr_imm, rec_f_ib.opcodes(*opcodes).rrr(4));
+        }
+    }
+
+    // SIMD integer comparisons
+    {
+        use IntCC::*;
+        for (ty, cc, opcodes, isa_predicate) in &[
+            (I8, Equal, &PCMPEQB[..], None),
+            (I16, Equal, &PCMPEQW[..], None),
+            (I32, Equal, &PCMPEQD[..], None),
+            (I64, Equal, &PCMPEQQ[..], Some(use_sse41_simd)),
+            (I8, SignedGreaterThan, &PCMPGTB[..], None),
+            (I16, SignedGreaterThan, &PCMPGTW[..], None),
+            (I32, SignedGreaterThan, &PCMPGTD[..], None),
+            (I64, SignedGreaterThan, &PCMPGTQ, Some(use_sse42_simd)),
+        ] {
+            let instruction = icmp
+                .bind(Immediate::IntCC(*cc))
+                .bind(vector(*ty, sse_vector_size));
+            let template = rec_icscc_fpr.opcodes(opcodes);
+            e.enc_both_inferred_maybe_isap(instruction, template, *isa_predicate);
+        }
+    }
+
+    // SIMD min/max
+    for (ty, inst, opcodes, isa_predicate) in &[
+        (I8, x86_pmaxs, &PMAXSB[..], Some(use_sse41_simd)),
+        (I16, x86_pmaxs, &PMAXSW[..], None),
+        (I32, x86_pmaxs, &PMAXSD[..], Some(use_sse41_simd)),
+        (I8, x86_pmaxu, &PMAXUB[..], None),
+        (I16, x86_pmaxu, &PMAXUW[..], Some(use_sse41_simd)),
+        (I32, x86_pmaxu, &PMAXUD[..], Some(use_sse41_simd)),
+        (I8, x86_pmins, &PMINSB[..], Some(use_sse41_simd)),
+        (I16, x86_pmins, &PMINSW[..], None),
+        (I32, x86_pmins, &PMINSD[..], Some(use_sse41_simd)),
+        (I8, x86_pminu, &PMINUB[..], None),
+        (I16, x86_pminu, &PMINUW[..], Some(use_sse41_simd)),
+        (I32, x86_pminu, &PMINUD[..], Some(use_sse41_simd)),
+    ] {
+        let inst = inst.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(inst, rec_fa.opcodes(opcodes), *isa_predicate);
+    }
+
+    // SIMD float comparisons
+    e.enc_both_inferred(
+        fcmp.bind(vector(F32, sse_vector_size)),
+        rec_pfcmp.opcodes(&CMPPS),
+    );
+    e.enc_both_inferred(
+        fcmp.bind(vector(F64, sse_vector_size)),
+        rec_pfcmp.opcodes(&CMPPD),
+    );
+
+    // SIMD float arithmetic
+    for (ty, inst, opcodes) in &[
+        (F32, fadd, &ADDPS[..]),
+        (F64, fadd, &ADDPD[..]),
+        (F32, fsub, &SUBPS[..]),
+        (F64, fsub, &SUBPD[..]),
+        (F32, fmul, &MULPS[..]),
+        (F64, fmul, &MULPD[..]),
+        (F32, fdiv, &DIVPS[..]),
+        (F64, fdiv, &DIVPD[..]),
+        (F32, x86_fmin, &MINPS[..]),
+        (F64, x86_fmin, &MINPD[..]),
+        (F32, x86_fmax, &MAXPS[..]),
+        (F64, x86_fmax, &MAXPD[..]),
+    ] {
+        let inst = inst.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(inst, rec_fa.opcodes(opcodes));
+    }
+    for (ty, inst, opcodes) in &[(F32, sqrt, &SQRTPS[..]), (F64, sqrt, &SQRTPD[..])] {
+        let inst = inst.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred(inst, rec_furm.opcodes(opcodes));
+    }
+}
+
+#[inline(never)]
+fn define_entity_ref(
+    e: &mut PerCpuModeEncodings,
+    shared_defs: &SharedDefinitions,
+    settings: &SettingGroup,
+    r: &RecipeGroup,
+) {
+    let shared = &shared_defs.instructions;
+    let formats = &shared_defs.formats;
+
+    // Shorthands for instructions.
+    let const_addr = shared.by_name("const_addr");
+    let func_addr = shared.by_name("func_addr");
+    let stack_addr = shared.by_name("stack_addr");
+    let symbol_value = shared.by_name("symbol_value");
+
+    // Shorthands for recipes.
+    let rec_allones_fnaddr4 = r.template("allones_fnaddr4");
+    let rec_allones_fnaddr8 = r.template("allones_fnaddr8");
+    let rec_fnaddr4 = r.template("fnaddr4");
+    let rec_fnaddr8 = r.template("fnaddr8");
+    let rec_const_addr = r.template("const_addr");
+    let rec_got_fnaddr8 = r.template("got_fnaddr8");
+    let rec_got_gvaddr8 = r.template("got_gvaddr8");
+    let rec_gvaddr4 = r.template("gvaddr4");
+    let rec_gvaddr8 = r.template("gvaddr8");
+    let rec_pcrel_fnaddr8 = r.template("pcrel_fnaddr8");
+    let rec_pcrel_gvaddr8 = r.template("pcrel_gvaddr8");
+    let rec_spaddr_id = r.template("spaddr_id");
+
+    // Predicates shorthands.
+    let all_ones_funcaddrs_and_not_is_pic =
+        settings.predicate_by_name("all_ones_funcaddrs_and_not_is_pic");
+    let is_pic = settings.predicate_by_name("is_pic");
+    let not_all_ones_funcaddrs_and_not_is_pic =
+        settings.predicate_by_name("not_all_ones_funcaddrs_and_not_is_pic");
+    let not_is_pic = settings.predicate_by_name("not_is_pic");
+
+    // Function addresses.
+
+    // Non-PIC, all-ones funcaddresses.
+    e.enc32_isap(
+        func_addr.bind(I32),
+        rec_fnaddr4.opcodes(&MOV_IMM),
+        not_all_ones_funcaddrs_and_not_is_pic,
+    );
+    e.enc64_isap(
+        func_addr.bind(I64),
+        rec_fnaddr8.opcodes(&MOV_IMM).rex().w(),
+        not_all_ones_funcaddrs_and_not_is_pic,
+    );
+
+    // Non-PIC, all-zeros funcaddresses.
+    e.enc32_isap(
+        func_addr.bind(I32),
+        rec_allones_fnaddr4.opcodes(&MOV_IMM),
+        all_ones_funcaddrs_and_not_is_pic,
+    );
+    e.enc64_isap(
+        func_addr.bind(I64),
+        rec_allones_fnaddr8.opcodes(&MOV_IMM).rex().w(),
+        all_ones_funcaddrs_and_not_is_pic,
+    );
+
+    // 64-bit, colocated, both PIC and non-PIC. Use the lea instruction's pc-relative field.
+    let is_colocated_func =
+        InstructionPredicate::new_is_colocated_func(&*formats.func_addr, "func_ref");
+    e.enc64_instp(
+        func_addr.bind(I64),
+        rec_pcrel_fnaddr8.opcodes(&LEA).rex().w(),
+        is_colocated_func,
+    );
+
+    // 64-bit, non-colocated, PIC.
+    e.enc64_isap(
+        func_addr.bind(I64),
+        rec_got_fnaddr8.opcodes(&MOV_LOAD).rex().w(),
+        is_pic,
+    );
+
+    // Global addresses.
+
+    // Non-PIC.
+    e.enc32_isap(
+        symbol_value.bind(I32),
+        rec_gvaddr4.opcodes(&MOV_IMM),
+        not_is_pic,
+    );
+    e.enc64_isap(
+        symbol_value.bind(I64),
+        rec_gvaddr8.opcodes(&MOV_IMM).rex().w(),
+        not_is_pic,
+    );
+
+    // PIC, colocated.
+    e.enc64_func(
+        symbol_value.bind(I64),
+        rec_pcrel_gvaddr8.opcodes(&LEA).rex().w(),
+        |encoding| {
+            encoding
+                .isa_predicate(is_pic)
+                .inst_predicate(InstructionPredicate::new_is_colocated_data(formats))
+        },
+    );
+
+    // PIC, non-colocated.
+    e.enc64_isap(
+        symbol_value.bind(I64),
+        rec_got_gvaddr8.opcodes(&MOV_LOAD).rex().w(),
+        is_pic,
+    );
+
+    // Stack addresses.
+    //
+    // TODO: Add encoding rules for stack_load and stack_store, so that they
+    // don't get legalized to stack_addr + load/store.
+    e.enc64(stack_addr.bind(I64), rec_spaddr_id.opcodes(&LEA).rex().w());
+    e.enc32(stack_addr.bind(I32), rec_spaddr_id.opcodes(&LEA));
+
+    // Constant addresses (PIC).
+    e.enc64(const_addr.bind(I64), rec_const_addr.opcodes(&LEA).rex().w());
+    e.enc32(const_addr.bind(I32), rec_const_addr.opcodes(&LEA));
+}
+
+/// Control flow opcodes.
+#[inline(never)]
+fn define_control_flow(
+    e: &mut PerCpuModeEncodings,
+    shared_defs: &SharedDefinitions,
+    settings: &SettingGroup,
+    r: &RecipeGroup,
+) {
+    let shared = &shared_defs.instructions;
+    let formats = &shared_defs.formats;
+
+    // Shorthands for instructions.
+    let brff = shared.by_name("brff");
+    let brif = shared.by_name("brif");
+    let brnz = shared.by_name("brnz");
+    let brz = shared.by_name("brz");
+    let call = shared.by_name("call");
+    let call_indirect = shared.by_name("call_indirect");
+    let debugtrap = shared.by_name("debugtrap");
+    let indirect_jump_table_br = shared.by_name("indirect_jump_table_br");
+    let jump = shared.by_name("jump");
+    let jump_table_base = shared.by_name("jump_table_base");
+    let jump_table_entry = shared.by_name("jump_table_entry");
+    let return_ = shared.by_name("return");
+    let trap = shared.by_name("trap");
+    let trapff = shared.by_name("trapff");
+    let trapif = shared.by_name("trapif");
+    let resumable_trap = shared.by_name("resumable_trap");
+
+    // Shorthands for recipes.
+    let rec_brfb = r.template("brfb");
+    let rec_brfd = r.template("brfd");
+    let rec_brib = r.template("brib");
+    let rec_brid = r.template("brid");
+    let rec_call_id = r.template("call_id");
+    let rec_call_plt_id = r.template("call_plt_id");
+    let rec_call_r = r.template("call_r");
+    let rec_debugtrap = r.recipe("debugtrap");
+    let rec_indirect_jmp = r.template("indirect_jmp");
+    let rec_jmpb = r.template("jmpb");
+    let rec_jmpd = r.template("jmpd");
+    let rec_jt_base = r.template("jt_base");
+    let rec_jt_entry = r.template("jt_entry");
+    let rec_ret = r.template("ret");
+    let rec_t8jccb_abcd = r.template("t8jccb_abcd");
+    let rec_t8jccd_abcd = r.template("t8jccd_abcd");
+    let rec_t8jccd_long = r.template("t8jccd_long");
+    let rec_tjccb = r.template("tjccb");
+    let rec_tjccd = r.template("tjccd");
+    let rec_trap = r.template("trap");
+    let rec_trapif = r.recipe("trapif");
+    let rec_trapff = r.recipe("trapff");
+
+    // Predicates shorthands.
+    let is_pic = settings.predicate_by_name("is_pic");
+
+    // Call/return
+
+    // 32-bit, both PIC and non-PIC.
+    e.enc32(call, rec_call_id.opcodes(&CALL_RELATIVE));
+
+    // 64-bit, colocated, both PIC and non-PIC. Use the call instruction's pc-relative field.
+    let is_colocated_func = InstructionPredicate::new_is_colocated_func(&*formats.call, "func_ref");
+    e.enc64_instp(call, rec_call_id.opcodes(&CALL_RELATIVE), is_colocated_func);
+
+    // 64-bit, non-colocated, PIC. There is no 64-bit non-colocated non-PIC version, since non-PIC
+    // is currently using the large model, which requires calls be lowered to
+    // func_addr+call_indirect.
+    e.enc64_isap(call, rec_call_plt_id.opcodes(&CALL_RELATIVE), is_pic);
+
+    e.enc32(
+        call_indirect.bind(I32),
+        rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2),
+    );
+    e.enc64(
+        call_indirect.bind(I64),
+        rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2).rex(),
+    );
+    e.enc64(
+        call_indirect.bind(I64),
+        rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2),
+    );
+
+    e.enc32(return_, rec_ret.opcodes(&RET_NEAR));
+    e.enc64(return_, rec_ret.opcodes(&RET_NEAR));
+
+    // Branches.
+    e.enc32(jump, rec_jmpb.opcodes(&JUMP_SHORT));
+    e.enc64(jump, rec_jmpb.opcodes(&JUMP_SHORT));
+    e.enc32(jump, rec_jmpd.opcodes(&JUMP_NEAR_RELATIVE));
+    e.enc64(jump, rec_jmpd.opcodes(&JUMP_NEAR_RELATIVE));
+
+    e.enc_both(brif, rec_brib.opcodes(&JUMP_SHORT_IF_OVERFLOW));
+    e.enc_both(brif, rec_brid.opcodes(&JUMP_NEAR_IF_OVERFLOW));
+
+    // Not all float condition codes are legal, see `supported_floatccs`.
+    e.enc_both(brff, rec_brfb.opcodes(&JUMP_SHORT_IF_OVERFLOW));
+    e.enc_both(brff, rec_brfd.opcodes(&JUMP_NEAR_IF_OVERFLOW));
+
+    // Note that the tjccd opcode will be prefixed with 0x0f.
+    e.enc_i32_i64_explicit_rex(brz, rec_tjccb.opcodes(&JUMP_SHORT_IF_EQUAL));
+    e.enc_i32_i64_explicit_rex(brz, rec_tjccd.opcodes(&TEST_BYTE_REG));
+    e.enc_i32_i64_explicit_rex(brnz, rec_tjccb.opcodes(&JUMP_SHORT_IF_NOT_EQUAL));
+    e.enc_i32_i64_explicit_rex(brnz, rec_tjccd.opcodes(&TEST_REG));
+
+    // Branch on a b1 value in a register only looks at the low 8 bits. See also
+    // bint encodings below.
+    //
+    // Start with the worst-case encoding for X86_32 only. The register allocator
+    // can't handle a branch with an ABCD-constrained operand.
+    e.enc32(brz.bind(B1), rec_t8jccd_long.opcodes(&TEST_BYTE_REG));
+    e.enc32(brnz.bind(B1), rec_t8jccd_long.opcodes(&TEST_REG));
+
+    e.enc_both(brz.bind(B1), rec_t8jccb_abcd.opcodes(&JUMP_SHORT_IF_EQUAL));
+    e.enc_both(brz.bind(B1), rec_t8jccd_abcd.opcodes(&TEST_BYTE_REG));
+    e.enc_both(
+        brnz.bind(B1),
+        rec_t8jccb_abcd.opcodes(&JUMP_SHORT_IF_NOT_EQUAL),
+    );
+    e.enc_both(brnz.bind(B1), rec_t8jccd_abcd.opcodes(&TEST_REG));
+
+    // Jump tables.
+    e.enc64(
+        jump_table_entry.bind(I64),
+        rec_jt_entry.opcodes(&MOVSXD).rex().w(),
+    );
+    e.enc32(jump_table_entry.bind(I32), rec_jt_entry.opcodes(&MOV_LOAD));
+
+    e.enc64(
+        jump_table_base.bind(I64),
+        rec_jt_base.opcodes(&LEA).rex().w(),
+    );
+    e.enc32(jump_table_base.bind(I32), rec_jt_base.opcodes(&LEA));
+
+    e.enc_x86_64(
+        indirect_jump_table_br.bind(I64),
+        rec_indirect_jmp.opcodes(&JUMP_ABSOLUTE).rrr(4),
+    );
+    e.enc32(
+        indirect_jump_table_br.bind(I32),
+        rec_indirect_jmp.opcodes(&JUMP_ABSOLUTE).rrr(4),
+    );
+
+    // Trap as ud2
+    e.enc32(trap, rec_trap.opcodes(&UNDEFINED2));
+    e.enc64(trap, rec_trap.opcodes(&UNDEFINED2));
+    e.enc32(resumable_trap, rec_trap.opcodes(&UNDEFINED2));
+    e.enc64(resumable_trap, rec_trap.opcodes(&UNDEFINED2));
+
+    // Debug trap as int3
+    e.enc32_rec(debugtrap, rec_debugtrap, 0);
+    e.enc64_rec(debugtrap, rec_debugtrap, 0);
+
+    e.enc32_rec(trapif, rec_trapif, 0);
+    e.enc64_rec(trapif, rec_trapif, 0);
+    e.enc32_rec(trapff, rec_trapff, 0);
+    e.enc64_rec(trapff, rec_trapff, 0);
+}
+
+/// Reference type instructions.
+#[inline(never)]
+fn define_reftypes(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) {
+    let shared = &shared_defs.instructions;
+
+    let is_null = shared.by_name("is_null");
+    let is_invalid = shared.by_name("is_invalid");
+    let null = shared.by_name("null");
+    let safepoint = shared.by_name("safepoint");
+
+    let rec_is_zero = r.template("is_zero");
+    let rec_is_invalid = r.template("is_invalid");
+    let rec_pu_id_ref = r.template("pu_id_ref");
+    let rec_safepoint = r.recipe("safepoint");
+
+    // Null references implemented as iconst 0.
+    e.enc32(null.bind(R32), rec_pu_id_ref.opcodes(&MOV_IMM));
+
+    e.enc64(null.bind(R64), rec_pu_id_ref.rex().opcodes(&MOV_IMM));
+    e.enc64(null.bind(R64), rec_pu_id_ref.opcodes(&MOV_IMM));
+
+    // is_null, implemented by testing whether the value is 0.
+    e.enc_r32_r64_rex_only(is_null, rec_is_zero.opcodes(&TEST_REG));
+
+    // is_invalid, implemented by testing whether the value is -1.
+    e.enc_r32_r64_rex_only(is_invalid, rec_is_invalid.opcodes(&CMP_IMM8).rrr(7));
+
+    // safepoint instruction calls sink, no actual encoding.
+    e.enc32_rec(safepoint, rec_safepoint, 0);
+    e.enc64_rec(safepoint, rec_safepoint, 0);
+}
+
+#[allow(clippy::cognitive_complexity)]
+pub(crate) fn define(
+    shared_defs: &SharedDefinitions,
+    settings: &SettingGroup,
+    x86: &InstructionGroup,
+    r: &RecipeGroup,
+) -> PerCpuModeEncodings {
+    // Definitions.
+    let mut e = PerCpuModeEncodings::new();
+
+    define_moves(&mut e, shared_defs, r);
+    define_memory(&mut e, shared_defs, x86, r);
+    define_fpu_moves(&mut e, shared_defs, r);
+    define_fpu_memory(&mut e, shared_defs, r);
+    define_fpu_ops(&mut e, shared_defs, settings, x86, r);
+    define_alu(&mut e, shared_defs, settings, x86, r);
+    define_simd(&mut e, shared_defs, settings, x86, r);
+    define_entity_ref(&mut e, shared_defs, settings, r);
+    define_control_flow(&mut e, shared_defs, settings, r);
+    define_reftypes(&mut e, shared_defs, r);
+
+    let x86_elf_tls_get_addr = x86.by_name("x86_elf_tls_get_addr");
+    let x86_macho_tls_get_addr = x86.by_name("x86_macho_tls_get_addr");
+
+    let rec_elf_tls_get_addr = r.recipe("elf_tls_get_addr");
+    let rec_macho_tls_get_addr = r.recipe("macho_tls_get_addr");
+
+    e.enc64_rec(x86_elf_tls_get_addr, rec_elf_tls_get_addr, 0);
+    e.enc64_rec(x86_macho_tls_get_addr, rec_macho_tls_get_addr, 0);
+
+    e
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/instructions.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/instructions.rs
new file mode 100644
index 0000000000..7acd2e2c50
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/instructions.rs
@@ -0,0 +1,723 @@
+#![allow(non_snake_case)]
+
+use crate::cdsl::instructions::{
+    AllInstructions, InstructionBuilder as Inst, InstructionGroup, InstructionGroupBuilder,
+};
+use crate::cdsl::operands::Operand;
+use crate::cdsl::types::ValueType;
+use crate::cdsl::typevar::{Interval, TypeSetBuilder, TypeVar};
+use crate::shared::entities::EntityRefs;
+use crate::shared::formats::Formats;
+use crate::shared::immediates::Immediates;
+use crate::shared::types;
+
+#[allow(clippy::many_single_char_names)]
+pub(crate) fn define(
+    mut all_instructions: &mut AllInstructions,
+    formats: &Formats,
+    immediates: &Immediates,
+    entities: &EntityRefs,
+) -> InstructionGroup {
+    let mut ig = InstructionGroupBuilder::new(&mut all_instructions);
+
+    let iflags: &TypeVar = &ValueType::Special(types::Flag::IFlags.into()).into();
+
+    let iWord = &TypeVar::new(
+        "iWord",
+        "A scalar integer machine word",
+        TypeSetBuilder::new().ints(32..64).build(),
+    );
+    let nlo = &Operand::new("nlo", iWord).with_doc("Low part of numerator");
+    let nhi = &Operand::new("nhi", iWord).with_doc("High part of numerator");
+    let d = &Operand::new("d", iWord).with_doc("Denominator");
+    let q = &Operand::new("q", iWord).with_doc("Quotient");
+    let r = &Operand::new("r", iWord).with_doc("Remainder");
+
+    ig.push(
+        Inst::new(
+            "x86_udivmodx",
+            r#"
+        Extended unsigned division.
+
+        Concatenate the bits in `nhi` and `nlo` to form the numerator.
+        Interpret the bits as an unsigned number and divide by the unsigned
+        denominator `d`. Trap when `d` is zero or if the quotient is larger
+        than the range of the output.
+
+        Return both quotient and remainder.
+        "#,
+            &formats.ternary,
+        )
+        .operands_in(vec![nlo, nhi, d])
+        .operands_out(vec![q, r])
+        .can_trap(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_sdivmodx",
+            r#"
+        Extended signed division.
+
+        Concatenate the bits in `nhi` and `nlo` to form the numerator.
+        Interpret the bits as a signed number and divide by the signed
+        denominator `d`. Trap when `d` is zero or if the quotient is outside
+        the range of the output.
+
+        Return both quotient and remainder.
+        "#,
+            &formats.ternary,
+        )
+        .operands_in(vec![nlo, nhi, d])
+        .operands_out(vec![q, r])
+        .can_trap(true),
+    );
+
+    let argL = &Operand::new("argL", iWord);
+    let argR = &Operand::new("argR", iWord);
+    let resLo = &Operand::new("resLo", iWord);
+    let resHi = &Operand::new("resHi", iWord);
+
+    ig.push(
+        Inst::new(
+            "x86_umulx",
+            r#"
+        Unsigned integer multiplication, producing a double-length result.
+
+        Polymorphic over all scalar integer types, but does not support vector
+        types.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![argL, argR])
+        .operands_out(vec![resLo, resHi]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_smulx",
+            r#"
+        Signed integer multiplication, producing a double-length result.
+
+        Polymorphic over all scalar integer types, but does not support vector
+        types.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![argL, argR])
+        .operands_out(vec![resLo, resHi]),
+    );
+
+    let Float = &TypeVar::new(
+        "Float",
+        "A scalar or vector floating point number",
+        TypeSetBuilder::new()
+            .floats(Interval::All)
+            .simd_lanes(Interval::All)
+            .build(),
+    );
+    let IntTo = &TypeVar::new(
+        "IntTo",
+        "An integer type with the same number of lanes",
+        TypeSetBuilder::new()
+            .ints(32..64)
+            .simd_lanes(Interval::All)
+            .build(),
+    );
+    let x = &Operand::new("x", Float);
+    let a = &Operand::new("a", IntTo);
+
+    ig.push(
+        Inst::new(
+            "x86_cvtt2si",
+            r#"
+        Convert with truncation floating point to signed integer.
+
+        The source floating point operand is converted to a signed integer by
+        rounding towards zero. If the result can't be represented in the output
+        type, returns the smallest signed value the output type can represent.
+
+        This instruction does not trap.
+        "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    let f32x4 = &TypeVar::new(
+        "f32x4",
+        "A floating point number",
+        TypeSetBuilder::new()
+            .floats(32..32)
+            .simd_lanes(4..4)
+            .build(),
+    );
+    let i32x4 = &TypeVar::new(
+        "i32x4",
+        "An integer type with the same number of lanes",
+        TypeSetBuilder::new().ints(32..32).simd_lanes(4..4).build(),
+    );
+    let x = &Operand::new("x", i32x4);
+    let a = &Operand::new("a", f32x4);
+
+    ig.push(
+        Inst::new(
+            "x86_vcvtudq2ps",
+            r#"
+        Convert unsigned integer to floating point.
+
+        Convert packed doubleword unsigned integers to packed single-precision floating-point 
+        values. This instruction does not trap.
+        "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    let x = &Operand::new("x", Float);
+    let a = &Operand::new("a", Float);
+    let y = &Operand::new("y", Float);
+
+    ig.push(
+        Inst::new(
+            "x86_fmin",
+            r#"
+        Floating point minimum with x86 semantics.
+
+        This is equivalent to the C ternary operator `x < y ? x : y` which
+        differs from `fmin` when either operand is NaN or when comparing
+        +0.0 to -0.0.
+
+        When the two operands don't compare as LT, `y` is returned unchanged,
+        even if it is a signalling NaN.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_fmax",
+            r#"
+        Floating point maximum with x86 semantics.
+
+        This is equivalent to the C ternary operator `x > y ? x : y` which
+        differs from `fmax` when either operand is NaN or when comparing
+        +0.0 to -0.0.
+
+        When the two operands don't compare as GT, `y` is returned unchanged,
+        even if it is a signalling NaN.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    let x = &Operand::new("x", iWord);
+
+    ig.push(
+        Inst::new(
+            "x86_push",
+            r#"
+    Pushes a value onto the stack.
+
+    Decrements the stack pointer and stores the specified value on to the top.
+
+    This is polymorphic in i32 and i64. However, it is only implemented for i64
+    in 64-bit mode, and only for i32 in 32-bit mode.
+    "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .other_side_effects(true)
+        .can_store(true),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_pop",
+            r#"
+    Pops a value from the stack.
+
+    Loads a value from the top of the stack and then increments the stack
+    pointer.
+
+    This is polymorphic in i32 and i64. However, it is only implemented for i64
+    in 64-bit mode, and only for i32 in 32-bit mode.
+    "#,
+            &formats.nullary,
+        )
+        .operands_out(vec![x])
+        .other_side_effects(true)
+        .can_load(true),
+    );
+
+    let y = &Operand::new("y", iWord);
+    let rflags = &Operand::new("rflags", iflags);
+
+    ig.push(
+        Inst::new(
+            "x86_bsr",
+            r#"
+    Bit Scan Reverse -- returns the bit-index of the most significant 1
+    in the word. Result is undefined if the argument is zero. However, it
+    sets the Z flag depending on the argument, so it is at least easy to
+    detect and handle that case.
+
+    This is polymorphic in i32 and i64. It is implemented for both i64 and
+    i32 in 64-bit mode, and only for i32 in 32-bit mode.
+    "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![y, rflags]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_bsf",
+            r#"
+    Bit Scan Forwards -- returns the bit-index of the least significant 1
+    in the word. Is otherwise identical to 'bsr', just above.
+    "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![y, rflags]),
+    );
+
+    let uimm8 = &immediates.uimm8;
+    let TxN = &TypeVar::new(
+        "TxN",
+        "A SIMD vector type",
+        TypeSetBuilder::new()
+            .ints(Interval::All)
+            .floats(Interval::All)
+            .bools(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let a = &Operand::new("a", TxN).with_doc("A vector value (i.e. held in an XMM register)");
+    let b = &Operand::new("b", TxN).with_doc("A vector value (i.e. held in an XMM register)");
+    let i = &Operand::new("i", uimm8).with_doc("An ordering operand controlling the copying of data from the source to the destination; see PSHUFD in Intel manual for details");
+
+    ig.push(
+        Inst::new(
+            "x86_pshufd",
+            r#"
+    Packed Shuffle Doublewords -- copies data from either memory or lanes in an extended
+    register and re-orders the data according to the passed immediate byte.
+    "#,
+            &formats.binary_imm8,
+        )
+        .operands_in(vec![a, i]) // TODO allow copying from memory here (need more permissive type than TxN)
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_pshufb",
+            r#"
+    Packed Shuffle Bytes -- re-orders data in an extended register using a shuffle
+    mask from either memory or another extended register
+    "#,
+            &formats.binary,
+        )
+        .operands_in(vec![a, b]) // TODO allow re-ordering from memory here (need more permissive type than TxN)
+        .operands_out(vec![a]),
+    );
+
+    let mask = &Operand::new("mask", uimm8).with_doc("mask to select lanes from b");
+    ig.push(
+        Inst::new(
+            "x86_pblendw",
+            r#"
+    Blend packed words using an immediate mask. Each bit of the 8-bit immediate corresponds to a 
+    lane in ``b``: if the bit is set, the lane is copied into ``a``.
+    "#,
+            &formats.ternary_imm8,
+        )
+        .operands_in(vec![a, b, mask])
+        .operands_out(vec![a]),
+    );
+
+    let Idx = &Operand::new("Idx", uimm8).with_doc("Lane index");
+    let x = &Operand::new("x", TxN);
+    let a = &Operand::new("a", &TxN.lane_of());
+
+    ig.push(
+        Inst::new(
+            "x86_pextr",
+            r#"
+        Extract lane ``Idx`` from ``x``.
+        The lane index, ``Idx``, is an immediate value, not an SSA value. It
+        must indicate a valid lane index for the type of ``x``.
+        "#,
+            &formats.binary_imm8,
+        )
+        .operands_in(vec![x, Idx])
+        .operands_out(vec![a]),
+    );
+
+    let IBxN = &TypeVar::new(
+        "IBxN",
+        "A SIMD vector type containing only booleans and integers",
+        TypeSetBuilder::new()
+            .ints(Interval::All)
+            .bools(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let x = &Operand::new("x", IBxN);
+    let y = &Operand::new("y", &IBxN.lane_of()).with_doc("New lane value");
+    let a = &Operand::new("a", IBxN);
+
+    ig.push(
+        Inst::new(
+            "x86_pinsr",
+            r#"
+        Insert ``y`` into ``x`` at lane ``Idx``.
+        The lane index, ``Idx``, is an immediate value, not an SSA value. It
+        must indicate a valid lane index for the type of ``x``.
+        "#,
+            &formats.ternary_imm8,
+        )
+        .operands_in(vec![x, y, Idx])
+        .operands_out(vec![a]),
+    );
+
+    let FxN = &TypeVar::new(
+        "FxN",
+        "A SIMD vector type containing floats",
+        TypeSetBuilder::new()
+            .floats(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let x = &Operand::new("x", FxN);
+    let y = &Operand::new("y", &FxN.lane_of()).with_doc("New lane value");
+    let a = &Operand::new("a", FxN);
+
+    ig.push(
+        Inst::new(
+            "x86_insertps",
+            r#"
+        Insert a lane of ``y`` into ``x`` at using ``Idx`` to encode both which lane the value is
+        extracted from and which it is inserted to. This is similar to x86_pinsr but inserts
+        floats, which are already stored in an XMM register.
+        "#,
+            &formats.ternary_imm8,
+        )
+        .operands_in(vec![x, y, Idx])
+        .operands_out(vec![a]),
+    );
+
+    let x = &Operand::new("x", TxN);
+    let y = &Operand::new("y", TxN);
+    let a = &Operand::new("a", TxN);
+
+    ig.push(
+        Inst::new(
+            "x86_punpckh",
+            r#"
+        Unpack the high-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional
+        i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation
+        would result in ``a = [y3, x3, y2, x2]`` (using the Intel manual's right-to-left lane
+        ordering). 
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_punpckl",
+            r#"
+        Unpack the low-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional
+        i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation
+        would result in ``a = [y1, x1, y0, x0]`` (using the Intel manual's right-to-left lane
+        ordering).
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    let x = &Operand::new("x", FxN);
+    let y = &Operand::new("y", FxN);
+    let a = &Operand::new("a", FxN);
+
+    ig.push(
+        Inst::new(
+            "x86_movsd",
+            r#"
+        Move the low 64 bits of the float vector ``y`` to the low 64 bits of float vector ``x``
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_movlhps",
+            r#"
+        Move the low 64 bits of the float vector ``y`` to the high 64 bits of float vector ``x``
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    let IxN = &TypeVar::new(
+        "IxN",
+        "A SIMD vector type containing integers",
+        TypeSetBuilder::new()
+            .ints(Interval::All)
+            .simd_lanes(Interval::All)
+            .includes_scalars(false)
+            .build(),
+    );
+    let I128 = &TypeVar::new(
+        "I128",
+        "A SIMD vector type containing one large integer (due to Cranelift type constraints, \
+        this uses the Cranelift I64X2 type but should be understood as one large value, i.e., the \
+        upper lane is concatenated with the lower lane to form the integer)",
+        TypeSetBuilder::new()
+            .ints(64..64)
+            .simd_lanes(2..2)
+            .includes_scalars(false)
+            .build(),
+    );
+
+    let x = &Operand::new("x", IxN).with_doc("Vector value to shift");
+    let y = &Operand::new("y", I128).with_doc("Number of bits to shift");
+    let a = &Operand::new("a", IxN);
+
+    ig.push(
+        Inst::new(
+            "x86_psll",
+            r#"
+        Shift Packed Data Left Logical -- This implements the behavior of the shared instruction
+        ``ishl`` but alters the shift operand to live in an XMM register as expected by the PSLL*
+        family of instructions.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_psrl",
+            r#"
+        Shift Packed Data Right Logical -- This implements the behavior of the shared instruction
+        ``ushr`` but alters the shift operand to live in an XMM register as expected by the PSRL*
+        family of instructions.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_psra",
+            r#"
+        Shift Packed Data Right Arithmetic -- This implements the behavior of the shared
+        instruction ``sshr`` but alters the shift operand to live in an XMM register as expected by
+        the PSRA* family of instructions.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    let I64x2 = &TypeVar::new(
+        "I64x2",
+        "A SIMD vector type containing two 64-bit integers",
+        TypeSetBuilder::new()
+            .ints(64..64)
+            .simd_lanes(2..2)
+            .includes_scalars(false)
+            .build(),
+    );
+
+    let x = &Operand::new("x", I64x2);
+    let y = &Operand::new("y", I64x2);
+    let a = &Operand::new("a", I64x2);
+    ig.push(
+        Inst::new(
+            "x86_pmullq",
+            r#"
+        Multiply Packed Integers -- Multiply two 64x2 integers and receive a 64x2 result with
+        lane-wise wrapping if the result overflows. This instruction is necessary to add distinct
+        encodings for CPUs with newer vector features.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_pmuludq",
+            r#"
+        Multiply Packed Integers -- Using only the bottom 32 bits in each lane, multiply two 64x2
+        unsigned integers and receive a 64x2 result. This instruction avoids the need for handling
+        overflow as in `x86_pmullq`.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    let x = &Operand::new("x", TxN);
+    let y = &Operand::new("y", TxN);
+    let f = &Operand::new("f", iflags);
+    ig.push(
+        Inst::new(
+            "x86_ptest",
+            r#"
+        Logical Compare -- PTEST will set the ZF flag if all bits in the result are 0 of the
+        bitwise AND of the first source operand (first operand) and the second source operand
+        (second operand). PTEST sets the CF flag if all bits in the result are 0 of the bitwise
+        AND of the second source operand (second operand) and the logical NOT of the destination
+        operand (first operand).
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![f]),
+    );
+
+    let x = &Operand::new("x", IxN);
+    let y = &Operand::new("y", IxN);
+    let a = &Operand::new("a", IxN);
+    ig.push(
+        Inst::new(
+            "x86_pmaxs",
+            r#"
+        Maximum of Packed Signed Integers -- Compare signed integers in the first and second
+        operand and return the maximum values.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_pmaxu",
+            r#"
+        Maximum of Packed Unsigned Integers -- Compare unsigned integers in the first and second
+        operand and return the maximum values.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_pmins",
+            r#"
+        Minimum of Packed Signed Integers -- Compare signed integers in the first and second
+        operand and return the minimum values.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "x86_pminu",
+            r#"
+        Minimum of Packed Unsigned Integers -- Compare unsigned integers in the first and second
+        operand and return the minimum values.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    let c = &Operand::new("c", uimm8)
+        .with_doc("The number of bytes to shift right; see PALIGNR in Intel manual for details");
+    ig.push(
+        Inst::new(
+            "x86_palignr",
+            r#"
+        Concatenate destination and source operands, extracting a byte-aligned result shifted to 
+        the right by `c`.
+        "#,
+            &formats.ternary_imm8,
+        )
+        .operands_in(vec![x, y, c])
+        .operands_out(vec![a]),
+    );
+
+    let i64_t = &TypeVar::new(
+        "i64_t",
+        "A scalar 64bit integer",
+        TypeSetBuilder::new().ints(64..64).build(),
+    );
+
+    let GV = &Operand::new("GV", &entities.global_value);
+    let addr = &Operand::new("addr", i64_t);
+
+    ig.push(
+        Inst::new(
+            "x86_elf_tls_get_addr",
+            r#"
+        Elf tls get addr -- This implements the GD TLS model for ELF. The clobber output should
+        not be used.
+            "#,
+            &formats.unary_global_value,
+        )
+        // This is a bit overly broad to mark as clobbering *all* the registers, because it should
+        // only preserve caller-saved registers. There's no way to indicate this to register
+        // allocation yet, though, so mark as clobbering all registers instead.
+        .clobbers_all_regs(true)
+        .operands_in(vec![GV])
+        .operands_out(vec![addr]),
+    );
+    ig.push(
+        Inst::new(
+            "x86_macho_tls_get_addr",
+            r#"
+        Mach-O tls get addr -- This implements TLS access for Mach-O. The clobber output should
+        not be used.
+            "#,
+            &formats.unary_global_value,
+        )
+        // See above comment for x86_elf_tls_get_addr.
+        .clobbers_all_regs(true)
+        .operands_in(vec![GV])
+        .operands_out(vec![addr]),
+    );
+
+    ig.build()
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/legalize.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/legalize.rs
new file mode 100644
index 0000000000..681b3104d5
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/legalize.rs
@@ -0,0 +1,829 @@
+use crate::cdsl::ast::{constant, var, ExprBuilder, Literal};
+use crate::cdsl::instructions::{vector, Bindable, InstructionGroup};
+use crate::cdsl::types::{LaneType, ValueType};
+use crate::cdsl::xform::TransformGroupBuilder;
+use crate::shared::types::Float::{F32, F64};
+use crate::shared::types::Int::{I16, I32, I64, I8};
+use crate::shared::Definitions as SharedDefinitions;
+
+#[allow(clippy::many_single_char_names)]
+pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
+    let mut expand = TransformGroupBuilder::new(
+        "x86_expand",
+        r#"
+    Legalize instructions by expansion.
+
+    Use x86-specific instructions if needed."#,
+    )
+    .isa("x86")
+    .chain_with(shared.transform_groups.by_name("expand_flags").id);
+
+    let mut narrow = TransformGroupBuilder::new(
+        "x86_narrow",
+        r#"
+    Legalize instructions by narrowing.
+
+    Use x86-specific instructions if needed."#,
+    )
+    .isa("x86")
+    .chain_with(shared.transform_groups.by_name("narrow_flags").id);
+
+    let mut narrow_avx = TransformGroupBuilder::new(
+        "x86_narrow_avx",
+        r#"
+    Legalize instructions by narrowing with CPU feature checks.
+
+    This special case converts using x86 AVX instructions where available."#,
+    )
+    .isa("x86");
+    // We cannot chain with the x86_narrow group until this group is built, see bottom of this
+    // function for where this is chained.
+
+    let mut widen = TransformGroupBuilder::new(
+        "x86_widen",
+        r#"
+    Legalize instructions by widening.
+
+    Use x86-specific instructions if needed."#,
+    )
+    .isa("x86")
+    .chain_with(shared.transform_groups.by_name("widen").id);
+
+    // List of instructions.
+    let insts = &shared.instructions;
+    let band = insts.by_name("band");
+    let bor = insts.by_name("bor");
+    let clz = insts.by_name("clz");
+    let ctz = insts.by_name("ctz");
+    let fcmp = insts.by_name("fcmp");
+    let fcvt_from_uint = insts.by_name("fcvt_from_uint");
+    let fcvt_to_sint = insts.by_name("fcvt_to_sint");
+    let fcvt_to_uint = insts.by_name("fcvt_to_uint");
+    let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
+    let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
+    let fmax = insts.by_name("fmax");
+    let fmin = insts.by_name("fmin");
+    let iadd = insts.by_name("iadd");
+    let iconst = insts.by_name("iconst");
+    let imul = insts.by_name("imul");
+    let ineg = insts.by_name("ineg");
+    let isub = insts.by_name("isub");
+    let ishl = insts.by_name("ishl");
+    let ireduce = insts.by_name("ireduce");
+    let popcnt = insts.by_name("popcnt");
+    let sdiv = insts.by_name("sdiv");
+    let selectif = insts.by_name("selectif");
+    let smulhi = insts.by_name("smulhi");
+    let srem = insts.by_name("srem");
+    let tls_value = insts.by_name("tls_value");
+    let udiv = insts.by_name("udiv");
+    let umulhi = insts.by_name("umulhi");
+    let ushr = insts.by_name("ushr");
+    let ushr_imm = insts.by_name("ushr_imm");
+    let urem = insts.by_name("urem");
+
+    let x86_bsf = x86_instructions.by_name("x86_bsf");
+    let x86_bsr = x86_instructions.by_name("x86_bsr");
+    let x86_umulx = x86_instructions.by_name("x86_umulx");
+    let x86_smulx = x86_instructions.by_name("x86_smulx");
+
+    let imm = &shared.imm;
+
+    // Shift by a 64-bit amount is equivalent to a shift by that amount mod 32, so we can reduce
+    // the size of the shift amount. This is useful for x86_32, where an I64 shift amount is
+    // not encodable.
+    let a = var("a");
+    let x = var("x");
+    let y = var("y");
+    let z = var("z");
+
+    for &ty in &[I8, I16, I32] {
+        let ishl_by_i64 = ishl.bind(ty).bind(I64);
+        let ireduce = ireduce.bind(I32);
+        expand.legalize(
+            def!(a = ishl_by_i64(x, y)),
+            vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
+        );
+    }
+
+    for &ty in &[I8, I16, I32] {
+        let ushr_by_i64 = ushr.bind(ty).bind(I64);
+        let ireduce = ireduce.bind(I32);
+        expand.legalize(
+            def!(a = ushr_by_i64(x, y)),
+            vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
+        );
+    }
+
+    // Division and remainder.
+    //
+    // The srem expansion requires custom code because srem INT_MIN, -1 is not
+    // allowed to trap. The other ops need to check avoid_div_traps.
+    expand.custom_legalize(sdiv, "expand_sdivrem");
+    expand.custom_legalize(srem, "expand_sdivrem");
+    expand.custom_legalize(udiv, "expand_udivrem");
+    expand.custom_legalize(urem, "expand_udivrem");
+
+    // Double length (widening) multiplication.
+    let a = var("a");
+    let x = var("x");
+    let y = var("y");
+    let a1 = var("a1");
+    let a2 = var("a2");
+    let res_lo = var("res_lo");
+    let res_hi = var("res_hi");
+
+    expand.legalize(
+        def!(res_hi = umulhi(x, y)),
+        vec![def!((res_lo, res_hi) = x86_umulx(x, y))],
+    );
+
+    expand.legalize(
+        def!(res_hi = smulhi(x, y)),
+        vec![def!((res_lo, res_hi) = x86_smulx(x, y))],
+    );
+
+    // Floating point condition codes.
+    //
+    // The 8 condition codes in `supported_floatccs` are directly supported by a
+    // `ucomiss` or `ucomisd` instruction. The remaining codes need legalization
+    // patterns.
+
+    let floatcc_eq = Literal::enumerator_for(&imm.floatcc, "eq");
+    let floatcc_ord = Literal::enumerator_for(&imm.floatcc, "ord");
+    let floatcc_ueq = Literal::enumerator_for(&imm.floatcc, "ueq");
+    let floatcc_ne = Literal::enumerator_for(&imm.floatcc, "ne");
+    let floatcc_uno = Literal::enumerator_for(&imm.floatcc, "uno");
+    let floatcc_one = Literal::enumerator_for(&imm.floatcc, "one");
+
+    // Equality needs an explicit `ord` test which checks the parity bit.
+    expand.legalize(
+        def!(a = fcmp(floatcc_eq, x, y)),
+        vec![
+            def!(a1 = fcmp(floatcc_ord, x, y)),
+            def!(a2 = fcmp(floatcc_ueq, x, y)),
+            def!(a = band(a1, a2)),
+        ],
+    );
+    expand.legalize(
+        def!(a = fcmp(floatcc_ne, x, y)),
+        vec![
+            def!(a1 = fcmp(floatcc_uno, x, y)),
+            def!(a2 = fcmp(floatcc_one, x, y)),
+            def!(a = bor(a1, a2)),
+        ],
+    );
+
+    let floatcc_lt = &Literal::enumerator_for(&imm.floatcc, "lt");
+    let floatcc_gt = &Literal::enumerator_for(&imm.floatcc, "gt");
+    let floatcc_le = &Literal::enumerator_for(&imm.floatcc, "le");
+    let floatcc_ge = &Literal::enumerator_for(&imm.floatcc, "ge");
+    let floatcc_ugt = &Literal::enumerator_for(&imm.floatcc, "ugt");
+    let floatcc_ult = &Literal::enumerator_for(&imm.floatcc, "ult");
+    let floatcc_uge = &Literal::enumerator_for(&imm.floatcc, "uge");
+    let floatcc_ule = &Literal::enumerator_for(&imm.floatcc, "ule");
+
+    // Inequalities that need to be reversed.
+    for &(cc, rev_cc) in &[
+        (floatcc_lt, floatcc_gt),
+        (floatcc_le, floatcc_ge),
+        (floatcc_ugt, floatcc_ult),
+        (floatcc_uge, floatcc_ule),
+    ] {
+        expand.legalize(def!(a = fcmp(cc, x, y)), vec![def!(a = fcmp(rev_cc, y, x))]);
+    }
+
+    // We need to modify the CFG for min/max legalization.
+    expand.custom_legalize(fmin, "expand_minmax");
+    expand.custom_legalize(fmax, "expand_minmax");
+
+    // Conversions from unsigned need special handling.
+    expand.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint");
+    // Conversions from float to int can trap and modify the control flow graph.
+    expand.custom_legalize(fcvt_to_sint, "expand_fcvt_to_sint");
+    expand.custom_legalize(fcvt_to_uint, "expand_fcvt_to_uint");
+    expand.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat");
+    expand.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat");
+
+    // Count leading and trailing zeroes, for baseline x86_64
+    let c_minus_one = var("c_minus_one");
+    let c_thirty_one = var("c_thirty_one");
+    let c_thirty_two = var("c_thirty_two");
+    let c_sixty_three = var("c_sixty_three");
+    let c_sixty_four = var("c_sixty_four");
+    let index1 = var("index1");
+    let r2flags = var("r2flags");
+    let index2 = var("index2");
+
+    let intcc_eq = Literal::enumerator_for(&imm.intcc, "eq");
+    let imm64_minus_one = Literal::constant(&imm.imm64, -1);
+    let imm64_63 = Literal::constant(&imm.imm64, 63);
+    expand.legalize(
+        def!(a = clz.I64(x)),
+        vec![
+            def!(c_minus_one = iconst(imm64_minus_one)),
+            def!(c_sixty_three = iconst(imm64_63)),
+            def!((index1, r2flags) = x86_bsr(x)),
+            def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)),
+            def!(a = isub(c_sixty_three, index2)),
+        ],
+    );
+
+    let imm64_31 = Literal::constant(&imm.imm64, 31);
+    expand.legalize(
+        def!(a = clz.I32(x)),
+        vec![
+            def!(c_minus_one = iconst(imm64_minus_one)),
+            def!(c_thirty_one = iconst(imm64_31)),
+            def!((index1, r2flags) = x86_bsr(x)),
+            def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)),
+            def!(a = isub(c_thirty_one, index2)),
+        ],
+    );
+
+    let imm64_64 = Literal::constant(&imm.imm64, 64);
+    expand.legalize(
+        def!(a = ctz.I64(x)),
+        vec![
+            def!(c_sixty_four = iconst(imm64_64)),
+            def!((index1, r2flags) = x86_bsf(x)),
+            def!(a = selectif(intcc_eq, r2flags, c_sixty_four, index1)),
+        ],
+    );
+
+    let imm64_32 = Literal::constant(&imm.imm64, 32);
+    expand.legalize(
+        def!(a = ctz.I32(x)),
+        vec![
+            def!(c_thirty_two = iconst(imm64_32)),
+            def!((index1, r2flags) = x86_bsf(x)),
+            def!(a = selectif(intcc_eq, r2flags, c_thirty_two, index1)),
+        ],
+    );
+
+    // Population count for baseline x86_64
+    let x = var("x");
+    let r = var("r");
+
+    let qv3 = var("qv3");
+    let qv4 = var("qv4");
+    let qv5 = var("qv5");
+    let qv6 = var("qv6");
+    let qv7 = var("qv7");
+    let qv8 = var("qv8");
+    let qv9 = var("qv9");
+    let qv10 = var("qv10");
+    let qv11 = var("qv11");
+    let qv12 = var("qv12");
+    let qv13 = var("qv13");
+    let qv14 = var("qv14");
+    let qv15 = var("qv15");
+    let qc77 = var("qc77");
+    #[allow(non_snake_case)]
+    let qc0F = var("qc0F");
+    let qc01 = var("qc01");
+
+    let imm64_1 = Literal::constant(&imm.imm64, 1);
+    let imm64_4 = Literal::constant(&imm.imm64, 4);
+    expand.legalize(
+        def!(r = popcnt.I64(x)),
+        vec![
+            def!(qv3 = ushr_imm(x, imm64_1)),
+            def!(qc77 = iconst(Literal::constant(&imm.imm64, 0x7777_7777_7777_7777))),
+            def!(qv4 = band(qv3, qc77)),
+            def!(qv5 = isub(x, qv4)),
+            def!(qv6 = ushr_imm(qv4, imm64_1)),
+            def!(qv7 = band(qv6, qc77)),
+            def!(qv8 = isub(qv5, qv7)),
+            def!(qv9 = ushr_imm(qv7, imm64_1)),
+            def!(qv10 = band(qv9, qc77)),
+            def!(qv11 = isub(qv8, qv10)),
+            def!(qv12 = ushr_imm(qv11, imm64_4)),
+            def!(qv13 = iadd(qv11, qv12)),
+            def!(qc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F_0F0F_0F0F_0F0F))),
+            def!(qv14 = band(qv13, qc0F)),
+            def!(qc01 = iconst(Literal::constant(&imm.imm64, 0x0101_0101_0101_0101))),
+            def!(qv15 = imul(qv14, qc01)),
+            def!(r = ushr_imm(qv15, Literal::constant(&imm.imm64, 56))),
+        ],
+    );
+
+    let lv3 = var("lv3");
+    let lv4 = var("lv4");
+    let lv5 = var("lv5");
+    let lv6 = var("lv6");
+    let lv7 = var("lv7");
+    let lv8 = var("lv8");
+    let lv9 = var("lv9");
+    let lv10 = var("lv10");
+    let lv11 = var("lv11");
+    let lv12 = var("lv12");
+    let lv13 = var("lv13");
+    let lv14 = var("lv14");
+    let lv15 = var("lv15");
+    let lc77 = var("lc77");
+    #[allow(non_snake_case)]
+    let lc0F = var("lc0F");
+    let lc01 = var("lc01");
+
+    expand.legalize(
+        def!(r = popcnt.I32(x)),
+        vec![
+            def!(lv3 = ushr_imm(x, imm64_1)),
+            def!(lc77 = iconst(Literal::constant(&imm.imm64, 0x7777_7777))),
+            def!(lv4 = band(lv3, lc77)),
+            def!(lv5 = isub(x, lv4)),
+            def!(lv6 = ushr_imm(lv4, imm64_1)),
+            def!(lv7 = band(lv6, lc77)),
+            def!(lv8 = isub(lv5, lv7)),
+            def!(lv9 = ushr_imm(lv7, imm64_1)),
+            def!(lv10 = band(lv9, lc77)),
+            def!(lv11 = isub(lv8, lv10)),
+            def!(lv12 = ushr_imm(lv11, imm64_4)),
+            def!(lv13 = iadd(lv11, lv12)),
+            def!(lc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F_0F0F))),
+            def!(lv14 = band(lv13, lc0F)),
+            def!(lc01 = iconst(Literal::constant(&imm.imm64, 0x0101_0101))),
+            def!(lv15 = imul(lv14, lc01)),
+            def!(r = ushr_imm(lv15, Literal::constant(&imm.imm64, 24))),
+        ],
+    );
+
+    expand.custom_legalize(ineg, "convert_ineg");
+    expand.custom_legalize(tls_value, "expand_tls_value");
+    widen.custom_legalize(ineg, "convert_ineg");
+
+    // To reduce compilation times, separate out large blocks of legalizations by theme.
+    define_simd(shared, x86_instructions, &mut narrow, &mut narrow_avx);
+
+    expand.build_and_add_to(&mut shared.transform_groups);
+    let narrow_id = narrow.build_and_add_to(&mut shared.transform_groups);
+    narrow_avx
+        .chain_with(narrow_id)
+        .build_and_add_to(&mut shared.transform_groups);
+    widen.build_and_add_to(&mut shared.transform_groups);
+}
+
+fn define_simd(
+    shared: &mut SharedDefinitions,
+    x86_instructions: &InstructionGroup,
+    narrow: &mut TransformGroupBuilder,
+    narrow_avx: &mut TransformGroupBuilder,
+) {
+    let insts = &shared.instructions;
+    let band = insts.by_name("band");
+    let band_not = insts.by_name("band_not");
+    let bitcast = insts.by_name("bitcast");
+    let bitselect = insts.by_name("bitselect");
+    let bor = insts.by_name("bor");
+    let bnot = insts.by_name("bnot");
+    let bxor = insts.by_name("bxor");
+    let extractlane = insts.by_name("extractlane");
+    let fabs = insts.by_name("fabs");
+    let fcmp = insts.by_name("fcmp");
+    let fcvt_from_uint = insts.by_name("fcvt_from_uint");
+    let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
+    let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
+    let fmax = insts.by_name("fmax");
+    let fmin = insts.by_name("fmin");
+    let fneg = insts.by_name("fneg");
+    let iadd_imm = insts.by_name("iadd_imm");
+    let icmp = insts.by_name("icmp");
+    let imax = insts.by_name("imax");
+    let imin = insts.by_name("imin");
+    let imul = insts.by_name("imul");
+    let ineg = insts.by_name("ineg");
+    let insertlane = insts.by_name("insertlane");
+    let ishl = insts.by_name("ishl");
+    let ishl_imm = insts.by_name("ishl_imm");
+    let load_splat = insts.by_name("load_splat");
+    let raw_bitcast = insts.by_name("raw_bitcast");
+    let scalar_to_vector = insts.by_name("scalar_to_vector");
+    let splat = insts.by_name("splat");
+    let shuffle = insts.by_name("shuffle");
+    let sshr = insts.by_name("sshr");
+    let swizzle = insts.by_name("swizzle");
+    let trueif = insts.by_name("trueif");
+    let uadd_sat = insts.by_name("uadd_sat");
+    let umax = insts.by_name("umax");
+    let umin = insts.by_name("umin");
+    let snarrow = insts.by_name("snarrow");
+    let swiden_high = insts.by_name("swiden_high");
+    let swiden_low = insts.by_name("swiden_low");
+    let ushr_imm = insts.by_name("ushr_imm");
+    let ushr = insts.by_name("ushr");
+    let uwiden_high = insts.by_name("uwiden_high");
+    let uwiden_low = insts.by_name("uwiden_low");
+    let vconst = insts.by_name("vconst");
+    let vall_true = insts.by_name("vall_true");
+    let vany_true = insts.by_name("vany_true");
+    let vselect = insts.by_name("vselect");
+
+    let x86_palignr = x86_instructions.by_name("x86_palignr");
+    let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
+    let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
+    let x86_pmins = x86_instructions.by_name("x86_pmins");
+    let x86_pminu = x86_instructions.by_name("x86_pminu");
+    let x86_pshufb = x86_instructions.by_name("x86_pshufb");
+    let x86_pshufd = x86_instructions.by_name("x86_pshufd");
+    let x86_psra = x86_instructions.by_name("x86_psra");
+    let x86_ptest = x86_instructions.by_name("x86_ptest");
+    let x86_punpckh = x86_instructions.by_name("x86_punpckh");
+    let x86_punpckl = x86_instructions.by_name("x86_punpckl");
+
+    let imm = &shared.imm;
+
+    // Set up variables and immediates.
+    let uimm8_zero = Literal::constant(&imm.uimm8, 0x00);
+    let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
+    let uimm8_eight = Literal::constant(&imm.uimm8, 8);
+    let u128_zeroes = constant(vec![0x00; 16]);
+    let u128_ones = constant(vec![0xff; 16]);
+    let u128_seventies = constant(vec![0x70; 16]);
+    let a = var("a");
+    let b = var("b");
+    let c = var("c");
+    let d = var("d");
+    let e = var("e");
+    let f = var("f");
+    let g = var("g");
+    let h = var("h");
+    let x = var("x");
+    let y = var("y");
+    let z = var("z");
+
+    // Limit the SIMD vector size: eventually multiple vector sizes may be supported
+    // but for now only SSE-sized vectors are available.
+    let sse_vector_size: u64 = 128;
+    let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
+
+    // SIMD splat: 8-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
+        let splat_any8x16 = splat.bind(vector(ty, sse_vector_size));
+        narrow.legalize(
+            def!(y = splat_any8x16(x)),
+            vec![
+                // Move into the lowest 8 bits of an XMM register.
+                def!(a = scalar_to_vector(x)),
+                // Zero out a different XMM register; the shuffle mask for moving the lowest byte
+                // to all other byte lanes is 0x0.
+                def!(b = vconst(u128_zeroes)),
+                // PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b).
+                def!(y = x86_pshufb(a, b)),
+            ],
+        );
+    }
+
+    // SIMD splat: 16-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
+        let splat_x16x8 = splat.bind(vector(ty, sse_vector_size));
+        let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
+            .bind(vector(I32, sse_vector_size))
+            .bind(vector(ty, sse_vector_size));
+        let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
+            .bind(vector(ty, sse_vector_size))
+            .bind(vector(I32, sse_vector_size));
+        narrow.legalize(
+            def!(y = splat_x16x8(x)),
+            vec![
+                // Move into the lowest 16 bits of an XMM register.
+                def!(a = scalar_to_vector(x)),
+                // Insert the value again but in the next lowest 16 bits.
+                def!(b = insertlane(a, x, uimm8_one)),
+                // No instruction emitted; pretend this is an I32x4 so we can use PSHUFD.
+                def!(c = raw_bitcast_any16x8_to_i32x4(b)),
+                // Broadcast the bytes in the XMM register with PSHUFD.
+                def!(d = x86_pshufd(c, uimm8_zero)),
+                // No instruction emitted; pretend this is an X16x8 again.
+                def!(y = raw_bitcast_i32x4_to_any16x8(d)),
+            ],
+        );
+    }
+
+    // SIMD splat: 32-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
+        let splat_any32x4 = splat.bind(vector(ty, sse_vector_size));
+        narrow.legalize(
+            def!(y = splat_any32x4(x)),
+            vec![
+                // Translate to an x86 MOV to get the value in an XMM register.
+                def!(a = scalar_to_vector(x)),
+                // Broadcast the bytes in the XMM register with PSHUFD.
+                def!(y = x86_pshufd(a, uimm8_zero)),
+            ],
+        );
+    }
+
+    // SIMD splat: 64-bits
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
+        let splat_any64x2 = splat.bind(vector(ty, sse_vector_size));
+        narrow.legalize(
+            def!(y = splat_any64x2(x)),
+            vec![
+                // Move into the lowest 64 bits of an XMM register.
+                def!(a = scalar_to_vector(x)),
+                // Move into the highest 64 bits of the same XMM register.
+                def!(y = insertlane(a, x, uimm8_one)),
+            ],
+        );
+    }
+
+    // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec requiring
+    // mask indexes greater than 15 to have the same semantics as a 0 index. For the spec discussion,
+    // see https://github.com/WebAssembly/simd/issues/93.
+    {
+        let swizzle = swizzle.bind(vector(I8, sse_vector_size));
+        narrow.legalize(
+            def!(a = swizzle(x, y)),
+            vec![
+                def!(b = vconst(u128_seventies)),
+                def!(c = uadd_sat(y, b)),
+                def!(a = x86_pshufb(x, c)),
+            ],
+        );
+    }
+
+    // SIMD bnot
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let bnot = bnot.bind(vector(ty, sse_vector_size));
+        narrow.legalize(
+            def!(y = bnot(x)),
+            vec![def!(a = vconst(u128_ones)), def!(y = bxor(a, x))],
+        );
+    }
+
+    // SIMD shift right (arithmetic, i16x8 and i32x4)
+    for ty in &[I16, I32] {
+        let sshr = sshr.bind(vector(*ty, sse_vector_size));
+        let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
+        narrow.legalize(
+            def!(a = sshr(x, y)),
+            vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))],
+        );
+    }
+    // SIMD shift right (arithmetic, i8x16)
+    {
+        let sshr = sshr.bind(vector(I8, sse_vector_size));
+        let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
+        let raw_bitcast_i16x8 = raw_bitcast.bind(vector(I16, sse_vector_size));
+        let raw_bitcast_i16x8_again = raw_bitcast.bind(vector(I16, sse_vector_size));
+        narrow.legalize(
+            def!(z = sshr(x, y)),
+            vec![
+                // Since we will use the high byte of each 16x8 lane, shift an extra 8 bits.
+                def!(a = iadd_imm(y, uimm8_eight)),
+                def!(b = bitcast_i64x2(a)),
+                // Take the low 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
+                def!(c = x86_punpckl(x, x)),
+                def!(d = raw_bitcast_i16x8(c)),
+                def!(e = x86_psra(d, b)),
+                // Take the high 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
+                def!(f = x86_punpckh(x, x)),
+                def!(g = raw_bitcast_i16x8_again(f)),
+                def!(h = x86_psra(g, b)),
+                // Re-pack the vector.
+                def!(z = snarrow(e, h)),
+            ],
+        );
+    }
+    // SIMD shift right (arithmetic, i64x2)
+    {
+        let sshr_vector = sshr.bind(vector(I64, sse_vector_size));
+        let sshr_scalar_lane0 = sshr.bind(I64);
+        let sshr_scalar_lane1 = sshr.bind(I64);
+        narrow.legalize(
+            def!(z = sshr_vector(x, y)),
+            vec![
+                // Use scalar operations to shift the first lane.
+                def!(a = extractlane(x, uimm8_zero)),
+                def!(b = sshr_scalar_lane0(a, y)),
+                def!(c = insertlane(x, b, uimm8_zero)),
+                // Do the same for the second lane.
+                def!(d = extractlane(x, uimm8_one)),
+                def!(e = sshr_scalar_lane1(d, y)),
+                def!(z = insertlane(c, e, uimm8_one)),
+            ],
+        );
+    }
+
+    // SIMD select
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let bitselect = bitselect.bind(vector(ty, sse_vector_size)); // must bind both x/y and c
+        narrow.legalize(
+            def!(d = bitselect(c, x, y)),
+            vec![
+                def!(a = band(x, c)),
+                def!(b = band_not(y, c)),
+                def!(d = bor(a, b)),
+            ],
+        );
+    }
+
+    // SIMD vselect; replace with bitselect if BLEND* instructions are not available.
+    // This works, because each lane of boolean vector is filled with zeroes or ones.
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let vselect = vselect.bind(vector(ty, sse_vector_size));
+        let raw_bitcast = raw_bitcast.bind(vector(ty, sse_vector_size));
+        narrow.legalize(
+            def!(d = vselect(c, x, y)),
+            vec![def!(a = raw_bitcast(c)), def!(d = bitselect(a, x, y))],
+        );
+    }
+
+    // SIMD vany_true
+    let ne = Literal::enumerator_for(&imm.intcc, "ne");
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let vany_true = vany_true.bind(vector(ty, sse_vector_size));
+        narrow.legalize(
+            def!(y = vany_true(x)),
+            vec![def!(a = x86_ptest(x, x)), def!(y = trueif(ne, a))],
+        );
+    }
+
+    // SIMD vall_true
+    let eq = Literal::enumerator_for(&imm.intcc, "eq");
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let vall_true = vall_true.bind(vector(ty, sse_vector_size));
+        if ty.is_int() {
+            // In the common case (Wasm's integer-only all_true), we do not require a
+            // bitcast.
+            narrow.legalize(
+                def!(y = vall_true(x)),
+                vec![
+                    def!(a = vconst(u128_zeroes)),
+                    def!(c = icmp(eq, x, a)),
+                    def!(d = x86_ptest(c, c)),
+                    def!(y = trueif(eq, d)),
+                ],
+            );
+        } else {
+            // However, to support other types we must bitcast them to an integer vector to
+            // use icmp.
+            let lane_type_as_int = LaneType::int_from_bits(ty.lane_bits() as u16);
+            let raw_bitcast_to_int = raw_bitcast.bind(vector(lane_type_as_int, sse_vector_size));
+            narrow.legalize(
+                def!(y = vall_true(x)),
+                vec![
+                    def!(a = vconst(u128_zeroes)),
+                    def!(b = raw_bitcast_to_int(x)),
+                    def!(c = icmp(eq, b, a)),
+                    def!(d = x86_ptest(c, c)),
+                    def!(y = trueif(eq, d)),
+                ],
+            );
+        }
+    }
+
+    // SIMD icmp ne
+    let ne = Literal::enumerator_for(&imm.intcc, "ne");
+    for ty in ValueType::all_lane_types().filter(|ty| allowed_simd_type(ty) && ty.is_int()) {
+        let icmp_ = icmp.bind(vector(ty, sse_vector_size));
+        narrow.legalize(
+            def!(c = icmp_(ne, a, b)),
+            vec![def!(x = icmp(eq, a, b)), def!(c = bnot(x))],
+        );
+    }
+
+    // SIMD icmp greater-/less-than
+    let sgt = Literal::enumerator_for(&imm.intcc, "sgt");
+    let ugt = Literal::enumerator_for(&imm.intcc, "ugt");
+    let sge = Literal::enumerator_for(&imm.intcc, "sge");
+    let uge = Literal::enumerator_for(&imm.intcc, "uge");
+    let slt = Literal::enumerator_for(&imm.intcc, "slt");
+    let ult = Literal::enumerator_for(&imm.intcc, "ult");
+    let sle = Literal::enumerator_for(&imm.intcc, "sle");
+    let ule = Literal::enumerator_for(&imm.intcc, "ule");
+    for ty in &[I8, I16, I32] {
+        // greater-than
+        let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(c = icmp_(ugt, a, b)),
+            vec![
+                def!(x = x86_pmaxu(a, b)),
+                def!(y = icmp(eq, x, b)),
+                def!(c = bnot(y)),
+            ],
+        );
+        let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(c = icmp_(sge, a, b)),
+            vec![def!(x = x86_pmins(a, b)), def!(c = icmp(eq, x, b))],
+        );
+        let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(c = icmp_(uge, a, b)),
+            vec![def!(x = x86_pminu(a, b)), def!(c = icmp(eq, x, b))],
+        );
+
+        // less-than
+        let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = icmp_(slt, a, b)), vec![def!(c = icmp(sgt, b, a))]);
+        let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = icmp_(ult, a, b)), vec![def!(c = icmp(ugt, b, a))]);
+        let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = icmp_(sle, a, b)), vec![def!(c = icmp(sge, b, a))]);
+        let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = icmp_(ule, a, b)), vec![def!(c = icmp(uge, b, a))]);
+    }
+
+    // SIMD integer min/max
+    for ty in &[I8, I16, I32] {
+        let imin = imin.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = imin(a, b)), vec![def!(c = x86_pmins(a, b))]);
+        let umin = umin.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = umin(a, b)), vec![def!(c = x86_pminu(a, b))]);
+        let imax = imax.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = imax(a, b)), vec![def!(c = x86_pmaxs(a, b))]);
+        let umax = umax.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = umax(a, b)), vec![def!(c = x86_pmaxu(a, b))]);
+    }
+
+    // SIMD fcmp greater-/less-than
+    let gt = Literal::enumerator_for(&imm.floatcc, "gt");
+    let lt = Literal::enumerator_for(&imm.floatcc, "lt");
+    let ge = Literal::enumerator_for(&imm.floatcc, "ge");
+    let le = Literal::enumerator_for(&imm.floatcc, "le");
+    let ugt = Literal::enumerator_for(&imm.floatcc, "ugt");
+    let ult = Literal::enumerator_for(&imm.floatcc, "ult");
+    let uge = Literal::enumerator_for(&imm.floatcc, "uge");
+    let ule = Literal::enumerator_for(&imm.floatcc, "ule");
+    for ty in &[F32, F64] {
+        let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = fcmp_(gt, a, b)), vec![def!(c = fcmp(lt, b, a))]);
+        let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = fcmp_(ge, a, b)), vec![def!(c = fcmp(le, b, a))]);
+        let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = fcmp_(ult, a, b)), vec![def!(c = fcmp(ugt, b, a))]);
+        let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(def!(c = fcmp_(ule, a, b)), vec![def!(c = fcmp(uge, b, a))]);
+    }
+
+    for ty in &[F32, F64] {
+        let fneg = fneg.bind(vector(*ty, sse_vector_size));
+        let lane_type_as_int = LaneType::int_from_bits(LaneType::from(*ty).lane_bits() as u16);
+        let uimm8_shift = Literal::constant(&imm.uimm8, lane_type_as_int.lane_bits() as i64 - 1);
+        let vconst = vconst.bind(vector(lane_type_as_int, sse_vector_size));
+        let bitcast_to_float = raw_bitcast.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(b = fneg(a)),
+            vec![
+                def!(c = vconst(u128_ones)),
+                def!(d = ishl_imm(c, uimm8_shift)), // Create a mask of all 0s except the MSB.
+                def!(e = bitcast_to_float(d)),      // Cast mask to the floating-point type.
+                def!(b = bxor(a, e)),               // Flip the MSB.
+            ],
+        );
+    }
+
+    // SIMD fabs
+    for ty in &[F32, F64] {
+        let fabs = fabs.bind(vector(*ty, sse_vector_size));
+        let lane_type_as_int = LaneType::int_from_bits(LaneType::from(*ty).lane_bits() as u16);
+        let vconst = vconst.bind(vector(lane_type_as_int, sse_vector_size));
+        let bitcast_to_float = raw_bitcast.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(b = fabs(a)),
+            vec![
+                def!(c = vconst(u128_ones)),
+                def!(d = ushr_imm(c, uimm8_one)), // Create a mask of all 1s except the MSB.
+                def!(e = bitcast_to_float(d)),    // Cast mask to the floating-point type.
+                def!(b = band(a, e)),             // Unset the MSB.
+            ],
+        );
+    }
+
+    // SIMD widen
+    for ty in &[I8, I16] {
+        let swiden_high = swiden_high.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(b = swiden_high(a)),
+            vec![
+                def!(c = x86_palignr(a, a, uimm8_eight)),
+                def!(b = swiden_low(c)),
+            ],
+        );
+        let uwiden_high = uwiden_high.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(b = uwiden_high(a)),
+            vec![
+                def!(c = x86_palignr(a, a, uimm8_eight)),
+                def!(b = uwiden_low(c)),
+            ],
+        );
+    }
+
+    narrow.custom_legalize(shuffle, "convert_shuffle");
+    narrow.custom_legalize(extractlane, "convert_extractlane");
+    narrow.custom_legalize(insertlane, "convert_insertlane");
+    narrow.custom_legalize(ineg, "convert_ineg");
+    narrow.custom_legalize(ushr, "convert_ushr");
+    narrow.custom_legalize(ishl, "convert_ishl");
+    narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
+    narrow.custom_legalize(fmin, "expand_minmax_vector");
+    narrow.custom_legalize(fmax, "expand_minmax_vector");
+    narrow.custom_legalize(load_splat, "expand_load_splat");
+
+    narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
+    narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
+    narrow_avx.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat_vector");
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/mod.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/mod.rs
new file mode 100644
index 0000000000..a272e83900
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/mod.rs
@@ -0,0 +1,88 @@
+use crate::cdsl::cpu_modes::CpuMode;
+use crate::cdsl::isa::TargetIsa;
+use crate::cdsl::types::{ReferenceType, VectorType};
+
+use crate::shared::types::Bool::B1;
+use crate::shared::types::Float::{F32, F64};
+use crate::shared::types::Int::{I16, I32, I64, I8};
+use crate::shared::types::Reference::{R32, R64};
+use crate::shared::Definitions as SharedDefinitions;
+
+mod encodings;
+mod instructions;
+mod legalize;
+mod opcodes;
+mod recipes;
+mod registers;
+pub(crate) mod settings;
+
+pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
+    let settings = settings::define(&shared_defs.settings);
+    let regs = registers::define();
+
+    let inst_group = instructions::define(
+        &mut shared_defs.all_instructions,
+        &shared_defs.formats,
+        &shared_defs.imm,
+        &shared_defs.entities,
+    );
+    legalize::define(shared_defs, &inst_group);
+
+    // CPU modes for 32-bit and 64-bit operations.
+    let mut x86_64 = CpuMode::new("I64");
+    let mut x86_32 = CpuMode::new("I32");
+
+    let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
+    let x86_widen = shared_defs.transform_groups.by_name("x86_widen");
+    let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow");
+    let x86_narrow_avx = shared_defs.transform_groups.by_name("x86_narrow_avx");
+    let x86_expand = shared_defs.transform_groups.by_name("x86_expand");
+
+    x86_32.legalize_monomorphic(expand_flags);
+    x86_32.legalize_default(x86_narrow);
+    x86_32.legalize_type(B1, expand_flags);
+    x86_32.legalize_type(I8, x86_widen);
+    x86_32.legalize_type(I16, x86_widen);
+    x86_32.legalize_type(I32, x86_expand);
+    x86_32.legalize_value_type(ReferenceType(R32), x86_expand);
+    x86_32.legalize_type(F32, x86_expand);
+    x86_32.legalize_type(F64, x86_expand);
+    x86_32.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx);
+    x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
+    x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
+
+    x86_64.legalize_monomorphic(expand_flags);
+    x86_64.legalize_default(x86_narrow);
+    x86_64.legalize_type(B1, expand_flags);
+    x86_64.legalize_type(I8, x86_widen);
+    x86_64.legalize_type(I16, x86_widen);
+    x86_64.legalize_type(I32, x86_expand);
+    x86_64.legalize_type(I64, x86_expand);
+    x86_64.legalize_value_type(ReferenceType(R64), x86_expand);
+    x86_64.legalize_type(F32, x86_expand);
+    x86_64.legalize_type(F64, x86_expand);
+    x86_64.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx);
+    x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
+    x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
+
+    let recipes = recipes::define(shared_defs, &settings, &regs);
+
+    let encodings = encodings::define(shared_defs, &settings, &inst_group, &recipes);
+    x86_32.set_encodings(encodings.enc32);
+    x86_64.set_encodings(encodings.enc64);
+    let encodings_predicates = encodings.inst_pred_reg.extract();
+
+    let recipes = encodings.recipes;
+
+    let cpu_modes = vec![x86_64, x86_32];
+
+    TargetIsa::new(
+        "x86",
+        inst_group,
+        settings,
+        regs,
+        recipes,
+        cpu_modes,
+        encodings_predicates,
+    )
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/opcodes.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/opcodes.rs
new file mode 100644
index 0000000000..09c07c458f
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/opcodes.rs
@@ -0,0 +1,721 @@
+//! Static, named definitions of instruction opcodes.
+
+/// Empty opcode for use as a default.
+pub static EMPTY: [u8; 0] = [];
+
+/// Add with carry flag r{16,32,64} to r/m of the same size.
+pub static ADC: [u8; 1] = [0x11];
+
+/// Add r{16,32,64} to r/m of the same size.
+pub static ADD: [u8; 1] = [0x01];
+
+/// Add imm{16,32} to r/m{16,32,64}, possibly sign-extended.
+pub static ADD_IMM: [u8; 1] = [0x81];
+
+/// Add sign-extended imm8 to r/m{16,32,64}.
+pub static ADD_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
+
+/// Add packed double-precision floating-point values from xmm2/mem to xmm1 and store result in  
+/// xmm1 (SSE2).
+pub static ADDPD: [u8; 3] = [0x66, 0x0f, 0x58];
+
+/// Add packed single-precision floating-point values from xmm2/mem to xmm1 and store result in  
+/// xmm1 (SSE).
+pub static ADDPS: [u8; 2] = [0x0f, 0x58];
+
+/// Add the low double-precision floating-point value from xmm2/mem to xmm1
+/// and store the result in xmm1.
+pub static ADDSD: [u8; 3] = [0xf2, 0x0f, 0x58];
+
+/// Add the low single-precision floating-point value from xmm2/mem to xmm1
+/// and store the result in xmm1.
+pub static ADDSS: [u8; 3] = [0xf3, 0x0f, 0x58];
+
+/// r/m{16,32,64} AND register of the same size (Intel docs have a typo).
+pub static AND: [u8; 1] = [0x21];
+
+/// imm{16,32} AND r/m{16,32,64}, possibly sign-extended.
+pub static AND_IMM: [u8; 1] = [0x81];
+
+/// r/m{16,32,64} AND sign-extended imm8.
+pub static AND_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
+
+/// Return the bitwise logical AND NOT of packed single-precision floating-point
+/// values in xmm1 and xmm2/mem.
+pub static ANDNPS: [u8; 2] = [0x0f, 0x55];
+
+/// Return the bitwise logical AND of packed single-precision floating-point values
+/// in xmm1 and xmm2/mem.
+pub static ANDPS: [u8; 2] = [0x0f, 0x54];
+
+/// Bit scan forward (stores index of first encountered 1 from the front).
+pub static BIT_SCAN_FORWARD: [u8; 2] = [0x0f, 0xbc];
+
+/// Bit scan reverse (stores index of first encountered 1 from the back).
+pub static BIT_SCAN_REVERSE: [u8; 2] = [0x0f, 0xbd];
+
+/// Select packed single-precision floating-point values from xmm1 and xmm2/m128
+/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
+pub static BLENDVPS: [u8; 4] = [0x66, 0x0f, 0x38, 0x14];
+
+/// Select packed double-precision floating-point values from xmm1 and xmm2/m128
+/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
+pub static BLENDVPD: [u8; 4] = [0x66, 0x0f, 0x38, 0x15];
+
+/// Call near, relative, displacement relative to next instruction (sign-extended).
+pub static CALL_RELATIVE: [u8; 1] = [0xe8];
+
+/// Move r/m{16,32,64} if overflow (OF=1).
+pub static CMOV_OVERFLOW: [u8; 2] = [0x0f, 0x40];
+
+/// Compare imm{16,32} with r/m{16,32,64} (sign-extended if 64).
+pub static CMP_IMM: [u8; 1] = [0x81];
+
+/// Compare imm8 with r/m{16,32,64}.
+pub static CMP_IMM8: [u8; 1] = [0x83];
+
+/// Compare r{16,32,64} with r/m of the same size.
+pub static CMP_REG: [u8; 1] = [0x39];
+
+/// Compare packed double-precision floating-point value in xmm2/m32 and xmm1 using bits 2:0 of
+/// imm8 as comparison predicate (SSE2).
+pub static CMPPD: [u8; 3] = [0x66, 0x0f, 0xc2];
+
+/// Compare packed single-precision floating-point value in xmm2/m32 and xmm1 using bits 2:0 of
+/// imm8 as comparison predicate (SSE).
+pub static CMPPS: [u8; 2] = [0x0f, 0xc2];
+
+/// Convert four packed signed doubleword integers from xmm2/mem to four packed single-precision
+/// floating-point values in xmm1 (SSE2).
+pub static CVTDQ2PS: [u8; 2] = [0x0f, 0x5b];
+
+/// Convert scalar double-precision floating-point value to scalar single-precision
+/// floating-point value.
+pub static CVTSD2SS: [u8; 3] = [0xf2, 0x0f, 0x5a];
+
+/// Convert doubleword integer to scalar double-precision floating-point value.
+pub static CVTSI2SD: [u8; 3] = [0xf2, 0x0f, 0x2a];
+
+/// Convert doubleword integer to scalar single-precision floating-point value.
+pub static CVTSI2SS: [u8; 3] = [0xf3, 0x0f, 0x2a];
+
+/// Convert scalar single-precision floating-point value to scalar double-precision
+/// float-point value.
+pub static CVTSS2SD: [u8; 3] = [0xf3, 0x0f, 0x5a];
+
+/// Convert four packed single-precision floating-point values from xmm2/mem to four packed signed
+/// doubleword values in xmm1 using truncation (SSE2).
+pub static CVTTPS2DQ: [u8; 3] = [0xf3, 0x0f, 0x5b];
+
+/// Convert with truncation scalar double-precision floating-point value to signed
+/// integer.
+pub static CVTTSD2SI: [u8; 3] = [0xf2, 0x0f, 0x2c];
+
+/// Convert with truncation scalar single-precision floating-point value to integer.
+pub static CVTTSS2SI: [u8; 3] = [0xf3, 0x0f, 0x2c];
+
+/// Unsigned divide for {16,32,64}-bit.
+pub static DIV: [u8; 1] = [0xf7];
+
+/// Divide packed double-precision floating-point values in xmm1 by packed double-precision
+/// floating-point values in xmm2/mem (SSE2).
+pub static DIVPD: [u8; 3] = [0x66, 0x0f, 0x5e];
+
+/// Divide packed single-precision floating-point values in xmm1 by packed single-precision
+/// floating-point values in xmm2/mem (SSE).
+pub static DIVPS: [u8; 2] = [0x0f, 0x5e];
+
+/// Divide low double-precision floating-point value in xmm1 by low double-precision
+/// floating-point value in xmm2/m64.
+pub static DIVSD: [u8; 3] = [0xf2, 0x0f, 0x5e];
+
+/// Divide low single-precision floating-point value in xmm1 by low single-precision
+/// floating-point value in xmm2/m32.
+pub static DIVSS: [u8; 3] = [0xf3, 0x0f, 0x5e];
+
+/// Signed divide for {16,32,64}-bit.
+pub static IDIV: [u8; 1] = [0xf7];
+
+/// Signed multiply for {16,32,64}-bit, generic registers.
+pub static IMUL: [u8; 2] = [0x0f, 0xaf];
+
+/// Signed multiply for {16,32,64}-bit, storing into RDX:RAX.
+pub static IMUL_RDX_RAX: [u8; 1] = [0xf7];
+
+/// Insert scalar single-precision floating-point value.
+pub static INSERTPS: [u8; 4] = [0x66, 0x0f, 0x3a, 0x21];
+
+/// Either:
+///  1. Jump near, absolute indirect, RIP = 64-bit offset from register or memory.
+///  2. Jump far, absolute indirect, address given in m16:64.
+pub static JUMP_ABSOLUTE: [u8; 1] = [0xff];
+
+/// Jump near, relative, RIP = RIP + 32-bit displacement sign extended to 64 bits.
+pub static JUMP_NEAR_RELATIVE: [u8; 1] = [0xe9];
+
+/// Jump near (rel32) if overflow (OF=1).
+pub static JUMP_NEAR_IF_OVERFLOW: [u8; 2] = [0x0f, 0x80];
+
+/// Jump short, relative, RIP = RIP + 8-bit displacement sign extended to 64 bits.
+pub static JUMP_SHORT: [u8; 1] = [0xeb];
+
+/// Jump short (rel8) if equal (ZF=1).
+pub static JUMP_SHORT_IF_EQUAL: [u8; 1] = [0x74];
+
+/// Jump short (rel8) if not equal (ZF=0).
+pub static JUMP_SHORT_IF_NOT_EQUAL: [u8; 1] = [0x75];
+
+/// Jump short (rel8) if overflow (OF=1).
+pub static JUMP_SHORT_IF_OVERFLOW: [u8; 1] = [0x70];
+
+/// Store effective address for m in register r{16,32,64}.
+pub static LEA: [u8; 1] = [0x8d];
+
+/// Count the number of leading zero bits.
+pub static LZCNT: [u8; 3] = [0xf3, 0x0f, 0xbd];
+
+/// Return the maximum packed double-precision floating-point values between xmm1 and xmm2/m128
+/// (SSE2).
+pub static MAXPD: [u8; 3] = [0x66, 0x0f, 0x5f];
+
+/// Return the maximum packed single-precision floating-point values between  xmm1 and xmm2/m128
+/// (SSE).
+pub static MAXPS: [u8; 2] = [0x0f, 0x5f];
+
+/// Return the maximum scalar double-precision floating-point value between
+/// xmm2/m64 and xmm1.
+pub static MAXSD: [u8; 3] = [0xf2, 0x0f, 0x5f];
+
+/// Return the maximum scalar single-precision floating-point value between
+/// xmm2/m32 and xmm1.
+pub static MAXSS: [u8; 3] = [0xf3, 0x0f, 0x5f];
+
+/// Return the minimum packed double-precision floating-point values between xmm1 and xmm2/m128
+/// (SSE2).
+pub static MINPD: [u8; 3] = [0x66, 0x0f, 0x5d];
+
+/// Return the minimum packed single-precision floating-point values between xmm1 and xmm2/m128
+/// (SSE).
+pub static MINPS: [u8; 2] = [0x0f, 0x5d];
+
+/// Return the minimum scalar double-precision floating-point value between
+/// xmm2/m64 and xmm1.
+pub static MINSD: [u8; 3] = [0xf2, 0x0f, 0x5d];
+
+/// Return the minimum scalar single-precision floating-point value between
+/// xmm2/m32 and xmm1.
+pub static MINSS: [u8; 3] = [0xf3, 0x0f, 0x5d];
+
+/// Move r8 to r/m8.
+pub static MOV_BYTE_STORE: [u8; 1] = [0x88];
+
+/// Move imm{16,32,64} to same-sized register.
+pub static MOV_IMM: [u8; 1] = [0xb8];
+
+/// Move imm{16,32} to r{16,32,64}, sign-extended if 64-bit target.
+pub static MOV_IMM_SIGNEXTEND: [u8; 1] = [0xc7];
+
+/// Move {r/m16, r/m32, r/m64} to same-sized register.
+pub static MOV_LOAD: [u8; 1] = [0x8b];
+
+/// Move r16 to r/m16.
+pub static MOV_STORE_16: [u8; 2] = [0x66, 0x89];
+
+/// Move {r16, r32, r64} to same-sized register or memory.
+pub static MOV_STORE: [u8; 1] = [0x89];
+
+/// Move aligned packed single-precision floating-point values from x/m to xmm (SSE).
+pub static MOVAPS_LOAD: [u8; 2] = [0x0f, 0x28];
+
+/// Move doubleword from r/m32 to xmm (SSE2). Quadword with REX prefix.
+pub static MOVD_LOAD_XMM: [u8; 3] = [0x66, 0x0f, 0x6e];
+
+/// Move doubleword from xmm to r/m32 (SSE2). Quadword with REX prefix.
+pub static MOVD_STORE_XMM: [u8; 3] = [0x66, 0x0f, 0x7e];
+
+/// Move packed single-precision floating-point values low to high (SSE).
+pub static MOVLHPS: [u8; 2] = [0x0f, 0x16];
+
+/// Move scalar double-precision floating-point value (from reg/mem to reg).
+pub static MOVSD_LOAD: [u8; 3] = [0xf2, 0x0f, 0x10];
+
+/// Move scalar double-precision floating-point value (from reg to reg/mem).
+pub static MOVSD_STORE: [u8; 3] = [0xf2, 0x0f, 0x11];
+
+/// Move scalar single-precision floating-point value (from reg to reg/mem).
+pub static MOVSS_STORE: [u8; 3] = [0xf3, 0x0f, 0x11];
+
+/// Move scalar single-precision floating-point-value (from reg/mem to reg).
+pub static MOVSS_LOAD: [u8; 3] = [0xf3, 0x0f, 0x10];
+
+/// Move byte to register with sign-extension.
+pub static MOVSX_BYTE: [u8; 2] = [0x0f, 0xbe];
+
+/// Move word to register with sign-extension.
+pub static MOVSX_WORD: [u8; 2] = [0x0f, 0xbf];
+
+/// Move doubleword to register with sign-extension.
+pub static MOVSXD: [u8; 1] = [0x63];
+
+/// Move unaligned packed single-precision floating-point from x/m to xmm (SSE).
+pub static MOVUPS_LOAD: [u8; 2] = [0x0f, 0x10];
+
+/// Move unaligned packed single-precision floating-point value from xmm to x/m (SSE).
+pub static MOVUPS_STORE: [u8; 2] = [0x0f, 0x11];
+
+/// Move byte to register with zero-extension.
+pub static MOVZX_BYTE: [u8; 2] = [0x0f, 0xb6];
+
+/// Move word to register with zero-extension.
+pub static MOVZX_WORD: [u8; 2] = [0x0f, 0xb7];
+
+/// Unsigned multiply for {16,32,64}-bit.
+pub static MUL: [u8; 1] = [0xf7];
+
+/// Multiply packed double-precision floating-point values from xmm2/mem to xmm1 and store result
+/// in xmm1 (SSE2).
+pub static MULPD: [u8; 3] = [0x66, 0x0f, 0x59];
+
+/// Multiply packed single-precision floating-point values from xmm2/mem to xmm1 and store result
+/// in xmm1 (SSE).
+pub static MULPS: [u8; 2] = [0x0f, 0x59];
+
+/// Multiply the low double-precision floating-point value in xmm2/m64 by the
+/// low double-precision floating-point value in xmm1.
+pub static MULSD: [u8; 3] = [0xf2, 0x0f, 0x59];
+
+/// Multiply the low single-precision floating-point value in xmm2/m32 by the
+/// low single-precision floating-point value in xmm1.
+pub static MULSS: [u8; 3] = [0xf3, 0x0f, 0x59];
+
+/// Reverse each bit of r/m{16,32,64}.
+pub static NOT: [u8; 1] = [0xf7];
+
+/// r{16,32,64} OR register of same size.
+pub static OR: [u8; 1] = [0x09];
+
+/// imm{16,32} OR r/m{16,32,64}, possibly sign-extended.
+pub static OR_IMM: [u8; 1] = [0x81];
+
+/// r/m{16,32,64} OR sign-extended imm8.
+pub static OR_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
+
+/// Return the bitwise logical OR of packed single-precision values in xmm and x/m (SSE).
+pub static ORPS: [u8; 2] = [0x0f, 0x56];
+
+/// Compute the absolute value of bytes in xmm2/m128 and store the unsigned result in xmm1 (SSSE3).
+pub static PABSB: [u8; 4] = [0x66, 0x0f, 0x38, 0x1c];
+
+/// Compute the absolute value of 32-bit integers in xmm2/m128 and store the unsigned result in
+/// xmm1 (SSSE3).
+pub static PABSD: [u8; 4] = [0x66, 0x0f, 0x38, 0x1e];
+
+/// Compute the absolute value of 16-bit integers in xmm2/m128 and store the unsigned result in
+/// xmm1 (SSSE3).
+pub static PABSW: [u8; 4] = [0x66, 0x0f, 0x38, 0x1d];
+
+/// Converts 8 packed signed word integers from xmm1 and from xmm2/m128 into 16 packed signed byte
+/// integers in xmm1 using signed saturation (SSE2).
+pub static PACKSSWB: [u8; 3] = [0x66, 0x0f, 0x63];
+
+/// Converts 4 packed signed doubleword integers from xmm1 and from xmm2/m128 into 8 packed signed
+/// word integers in xmm1 using signed saturation (SSE2).
+pub static PACKSSDW: [u8; 3] = [0x66, 0x0f, 0x6b];
+
+/// Converts 8 packed signed word integers from xmm1 and from xmm2/m128 into 16 packed unsigned byte
+/// integers in xmm1 using unsigned saturation (SSE2).
+pub static PACKUSWB: [u8; 3] = [0x66, 0x0f, 0x67];
+
+/// Converts 4 packed signed doubleword integers from xmm1 and from xmm2/m128 into 8 unpacked signed
+/// word integers in xmm1 using unsigned saturation (SSE4.1).
+pub static PACKUSDW: [u8; 4] = [0x66, 0x0f, 0x38, 0x2b];
+
+/// Add packed byte integers from xmm2/m128 and xmm1 (SSE2).
+pub static PADDB: [u8; 3] = [0x66, 0x0f, 0xfc];
+
+/// Add packed doubleword integers from xmm2/m128 and xmm1 (SSE2).
+pub static PADDD: [u8; 3] = [0x66, 0x0f, 0xfe];
+
+/// Add packed quadword integers from xmm2/m128 and xmm1 (SSE2).
+pub static PADDQ: [u8; 3] = [0x66, 0x0f, 0xd4];
+
+/// Add packed word integers from xmm2/m128 and xmm1 (SSE2).
+pub static PADDW: [u8; 3] = [0x66, 0x0f, 0xfd];
+
+/// Add packed signed byte integers from xmm2/m128 and xmm1 saturate the results (SSE).
+pub static PADDSB: [u8; 3] = [0x66, 0x0f, 0xec];
+
+/// Add packed signed word integers from xmm2/m128 and xmm1 saturate the results (SSE).
+pub static PADDSW: [u8; 3] = [0x66, 0x0f, 0xed];
+
+/// Add packed unsigned byte integers from xmm2/m128 and xmm1 saturate the results (SSE).
+pub static PADDUSB: [u8; 3] = [0x66, 0x0f, 0xdc];
+
+/// Add packed unsigned word integers from xmm2/m128 and xmm1 saturate the results (SSE).
+pub static PADDUSW: [u8; 3] = [0x66, 0x0f, 0xdd];
+
+/// Concatenate destination and source operands, extract a byte-aligned result into xmm1 that is
+/// shifted to the right by the constant number of bytes in imm8 (SSSE3).
+pub static PALIGNR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0f];
+
+/// Bitwise AND of xmm2/m128 and xmm1 (SSE2).
+pub static PAND: [u8; 3] = [0x66, 0x0f, 0xdb];
+
+/// Bitwise AND NOT of xmm2/m128 and xmm1 (SSE2).
+pub static PANDN: [u8; 3] = [0x66, 0x0f, 0xdf];
+
+/// Average packed unsigned byte integers from xmm2/m128 and xmm1 with rounding (SSE2).
+pub static PAVGB: [u8; 3] = [0x66, 0x0f, 0xE0];
+
+/// Average packed unsigned word integers from xmm2/m128 and xmm1 with rounding (SSE2).
+pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];
+
+/// Select byte values from xmm1 and xmm2/m128 from mask specified in the high bit of each byte
+/// in XMM0 and store the values into xmm1 (SSE4.1).
+pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10];
+
+/// Select words from xmm1 and xmm2/m128 from mask specified in imm8 and store the values into xmm1
+/// (SSE4.1).
+pub static PBLENDW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0e];
+
+/// Compare packed data for equal (SSE2).
+pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];
+
+/// Compare packed data for equal (SSE2).
+pub static PCMPEQD: [u8; 3] = [0x66, 0x0f, 0x76];
+
+/// Compare packed data for equal (SSE4.1).
+pub static PCMPEQQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x29];
+
+/// Compare packed data for equal (SSE2).
+pub static PCMPEQW: [u8; 3] = [0x66, 0x0f, 0x75];
+
+/// Compare packed signed byte integers for greater than (SSE2).
+pub static PCMPGTB: [u8; 3] = [0x66, 0x0f, 0x64];
+
+/// Compare packed signed doubleword integers for greater than (SSE2).
+pub static PCMPGTD: [u8; 3] = [0x66, 0x0f, 0x66];
+
+/// Compare packed signed quadword integers for greater than (SSE4.2).
+pub static PCMPGTQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x37];
+
+/// Compare packed signed word integers for greater than (SSE2).
+pub static PCMPGTW: [u8; 3] = [0x66, 0x0f, 0x65];
+
+/// Extract doubleword or quadword, depending on REX.W (SSE4.1).
+pub static PEXTR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x16];
+
+/// Extract byte (SSE4.1).
+pub static PEXTRB: [u8; 4] = [0x66, 0x0f, 0x3a, 0x14];
+
+/// Extract word (SSE4.1). There is a 3-byte SSE2 variant that can also move to m/16.
+pub static PEXTRW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x15];
+
+/// Insert doubleword or quadword, depending on REX.W (SSE4.1).
+pub static PINSR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x22];
+
+/// Insert byte (SSE4.1).
+pub static PINSRB: [u8; 4] = [0x66, 0x0f, 0x3a, 0x20];
+
+/// Insert word (SSE2).
+pub static PINSRW: [u8; 3] = [0x66, 0x0f, 0xc4];
+
+/// Compare packed signed byte integers in xmm1 and xmm2/m128 and store packed maximum values in
+/// xmm1 (SSE4.1).
+pub static PMAXSB: [u8; 4] = [0x66, 0x0f, 0x38, 0x3c];
+
+/// Compare packed signed doubleword integers in xmm1 and xmm2/m128 and store packed maximum
+/// values in xmm1 (SSE4.1).
+pub static PMAXSD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3d];
+
+/// Compare packed signed word integers in xmm1 and xmm2/m128 and store packed maximum values in
+/// xmm1 (SSE2).
+pub static PMAXSW: [u8; 3] = [0x66, 0x0f, 0xee];
+
+/// Compare packed unsigned byte integers in xmm1 and xmm2/m128 and store packed maximum values in
+/// xmm1 (SSE2).
+pub static PMAXUB: [u8; 3] = [0x66, 0x0f, 0xde];
+
+/// Compare packed unsigned doubleword integers in xmm1 and xmm2/m128 and store packed maximum
+/// values in xmm1 (SSE4.1).
+pub static PMAXUD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3f];
+
+/// Compare packed unsigned word integers in xmm1 and xmm2/m128 and store packed maximum values in
+/// xmm1 (SSE4.1).
+pub static PMAXUW: [u8; 4] = [0x66, 0x0f, 0x38, 0x3e];
+
+/// Compare packed signed byte integers in xmm1 and xmm2/m128 and store packed minimum values in
+/// xmm1 (SSE4.1).
+pub static PMINSB: [u8; 4] = [0x66, 0x0f, 0x38, 0x38];
+
+/// Compare packed signed doubleword integers in xmm1 and xmm2/m128 and store packed minimum
+/// values in xmm1 (SSE4.1).
+pub static PMINSD: [u8; 4] = [0x66, 0x0f, 0x38, 0x39];
+
+/// Compare packed signed word integers in xmm1 and xmm2/m128 and store packed minimum values in
+/// xmm1 (SSE2).
+pub static PMINSW: [u8; 3] = [0x66, 0x0f, 0xea];
+
+/// Compare packed unsigned byte integers in xmm1 and xmm2/m128 and store packed minimum values in
+/// xmm1 (SSE2).
+pub static PMINUB: [u8; 3] = [0x66, 0x0f, 0xda];
+
+/// Compare packed unsigned doubleword integers in xmm1 and xmm2/m128 and store packed minimum
+/// values in xmm1 (SSE4.1).
+pub static PMINUD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3b];
+
+/// Compare packed unsigned word integers in xmm1 and xmm2/m128 and store packed minimum values in
+/// xmm1 (SSE4.1).
+pub static PMINUW: [u8; 4] = [0x66, 0x0f, 0x38, 0x3a];
+
+/// Sign extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
+/// integers in xmm1 (SSE4.1).
+pub static PMOVSXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x20];
+
+/// Sign extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit
+/// integers in xmm1 (SSE4.1).
+pub static PMOVSXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x23];
+
+/// Sign extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
+/// integers in xmm1 (SSE4.1).
+pub static PMOVSXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x25];
+
+/// Zero extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
+/// integers in xmm1 (SSE4.1).
+pub static PMOVZXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x30];
+
+/// Zero extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit
+/// integers in xmm1 (SSE4.1).
+pub static PMOVZXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x33];
+
+/// Zero extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
+/// integers in xmm1 (SSE4.1).
+pub static PMOVZXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x35];
+
+/// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of
+/// the results in xmm1 (SSE2).
+pub static PMULLW: [u8; 3] = [0x66, 0x0f, 0xd5];
+
+/// Multiply the packed doubleword signed integers in xmm1 and xmm2/m128 and store the low 32
+/// bits of each product in xmm1 (SSE4.1).
+pub static PMULLD: [u8; 4] = [0x66, 0x0f, 0x38, 0x40];
+
+/// Multiply the packed quadword signed integers in xmm2 and xmm3/m128 and store the low 64
+/// bits of each product in xmm1 (AVX512VL/DQ). Requires an EVEX encoding.
+pub static VPMULLQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x40];
+
+/// Multiply packed unsigned doubleword integers in xmm1 by packed unsigned doubleword integers
+/// in xmm2/m128, and store the quadword results in xmm1 (SSE2).
+pub static PMULUDQ: [u8; 3] = [0x66, 0x0f, 0xf4];
+
+/// Pop top of stack into r{16,32,64}; increment stack pointer.
+pub static POP_REG: [u8; 1] = [0x58];
+
+/// Returns the count of number of bits set to 1.
+pub static POPCNT: [u8; 3] = [0xf3, 0x0f, 0xb8];
+
+/// Bitwise OR of xmm2/m128 and xmm1 (SSE2).
+pub static POR: [u8; 3] = [0x66, 0x0f, 0xeb];
+
+/// Shuffle bytes in xmm1 according to contents of xmm2/m128 (SSE3).
+pub static PSHUFB: [u8; 4] = [0x66, 0x0f, 0x38, 0x00];
+
+/// Shuffle the doublewords in xmm2/m128 based on the encoding in imm8 and
+/// store the result in xmm1 (SSE2).
+pub static PSHUFD: [u8; 3] = [0x66, 0x0f, 0x70];
+
+/// Shift words in xmm1 by imm8; the direction and sign-bit behavior is controlled by the RRR
+/// digit used in the ModR/M byte (SSE2).
+pub static PS_W_IMM: [u8; 3] = [0x66, 0x0f, 0x71];
+
+/// Shift doublewords in xmm1 by imm8; the direction and sign-bit behavior is controlled by the RRR
+/// digit used in the ModR/M byte (SSE2).
+pub static PS_D_IMM: [u8; 3] = [0x66, 0x0f, 0x72];
+
+/// Shift quadwords in xmm1 by imm8; the direction and sign-bit behavior is controlled by the RRR
+/// digit used in the ModR/M byte (SSE2).
+pub static PS_Q_IMM: [u8; 3] = [0x66, 0x0f, 0x73];
+
+/// Shift words in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
+pub static PSLLW: [u8; 3] = [0x66, 0x0f, 0xf1];
+
+/// Shift doublewords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
+pub static PSLLD: [u8; 3] = [0x66, 0x0f, 0xf2];
+
+/// Shift quadwords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
+pub static PSLLQ: [u8; 3] = [0x66, 0x0f, 0xf3];
+
+/// Shift words in xmm1 right by xmm2/m128 while shifting in 0s (SSE2).
+pub static PSRLW: [u8; 3] = [0x66, 0x0f, 0xd1];
+
+/// Shift doublewords in xmm1 right by xmm2/m128 while shifting in 0s (SSE2).
+pub static PSRLD: [u8; 3] = [0x66, 0x0f, 0xd2];
+
+/// Shift quadwords in xmm1 right by xmm2/m128 while shifting in 0s (SSE2).
+pub static PSRLQ: [u8; 3] = [0x66, 0x0f, 0xd3];
+
+/// Shift words in xmm1 right by xmm2/m128 while shifting in sign bits (SSE2).
+pub static PSRAW: [u8; 3] = [0x66, 0x0f, 0xe1];
+
+/// Shift doublewords in xmm1 right by xmm2/m128 while shifting in sign bits (SSE2).
+pub static PSRAD: [u8; 3] = [0x66, 0x0f, 0xe2];
+
+/// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2).
+pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8];
+
+/// Subtract packed word integers in xmm2/m128 from packed word integers in xmm1 (SSE2).
+pub static PSUBW: [u8; 3] = [0x66, 0x0f, 0xf9];
+
+/// Subtract packed doubleword integers in xmm2/m128 from doubleword byte integers in xmm1 (SSE2).
+pub static PSUBD: [u8; 3] = [0x66, 0x0f, 0xfa];
+
+/// Subtract packed quadword integers in xmm2/m128 from xmm1 (SSE2).
+pub static PSUBQ: [u8; 3] = [0x66, 0x0f, 0xfb];
+
+/// Subtract packed signed byte integers in xmm2/m128 from packed signed byte integers in xmm1
+/// and saturate results (SSE2).
+pub static PSUBSB: [u8; 3] = [0x66, 0x0f, 0xe8];
+
+/// Subtract packed signed word integers in xmm2/m128 from packed signed word integers in xmm1
+/// and saturate results (SSE2).
+pub static PSUBSW: [u8; 3] = [0x66, 0x0f, 0xe9];
+
+/// Subtract packed unsigned byte integers in xmm2/m128 from packed unsigned byte integers in xmm1
+/// and saturate results (SSE2).
+pub static PSUBUSB: [u8; 3] = [0x66, 0x0f, 0xd8];
+
+/// Subtract packed unsigned word integers in xmm2/m128 from packed unsigned word integers in xmm1
+/// and saturate results (SSE2).
+pub static PSUBUSW: [u8; 3] = [0x66, 0x0f, 0xd9];
+
+/// Set ZF if xmm2/m128 AND xmm1 result is all 0s; set CF if xmm2/m128 AND NOT xmm1 result is all
+/// 0s (SSE4.1).
+pub static PTEST: [u8; 4] = [0x66, 0x0f, 0x38, 0x17];
+
+/// Unpack and interleave high-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKHBW: [u8; 3] = [0x66, 0x0f, 0x68];
+
+/// Unpack and interleave high-order words from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKHWD: [u8; 3] = [0x66, 0x0f, 0x69];
+
+/// Unpack and interleave high-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKHDQ: [u8; 3] = [0x66, 0x0f, 0x6A];
+
+/// Unpack and interleave high-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKHQDQ: [u8; 3] = [0x66, 0x0f, 0x6D];
+
+/// Unpack and interleave low-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKLBW: [u8; 3] = [0x66, 0x0f, 0x60];
+
+/// Unpack and interleave low-order words from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKLWD: [u8; 3] = [0x66, 0x0f, 0x61];
+
+/// Unpack and interleave low-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKLDQ: [u8; 3] = [0x66, 0x0f, 0x62];
+
+/// Unpack and interleave low-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2).
+pub static PUNPCKLQDQ: [u8; 3] = [0x66, 0x0f, 0x6C];
+
+/// Push r{16,32,64}.
+pub static PUSH_REG: [u8; 1] = [0x50];
+
+/// Logical exclusive OR (SSE2).
+pub static PXOR: [u8; 3] = [0x66, 0x0f, 0xef];
+
+/// Near return to calling procedure.
+pub static RET_NEAR: [u8; 1] = [0xc3];
+
+/// General rotation opcode. Kind of rotation depends on encoding.
+pub static ROTATE_CL: [u8; 1] = [0xd3];
+
+/// General rotation opcode. Kind of rotation depends on encoding.
+pub static ROTATE_IMM8: [u8; 1] = [0xc1];
+
+/// Round scalar doubl-precision floating-point values.
+pub static ROUNDSD: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0b];
+
+/// Round scalar single-precision floating-point values.
+pub static ROUNDSS: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0a];
+
+/// Subtract with borrow r{16,32,64} from r/m of the same size.
+pub static SBB: [u8; 1] = [0x19];
+
+/// Set byte if overflow (OF=1).
+pub static SET_BYTE_IF_OVERFLOW: [u8; 2] = [0x0f, 0x90];
+
+/// Compute the square root of the packed double-precision floating-point values and store the
+/// result in xmm1 (SSE2).
+pub static SQRTPD: [u8; 3] = [0x66, 0x0f, 0x51];
+
+/// Compute the square root of the packed double-precision floating-point values and store the
+/// result in xmm1 (SSE).
+pub static SQRTPS: [u8; 2] = [0x0f, 0x51];
+
+/// Compute square root of scalar double-precision floating-point value.
+pub static SQRTSD: [u8; 3] = [0xf2, 0x0f, 0x51];
+
+/// Compute square root of scalar single-precision value.
+pub static SQRTSS: [u8; 3] = [0xf3, 0x0f, 0x51];
+
+/// Subtract r{16,32,64} from r/m of same size.
+pub static SUB: [u8; 1] = [0x29];
+
+/// Subtract packed double-precision floating-point values in xmm2/mem from xmm1 and store result
+/// in xmm1 (SSE2).
+pub static SUBPD: [u8; 3] = [0x66, 0x0f, 0x5c];
+
+/// Subtract packed single-precision floating-point values in xmm2/mem from xmm1 and store result
+/// in xmm1 (SSE).
+pub static SUBPS: [u8; 2] = [0x0f, 0x5c];
+
+/// Subtract the low double-precision floating-point value in xmm2/m64 from xmm1
+/// and store the result in xmm1.
+pub static SUBSD: [u8; 3] = [0xf2, 0x0f, 0x5c];
+
+/// Subtract the low single-precision floating-point value in xmm2/m32 from xmm1
+/// and store the result in xmm1.
+pub static SUBSS: [u8; 3] = [0xf3, 0x0f, 0x5c];
+
+/// AND r8 with r/m8; set SF, ZF, PF according to result.
+pub static TEST_BYTE_REG: [u8; 1] = [0x84];
+
+/// AND {r16, r32, r64} with r/m of the same size; set SF, ZF, PF according to result.
+pub static TEST_REG: [u8; 1] = [0x85];
+
+/// Count the number of trailing zero bits.
+pub static TZCNT: [u8; 3] = [0xf3, 0x0f, 0xbc];
+
+/// Compare low double-precision floating-point values in xmm1 and xmm2/mem64
+/// and set the EFLAGS flags accordingly.
+pub static UCOMISD: [u8; 3] = [0x66, 0x0f, 0x2e];
+
+/// Compare low single-precision floating-point values in xmm1 and xmm2/mem32
+/// and set the EFLAGS flags accordingly.
+pub static UCOMISS: [u8; 2] = [0x0f, 0x2e];
+
+/// Raise invalid opcode instruction.
+pub static UNDEFINED2: [u8; 2] = [0x0f, 0x0b];
+
+/// Convert four packed unsigned doubleword integers from xmm2/m128/m32bcst to packed
+/// single-precision floating-point values in xmm1 with writemask k1. Rounding behavior
+/// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode
+/// (AVX512VL, AVX512F).
+pub static VCVTUDQ2PS: [u8; 3] = [0xf2, 0x0f, 0x7a];
+
+/// imm{16,32} XOR r/m{16,32,64}, possibly sign-extended.
+pub static XOR_IMM: [u8; 1] = [0x81];
+
+/// r/m{16,32,64} XOR sign-extended imm8.
+pub static XOR_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
+
+/// r/m{16,32,64} XOR register of the same size.
+pub static XOR: [u8; 1] = [0x31];
+
+/// r/m8 XOR r8.
+pub static XORB: [u8; 1] = [0x30];
+
+/// Bitwise logical XOR of packed double-precision floating-point values.
+pub static XORPD: [u8; 3] = [0x66, 0x0f, 0x57];
+
+/// Bitwise logical XOR of packed single-precision floating-point values.
+pub static XORPS: [u8; 2] = [0x0f, 0x57];
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/recipes.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/recipes.rs
new file mode 100644
index 0000000000..f45f8dc673
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/recipes.rs
@@ -0,0 +1,3445 @@
+//! Encoding recipes for x86/x86_64.
+use std::rc::Rc;
+
+use cranelift_codegen_shared::isa::x86::EncodingBits;
+
+use crate::cdsl::ast::Literal;
+use crate::cdsl::formats::InstructionFormat;
+use crate::cdsl::instructions::InstructionPredicate;
+use crate::cdsl::recipes::{
+    EncodingRecipe, EncodingRecipeBuilder, OperandConstraint, Register, Stack,
+};
+use crate::cdsl::regs::IsaRegs;
+use crate::cdsl::settings::SettingGroup;
+use crate::shared::Definitions as SharedDefinitions;
+
+use crate::isa::x86::opcodes;
+
+/// Helper data structure to create recipes and template recipes.
+/// It contains all the recipes and recipe templates that might be used in the encodings crate of
+/// this same directory.
+pub(crate) struct RecipeGroup<'builder> {
+    /// Memoized registers description, to pass it to builders later.
+    regs: &'builder IsaRegs,
+
+    /// All the recipes explicitly created in this file. This is different from the final set of
+    /// recipes, which is definitive only once encodings have generated new recipes on the fly.
+    recipes: Vec<EncodingRecipe>,
+
+    /// All the recipe templates created in this file.
+    templates: Vec<Rc<Template<'builder>>>,
+}
+
+impl<'builder> RecipeGroup<'builder> {
+    fn new(regs: &'builder IsaRegs) -> Self {
+        Self {
+            regs,
+            recipes: Vec::new(),
+            templates: Vec::new(),
+        }
+    }
+    fn add_recipe(&mut self, recipe: EncodingRecipeBuilder) {
+        self.recipes.push(recipe.build());
+    }
+    fn add_template_recipe(&mut self, recipe: EncodingRecipeBuilder) -> Rc<Template<'builder>> {
+        let template = Rc::new(Template::new(recipe, self.regs));
+        self.templates.push(template.clone());
+        template
+    }
+    fn add_template_inferred(
+        &mut self,
+        recipe: EncodingRecipeBuilder,
+        infer_function: &'static str,
+    ) -> Rc<Template<'builder>> {
+        let template =
+            Rc::new(Template::new(recipe, self.regs).inferred_rex_compute_size(infer_function));
+        self.templates.push(template.clone());
+        template
+    }
+    fn add_template(&mut self, template: Template<'builder>) -> Rc<Template<'builder>> {
+        let template = Rc::new(template);
+        self.templates.push(template.clone());
+        template
+    }
+    pub fn recipe(&self, name: &str) -> &EncodingRecipe {
+        self.recipes
+            .iter()
+            .find(|recipe| recipe.name == name)
+            .unwrap_or_else(|| panic!("unknown recipe name: {}. Try template?", name))
+    }
+    pub fn template(&self, name: &str) -> &Template {
+        self.templates
+            .iter()
+            .find(|recipe| recipe.name() == name)
+            .unwrap_or_else(|| panic!("unknown template name: {}. Try recipe?", name))
+    }
+}
+
+// Opcode representation.
+//
+// Cranelift requires each recipe to have a single encoding size in bytes, and x86 opcodes are
+// variable length, so we use separate recipes for different styles of opcodes and prefixes. The
+// opcode format is indicated by the recipe name prefix.
+//
+// The match case below does not include the REX prefix which goes after the mandatory prefix.
+// VEX/XOP and EVEX prefixes are not yet supported. Encodings using any of these prefixes are
+// represented by separate recipes.
+//
+// The encoding bits are:
+//
+// 0-7:   The opcode byte <op>.
+// 8-9:   pp, mandatory prefix:
+//        00 none (Op*)
+//        01 66   (Mp*)
+//        10 F3   (Mp*)
+//        11 F2   (Mp*)
+// 10-11: mm, opcode map:
+//        00 <op>        (Op1/Mp1)
+//        01 0F <op>     (Op2/Mp2)
+//        10 0F 38 <op>  (Op3/Mp3)
+//        11 0F 3A <op>  (Op3/Mp3)
+// 12-14  rrr, opcode bits for the ModR/M byte for certain opcodes.
+// 15:    REX.W bit (or VEX.W/E)
+//
+// There is some redundancy between bits 8-11 and the recipe names, but we have enough bits, and
+// the pp+mm format is ready for supporting VEX prefixes.
+//
+// TODO Cranelift doesn't actually require recipe to have different encoding sizes anymore, so this
+// could be simplified.
+
+/// Given a sequence of opcode bytes, compute the recipe name prefix and encoding bits.
+fn decode_opcodes(op_bytes: &[u8], rrr: u16, w: u16) -> (&'static str, u16) {
+    let enc = EncodingBits::new(op_bytes, rrr, w);
+    (enc.prefix().recipe_name_prefix(), enc.bits())
+}
+
+/// Given a snippet of Rust code (or None), replace the `PUT_OP` macro with the
+/// corresponding `put_*` function from the `binemit.rs` module.
+fn replace_put_op(code: Option<String>, prefix: &str) -> Option<String> {
+    code.map(|code| code.replace("{{PUT_OP}}", &format!("put_{}", prefix.to_lowercase())))
+}
+
+/// Replaces constraints to a REX-prefixed register class by the equivalent non-REX register class.
+fn replace_nonrex_constraints(
+    regs: &IsaRegs,
+    constraints: Vec<OperandConstraint>,
+) -> Vec<OperandConstraint> {
+    constraints
+        .into_iter()
+        .map(|constraint| match constraint {
+            OperandConstraint::RegClass(rc_index) => {
+                let new_rc_index = if rc_index == regs.class_by_name("GPR") {
+                    regs.class_by_name("GPR8")
+                } else if rc_index == regs.class_by_name("FPR") {
+                    regs.class_by_name("FPR8")
+                } else {
+                    rc_index
+                };
+                OperandConstraint::RegClass(new_rc_index)
+            }
+            _ => constraint,
+        })
+        .collect()
+}
+
+fn replace_evex_constraints(
+    _: &IsaRegs,
+    constraints: Vec<OperandConstraint>,
+) -> Vec<OperandConstraint> {
+    constraints
+        .into_iter()
+        .map(|constraint| match constraint {
+            OperandConstraint::RegClass(rc_index) => {
+                // FIXME(#1306) this should be able to upgrade the register class to FPR32 as in
+                // `replace_nonrex_constraints` above, e.g. When FPR32 is re-added, add back in the
+                // rc_index conversion to FPR32. In the meantime, this is effectively a no-op
+                // conversion--the register class stays the same.
+                OperandConstraint::RegClass(rc_index)
+            }
+            _ => constraint,
+        })
+        .collect()
+}
+
+/// Specifies how the prefix (e.g. REX) is emitted by a Recipe.
+#[derive(Copy, Clone, PartialEq)]
+pub enum RecipePrefixKind {
+    /// The REX emission behavior is not hardcoded for the Recipe
+    /// and may be overridden when using the Template.
+    Unspecified,
+
+    /// The Recipe must hardcode the non-emission of the REX prefix.
+    NeverEmitRex,
+
+    /// The Recipe must hardcode the emission of the REX prefix.
+    AlwaysEmitRex,
+
+    /// The Recipe should infer the emission of the REX.RXB bits from registers,
+    /// and the REX.W bit from the EncodingBits.
+    ///
+    /// Because such a Recipe has a non-constant instruction size, it must have
+    /// a special `compute_size` handler for the inferrable-REX case.
+    InferRex,
+
+    /// The Recipe must hardcode the emission of an EVEX prefix.
+    Evex,
+}
+
+impl Default for RecipePrefixKind {
+    fn default() -> Self {
+        Self::Unspecified
+    }
+}
+
+/// Previously called a TailRecipe in the Python meta language, this allows to create multiple
+/// variants of a single base EncodingRecipe (rex prefix, specialized w/rrr bits, different
+/// opcodes). It serves as a prototype of an EncodingRecipe, which is then used when actually creating
+/// Encodings, in encodings.rs. This is an idiosyncrasy of the x86 meta-language, and could be
+/// reconsidered later.
+#[derive(Clone)]
+pub(crate) struct Template<'builder> {
+    /// Description of registers, used in the build() method.
+    regs: &'builder IsaRegs,
+
+    /// The recipe template, which is to be specialized (by copy).
+    recipe: EncodingRecipeBuilder,
+
+    /// How is the REX prefix emitted?
+    rex_kind: RecipePrefixKind,
+
+    /// Function for `compute_size()` when REX is inferrable.
+    inferred_rex_compute_size: Option<&'static str>,
+
+    /// Other recipe to use when REX-prefixed.
+    when_prefixed: Option<Rc<Template<'builder>>>,
+
+    // Parameters passed in the EncodingBits.
+    /// Value of the W bit (0 or 1), stored in the EncodingBits.
+    w_bit: u16,
+    /// Value of the RRR bits (between 0 and 0b111).
+    rrr_bits: u16,
+    /// Opcode bytes.
+    op_bytes: &'static [u8],
+}
+
+impl<'builder> Template<'builder> {
+    fn new(recipe: EncodingRecipeBuilder, regs: &'builder IsaRegs) -> Self {
+        Self {
+            regs,
+            recipe,
+            rex_kind: RecipePrefixKind::default(),
+            inferred_rex_compute_size: None,
+            when_prefixed: None,
+            w_bit: 0,
+            rrr_bits: 0,
+            op_bytes: &opcodes::EMPTY,
+        }
+    }
+
+    fn name(&self) -> &str {
+        &self.recipe.name
+    }
+    fn rex_kind(self, kind: RecipePrefixKind) -> Self {
+        Self {
+            rex_kind: kind,
+            ..self
+        }
+    }
+    fn inferred_rex_compute_size(self, function: &'static str) -> Self {
+        Self {
+            inferred_rex_compute_size: Some(function),
+            ..self
+        }
+    }
+    fn when_prefixed(self, template: Rc<Template<'builder>>) -> Self {
+        assert!(self.when_prefixed.is_none());
+        Self {
+            when_prefixed: Some(template),
+            ..self
+        }
+    }
+
+    // Copy setters.
+    pub fn opcodes(&self, op_bytes: &'static [u8]) -> Self {
+        assert!(!op_bytes.is_empty());
+        let mut copy = self.clone();
+        copy.op_bytes = op_bytes;
+        copy
+    }
+    pub fn w(&self) -> Self {
+        let mut copy = self.clone();
+        copy.w_bit = 1;
+        copy
+    }
+    pub fn rrr(&self, value: u16) -> Self {
+        assert!(value <= 0b111);
+        let mut copy = self.clone();
+        copy.rrr_bits = value;
+        copy
+    }
+    pub fn nonrex(&self) -> Self {
+        assert!(
+            self.rex_kind != RecipePrefixKind::AlwaysEmitRex,
+            "Template requires REX prefix."
+        );
+        let mut copy = self.clone();
+        copy.rex_kind = RecipePrefixKind::NeverEmitRex;
+        copy
+    }
+    pub fn rex(&self) -> Self {
+        assert!(
+            self.rex_kind != RecipePrefixKind::NeverEmitRex,
+            "Template requires no REX prefix."
+        );
+        if let Some(prefixed) = &self.when_prefixed {
+            let mut ret = prefixed.rex();
+            // Forward specialized parameters.
+            ret.op_bytes = self.op_bytes;
+            ret.w_bit = self.w_bit;
+            ret.rrr_bits = self.rrr_bits;
+            return ret;
+        }
+        let mut copy = self.clone();
+        copy.rex_kind = RecipePrefixKind::AlwaysEmitRex;
+        copy
+    }
+    pub fn infer_rex(&self) -> Self {
+        assert!(
+            self.rex_kind != RecipePrefixKind::NeverEmitRex,
+            "Template requires no REX prefix."
+        );
+        assert!(
+            self.when_prefixed.is_none(),
+            "infer_rex used with when_prefixed()."
+        );
+        let mut copy = self.clone();
+        copy.rex_kind = RecipePrefixKind::InferRex;
+        copy
+    }
+
+    pub fn build(mut self) -> (EncodingRecipe, u16) {
+        let (opcode, bits) = decode_opcodes(&self.op_bytes, self.rrr_bits, self.w_bit);
+
+        let (recipe_name, size_addendum) = match self.rex_kind {
+            RecipePrefixKind::Unspecified | RecipePrefixKind::NeverEmitRex => {
+                // Ensure the operands are limited to non-REX constraints.
+                let operands_in = self.recipe.operands_in.unwrap_or_default();
+                self.recipe.operands_in = Some(replace_nonrex_constraints(self.regs, operands_in));
+                let operands_out = self.recipe.operands_out.unwrap_or_default();
+                self.recipe.operands_out =
+                    Some(replace_nonrex_constraints(self.regs, operands_out));
+
+                (opcode.into(), self.op_bytes.len() as u64)
+            }
+            RecipePrefixKind::AlwaysEmitRex => {
+                ("Rex".to_string() + opcode, self.op_bytes.len() as u64 + 1)
+            }
+            RecipePrefixKind::InferRex => {
+                assert_eq!(self.w_bit, 0, "A REX.W bit always requires a REX prefix; avoid using `infer_rex().w()` and use `rex().w()` instead.");
+                // Hook up the right function for inferred compute_size().
+                assert!(
+                    self.inferred_rex_compute_size.is_some(),
+                    "InferRex recipe '{}' needs an inferred_rex_compute_size function.",
+                    &self.recipe.name
+                );
+                self.recipe.compute_size = self.inferred_rex_compute_size;
+
+                ("DynRex".to_string() + opcode, self.op_bytes.len() as u64)
+            }
+            RecipePrefixKind::Evex => {
+                // Allow the operands to expand limits to EVEX constraints.
+                let operands_in = self.recipe.operands_in.unwrap_or_default();
+                self.recipe.operands_in = Some(replace_evex_constraints(self.regs, operands_in));
+                let operands_out = self.recipe.operands_out.unwrap_or_default();
+                self.recipe.operands_out = Some(replace_evex_constraints(self.regs, operands_out));
+
+                ("Evex".to_string() + opcode, 4 + 1)
+            }
+        };
+
+        self.recipe.base_size += size_addendum;
+
+        // Branch ranges are relative to the end of the instruction.
+        // For InferRex, the range should be the minimum, assuming no REX.
+        if let Some(range) = self.recipe.branch_range.as_mut() {
+            range.inst_size += size_addendum;
+        }
+
+        self.recipe.emit = replace_put_op(self.recipe.emit, &recipe_name);
+        self.recipe.name = recipe_name + &self.recipe.name;
+
+        (self.recipe.build(), bits)
+    }
+}
+
+/// Returns a predicate checking that the "cond" field of the instruction contains one of the
+/// directly supported floating point condition codes.
+fn supported_floatccs_predicate(
+    supported_cc: &[Literal],
+    format: &InstructionFormat,
+) -> InstructionPredicate {
+    supported_cc
+        .iter()
+        .fold(InstructionPredicate::new(), |pred, literal| {
+            pred.or(InstructionPredicate::new_is_field_equal(
+                format,
+                "cond",
+                literal.to_rust_code(),
+            ))
+        })
+}
+
+/// Return an instruction predicate that checks if `iform.imm` is a valid `scale` for a SIB byte.
+fn valid_scale(format: &InstructionFormat) -> InstructionPredicate {
+    ["1", "2", "4", "8"]
+        .iter()
+        .fold(InstructionPredicate::new(), |pred, &literal| {
+            pred.or(InstructionPredicate::new_is_field_equal(
+                format,
+                "imm",
+                literal.into(),
+            ))
+        })
+}
+
+pub(crate) fn define<'shared>(
+    shared_defs: &'shared SharedDefinitions,
+    settings: &'shared SettingGroup,
+    regs: &'shared IsaRegs,
+) -> RecipeGroup<'shared> {
+    // The set of floating point condition codes that are directly supported.
+    // Other condition codes need to be reversed or expressed as two tests.
+    let floatcc = &shared_defs.imm.floatcc;
+    let supported_floatccs: Vec<Literal> = ["ord", "uno", "one", "ueq", "gt", "ge", "ult", "ule"]
+        .iter()
+        .map(|name| Literal::enumerator_for(floatcc, name))
+        .collect();
+
+    // Register classes shorthands.
+    let abcd = regs.class_by_name("ABCD");
+    let gpr = regs.class_by_name("GPR");
+    let fpr = regs.class_by_name("FPR");
+    let flag = regs.class_by_name("FLAG");
+
+    // Operand constraints shorthands.
+    let reg_rflags = Register::new(flag, regs.regunit_by_name(flag, "rflags"));
+    let reg_rax = Register::new(gpr, regs.regunit_by_name(gpr, "rax"));
+    let reg_rcx = Register::new(gpr, regs.regunit_by_name(gpr, "rcx"));
+    let reg_rdx = Register::new(gpr, regs.regunit_by_name(gpr, "rdx"));
+    let reg_r15 = Register::new(gpr, regs.regunit_by_name(gpr, "r15"));
+    let reg_xmm0 = Register::new(fpr, regs.regunit_by_name(fpr, "xmm0"));
+
+    // Stack operand with a 32-bit signed displacement from either RBP or RSP.
+    let stack_gpr32 = Stack::new(gpr);
+    let stack_fpr32 = Stack::new(fpr);
+
+    let formats = &shared_defs.formats;
+
+    // Predicates shorthands.
+    let use_sse41 = settings.predicate_by_name("use_sse41");
+
+    // Definitions.
+    let mut recipes = RecipeGroup::new(regs);
+
+    // A null unary instruction that takes a GPR register. Can be used for identity copies and
+    // no-op conversions.
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("null", &formats.unary, 0)
+            .operands_in(vec![gpr])
+            .operands_out(vec![0])
+            .emit(""),
+    );
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("null_fpr", &formats.unary, 0)
+            .operands_in(vec![fpr])
+            .operands_out(vec![0])
+            .emit(""),
+    );
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("stacknull", &formats.unary, 0)
+            .operands_in(vec![stack_gpr32])
+            .operands_out(vec![stack_gpr32])
+            .emit(""),
+    );
+
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("get_pinned_reg", &formats.nullary, 0)
+            .operands_out(vec![reg_r15])
+            .emit(""),
+    );
+    // umr with a fixed register output that's r15.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("set_pinned_reg", &formats.unary, 1)
+            .operands_in(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    let r15 = RU::r15.into();
+                    {{PUT_OP}}(bits, rex2(r15, in_reg0), sink);
+                    modrm_rr(r15, in_reg0, sink);
+                "#,
+            ),
+    );
+
+    // No-op fills, created by late-stage redundant-fill removal.
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("fillnull", &formats.unary, 0)
+            .operands_in(vec![stack_gpr32])
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(""),
+    );
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("ffillnull", &formats.unary, 0)
+            .operands_in(vec![stack_gpr32])
+            .operands_out(vec![fpr])
+            .clobbers_flags(false)
+            .emit(""),
+    );
+
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("debugtrap", &formats.nullary, 1).emit("sink.put1(0xcc);"),
+    );
+
+    // XX opcode, no ModR/M.
+    recipes.add_template_recipe(EncodingRecipeBuilder::new("trap", &formats.trap, 0).emit(
+        r#"
+            sink.trap(code, func.srclocs[inst]);
+            {{PUT_OP}}(bits, BASE_REX, sink);
+        "#,
+    ));
+
+    // Macro: conditional jump over a ud2.
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("trapif", &formats.int_cond_trap, 4)
+            .operands_in(vec![reg_rflags])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    // Jump over a 2-byte ud2.
+                    sink.put1(0x70 | (icc2opc(cond.inverse()) as u8));
+                    sink.put1(2);
+                    // ud2.
+                    sink.trap(code, func.srclocs[inst]);
+                    sink.put1(0x0f);
+                    sink.put1(0x0b);
+                "#,
+            ),
+    );
+
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("trapff", &formats.float_cond_trap, 4)
+            .operands_in(vec![reg_rflags])
+            .clobbers_flags(false)
+            .inst_predicate(supported_floatccs_predicate(
+                &supported_floatccs,
+                &*formats.float_cond_trap,
+            ))
+            .emit(
+                r#"
+                    // Jump over a 2-byte ud2.
+                    sink.put1(0x70 | (fcc2opc(cond.inverse()) as u8));
+                    sink.put1(2);
+                    // ud2.
+                    sink.trap(code, func.srclocs[inst]);
+                    sink.put1(0x0f);
+                    sink.put1(0x0b);
+                "#,
+            ),
+    );
+
+    // XX /r
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("rr", &formats.binary, 1)
+            .operands_in(vec![gpr, gpr])
+            .operands_out(vec![0])
+            .emit(
+                r#"
+                        {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink);
+                        modrm_rr(in_reg0, in_reg1, sink);
+                    "#,
+            ),
+        "size_with_inferred_rex_for_inreg0_inreg1",
+    );
+
+    // XX /r with operands swapped. (RM form).
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("rrx", &formats.binary, 1)
+            .operands_in(vec![gpr, gpr])
+            .operands_out(vec![0])
+            .emit(
+                r#"
+                        {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                        modrm_rr(in_reg1, in_reg0, sink);
+                    "#,
+            ),
+        "size_with_inferred_rex_for_inreg0_inreg1",
+    );
+
+    // XX /r with FPR ins and outs. A form.
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("fa", &formats.binary, 1)
+            .operands_in(vec![fpr, fpr])
+            .operands_out(vec![0])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                "#,
+            ),
+        "size_with_inferred_rex_for_inreg0_inreg1",
+    );
+
+    // XX /r with FPR ins and outs. A form with input operands swapped.
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("fax", &formats.binary, 1)
+            .operands_in(vec![fpr, fpr])
+            .operands_out(vec![1])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink);
+                    modrm_rr(in_reg0, in_reg1, sink);
+                "#,
+            ),
+        // The operand order does not matter for calculating whether a REX prefix is needed.
+        "size_with_inferred_rex_for_inreg0_inreg1",
+    );
+
+    // XX /r with FPR ins and outs. A form with a byte immediate.
+    {
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("fa_ib", &formats.ternary_imm8, 2)
+                .operands_in(vec![fpr, fpr])
+                .operands_out(vec![0])
+                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
+                    &*formats.ternary_imm8,
+                    "imm",
+                    8,
+                    0,
+                ))
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                    let imm: i64 = imm.into();
+                    sink.put1(imm as u8);
+                "#,
+                ),
+            "size_with_inferred_rex_for_inreg0_inreg1",
+        );
+    }
+
+    // XX /n for a unary operation with extension bits.
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("ur", &formats.unary, 1)
+                .operands_in(vec![gpr])
+                .operands_out(vec![0])
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                        modrm_r_bits(in_reg0, bits, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"),
+    );
+
+    // XX /r, but for a unary operator with separate input/output register, like
+    // copies. MR form, preserving flags.
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("umr", &formats.unary, 1)
+                .operands_in(vec![gpr])
+                .operands_out(vec![gpr])
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex2(out_reg0, in_reg0), sink);
+                        modrm_rr(out_reg0, in_reg0, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_outreg0"),
+    );
+
+    // Same as umr, but with FPR -> GPR registers.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("rfumr", &formats.unary, 1)
+            .operands_in(vec![fpr])
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(out_reg0, in_reg0), sink);
+                    modrm_rr(out_reg0, in_reg0, sink);
+                "#,
+            ),
+    );
+
+    // Same as umr, but with the source register specified directly.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("umr_reg_to_ssa", &formats.copy_to_ssa, 1)
+            // No operands_in to mention, because a source register is specified directly.
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(out_reg0, src), sink);
+                    modrm_rr(out_reg0, src, sink);
+                "#,
+            ),
+    );
+
+    // XX /r, but for a unary operator with separate input/output register.
+    // RM form. Clobbers FLAGS.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("urm", &formats.unary, 1)
+            .operands_in(vec![gpr])
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                    modrm_rr(in_reg0, out_reg0, sink);
+                "#,
+            ),
+    );
+
+    // XX /r. Same as urm, but doesn't clobber FLAGS.
+    let urm_noflags = recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("urm_noflags", &formats.unary, 1)
+            .operands_in(vec![gpr])
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                    modrm_rr(in_reg0, out_reg0, sink);
+                "#,
+            ),
+    );
+
+    // XX /r. Same as urm_noflags, but input limited to ABCD.
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("urm_noflags_abcd", &formats.unary, 1)
+                .operands_in(vec![abcd])
+                .operands_out(vec![gpr])
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                    modrm_rr(in_reg0, out_reg0, sink);
+                "#,
+                ),
+            regs,
+        )
+        .when_prefixed(urm_noflags),
+    );
+
+    // XX /r, RM form, FPR -> FPR.
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("furm", &formats.unary, 1)
+            .operands_in(vec![fpr])
+            .operands_out(vec![fpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                    modrm_rr(in_reg0, out_reg0, sink);
+                "#,
+            ),
+        "size_with_inferred_rex_for_inreg0_outreg0",
+    );
+
+    // Same as furm, but with the source register specified directly.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("furm_reg_to_ssa", &formats.copy_to_ssa, 1)
+            // No operands_in to mention, because a source register is specified directly.
+            .operands_out(vec![fpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(src, out_reg0), sink);
+                    modrm_rr(src, out_reg0, sink);
+                "#,
+            ),
+    );
+
+    // XX /r, RM form, GPR -> FPR.
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("frurm", &formats.unary, 1)
+            .operands_in(vec![gpr])
+            .operands_out(vec![fpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                        {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                        modrm_rr(in_reg0, out_reg0, sink);
+                    "#,
+            ),
+        "size_with_inferred_rex_for_inreg0_outreg0",
+    );
+
+    // XX /r, RM form, FPR -> GPR.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("rfurm", &formats.unary, 1)
+            .operands_in(vec![fpr])
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                    modrm_rr(in_reg0, out_reg0, sink);
+                "#,
+            ),
+    );
+
+    // XX /r, RMI form for one of the roundXX SSE 4.1 instructions.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("furmi_rnd", &formats.unary, 2)
+            .operands_in(vec![fpr])
+            .operands_out(vec![fpr])
+            .isa_predicate(use_sse41)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                    modrm_rr(in_reg0, out_reg0, sink);
+                    sink.put1(match opcode {
+                        Opcode::Nearest => 0b00,
+                        Opcode::Floor => 0b01,
+                        Opcode::Ceil => 0b10,
+                        Opcode::Trunc => 0b11,
+                        x => panic!("{} unexpected for furmi_rnd", opcode),
+                    });
+                "#,
+            ),
+    );
+
+    // XX /r, for regmove instructions.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("rmov", &formats.reg_move, 1)
+            .operands_in(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(dst, src), sink);
+                    modrm_rr(dst, src, sink);
+                "#,
+            ),
+    );
+
+    // XX /r, for regmove instructions (FPR version, RM encoded).
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("frmov", &formats.reg_move, 1)
+            .operands_in(vec![fpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(src, dst), sink);
+                    modrm_rr(src, dst, sink);
+                "#,
+            ),
+    );
+
+    // XX /n with one arg in %rcx, for shifts.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("rc", &formats.binary, 1)
+            .operands_in(vec![
+                OperandConstraint::RegClass(gpr),
+                OperandConstraint::FixedReg(reg_rcx),
+            ])
+            .operands_out(vec![0])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                    modrm_r_bits(in_reg0, bits, sink);
+                "#,
+            ),
+    );
+
+    // XX /n for division: inputs in %rax, %rdx, r. Outputs in %rax, %rdx.
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("div", &formats.ternary, 1)
+                .operands_in(vec![
+                    OperandConstraint::FixedReg(reg_rax),
+                    OperandConstraint::FixedReg(reg_rdx),
+                    OperandConstraint::RegClass(gpr),
+                ])
+                .operands_out(vec![reg_rax, reg_rdx])
+                .emit(
+                    r#"
+                        sink.trap(TrapCode::IntegerDivisionByZero, func.srclocs[inst]);
+                        {{PUT_OP}}(bits, rex1(in_reg2), sink);
+                        modrm_r_bits(in_reg2, bits, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg2"),
+    );
+
+    // XX /n for {s,u}mulx: inputs in %rax, r. Outputs in %rdx(hi):%rax(lo)
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("mulx", &formats.binary, 1)
+                .operands_in(vec![
+                    OperandConstraint::FixedReg(reg_rax),
+                    OperandConstraint::RegClass(gpr),
+                ])
+                .operands_out(vec![
+                    OperandConstraint::FixedReg(reg_rax),
+                    OperandConstraint::FixedReg(reg_rdx),
+                ])
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex1(in_reg1), sink);
+                        modrm_r_bits(in_reg1, bits, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg1"),
+    );
+
+    // XX /r for BLEND* instructions
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("blend", &formats.ternary, 1)
+            .operands_in(vec![
+                OperandConstraint::FixedReg(reg_xmm0),
+                OperandConstraint::RegClass(fpr),
+                OperandConstraint::RegClass(fpr),
+            ])
+            .operands_out(vec![2])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg2), sink);
+                    modrm_rr(in_reg1, in_reg2, sink);
+                "#,
+            ),
+        "size_with_inferred_rex_for_inreg1_inreg2",
+    );
+
+    // XX /n ib with 8-bit immediate sign-extended.
+    {
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("r_ib", &formats.binary_imm64, 2)
+                .operands_in(vec![gpr])
+                .operands_out(vec![0])
+                .inst_predicate(InstructionPredicate::new_is_signed_int(
+                    &*formats.binary_imm64,
+                    "imm",
+                    8,
+                    0,
+                ))
+                .emit(
+                    r#"
+                            {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                            modrm_r_bits(in_reg0, bits, sink);
+                            let imm: i64 = imm.into();
+                            sink.put1(imm as u8);
+                        "#,
+                ),
+            "size_with_inferred_rex_for_inreg0",
+        );
+
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("f_ib", &formats.binary_imm64, 2)
+                .operands_in(vec![fpr])
+                .operands_out(vec![0])
+                .inst_predicate(InstructionPredicate::new_is_signed_int(
+                    &*formats.binary_imm64,
+                    "imm",
+                    8,
+                    0,
+                ))
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                        modrm_r_bits(in_reg0, bits, sink);
+                        let imm: i64 = imm.into();
+                        sink.put1(imm as u8);
+                    "#,
+                ),
+            "size_with_inferred_rex_for_inreg0",
+        );
+
+        // XX /n id with 32-bit immediate sign-extended.
+        recipes.add_template(
+            Template::new(
+                EncodingRecipeBuilder::new("r_id", &formats.binary_imm64, 5)
+                    .operands_in(vec![gpr])
+                    .operands_out(vec![0])
+                    .inst_predicate(InstructionPredicate::new_is_signed_int(
+                        &*formats.binary_imm64,
+                        "imm",
+                        32,
+                        0,
+                    ))
+                    .emit(
+                        r#"
+                            {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                            modrm_r_bits(in_reg0, bits, sink);
+                            let imm: i64 = imm.into();
+                            sink.put4(imm as u32);
+                        "#,
+                    ),
+                regs,
+            )
+            .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"),
+        );
+    }
+
+    // XX /r ib with 8-bit unsigned immediate (e.g. for pshufd)
+    {
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("r_ib_unsigned_fpr", &formats.binary_imm8, 2)
+                .operands_in(vec![fpr])
+                .operands_out(vec![fpr])
+                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
+                    &*formats.binary_imm8,
+                    "imm",
+                    8,
+                    0,
+                ))
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                    modrm_rr(in_reg0, out_reg0, sink);
+                    let imm: i64 = imm.into();
+                    sink.put1(imm as u8);
+                "#,
+                ),
+            "size_with_inferred_rex_for_inreg0_outreg0",
+        );
+    }
+
+    // XX /r ib with 8-bit unsigned immediate (e.g. for extractlane)
+    {
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("r_ib_unsigned_gpr", &formats.binary_imm8, 2)
+                .operands_in(vec![fpr])
+                .operands_out(vec![gpr])
+                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
+                    &*formats.binary_imm8, "imm", 8, 0,
+                ))
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits, rex2(out_reg0, in_reg0), sink);
+                    modrm_rr(out_reg0, in_reg0, sink); // note the flipped register in the ModR/M byte
+                    let imm: i64 = imm.into();
+                    sink.put1(imm as u8);
+                "#,
+                ), "size_with_inferred_rex_for_inreg0_outreg0"
+        );
+    }
+
+    // XX /r ib with 8-bit unsigned immediate (e.g. for insertlane)
+    {
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("r_ib_unsigned_r", &formats.ternary_imm8, 2)
+                .operands_in(vec![fpr, gpr])
+                .operands_out(vec![0])
+                .inst_predicate(InstructionPredicate::new_is_unsigned_int(
+                    &*formats.ternary_imm8,
+                    "imm",
+                    8,
+                    0,
+                ))
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                    let imm: i64 = imm.into();
+                    sink.put1(imm as u8);
+                "#,
+                ),
+            "size_with_inferred_rex_for_inreg0_inreg1",
+        );
+    }
+
+    {
+        // XX /n id with 32-bit immediate sign-extended. UnaryImm version.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("u_id", &formats.unary_imm, 5)
+                .operands_out(vec![gpr])
+                .inst_predicate(InstructionPredicate::new_is_signed_int(
+                    &*formats.unary_imm,
+                    "imm",
+                    32,
+                    0,
+                ))
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex1(out_reg0), sink);
+                        modrm_r_bits(out_reg0, bits, sink);
+                        let imm: i64 = imm.into();
+                        sink.put4(imm as u32);
+                    "#,
+                ),
+        );
+    }
+
+    // XX+rd id unary with 32-bit immediate. Note no recipe predicate.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("pu_id", &formats.unary_imm, 4)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    // The destination register is encoded in the low bits of the opcode.
+                    // No ModR/M.
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                    let imm: i64 = imm.into();
+                    sink.put4(imm as u32);
+                "#,
+            ),
+    );
+
+    // XX+rd id unary with bool immediate. Note no recipe predicate.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("pu_id_bool", &formats.unary_bool, 4)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    // The destination register is encoded in the low bits of the opcode.
+                    // No ModR/M.
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                    let imm: u32 = if imm { 1 } else { 0 };
+                    sink.put4(imm);
+                "#,
+            ),
+    );
+
+    // XX+rd id nullary with 0 as 32-bit immediate. Note no recipe predicate.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("pu_id_ref", &formats.nullary, 4)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    // The destination register is encoded in the low bits of the opcode.
+                    // No ModR/M.
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                    sink.put4(0);
+                "#,
+            ),
+    );
+
+    // XX+rd iq unary with 64-bit immediate.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("pu_iq", &formats.unary_imm, 8)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                    let imm: i64 = imm.into();
+                    sink.put8(imm as u64);
+                "#,
+            ),
+    );
+
+    // XX+rd id unary with zero immediate.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("u_id_z", &formats.unary_imm, 1)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink);
+                    modrm_rr(out_reg0, out_reg0, sink);
+                "#,
+            ),
+    );
+
+    // XX /n Unary with floating point 32-bit immediate equal to zero.
+    {
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("f32imm_z", &formats.unary_ieee32, 1)
+                .operands_out(vec![fpr])
+                .inst_predicate(InstructionPredicate::new_is_zero_32bit_float(
+                    &*formats.unary_ieee32,
+                    "imm",
+                ))
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink);
+                        modrm_rr(out_reg0, out_reg0, sink);
+                    "#,
+                ),
+        );
+    }
+
+    // XX /n Unary with floating point 64-bit immediate equal to zero.
+    {
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("f64imm_z", &formats.unary_ieee64, 1)
+                .operands_out(vec![fpr])
+                .inst_predicate(InstructionPredicate::new_is_zero_64bit_float(
+                    &*formats.unary_ieee64,
+                    "imm",
+                ))
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink);
+                        modrm_rr(out_reg0, out_reg0, sink);
+                    "#,
+                ),
+        );
+    }
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("pushq", &formats.unary, 0)
+            .operands_in(vec![gpr])
+            .emit(
+                r#"
+                    sink.trap(TrapCode::StackOverflow, func.srclocs[inst]);
+                    {{PUT_OP}}(bits | (in_reg0 & 7), rex1(in_reg0), sink);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("popq", &formats.nullary, 0)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                "#,
+            ),
+    );
+
+    // XX /r, for regmove instructions.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("copysp", &formats.copy_special, 1)
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(dst, src), sink);
+                    modrm_rr(dst, src, sink);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("adjustsp", &formats.unary, 1)
+            .operands_in(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(RU::rsp.into(), in_reg0), sink);
+                    modrm_rr(RU::rsp.into(), in_reg0, sink);
+                "#,
+            ),
+    );
+
+    {
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("adjustsp_ib", &formats.unary_imm, 2)
+                .inst_predicate(InstructionPredicate::new_is_signed_int(
+                    &*formats.unary_imm,
+                    "imm",
+                    8,
+                    0,
+                ))
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex1(RU::rsp.into()), sink);
+                        modrm_r_bits(RU::rsp.into(), bits, sink);
+                        let imm: i64 = imm.into();
+                        sink.put1(imm as u8);
+                    "#,
+                ),
+        );
+
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("adjustsp_id", &formats.unary_imm, 5)
+                .inst_predicate(InstructionPredicate::new_is_signed_int(
+                    &*formats.unary_imm,
+                    "imm",
+                    32,
+                    0,
+                ))
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex1(RU::rsp.into()), sink);
+                        modrm_r_bits(RU::rsp.into(), bits, sink);
+                        let imm: i64 = imm.into();
+                        sink.put4(imm as u32);
+                    "#,
+                ),
+        );
+    }
+
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("dummy_sarg_t", &formats.nullary, 0)
+            .operands_out(vec![Stack::new(gpr)])
+            .emit(""),
+    );
+
+    // XX+rd id with Abs4 function relocation.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("fnaddr4", &formats.func_addr, 4)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::Abs4,
+                                        &func.dfg.ext_funcs[func_ref].name,
+                                        0);
+                    sink.put4(0);
+                "#,
+            ),
+    );
+
+    // XX+rd iq with Abs8 function relocation.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("fnaddr8", &formats.func_addr, 8)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::Abs8,
+                                        &func.dfg.ext_funcs[func_ref].name,
+                                        0);
+                    sink.put8(0);
+                "#,
+            ),
+    );
+
+    // Similar to fnaddr4, but writes !0 (this is used by BaldrMonkey).
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("allones_fnaddr4", &formats.func_addr, 4)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::Abs4,
+                                        &func.dfg.ext_funcs[func_ref].name,
+                                        0);
+                    // Write the immediate as `!0` for the benefit of BaldrMonkey.
+                    sink.put4(!0);
+                "#,
+            ),
+    );
+
+    // Similar to fnaddr8, but writes !0 (this is used by BaldrMonkey).
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("allones_fnaddr8", &formats.func_addr, 8)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::Abs8,
+                                        &func.dfg.ext_funcs[func_ref].name,
+                                        0);
+                    // Write the immediate as `!0` for the benefit of BaldrMonkey.
+                    sink.put8(!0);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("pcrel_fnaddr8", &formats.func_addr, 5)
+            .operands_out(vec![gpr])
+            // rex2 gets passed 0 for r/m register because the upper bit of
+            // r/m doesn't get decoded when in rip-relative addressing mode.
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(0, out_reg0), sink);
+                    modrm_riprel(out_reg0, sink);
+                    // The addend adjusts for the difference between the end of the
+                    // instruction and the beginning of the immediate field.
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::X86PCRel4,
+                                        &func.dfg.ext_funcs[func_ref].name,
+                                        -4);
+                    sink.put4(0);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("got_fnaddr8", &formats.func_addr, 5)
+            .operands_out(vec![gpr])
+            // rex2 gets passed 0 for r/m register because the upper bit of
+            // r/m doesn't get decoded when in rip-relative addressing mode.
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(0, out_reg0), sink);
+                    modrm_riprel(out_reg0, sink);
+                    // The addend adjusts for the difference between the end of the
+                    // instruction and the beginning of the immediate field.
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::X86GOTPCRel4,
+                                        &func.dfg.ext_funcs[func_ref].name,
+                                        -4);
+                    sink.put4(0);
+                "#,
+            ),
+    );
+
+    // XX+rd id with Abs4 globalsym relocation.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("gvaddr4", &formats.unary_global_value, 4)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::Abs4,
+                                        &func.global_values[global_value].symbol_name(),
+                                        0);
+                    sink.put4(0);
+                "#,
+            ),
+    );
+
+    // XX+rd iq with Abs8 globalsym relocation.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("gvaddr8", &formats.unary_global_value, 8)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink);
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::Abs8,
+                                        &func.global_values[global_value].symbol_name(),
+                                        0);
+                    sink.put8(0);
+                "#,
+            ),
+    );
+
+    // XX+rd iq with PCRel4 globalsym relocation.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("pcrel_gvaddr8", &formats.unary_global_value, 5)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(0, out_reg0), sink);
+                    modrm_rm(5, out_reg0, sink);
+                    // The addend adjusts for the difference between the end of the
+                    // instruction and the beginning of the immediate field.
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::X86PCRel4,
+                                        &func.global_values[global_value].symbol_name(),
+                                        -4);
+                    sink.put4(0);
+                "#,
+            ),
+    );
+
+    // XX+rd iq with Abs8 globalsym relocation.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("got_gvaddr8", &formats.unary_global_value, 5)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(0, out_reg0), sink);
+                    modrm_rm(5, out_reg0, sink);
+                    // The addend adjusts for the difference between the end of the
+                    // instruction and the beginning of the immediate field.
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::X86GOTPCRel4,
+                                        &func.global_values[global_value].symbol_name(),
+                                        -4);
+                    sink.put4(0);
+                "#,
+            ),
+    );
+
+    // Stack addresses.
+    //
+    // TODO Alternative forms for 8-bit immediates, when applicable.
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("spaddr_id", &formats.stack_load, 6)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    let sp = StackRef::sp(stack_slot, &func.stack_slots);
+                    let base = stk_base(sp.base);
+                    {{PUT_OP}}(bits, rex2(base, out_reg0), sink);
+                    modrm_sib_disp32(out_reg0, sink);
+                    sib_noindex(base, sink);
+                    let imm : i32 = offset.into();
+                    sink.put4(sp.offset.checked_add(imm).unwrap() as u32);
+                "#,
+            ),
+    );
+
+    // Constant addresses.
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("const_addr", &formats.unary_const, 5)
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(0, out_reg0), sink);
+                    modrm_riprel(out_reg0, sink);
+                    const_disp4(constant_handle, func, sink);
+                "#,
+            ),
+    );
+
+    // Store recipes.
+
+    {
+        // Simple stores.
+
+        // A predicate asking if the offset is zero.
+        let has_no_offset =
+            InstructionPredicate::new_is_field_equal(&*formats.store, "offset", "0".into());
+
+        // XX /r register-indirect store with no offset.
+        let st = recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("st", &formats.store, 1)
+                .operands_in(vec![gpr, gpr])
+                .inst_predicate(has_no_offset.clone())
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_or_offset_for_inreg_1")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                        if needs_sib_byte(in_reg1) {
+                            modrm_sib(in_reg0, sink);
+                            sib_noindex(in_reg1, sink);
+                        } else if needs_offset(in_reg1) {
+                            modrm_disp8(in_reg1, in_reg0, sink);
+                            sink.put1(0);
+                        } else {
+                            modrm_rm(in_reg1, in_reg0, sink);
+                        }
+                    "#,
+                ),
+        );
+
+        // XX /r register-indirect store with no offset.
+        // Only ABCD allowed for stored value. This is for byte stores with no REX.
+        recipes.add_template(
+            Template::new(
+                EncodingRecipeBuilder::new("st_abcd", &formats.store, 1)
+                    .operands_in(vec![abcd, gpr])
+                    .inst_predicate(has_no_offset.clone())
+                    .clobbers_flags(false)
+                    .compute_size("size_plus_maybe_sib_or_offset_for_inreg_1")
+                    .emit(
+                        r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                        if needs_sib_byte(in_reg1) {
+                            modrm_sib(in_reg0, sink);
+                            sib_noindex(in_reg1, sink);
+                        } else if needs_offset(in_reg1) {
+                            modrm_disp8(in_reg1, in_reg0, sink);
+                            sink.put1(0);
+                        } else {
+                            modrm_rm(in_reg1, in_reg0, sink);
+                        }
+                    "#,
+                    ),
+                regs,
+            )
+            .when_prefixed(st),
+        );
+
+        // XX /r register-indirect store of FPR with no offset.
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("fst", &formats.store, 1)
+                .operands_in(vec![fpr, gpr])
+                .inst_predicate(has_no_offset)
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_or_offset_for_inreg_1")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                        if needs_sib_byte(in_reg1) {
+                            modrm_sib(in_reg0, sink);
+                            sib_noindex(in_reg1, sink);
+                        } else if needs_offset(in_reg1) {
+                            modrm_disp8(in_reg1, in_reg0, sink);
+                            sink.put1(0);
+                        } else {
+                            modrm_rm(in_reg1, in_reg0, sink);
+                        }
+                    "#,
+                ),
+            "size_plus_maybe_sib_or_offset_inreg1_plus_rex_prefix_for_inreg0_inreg1",
+        );
+
+        let has_small_offset =
+            InstructionPredicate::new_is_signed_int(&*formats.store, "offset", 8, 0);
+
+        // XX /r register-indirect store with 8-bit offset.
+        let st_disp8 = recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("stDisp8", &formats.store, 2)
+                .operands_in(vec![gpr, gpr])
+                .inst_predicate(has_small_offset.clone())
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_for_inreg_1")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                        if needs_sib_byte(in_reg1) {
+                            modrm_sib_disp8(in_reg0, sink);
+                            sib_noindex(in_reg1, sink);
+                        } else {
+                            modrm_disp8(in_reg1, in_reg0, sink);
+                        }
+                        let offset: i32 = offset.into();
+                        sink.put1(offset as u8);
+                    "#,
+                ),
+        );
+
+        // XX /r register-indirect store with 8-bit offset.
+        // Only ABCD allowed for stored value. This is for byte stores with no REX.
+        recipes.add_template(
+            Template::new(
+                EncodingRecipeBuilder::new("stDisp8_abcd", &formats.store, 2)
+                    .operands_in(vec![abcd, gpr])
+                    .inst_predicate(has_small_offset.clone())
+                    .clobbers_flags(false)
+                    .compute_size("size_plus_maybe_sib_for_inreg_1")
+                    .emit(
+                        r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                        if needs_sib_byte(in_reg1) {
+                            modrm_sib_disp8(in_reg0, sink);
+                            sib_noindex(in_reg1, sink);
+                        } else {
+                            modrm_disp8(in_reg1, in_reg0, sink);
+                        }
+                        let offset: i32 = offset.into();
+                        sink.put1(offset as u8);
+                    "#,
+                    ),
+                regs,
+            )
+            .when_prefixed(st_disp8),
+        );
+
+        // XX /r register-indirect store with 8-bit offset of FPR.
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("fstDisp8", &formats.store, 2)
+                .operands_in(vec![fpr, gpr])
+                .inst_predicate(has_small_offset)
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_for_inreg_1")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                        if needs_sib_byte(in_reg1) {
+                            modrm_sib_disp8(in_reg0, sink);
+                            sib_noindex(in_reg1, sink);
+                        } else {
+                            modrm_disp8(in_reg1, in_reg0, sink);
+                        }
+                        let offset: i32 = offset.into();
+                        sink.put1(offset as u8);
+                    "#,
+                ),
+            "size_plus_maybe_sib_inreg1_plus_rex_prefix_for_inreg0_inreg1",
+        );
+
+        // XX /r register-indirect store with 32-bit offset.
+        let st_disp32 = recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("stDisp32", &formats.store, 5)
+                .operands_in(vec![gpr, gpr])
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_for_inreg_1")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                        if needs_sib_byte(in_reg1) {
+                            modrm_sib_disp32(in_reg0, sink);
+                            sib_noindex(in_reg1, sink);
+                        } else {
+                            modrm_disp32(in_reg1, in_reg0, sink);
+                        }
+                        let offset: i32 = offset.into();
+                        sink.put4(offset as u32);
+                    "#,
+                ),
+        );
+
+        // XX /r register-indirect store with 32-bit offset.
+        // Only ABCD allowed for stored value. This is for byte stores with no REX.
+        recipes.add_template(
+            Template::new(
+                EncodingRecipeBuilder::new("stDisp32_abcd", &formats.store, 5)
+                    .operands_in(vec![abcd, gpr])
+                    .clobbers_flags(false)
+                    .compute_size("size_plus_maybe_sib_for_inreg_1")
+                    .emit(
+                        r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                        if needs_sib_byte(in_reg1) {
+                            modrm_sib_disp32(in_reg0, sink);
+                            sib_noindex(in_reg1, sink);
+                        } else {
+                            modrm_disp32(in_reg1, in_reg0, sink);
+                        }
+                        let offset: i32 = offset.into();
+                        sink.put4(offset as u32);
+                    "#,
+                    ),
+                regs,
+            )
+            .when_prefixed(st_disp32),
+        );
+
+        // XX /r register-indirect store with 32-bit offset of FPR.
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("fstDisp32", &formats.store, 5)
+                .operands_in(vec![fpr, gpr])
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_for_inreg_1")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                        if needs_sib_byte(in_reg1) {
+                            modrm_sib_disp32(in_reg0, sink);
+                            sib_noindex(in_reg1, sink);
+                        } else {
+                            modrm_disp32(in_reg1, in_reg0, sink);
+                        }
+                        let offset: i32 = offset.into();
+                        sink.put4(offset as u32);
+                    "#,
+                ),
+            "size_plus_maybe_sib_inreg1_plus_rex_prefix_for_inreg0_inreg1",
+        );
+    }
+
+    {
+        // Complex stores.
+
+        // A predicate asking if the offset is zero.
+        let has_no_offset =
+            InstructionPredicate::new_is_field_equal(&*formats.store_complex, "offset", "0".into());
+
+        // XX /r register-indirect store with index and no offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("stWithIndex", &formats.store_complex, 2)
+                .operands_in(vec![gpr, gpr, gpr])
+                .inst_predicate(has_no_offset.clone())
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_offset_for_inreg_1")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+                        // The else branch always inserts an SIB byte.
+                        if needs_offset(in_reg1) {
+                            modrm_sib_disp8(in_reg0, sink);
+                            sib(0, in_reg2, in_reg1, sink);
+                            sink.put1(0);
+                        } else {
+                            modrm_sib(in_reg0, sink);
+                            sib(0, in_reg2, in_reg1, sink);
+                        }
+                    "#,
+                ),
+        );
+
+        // XX /r register-indirect store with index and no offset.
+        // Only ABCD allowed for stored value. This is for byte stores with no REX.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("stWithIndex_abcd", &formats.store_complex, 2)
+                .operands_in(vec![abcd, gpr, gpr])
+                .inst_predicate(has_no_offset.clone())
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_offset_for_inreg_1")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+                        // The else branch always inserts an SIB byte.
+                        if needs_offset(in_reg1) {
+                            modrm_sib_disp8(in_reg0, sink);
+                            sib(0, in_reg2, in_reg1, sink);
+                            sink.put1(0);
+                        } else {
+                            modrm_sib(in_reg0, sink);
+                            sib(0, in_reg2, in_reg1, sink);
+                        }
+                    "#,
+                ),
+        );
+
+        // XX /r register-indirect store with index and no offset of FPR.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("fstWithIndex", &formats.store_complex, 2)
+                .operands_in(vec![fpr, gpr, gpr])
+                .inst_predicate(has_no_offset)
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_offset_for_inreg_1")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+                        // The else branch always inserts an SIB byte.
+                        if needs_offset(in_reg1) {
+                            modrm_sib_disp8(in_reg0, sink);
+                            sib(0, in_reg2, in_reg1, sink);
+                            sink.put1(0);
+                        } else {
+                            modrm_sib(in_reg0, sink);
+                            sib(0, in_reg2, in_reg1, sink);
+                        }
+                    "#,
+                ),
+        );
+
+        let has_small_offset =
+            InstructionPredicate::new_is_signed_int(&*formats.store_complex, "offset", 8, 0);
+
+        // XX /r register-indirect store with index and 8-bit offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("stWithIndexDisp8", &formats.store_complex, 3)
+                .operands_in(vec![gpr, gpr, gpr])
+                .inst_predicate(has_small_offset.clone())
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+                        modrm_sib_disp8(in_reg0, sink);
+                        sib(0, in_reg2, in_reg1, sink);
+                        let offset: i32 = offset.into();
+                        sink.put1(offset as u8);
+                    "#,
+                ),
+        );
+
+        // XX /r register-indirect store with index and 8-bit offset.
+        // Only ABCD allowed for stored value. This is for byte stores with no REX.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("stWithIndexDisp8_abcd", &formats.store_complex, 3)
+                .operands_in(vec![abcd, gpr, gpr])
+                .inst_predicate(has_small_offset.clone())
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+                        modrm_sib_disp8(in_reg0, sink);
+                        sib(0, in_reg2, in_reg1, sink);
+                        let offset: i32 = offset.into();
+                        sink.put1(offset as u8);
+                    "#,
+                ),
+        );
+
+        // XX /r register-indirect store with index and 8-bit offset of FPR.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("fstWithIndexDisp8", &formats.store_complex, 3)
+                .operands_in(vec![fpr, gpr, gpr])
+                .inst_predicate(has_small_offset)
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+                        modrm_sib_disp8(in_reg0, sink);
+                        sib(0, in_reg2, in_reg1, sink);
+                        let offset: i32 = offset.into();
+                        sink.put1(offset as u8);
+                    "#,
+                ),
+        );
+
+        let has_big_offset =
+            InstructionPredicate::new_is_signed_int(&*formats.store_complex, "offset", 32, 0);
+
+        // XX /r register-indirect store with index and 32-bit offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("stWithIndexDisp32", &formats.store_complex, 6)
+                .operands_in(vec![gpr, gpr, gpr])
+                .inst_predicate(has_big_offset.clone())
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+                        modrm_sib_disp32(in_reg0, sink);
+                        sib(0, in_reg2, in_reg1, sink);
+                        let offset: i32 = offset.into();
+                        sink.put4(offset as u32);
+                    "#,
+                ),
+        );
+
+        // XX /r register-indirect store with index and 32-bit offset.
+        // Only ABCD allowed for stored value. This is for byte stores with no REX.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("stWithIndexDisp32_abcd", &formats.store_complex, 6)
+                .operands_in(vec![abcd, gpr, gpr])
+                .inst_predicate(has_big_offset.clone())
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+                        modrm_sib_disp32(in_reg0, sink);
+                        sib(0, in_reg2, in_reg1, sink);
+                        let offset: i32 = offset.into();
+                        sink.put4(offset as u32);
+                    "#,
+                ),
+        );
+
+        // XX /r register-indirect store with index and 32-bit offset of FPR.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("fstWithIndexDisp32", &formats.store_complex, 6)
+                .operands_in(vec![fpr, gpr, gpr])
+                .inst_predicate(has_big_offset)
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+                        modrm_sib_disp32(in_reg0, sink);
+                        sib(0, in_reg2, in_reg1, sink);
+                        let offset: i32 = offset.into();
+                        sink.put4(offset as u32);
+                    "#,
+                ),
+        );
+    }
+
+    // Unary spill with SIB and 32-bit displacement.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("spillSib32", &formats.unary, 6)
+            .operands_in(vec![gpr])
+            .operands_out(vec![stack_gpr32])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    sink.trap(TrapCode::StackOverflow, func.srclocs[inst]);
+                    let base = stk_base(out_stk0.base);
+                    {{PUT_OP}}(bits, rex2(base, in_reg0), sink);
+                    modrm_sib_disp32(in_reg0, sink);
+                    sib_noindex(base, sink);
+                    sink.put4(out_stk0.offset as u32);
+                "#,
+            ),
+    );
+
+    // Like spillSib32, but targeting an FPR rather than a GPR.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("fspillSib32", &formats.unary, 6)
+            .operands_in(vec![fpr])
+            .operands_out(vec![stack_fpr32])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    sink.trap(TrapCode::StackOverflow, func.srclocs[inst]);
+                    let base = stk_base(out_stk0.base);
+                    {{PUT_OP}}(bits, rex2(base, in_reg0), sink);
+                    modrm_sib_disp32(in_reg0, sink);
+                    sib_noindex(base, sink);
+                    sink.put4(out_stk0.offset as u32);
+                "#,
+            ),
+    );
+
+    // Regspill using RSP-relative addressing.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("regspill32", &formats.reg_spill, 6)
+            .operands_in(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    sink.trap(TrapCode::StackOverflow, func.srclocs[inst]);
+                    let dst = StackRef::sp(dst, &func.stack_slots);
+                    let base = stk_base(dst.base);
+                    {{PUT_OP}}(bits, rex2(base, src), sink);
+                    modrm_sib_disp32(src, sink);
+                    sib_noindex(base, sink);
+                    sink.put4(dst.offset as u32);
+                "#,
+            ),
+    );
+
+    // Like regspill32, but targeting an FPR rather than a GPR.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("fregspill32", &formats.reg_spill, 6)
+            .operands_in(vec![fpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    sink.trap(TrapCode::StackOverflow, func.srclocs[inst]);
+                    let dst = StackRef::sp(dst, &func.stack_slots);
+                    let base = stk_base(dst.base);
+                    {{PUT_OP}}(bits, rex2(base, src), sink);
+                    modrm_sib_disp32(src, sink);
+                    sib_noindex(base, sink);
+                    sink.put4(dst.offset as u32);
+                "#,
+            ),
+    );
+
+    // Load recipes.
+
+    {
+        // Simple loads.
+
+        // A predicate asking if the offset is zero.
+        let has_no_offset =
+            InstructionPredicate::new_is_field_equal(&*formats.load, "offset", "0".into());
+
+        // XX /r load with no offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("ld", &formats.load, 1)
+                .operands_in(vec![gpr])
+                .operands_out(vec![gpr])
+                .inst_predicate(has_no_offset.clone())
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_or_offset_for_inreg_0")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                        if needs_sib_byte(in_reg0) {
+                            modrm_sib(out_reg0, sink);
+                            sib_noindex(in_reg0, sink);
+                        } else if needs_offset(in_reg0) {
+                            modrm_disp8(in_reg0, out_reg0, sink);
+                            sink.put1(0);
+                        } else {
+                            modrm_rm(in_reg0, out_reg0, sink);
+                        }
+                    "#,
+                ),
+        );
+
+        // XX /r float load with no offset.
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("fld", &formats.load, 1)
+                .operands_in(vec![gpr])
+                .operands_out(vec![fpr])
+                .inst_predicate(has_no_offset)
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_or_offset_for_inreg_0")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                        if needs_sib_byte(in_reg0) {
+                            modrm_sib(out_reg0, sink);
+                            sib_noindex(in_reg0, sink);
+                        } else if needs_offset(in_reg0) {
+                            modrm_disp8(in_reg0, out_reg0, sink);
+                            sink.put1(0);
+                        } else {
+                            modrm_rm(in_reg0, out_reg0, sink);
+                        }
+                    "#,
+                ),
+            "size_plus_maybe_sib_or_offset_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0",
+        );
+
+        let has_small_offset =
+            InstructionPredicate::new_is_signed_int(&*formats.load, "offset", 8, 0);
+
+        // XX /r load with 8-bit offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("ldDisp8", &formats.load, 2)
+                .operands_in(vec![gpr])
+                .operands_out(vec![gpr])
+                .inst_predicate(has_small_offset.clone())
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_for_inreg_0")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                        if needs_sib_byte(in_reg0) {
+                            modrm_sib_disp8(out_reg0, sink);
+                            sib_noindex(in_reg0, sink);
+                        } else {
+                            modrm_disp8(in_reg0, out_reg0, sink);
+                        }
+                        let offset: i32 = offset.into();
+                        sink.put1(offset as u8);
+                    "#,
+                ),
+        );
+
+        // XX /r float load with 8-bit offset.
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("fldDisp8", &formats.load, 2)
+                .operands_in(vec![gpr])
+                .operands_out(vec![fpr])
+                .inst_predicate(has_small_offset)
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_for_inreg_0")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                        if needs_sib_byte(in_reg0) {
+                            modrm_sib_disp8(out_reg0, sink);
+                            sib_noindex(in_reg0, sink);
+                        } else {
+                            modrm_disp8(in_reg0, out_reg0, sink);
+                        }
+                        let offset: i32 = offset.into();
+                        sink.put1(offset as u8);
+                    "#,
+                ),
+            "size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0",
+        );
+
+        let has_big_offset =
+            InstructionPredicate::new_is_signed_int(&*formats.load, "offset", 32, 0);
+
+        // XX /r load with 32-bit offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("ldDisp32", &formats.load, 5)
+                .operands_in(vec![gpr])
+                .operands_out(vec![gpr])
+                .inst_predicate(has_big_offset.clone())
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_for_inreg_0")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                        if needs_sib_byte(in_reg0) {
+                            modrm_sib_disp32(out_reg0, sink);
+                            sib_noindex(in_reg0, sink);
+                        } else {
+                            modrm_disp32(in_reg0, out_reg0, sink);
+                        }
+                        let offset: i32 = offset.into();
+                        sink.put4(offset as u32);
+                    "#,
+                ),
+        );
+
+        // XX /r float load with 32-bit offset.
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("fldDisp32", &formats.load, 5)
+                .operands_in(vec![gpr])
+                .operands_out(vec![fpr])
+                .inst_predicate(has_big_offset)
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_sib_for_inreg_0")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                        if needs_sib_byte(in_reg0) {
+                            modrm_sib_disp32(out_reg0, sink);
+                            sib_noindex(in_reg0, sink);
+                        } else {
+                            modrm_disp32(in_reg0, out_reg0, sink);
+                        }
+                        let offset: i32 = offset.into();
+                        sink.put4(offset as u32);
+                    "#,
+                ),
+            "size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0",
+        );
+    }
+
+    {
+        // Complex loads.
+
+        // A predicate asking if the offset is zero.
+        let has_no_offset =
+            InstructionPredicate::new_is_field_equal(&*formats.load_complex, "offset", "0".into());
+
+        // XX /r load with index and no offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("ldWithIndex", &formats.load_complex, 2)
+                .operands_in(vec![gpr, gpr])
+                .operands_out(vec![gpr])
+                .inst_predicate(has_no_offset.clone())
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_offset_for_inreg_0")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+                        // The else branch always inserts an SIB byte.
+                        if needs_offset(in_reg0) {
+                            modrm_sib_disp8(out_reg0, sink);
+                            sib(0, in_reg1, in_reg0, sink);
+                            sink.put1(0);
+                        } else {
+                            modrm_sib(out_reg0, sink);
+                            sib(0, in_reg1, in_reg0, sink);
+                        }
+                    "#,
+                ),
+        );
+
+        // XX /r float load with index and no offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("fldWithIndex", &formats.load_complex, 2)
+                .operands_in(vec![gpr, gpr])
+                .operands_out(vec![fpr])
+                .inst_predicate(has_no_offset)
+                .clobbers_flags(false)
+                .compute_size("size_plus_maybe_offset_for_inreg_0")
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+                        // The else branch always inserts an SIB byte.
+                        if needs_offset(in_reg0) {
+                            modrm_sib_disp8(out_reg0, sink);
+                            sib(0, in_reg1, in_reg0, sink);
+                            sink.put1(0);
+                        } else {
+                            modrm_sib(out_reg0, sink);
+                            sib(0, in_reg1, in_reg0, sink);
+                        }
+                    "#,
+                ),
+        );
+
+        let has_small_offset =
+            InstructionPredicate::new_is_signed_int(&*formats.load_complex, "offset", 8, 0);
+
+        // XX /r load with index and 8-bit offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("ldWithIndexDisp8", &formats.load_complex, 3)
+                .operands_in(vec![gpr, gpr])
+                .operands_out(vec![gpr])
+                .inst_predicate(has_small_offset.clone())
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+                        modrm_sib_disp8(out_reg0, sink);
+                        sib(0, in_reg1, in_reg0, sink);
+                        let offset: i32 = offset.into();
+                        sink.put1(offset as u8);
+                    "#,
+                ),
+        );
+
+        // XX /r float load with 8-bit offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("fldWithIndexDisp8", &formats.load_complex, 3)
+                .operands_in(vec![gpr, gpr])
+                .operands_out(vec![fpr])
+                .inst_predicate(has_small_offset)
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+                        modrm_sib_disp8(out_reg0, sink);
+                        sib(0, in_reg1, in_reg0, sink);
+                        let offset: i32 = offset.into();
+                        sink.put1(offset as u8);
+                    "#,
+                ),
+        );
+
+        let has_big_offset =
+            InstructionPredicate::new_is_signed_int(&*formats.load_complex, "offset", 32, 0);
+
+        // XX /r load with index and 32-bit offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("ldWithIndexDisp32", &formats.load_complex, 6)
+                .operands_in(vec![gpr, gpr])
+                .operands_out(vec![gpr])
+                .inst_predicate(has_big_offset.clone())
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+                        modrm_sib_disp32(out_reg0, sink);
+                        sib(0, in_reg1, in_reg0, sink);
+                        let offset: i32 = offset.into();
+                        sink.put4(offset as u32);
+                    "#,
+                ),
+        );
+
+        // XX /r float load with index and 32-bit offset.
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("fldWithIndexDisp32", &formats.load_complex, 6)
+                .operands_in(vec![gpr, gpr])
+                .operands_out(vec![fpr])
+                .inst_predicate(has_big_offset)
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        if !flags.notrap() {
+                            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+                        }
+                        {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+                        modrm_sib_disp32(out_reg0, sink);
+                        sib(0, in_reg1, in_reg0, sink);
+                        let offset: i32 = offset.into();
+                        sink.put4(offset as u32);
+                    "#,
+                ),
+        );
+    }
+
+    // Unary fill with SIB and 32-bit displacement.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("fillSib32", &formats.unary, 6)
+            .operands_in(vec![stack_gpr32])
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    let base = stk_base(in_stk0.base);
+                    {{PUT_OP}}(bits, rex2(base, out_reg0), sink);
+                    modrm_sib_disp32(out_reg0, sink);
+                    sib_noindex(base, sink);
+                    sink.put4(in_stk0.offset as u32);
+                "#,
+            ),
+    );
+
+    // Like fillSib32, but targeting an FPR rather than a GPR.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("ffillSib32", &formats.unary, 6)
+            .operands_in(vec![stack_fpr32])
+            .operands_out(vec![fpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    let base = stk_base(in_stk0.base);
+                    {{PUT_OP}}(bits, rex2(base, out_reg0), sink);
+                    modrm_sib_disp32(out_reg0, sink);
+                    sib_noindex(base, sink);
+                    sink.put4(in_stk0.offset as u32);
+                "#,
+            ),
+    );
+
+    // Regfill with RSP-relative 32-bit displacement.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("regfill32", &formats.reg_fill, 6)
+            .operands_in(vec![stack_gpr32])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    let src = StackRef::sp(src, &func.stack_slots);
+                    let base = stk_base(src.base);
+                    {{PUT_OP}}(bits, rex2(base, dst), sink);
+                    modrm_sib_disp32(dst, sink);
+                    sib_noindex(base, sink);
+                    sink.put4(src.offset as u32);
+                "#,
+            ),
+    );
+
+    // Like regfill32, but targeting an FPR rather than a GPR.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("fregfill32", &formats.reg_fill, 6)
+            .operands_in(vec![stack_fpr32])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    let src = StackRef::sp(src, &func.stack_slots);
+                    let base = stk_base(src.base);
+                    {{PUT_OP}}(bits, rex2(base, dst), sink);
+                    modrm_sib_disp32(dst, sink);
+                    sib_noindex(base, sink);
+                    sink.put4(src.offset as u32);
+                "#,
+            ),
+    );
+
+    // Call/return.
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("call_id", &formats.call, 4).emit(
+            r#"
+            sink.trap(TrapCode::StackOverflow, func.srclocs[inst]);
+            {{PUT_OP}}(bits, BASE_REX, sink);
+            // The addend adjusts for the difference between the end of the
+            // instruction and the beginning of the immediate field.
+            sink.reloc_external(func.srclocs[inst],
+                                Reloc::X86CallPCRel4,
+                                &func.dfg.ext_funcs[func_ref].name,
+                                -4);
+            sink.put4(0);
+            sink.add_call_site(opcode, func.srclocs[inst]);
+        "#,
+        ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("call_plt_id", &formats.call, 4).emit(
+            r#"
+            sink.trap(TrapCode::StackOverflow, func.srclocs[inst]);
+            {{PUT_OP}}(bits, BASE_REX, sink);
+            sink.reloc_external(func.srclocs[inst],
+                                Reloc::X86CallPLTRel4,
+                                &func.dfg.ext_funcs[func_ref].name,
+                                -4);
+            sink.put4(0);
+            sink.add_call_site(opcode, func.srclocs[inst]);
+        "#,
+        ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("call_r", &formats.call_indirect, 1)
+            .operands_in(vec![gpr])
+            .emit(
+                r#"
+                    sink.trap(TrapCode::StackOverflow, func.srclocs[inst]);
+                    {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                    modrm_r_bits(in_reg0, bits, sink);
+                    sink.add_call_site(opcode, func.srclocs[inst]);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("ret", &formats.multiary, 0)
+            .emit("{{PUT_OP}}(bits, BASE_REX, sink);"),
+    );
+
+    // Branches.
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("jmpb", &formats.jump, 1)
+            .branch_range((1, 8))
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, BASE_REX, sink);
+                    disp1(destination, func, sink);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("jmpd", &formats.jump, 4)
+            .branch_range((4, 32))
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, BASE_REX, sink);
+                    disp4(destination, func, sink);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("brib", &formats.branch_int, 1)
+            .operands_in(vec![reg_rflags])
+            .branch_range((1, 8))
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | icc2opc(cond), BASE_REX, sink);
+                    disp1(destination, func, sink);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("brid", &formats.branch_int, 4)
+            .operands_in(vec![reg_rflags])
+            .branch_range((4, 32))
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | icc2opc(cond), BASE_REX, sink);
+                    disp4(destination, func, sink);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("brfb", &formats.branch_float, 1)
+            .operands_in(vec![reg_rflags])
+            .branch_range((1, 8))
+            .clobbers_flags(false)
+            .inst_predicate(supported_floatccs_predicate(
+                &supported_floatccs,
+                &*formats.branch_float,
+            ))
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | fcc2opc(cond), BASE_REX, sink);
+                    disp1(destination, func, sink);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("brfd", &formats.branch_float, 4)
+            .operands_in(vec![reg_rflags])
+            .branch_range((4, 32))
+            .clobbers_flags(false)
+            .inst_predicate(supported_floatccs_predicate(
+                &supported_floatccs,
+                &*formats.branch_float,
+            ))
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits | fcc2opc(cond), BASE_REX, sink);
+                    disp4(destination, func, sink);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("indirect_jmp", &formats.indirect_jump, 1)
+            .operands_in(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                    modrm_r_bits(in_reg0, bits, sink);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("jt_entry", &formats.branch_table_entry, 2)
+            .operands_in(vec![gpr, gpr])
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .inst_predicate(valid_scale(&*formats.branch_table_entry))
+            .compute_size("size_plus_maybe_offset_for_inreg_1")
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex3(in_reg1, out_reg0, in_reg0), sink);
+                    if needs_offset(in_reg1) {
+                        modrm_sib_disp8(out_reg0, sink);
+                        sib(imm.trailing_zeros() as u8, in_reg0, in_reg1, sink);
+                        sink.put1(0);
+                    } else {
+                        modrm_sib(out_reg0, sink);
+                        sib(imm.trailing_zeros() as u8, in_reg0, in_reg1, sink);
+                    }
+                "#,
+            ),
+    );
+
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("vconst", &formats.unary_const, 5)
+            .operands_out(vec![fpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(0, out_reg0), sink);
+                    modrm_riprel(out_reg0, sink);
+                    const_disp4(constant_handle, func, sink);
+                "#,
+            ),
+        "size_with_inferred_rex_for_outreg0",
+    );
+
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("vconst_optimized", &formats.unary_const, 1)
+            .operands_out(vec![fpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink);
+                    modrm_rr(out_reg0, out_reg0, sink);
+                "#,
+            ),
+        "size_with_inferred_rex_for_outreg0",
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("jt_base", &formats.branch_table_base, 5)
+            .operands_out(vec![gpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(0, out_reg0), sink);
+                    modrm_riprel(out_reg0, sink);
+
+                    // No reloc is needed here as the jump table is emitted directly after
+                    // the function body.
+                    jt_disp4(table, func, sink);
+                "#,
+            ),
+    );
+
+    // Test flags and set a register.
+    //
+    // These setCC instructions only set the low 8 bits, and they can only write ABCD registers
+    // without a REX prefix.
+    //
+    // Other instruction encodings accepting `b1` inputs have the same constraints and only look at
+    // the low 8 bits of the input register.
+
+    let seti = recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("seti", &formats.int_cond, 1)
+                .operands_in(vec![reg_rflags])
+                .operands_out(vec![gpr])
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits | icc2opc(cond), rex1(out_reg0), sink);
+                    modrm_r_bits(out_reg0, bits, sink);
+                "#,
+                ),
+            regs,
+        )
+        .rex_kind(RecipePrefixKind::AlwaysEmitRex),
+    );
+
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("seti_abcd", &formats.int_cond, 1)
+                .operands_in(vec![reg_rflags])
+                .operands_out(vec![abcd])
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits | icc2opc(cond), rex1(out_reg0), sink);
+                    modrm_r_bits(out_reg0, bits, sink);
+                "#,
+                ),
+            regs,
+        )
+        .when_prefixed(seti),
+    );
+
+    let setf = recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("setf", &formats.float_cond, 1)
+                .operands_in(vec![reg_rflags])
+                .operands_out(vec![gpr])
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits | fcc2opc(cond), rex1(out_reg0), sink);
+                    modrm_r_bits(out_reg0, bits, sink);
+                "#,
+                ),
+            regs,
+        )
+        .rex_kind(RecipePrefixKind::AlwaysEmitRex),
+    );
+
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("setf_abcd", &formats.float_cond, 1)
+                .operands_in(vec![reg_rflags])
+                .operands_out(vec![abcd])
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                    {{PUT_OP}}(bits | fcc2opc(cond), rex1(out_reg0), sink);
+                    modrm_r_bits(out_reg0, bits, sink);
+                "#,
+                ),
+            regs,
+        )
+        .when_prefixed(setf),
+    );
+
+    // Conditional move (a.k.a integer select)
+    // (maybe-REX.W) 0F 4x modrm(r,r)
+    // 1 byte, modrm(r,r), is after the opcode
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("cmov", &formats.int_select, 1)
+                .operands_in(vec![
+                    OperandConstraint::FixedReg(reg_rflags),
+                    OperandConstraint::RegClass(gpr),
+                    OperandConstraint::RegClass(gpr),
+                ])
+                .operands_out(vec![2])
+                .clobbers_flags(false)
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits | icc2opc(cond), rex2(in_reg1, in_reg2), sink);
+                        modrm_rr(in_reg1, in_reg2, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_cmov"),
+    );
+
+    // Bit scan forwards and reverse
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("bsf_and_bsr", &formats.unary, 1)
+                .operands_in(vec![gpr])
+                .operands_out(vec![
+                    OperandConstraint::RegClass(gpr),
+                    OperandConstraint::FixedReg(reg_rflags),
+                ])
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink);
+                        modrm_rr(in_reg0, out_reg0, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_outreg0"),
+    );
+
+    // Arithematic with flag I/O.
+
+    // XX /r, MR form. Add two GPR registers and set carry flag.
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("rout", &formats.binary, 1)
+                .operands_in(vec![gpr, gpr])
+                .operands_out(vec![
+                    OperandConstraint::TiedInput(0),
+                    OperandConstraint::FixedReg(reg_rflags),
+                ])
+                .clobbers_flags(true)
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink);
+                        modrm_rr(in_reg0, in_reg1, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"),
+    );
+
+    // XX /r, MR form. Add two GPR registers and get carry flag.
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("rin", &formats.ternary, 1)
+                .operands_in(vec![
+                    OperandConstraint::RegClass(gpr),
+                    OperandConstraint::RegClass(gpr),
+                    OperandConstraint::FixedReg(reg_rflags),
+                ])
+                .operands_out(vec![0])
+                .clobbers_flags(true)
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink);
+                        modrm_rr(in_reg0, in_reg1, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"),
+    );
+
+    // XX /r, MR form. Add two GPR registers with carry flag.
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("rio", &formats.ternary, 1)
+                .operands_in(vec![
+                    OperandConstraint::RegClass(gpr),
+                    OperandConstraint::RegClass(gpr),
+                    OperandConstraint::FixedReg(reg_rflags),
+                ])
+                .operands_out(vec![
+                    OperandConstraint::TiedInput(0),
+                    OperandConstraint::FixedReg(reg_rflags),
+                ])
+                .clobbers_flags(true)
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink);
+                        modrm_rr(in_reg0, in_reg1, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"),
+    );
+
+    // Compare and set flags.
+
+    // XX /r, MR form. Compare two GPR registers and set flags.
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("rcmp", &formats.binary, 1)
+                .operands_in(vec![gpr, gpr])
+                .operands_out(vec![reg_rflags])
+                .emit(
+                    r#"
+                        {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink);
+                        modrm_rr(in_reg0, in_reg1, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"),
+    );
+
+    // Same as rcmp, but second operand is the stack pointer.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("rcmp_sp", &formats.unary, 1)
+            .operands_in(vec![gpr])
+            .operands_out(vec![reg_rflags])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg0, RU::rsp.into()), sink);
+                    modrm_rr(in_reg0, RU::rsp.into(), sink);
+                "#,
+            ),
+    );
+
+    // XX /r, RM form. Compare two FPR registers and set flags.
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("fcmp", &formats.binary, 1)
+            .operands_in(vec![fpr, fpr])
+            .operands_out(vec![reg_rflags])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                "#,
+            ),
+        "size_with_inferred_rex_for_inreg0_inreg1",
+    );
+
+    {
+        let has_small_offset =
+            InstructionPredicate::new_is_signed_int(&*formats.binary_imm64, "imm", 8, 0);
+
+        // XX /n, MI form with imm8.
+        recipes.add_template(
+            Template::new(
+                EncodingRecipeBuilder::new("rcmp_ib", &formats.binary_imm64, 2)
+                    .operands_in(vec![gpr])
+                    .operands_out(vec![reg_rflags])
+                    .inst_predicate(has_small_offset)
+                    .emit(
+                        r#"
+                            {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                            modrm_r_bits(in_reg0, bits, sink);
+                            let imm: i64 = imm.into();
+                            sink.put1(imm as u8);
+                        "#,
+                    ),
+                regs,
+            )
+            .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"),
+        );
+
+        let has_big_offset =
+            InstructionPredicate::new_is_signed_int(&*formats.binary_imm64, "imm", 32, 0);
+
+        // XX /n, MI form with imm32.
+        recipes.add_template(
+            Template::new(
+                EncodingRecipeBuilder::new("rcmp_id", &formats.binary_imm64, 5)
+                    .operands_in(vec![gpr])
+                    .operands_out(vec![reg_rflags])
+                    .inst_predicate(has_big_offset)
+                    .emit(
+                        r#"
+                            {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                            modrm_r_bits(in_reg0, bits, sink);
+                            let imm: i64 = imm.into();
+                            sink.put4(imm as u32);
+                        "#,
+                    ),
+                regs,
+            )
+            .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"),
+        );
+    }
+
+    // Test-and-branch.
+    //
+    // This recipe represents the macro fusion of a test and a conditional branch.
+    // This serves two purposes:
+    //
+    // 1. Guarantee that the test and branch get scheduled next to each other so
+    //    macro fusion is guaranteed to be possible.
+    // 2. Hide the status flags from Cranelift which doesn't currently model flags.
+    //
+    // The encoding bits affect both the test and the branch instruction:
+    //
+    // Bits 0-7 are the Jcc opcode.
+    // Bits 8-15 control the test instruction which always has opcode byte 0x85.
+
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("tjccb", &formats.branch, 1 + 2)
+                .operands_in(vec![gpr])
+                .branch_range((3, 8))
+                .emit(
+                    r#"
+                        // test r, r.
+                        {{PUT_OP}}((bits & 0xff00) | 0x85, rex2(in_reg0, in_reg0), sink);
+                        modrm_rr(in_reg0, in_reg0, sink);
+                        // Jcc instruction.
+                        sink.put1(bits as u8);
+                        disp1(destination, func, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"),
+    );
+
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("tjccd", &formats.branch, 1 + 6)
+                .operands_in(vec![gpr])
+                .branch_range((7, 32))
+                .emit(
+                    r#"
+                        // test r, r.
+                        {{PUT_OP}}((bits & 0xff00) | 0x85, rex2(in_reg0, in_reg0), sink);
+                        modrm_rr(in_reg0, in_reg0, sink);
+                        // Jcc instruction.
+                        sink.put1(0x0f);
+                        sink.put1(bits as u8);
+                        disp4(destination, func, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"),
+    );
+
+    // 8-bit test-and-branch.
+
+    let t8jccb = recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("t8jccb", &formats.branch, 1 + 2)
+                .operands_in(vec![gpr])
+                .branch_range((3, 8))
+                .emit(
+                    r#"
+                    // test8 r, r.
+                    {{PUT_OP}}((bits & 0xff00) | 0x84, rex2(in_reg0, in_reg0), sink);
+                    modrm_rr(in_reg0, in_reg0, sink);
+                    // Jcc instruction.
+                    sink.put1(bits as u8);
+                    disp1(destination, func, sink);
+                "#,
+                ),
+            regs,
+        )
+        .rex_kind(RecipePrefixKind::AlwaysEmitRex),
+    );
+
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("t8jccb_abcd", &formats.branch, 1 + 2)
+                .operands_in(vec![abcd])
+                .branch_range((3, 8))
+                .emit(
+                    r#"
+                    // test8 r, r.
+                    {{PUT_OP}}((bits & 0xff00) | 0x84, rex2(in_reg0, in_reg0), sink);
+                    modrm_rr(in_reg0, in_reg0, sink);
+                    // Jcc instruction.
+                    sink.put1(bits as u8);
+                    disp1(destination, func, sink);
+                "#,
+                ),
+            regs,
+        )
+        .when_prefixed(t8jccb),
+    );
+
+    let t8jccd = recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("t8jccd", &formats.branch, 1 + 6)
+                .operands_in(vec![gpr])
+                .branch_range((7, 32))
+                .emit(
+                    r#"
+                    // test8 r, r.
+                    {{PUT_OP}}((bits & 0xff00) | 0x84, rex2(in_reg0, in_reg0), sink);
+                    modrm_rr(in_reg0, in_reg0, sink);
+                    // Jcc instruction.
+                    sink.put1(0x0f);
+                    sink.put1(bits as u8);
+                    disp4(destination, func, sink);
+                "#,
+                ),
+            regs,
+        )
+        .rex_kind(RecipePrefixKind::AlwaysEmitRex),
+    );
+
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("t8jccd_abcd", &formats.branch, 1 + 6)
+                .operands_in(vec![abcd])
+                .branch_range((7, 32))
+                .emit(
+                    r#"
+                    // test8 r, r.
+                    {{PUT_OP}}((bits & 0xff00) | 0x84, rex2(in_reg0, in_reg0), sink);
+                    modrm_rr(in_reg0, in_reg0, sink);
+                    // Jcc instruction.
+                    sink.put1(0x0f);
+                    sink.put1(bits as u8);
+                    disp4(destination, func, sink);
+                "#,
+                ),
+            regs,
+        )
+        .when_prefixed(t8jccd),
+    );
+
+    // Worst case test-and-branch recipe for brz.b1 and brnz.b1 in 32-bit mode.
+    // The register allocator can't handle a branch instruction with constrained
+    // operands like the t8jccd_abcd above. This variant can accept the b1 opernd in
+    // any register, but is is larger because it uses a 32-bit test instruction with
+    // a 0xff immediate.
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("t8jccd_long", &formats.branch, 5 + 6)
+            .operands_in(vec![gpr])
+            .branch_range((11, 32))
+            .emit(
+                r#"
+                    // test32 r, 0xff.
+                    {{PUT_OP}}((bits & 0xff00) | 0xf7, rex1(in_reg0), sink);
+                    modrm_r_bits(in_reg0, bits, sink);
+                    sink.put4(0xff);
+                    // Jcc instruction.
+                    sink.put1(0x0f);
+                    sink.put1(bits as u8);
+                    disp4(destination, func, sink);
+                "#,
+            ),
+    );
+
+    // Comparison that produces a `b1` result in a GPR.
+    //
+    // This is a macro of a `cmp` instruction followed by a `setCC` instruction.
+    //
+    // TODO This is not a great solution because:
+    //
+    // - The cmp+setcc combination is not recognized by CPU's macro fusion.
+    // - The 64-bit encoding has issues with REX prefixes. The `cmp` and `setCC`
+    //   instructions may need a REX independently.
+    // - Modeling CPU flags in the type system would be better.
+    //
+    // Since the `setCC` instructions only write an 8-bit register, we use that as
+    // our `b1` representation: A `b1` value is represented as a GPR where the low 8
+    // bits are known to be 0 or 1. The high bits are undefined.
+    //
+    // This bandaid macro doesn't support a REX prefix for the final `setCC`
+    // instruction, so it is limited to the `ABCD` register class for booleans.
+    // The omission of a `when_prefixed` alternative is deliberate here.
+
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("icscc", &formats.int_compare, 1 + 3)
+                .operands_in(vec![gpr, gpr])
+                .operands_out(vec![abcd])
+                .emit(
+                    r#"
+                        // Comparison instruction.
+                        {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink);
+                        modrm_rr(in_reg0, in_reg1, sink);
+                        // `setCC` instruction, no REX.
+                        let setcc = 0x90 | icc2opc(cond);
+                        sink.put1(0x0f);
+                        sink.put1(setcc as u8);
+                        modrm_rr(out_reg0, 0, sink);
+                    "#,
+                ),
+            regs,
+        )
+        .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"),
+    );
+
+    recipes.add_template_inferred(
+        EncodingRecipeBuilder::new("icscc_fpr", &formats.int_compare, 1)
+            .operands_in(vec![fpr, fpr])
+            .operands_out(vec![0])
+            .emit(
+                r#"
+                    // Comparison instruction.
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                "#,
+            ),
+        "size_with_inferred_rex_for_inreg0_inreg1",
+    );
+
+    {
+        let is_small_imm =
+            InstructionPredicate::new_is_signed_int(&*formats.int_compare_imm, "imm", 8, 0);
+
+        recipes.add_template(
+            Template::new(
+                EncodingRecipeBuilder::new("icscc_ib", &formats.int_compare_imm, 2 + 3)
+                    .operands_in(vec![gpr])
+                    .operands_out(vec![abcd])
+                    .inst_predicate(is_small_imm)
+                    .emit(
+                        r#"
+                            // Comparison instruction.
+                            {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                            modrm_r_bits(in_reg0, bits, sink);
+                            let imm: i64 = imm.into();
+                            sink.put1(imm as u8);
+                            // `setCC` instruction, no REX.
+                            let setcc = 0x90 | icc2opc(cond);
+                            sink.put1(0x0f);
+                            sink.put1(setcc as u8);
+                            modrm_rr(out_reg0, 0, sink);
+                        "#,
+                    ),
+                regs,
+            )
+            .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"),
+        );
+
+        let is_big_imm =
+            InstructionPredicate::new_is_signed_int(&*formats.int_compare_imm, "imm", 32, 0);
+
+        recipes.add_template(
+            Template::new(
+                EncodingRecipeBuilder::new("icscc_id", &formats.int_compare_imm, 5 + 3)
+                    .operands_in(vec![gpr])
+                    .operands_out(vec![abcd])
+                    .inst_predicate(is_big_imm)
+                    .emit(
+                        r#"
+                            // Comparison instruction.
+                            {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                            modrm_r_bits(in_reg0, bits, sink);
+                            let imm: i64 = imm.into();
+                            sink.put4(imm as u32);
+                            // `setCC` instruction, no REX.
+                            let setcc = 0x90 | icc2opc(cond);
+                            sink.put1(0x0f);
+                            sink.put1(setcc as u8);
+                            modrm_rr(out_reg0, 0, sink);
+                        "#,
+                    ),
+                regs,
+            )
+            .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"),
+        );
+    }
+
+    // Make a FloatCompare instruction predicate with the supported condition codes.
+    //
+    // Same thing for floating point.
+    //
+    // The ucomiss/ucomisd instructions set the FLAGS bits CF/PF/CF like this:
+    //
+    //    ZPC OSA
+    // UN 111 000
+    // GT 000 000
+    // LT 001 000
+    // EQ 100 000
+    //
+    // Not all floating point condition codes are supported.
+    // The omission of a `when_prefixed` alternative is deliberate here.
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("fcscc", &formats.float_compare, 1 + 3)
+            .operands_in(vec![fpr, fpr])
+            .operands_out(vec![abcd])
+            .inst_predicate(supported_floatccs_predicate(
+                &supported_floatccs,
+                &*formats.float_compare,
+            ))
+            .emit(
+                r#"
+                    // Comparison instruction.
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                    // `setCC` instruction, no REX.
+                    use crate::ir::condcodes::FloatCC::*;
+                    let setcc = match cond {
+                        Ordered                    => 0x9b, // EQ|LT|GT => setnp (P=0)
+                        Unordered                  => 0x9a, // UN       => setp  (P=1)
+                        OrderedNotEqual            => 0x95, // LT|GT    => setne (Z=0),
+                        UnorderedOrEqual           => 0x94, // UN|EQ    => sete  (Z=1)
+                        GreaterThan                => 0x97, // GT       => seta  (C=0&Z=0)
+                        GreaterThanOrEqual         => 0x93, // GT|EQ    => setae (C=0)
+                        UnorderedOrLessThan        => 0x92, // UN|LT    => setb  (C=1)
+                        UnorderedOrLessThanOrEqual => 0x96, // UN|LT|EQ => setbe (Z=1|C=1)
+                        Equal |                       // EQ
+                        NotEqual |                    // UN|LT|GT
+                        LessThan |                    // LT
+                        LessThanOrEqual |             // LT|EQ
+                        UnorderedOrGreaterThan |      // UN|GT
+                        UnorderedOrGreaterThanOrEqual // UN|GT|EQ
+                        => panic!("{} not supported by fcscc", cond),
+                    };
+                    sink.put1(0x0f);
+                    sink.put1(setcc);
+                    modrm_rr(out_reg0, 0, sink);
+                "#,
+            ),
+    );
+
+    {
+        let supported_floatccs: Vec<Literal> = ["eq", "lt", "le", "uno", "ne", "uge", "ugt", "ord"]
+            .iter()
+            .map(|name| Literal::enumerator_for(floatcc, name))
+            .collect();
+        recipes.add_template_inferred(
+            EncodingRecipeBuilder::new("pfcmp", &formats.float_compare, 2)
+                .operands_in(vec![fpr, fpr])
+                .operands_out(vec![0])
+                .inst_predicate(supported_floatccs_predicate(
+                    &supported_floatccs[..],
+                    &*formats.float_compare,
+                ))
+                .emit(
+                    r#"
+                    // Comparison instruction.
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                    // Add immediate byte indicating what type of comparison.
+                    use crate::ir::condcodes::FloatCC::*;
+                    let imm = match cond {
+                        Equal                      => 0x00,
+                        LessThan                   => 0x01,
+                        LessThanOrEqual            => 0x02,
+                        Unordered                  => 0x03,
+                        NotEqual                   => 0x04,
+                        UnorderedOrGreaterThanOrEqual => 0x05,
+                        UnorderedOrGreaterThan => 0x06,
+                        Ordered                    => 0x07,
+                        _ => panic!("{} not supported by pfcmp", cond),
+                    };
+                    sink.put1(imm);
+                "#,
+                ),
+            "size_with_inferred_rex_for_inreg0_inreg1",
+        );
+    }
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("is_zero", &formats.unary, 2 + 2)
+            .operands_in(vec![gpr])
+            .operands_out(vec![abcd])
+            .emit(
+                r#"
+                    // Test instruction.
+                    {{PUT_OP}}(bits, rex2(in_reg0, in_reg0), sink);
+                    modrm_rr(in_reg0, in_reg0, sink);
+                    // Check ZF = 1 flag to see if register holds 0.
+                    sink.put1(0x0f);
+                    sink.put1(0x94);
+                    modrm_rr(out_reg0, 0, sink);
+                "#,
+            ),
+    );
+
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("is_invalid", &formats.unary, 2 + 3)
+            .operands_in(vec![gpr])
+            .operands_out(vec![abcd])
+            .emit(
+                r#"
+                    // Comparison instruction.
+                    {{PUT_OP}}(bits, rex1(in_reg0), sink);
+                    modrm_r_bits(in_reg0, bits, sink);
+                    sink.put1(0xff);
+                    // `setCC` instruction, no REX.
+                    use crate::ir::condcodes::IntCC::*;
+                    let setcc = 0x90 | icc2opc(Equal);
+                    sink.put1(0x0f);
+                    sink.put1(setcc as u8);
+                    modrm_rr(out_reg0, 0, sink);
+                "#,
+            ),
+    );
+
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("safepoint", &formats.multiary, 0).emit(
+            r#"
+                sink.add_stack_map(args, func, isa);
+            "#,
+        ),
+    );
+
+    // Both `elf_tls_get_addr` and `macho_tls_get_addr` require all caller-saved registers to be spilled.
+    // This is currently special cased in `regalloc/spilling.rs` in the `visit_inst` function.
+
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("elf_tls_get_addr", &formats.unary_global_value, 16)
+            // FIXME Correct encoding for non rax registers
+            .operands_out(vec![reg_rax])
+            .emit(
+                r#"
+                    // output %rax
+                    // clobbers %rdi
+
+                    // Those data16 prefixes are necessary to pad to 16 bytes.
+
+                    // data16 lea gv@tlsgd(%rip),%rdi
+                    sink.put1(0x66); // data16
+                    sink.put1(0b01001000); // rex.w
+                    const LEA: u8 = 0x8d;
+                    sink.put1(LEA); // lea
+                    modrm_riprel(0b111/*out_reg0*/, sink); // 0x3d
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::ElfX86_64TlsGd,
+                                        &func.global_values[global_value].symbol_name(),
+                                        -4);
+                    sink.put4(0);
+
+                    // data16 data16 callq __tls_get_addr-4
+                    sink.put1(0x66); // data16
+                    sink.put1(0x66); // data16
+                    sink.put1(0b01001000); // rex.w
+                    sink.put1(0xe8); // call
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::X86CallPLTRel4,
+                                        &ExternalName::LibCall(LibCall::ElfTlsGetAddr),
+                                        -4);
+                    sink.put4(0);
+                "#,
+            ),
+    );
+
+    recipes.add_recipe(
+        EncodingRecipeBuilder::new("macho_tls_get_addr", &formats.unary_global_value, 9)
+            // FIXME Correct encoding for non rax registers
+            .operands_out(vec![reg_rax])
+            .emit(
+                r#"
+                    // output %rax
+                    // clobbers %rdi
+
+                    // movq gv@tlv(%rip), %rdi
+                    sink.put1(0x48); // rex
+                    sink.put1(0x8b); // mov
+                    modrm_riprel(0b111/*out_reg0*/, sink); // 0x3d
+                    sink.reloc_external(func.srclocs[inst],
+                                        Reloc::MachOX86_64Tlv,
+                                        &func.global_values[global_value].symbol_name(),
+                                        -4);
+                    sink.put4(0);
+
+                    // callq *(%rdi)
+                    sink.put1(0xff);
+                    sink.put1(0x17);
+                "#,
+            ),
+    );
+
+    recipes.add_template(
+        Template::new(
+        EncodingRecipeBuilder::new("evex_reg_vvvv_rm_128", &formats.binary, 1)
+            .operands_in(vec![fpr, fpr])
+            .operands_out(vec![fpr])
+            .emit(
+                r#"
+                // instruction encoding operands: reg (op1, w), vvvv (op2, r), rm (op3, r)
+                // this maps to:                  out_reg0,     in_reg0,       in_reg1
+                let context = EvexContext::Other { length: EvexVectorLength::V128 };
+                let masking = EvexMasking::None;
+                put_evex(bits, out_reg0, in_reg0, in_reg1, context, masking, sink); // params: reg, vvvv, rm
+                modrm_rr(in_reg1, out_reg0, sink); // params: rm, reg
+                "#,
+            ),
+        regs).rex_kind(RecipePrefixKind::Evex)
+    );
+
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("evex_reg_rm_128", &formats.unary, 1)
+                .operands_in(vec![fpr])
+                .operands_out(vec![fpr])
+                .emit(
+                    r#"
+                // instruction encoding operands: reg (op1, w), rm (op2, r)
+                // this maps to:                  out_reg0,     in_reg0
+                let context = EvexContext::Other { length: EvexVectorLength::V128 };
+                let masking = EvexMasking::None;
+                put_evex(bits, out_reg0, 0, in_reg0, context, masking, sink); // params: reg, vvvv, rm
+                modrm_rr(in_reg0, out_reg0, sink); // params: rm, reg
+                "#,
+                ),
+            regs).rex_kind(RecipePrefixKind::Evex)
+    );
+
+    recipes
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/registers.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/registers.rs
new file mode 100644
index 0000000000..85a8965f89
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/registers.rs
@@ -0,0 +1,43 @@
+use crate::cdsl::regs::{IsaRegs, IsaRegsBuilder, RegBankBuilder, RegClassBuilder};
+
+pub(crate) fn define() -> IsaRegs {
+    let mut regs = IsaRegsBuilder::new();
+
+    let builder = RegBankBuilder::new("FloatRegs", "xmm")
+        .units(16)
+        .track_pressure(true);
+    let float_regs = regs.add_bank(builder);
+
+    let builder = RegBankBuilder::new("IntRegs", "r")
+        .units(16)
+        .names(vec!["rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi"])
+        .track_pressure(true)
+        .pinned_reg(15);
+    let int_regs = regs.add_bank(builder);
+
+    let builder = RegBankBuilder::new("FlagRegs", "")
+        .units(1)
+        .names(vec!["rflags"])
+        .track_pressure(false);
+    let flag_reg = regs.add_bank(builder);
+
+    let builder = RegClassBuilder::new_toplevel("GPR", int_regs);
+    let gpr = regs.add_class(builder);
+
+    let builder = RegClassBuilder::new_toplevel("FPR", float_regs);
+    let fpr = regs.add_class(builder);
+
+    let builder = RegClassBuilder::new_toplevel("FLAG", flag_reg);
+    regs.add_class(builder);
+
+    let builder = RegClassBuilder::subclass_of("GPR8", gpr, 0, 8);
+    let gpr8 = regs.add_class(builder);
+
+    let builder = RegClassBuilder::subclass_of("ABCD", gpr8, 0, 4);
+    regs.add_class(builder);
+
+    let builder = RegClassBuilder::subclass_of("FPR8", fpr, 0, 8);
+    regs.add_class(builder);
+
+    regs.build()
+}
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/settings.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/settings.rs
new file mode 100644
index 0000000000..dddd69abb3
--- /dev/null
+++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/settings.rs
@@ -0,0 +1,135 @@
+use crate::cdsl::settings::{PredicateNode, SettingGroup, SettingGroupBuilder};
+
+pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
+    let mut settings = SettingGroupBuilder::new("x86");
+
+    // CPUID.01H:ECX
+    let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false);
+    let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false);
+    let has_sse41 = settings.add_bool("has_sse41", "SSE4.1: CPUID.01H:ECX.SSE4_1[bit 19]", false);
+    let has_sse42 = settings.add_bool("has_sse42", "SSE4.2: CPUID.01H:ECX.SSE4_2[bit 20]", false);
+    let has_avx = settings.add_bool("has_avx", "AVX: CPUID.01H:ECX.AVX[bit 28]", false);
+    let has_avx2 = settings.add_bool("has_avx2", "AVX2: CPUID.07H:EBX.AVX2[bit 5]", false);
+    let has_avx512dq = settings.add_bool(
+        "has_avx512dq",
+        "AVX512DQ: CPUID.07H:EBX.AVX512DQ[bit 17]",
+        false,
+    );
+    let has_avx512vl = settings.add_bool(
+        "has_avx512vl",
+        "AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]",
+        false,
+    );
+    let has_avx512f = settings.add_bool(
+        "has_avx512f",
+        "AVX512F: CPUID.07H:EBX.AVX512F[bit 16]",
+        false,
+    );
+    let has_popcnt = settings.add_bool("has_popcnt", "POPCNT: CPUID.01H:ECX.POPCNT[bit 23]", false);
+
+    // CPUID.(EAX=07H, ECX=0H):EBX
+    let has_bmi1 = settings.add_bool(
+        "has_bmi1",
+        "BMI1: CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]",
+        false,
+    );
+    let has_bmi2 = settings.add_bool(
+        "has_bmi2",
+        "BMI2: CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]",
+        false,
+    );
+
+    // CPUID.EAX=80000001H:ECX
+    let has_lzcnt = settings.add_bool(
+        "has_lzcnt",
+        "LZCNT: CPUID.EAX=80000001H:ECX.LZCNT[bit 5]",
+        false,
+    );
+
+    let shared_enable_simd = shared.get_bool("enable_simd");
+
+    settings.add_predicate("use_ssse3", predicate!(has_ssse3));
+    settings.add_predicate("use_sse41", predicate!(has_sse41));
+    settings.add_predicate("use_sse42", predicate!(has_sse41 && has_sse42));
+
+    settings.add_predicate(
+        "use_ssse3_simd",
+        predicate!(shared_enable_simd && has_ssse3),
+    );
+    settings.add_predicate(
+        "use_sse41_simd",
+        predicate!(shared_enable_simd && has_sse41),
+    );
+    settings.add_predicate(
+        "use_sse42_simd",
+        predicate!(shared_enable_simd && has_sse41 && has_sse42),
+    );
+
+    settings.add_predicate("use_avx_simd", predicate!(shared_enable_simd && has_avx));
+    settings.add_predicate("use_avx2_simd", predicate!(shared_enable_simd && has_avx2));
+    settings.add_predicate(
+        "use_avx512dq_simd",
+        predicate!(shared_enable_simd && has_avx512dq),
+    );
+    settings.add_predicate(
+        "use_avx512vl_simd",
+        predicate!(shared_enable_simd && has_avx512vl),
+    );
+    settings.add_predicate(
+        "use_avx512f_simd",
+        predicate!(shared_enable_simd && has_avx512f),
+    );
+
+    settings.add_predicate("use_popcnt", predicate!(has_popcnt && has_sse42));
+    settings.add_predicate("use_bmi1", predicate!(has_bmi1));
+    settings.add_predicate("use_lzcnt", predicate!(has_lzcnt));
+
+    // Some shared boolean values are used in x86 instruction predicates, so we need to group them
+    // in the same TargetIsa, for compabitibity with code generated by meta-python.
+    // TODO Once all the meta generation code has been migrated from Python to Rust, we can put it
+    // back in the shared SettingGroup, and use it in x86 instruction predicates.
+
+    let is_pic = shared.get_bool("is_pic");
+    let emit_all_ones_funcaddrs = shared.get_bool("emit_all_ones_funcaddrs");
+    settings.add_predicate("is_pic", predicate!(is_pic));
+    settings.add_predicate("not_is_pic", predicate!(!is_pic));
+    settings.add_predicate(
+        "all_ones_funcaddrs_and_not_is_pic",
+        predicate!(emit_all_ones_funcaddrs && !is_pic),
+    );
+    settings.add_predicate(
+        "not_all_ones_funcaddrs_and_not_is_pic",
+        predicate!(!emit_all_ones_funcaddrs && !is_pic),
+    );
+
+    // Presets corresponding to x86 CPUs.
+
+    settings.add_preset("baseline", preset!());
+    let nehalem = settings.add_preset(
+        "nehalem",
+        preset!(has_sse3 && has_ssse3 && has_sse41 && has_sse42 && has_popcnt),
+    );
+    let haswell = settings.add_preset(
+        "haswell",
+        preset!(nehalem && has_bmi1 && has_bmi2 && has_lzcnt),
+    );
+    let broadwell = settings.add_preset("broadwell", preset!(haswell));
+    let skylake = settings.add_preset("skylake", preset!(broadwell));
+    let cannonlake = settings.add_preset("cannonlake", preset!(skylake));
+    settings.add_preset("icelake", preset!(cannonlake));
+    settings.add_preset(
+        "znver1",
+        preset!(
+            has_sse3
+                && has_ssse3
+                && has_sse41
+                && has_sse42
+                && has_popcnt
+                && has_bmi1
+                && has_bmi2
+                && has_lzcnt
+        ),
+    );
+
+    settings.build()
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 14:29:10 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 14:29:10 +0000
commit	2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
tree	b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/cranelift-codegen-meta/src/isa
parent	Initial commit. (diff)
download	firefox-upstream.tar.xz firefox-upstream.zip