diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/cranelift-codegen-meta/src/isa | |
parent | Initial commit. (diff) | |
download | firefox-upstream.tar.xz firefox-upstream.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/cranelift-codegen-meta/src/isa')
14 files changed, 9788 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/arm32/mod.rs b/third_party/rust/cranelift-codegen-meta/src/isa/arm32/mod.rs new file mode 100644 index 0000000000..f699ece8eb --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/arm32/mod.rs @@ -0,0 +1,88 @@ +use crate::cdsl::cpu_modes::CpuMode; +use crate::cdsl::instructions::{InstructionGroupBuilder, InstructionPredicateMap}; +use crate::cdsl::isa::TargetIsa; +use crate::cdsl::recipes::Recipes; +use crate::cdsl::regs::{IsaRegs, IsaRegsBuilder, RegBankBuilder, RegClassBuilder}; +use crate::cdsl::settings::{SettingGroup, SettingGroupBuilder}; + +use crate::shared::Definitions as SharedDefinitions; + +fn define_settings(_shared: &SettingGroup) -> SettingGroup { + let setting = SettingGroupBuilder::new("arm32"); + setting.build() +} + +fn define_regs() -> IsaRegs { + let mut regs = IsaRegsBuilder::new(); + + let builder = RegBankBuilder::new("FloatRegs", "s") + .units(64) + .track_pressure(true); + let float_regs = regs.add_bank(builder); + + let builder = RegBankBuilder::new("IntRegs", "r") + .units(16) + .track_pressure(true); + let int_regs = regs.add_bank(builder); + + let builder = RegBankBuilder::new("FlagRegs", "") + .units(1) + .names(vec!["nzcv"]) + .track_pressure(false); + let flag_reg = regs.add_bank(builder); + + let builder = RegClassBuilder::new_toplevel("S", float_regs).count(32); + regs.add_class(builder); + + let builder = RegClassBuilder::new_toplevel("D", float_regs).width(2); + regs.add_class(builder); + + let builder = RegClassBuilder::new_toplevel("Q", float_regs).width(4); + regs.add_class(builder); + + let builder = RegClassBuilder::new_toplevel("GPR", int_regs); + regs.add_class(builder); + + let builder = RegClassBuilder::new_toplevel("FLAG", flag_reg); + regs.add_class(builder); + + regs.build() +} + +pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { + let settings = define_settings(&shared_defs.settings); + let regs = define_regs(); + + let inst_group = InstructionGroupBuilder::new(&mut shared_defs.all_instructions).build(); + + // CPU modes for 32-bit ARM and Thumb2. + let mut a32 = CpuMode::new("A32"); + let mut t32 = CpuMode::new("T32"); + + // TODO refine these. + let narrow_flags = shared_defs.transform_groups.by_name("narrow_flags"); + a32.legalize_default(narrow_flags); + t32.legalize_default(narrow_flags); + + // Make sure that the expand code is used, thus generated. + let expand = shared_defs.transform_groups.by_name("expand"); + a32.legalize_monomorphic(expand); + + let cpu_modes = vec![a32, t32]; + + // TODO implement arm32 recipes. + let recipes = Recipes::new(); + + // TODO implement arm32 encodings and predicates. + let encodings_predicates = InstructionPredicateMap::new(); + + TargetIsa::new( + "arm32", + inst_group, + settings, + regs, + recipes, + cpu_modes, + encodings_predicates, + ) +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/arm64/mod.rs b/third_party/rust/cranelift-codegen-meta/src/isa/arm64/mod.rs new file mode 100644 index 0000000000..5d8bc76fc4 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/arm64/mod.rs @@ -0,0 +1,79 @@ +use crate::cdsl::cpu_modes::CpuMode; +use crate::cdsl::instructions::{InstructionGroupBuilder, InstructionPredicateMap}; +use crate::cdsl::isa::TargetIsa; +use crate::cdsl::recipes::Recipes; +use crate::cdsl::regs::{IsaRegs, IsaRegsBuilder, RegBankBuilder, RegClassBuilder}; +use crate::cdsl::settings::{SettingGroup, SettingGroupBuilder}; + +use crate::shared::Definitions as SharedDefinitions; + +fn define_settings(_shared: &SettingGroup) -> SettingGroup { + let setting = SettingGroupBuilder::new("arm64"); + setting.build() +} + +fn define_registers() -> IsaRegs { + let mut regs = IsaRegsBuilder::new(); + + // The `x31` regunit serves as the stack pointer / zero register depending on context. We + // reserve it and don't model the difference. + let builder = RegBankBuilder::new("IntRegs", "x") + .units(32) + .track_pressure(true); + let int_regs = regs.add_bank(builder); + + let builder = RegBankBuilder::new("FloatRegs", "v") + .units(32) + .track_pressure(true); + let float_regs = regs.add_bank(builder); + + let builder = RegBankBuilder::new("FlagRegs", "") + .units(1) + .names(vec!["nzcv"]) + .track_pressure(false); + let flag_reg = regs.add_bank(builder); + + let builder = RegClassBuilder::new_toplevel("GPR", int_regs); + regs.add_class(builder); + + let builder = RegClassBuilder::new_toplevel("FPR", float_regs); + regs.add_class(builder); + + let builder = RegClassBuilder::new_toplevel("FLAG", flag_reg); + regs.add_class(builder); + + regs.build() +} + +pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { + let settings = define_settings(&shared_defs.settings); + let regs = define_registers(); + + let inst_group = InstructionGroupBuilder::new(&mut shared_defs.all_instructions).build(); + + let mut a64 = CpuMode::new("A64"); + + // TODO refine these. + let expand_flags = shared_defs.transform_groups.by_name("expand_flags"); + let narrow_flags = shared_defs.transform_groups.by_name("narrow_flags"); + a64.legalize_monomorphic(expand_flags); + a64.legalize_default(narrow_flags); + + let cpu_modes = vec![a64]; + + // TODO implement arm64 recipes. + let recipes = Recipes::new(); + + // TODO implement arm64 encodings and predicates. + let encodings_predicates = InstructionPredicateMap::new(); + + TargetIsa::new( + "arm64", + inst_group, + settings, + regs, + recipes, + cpu_modes, + encodings_predicates, + ) +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/mod.rs b/third_party/rust/cranelift-codegen-meta/src/isa/mod.rs new file mode 100644 index 0000000000..ed8db85f0d --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/mod.rs @@ -0,0 +1,67 @@ +//! Define supported ISAs; includes ISA-specific instructions, encodings, registers, settings, etc. +use crate::cdsl::isa::TargetIsa; +use crate::shared::Definitions as SharedDefinitions; +use std::fmt; + +mod arm32; +mod arm64; +mod riscv; +pub(crate) mod x86; + +/// Represents known ISA target. +#[derive(PartialEq, Copy, Clone)] +pub enum Isa { + Riscv, + X86, + Arm32, + Arm64, +} + +impl Isa { + /// Creates isa target using name. + pub fn from_name(name: &str) -> Option<Self> { + Isa::all() + .iter() + .cloned() + .find(|isa| isa.to_string() == name) + } + + /// Creates isa target from arch. + pub fn from_arch(arch: &str) -> Option<Self> { + match arch { + "riscv" => Some(Isa::Riscv), + "aarch64" => Some(Isa::Arm64), + x if ["x86_64", "i386", "i586", "i686"].contains(&x) => Some(Isa::X86), + x if x.starts_with("arm") || arch.starts_with("thumb") => Some(Isa::Arm32), + _ => None, + } + } + + /// Returns all supported isa targets. + pub fn all() -> &'static [Isa] { + &[Isa::Riscv, Isa::X86, Isa::Arm32, Isa::Arm64] + } +} + +impl fmt::Display for Isa { + // These names should be kept in sync with the crate features. + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Isa::Riscv => write!(f, "riscv"), + Isa::X86 => write!(f, "x86"), + Isa::Arm32 => write!(f, "arm32"), + Isa::Arm64 => write!(f, "arm64"), + } + } +} + +pub(crate) fn define(isas: &[Isa], shared_defs: &mut SharedDefinitions) -> Vec<TargetIsa> { + isas.iter() + .map(|isa| match isa { + Isa::Riscv => riscv::define(shared_defs), + Isa::X86 => x86::define(shared_defs), + Isa::Arm32 => arm32::define(shared_defs), + Isa::Arm64 => arm64::define(shared_defs), + }) + .collect() +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/riscv/encodings.rs b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/encodings.rs new file mode 100644 index 0000000000..c255ddb483 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/encodings.rs @@ -0,0 +1,431 @@ +use crate::cdsl::ast::{Apply, Expr, Literal, VarPool}; +use crate::cdsl::encodings::{Encoding, EncodingBuilder}; +use crate::cdsl::instructions::{ + Bindable, BoundInstruction, InstSpec, InstructionPredicateNode, InstructionPredicateRegistry, +}; +use crate::cdsl::recipes::{EncodingRecipeNumber, Recipes}; +use crate::cdsl::settings::SettingGroup; + +use crate::shared::types::Bool::B1; +use crate::shared::types::Float::{F32, F64}; +use crate::shared::types::Int::{I16, I32, I64, I8}; +use crate::shared::types::Reference::{R32, R64}; +use crate::shared::Definitions as SharedDefinitions; + +use super::recipes::RecipeGroup; + +pub(crate) struct PerCpuModeEncodings<'defs> { + pub inst_pred_reg: InstructionPredicateRegistry, + pub enc32: Vec<Encoding>, + pub enc64: Vec<Encoding>, + recipes: &'defs Recipes, +} + +impl<'defs> PerCpuModeEncodings<'defs> { + fn new(recipes: &'defs Recipes) -> Self { + Self { + inst_pred_reg: InstructionPredicateRegistry::new(), + enc32: Vec::new(), + enc64: Vec::new(), + recipes, + } + } + fn enc( + &self, + inst: impl Into<InstSpec>, + recipe: EncodingRecipeNumber, + bits: u16, + ) -> EncodingBuilder { + EncodingBuilder::new(inst.into(), recipe, bits) + } + fn add32(&mut self, encoding: EncodingBuilder) { + self.enc32 + .push(encoding.build(self.recipes, &mut self.inst_pred_reg)); + } + fn add64(&mut self, encoding: EncodingBuilder) { + self.enc64 + .push(encoding.build(self.recipes, &mut self.inst_pred_reg)); + } +} + +// The low 7 bits of a RISC-V instruction is the base opcode. All 32-bit instructions have 11 as +// the two low bits, with bits 6:2 determining the base opcode. +// +// Encbits for the 32-bit recipes are opcode[6:2] | (funct3 << 5) | ... +// The functions below encode the encbits. + +fn load_bits(funct3: u16) -> u16 { + assert!(funct3 <= 0b111); + funct3 << 5 +} + +fn store_bits(funct3: u16) -> u16 { + assert!(funct3 <= 0b111); + 0b01000 | (funct3 << 5) +} + +fn branch_bits(funct3: u16) -> u16 { + assert!(funct3 <= 0b111); + 0b11000 | (funct3 << 5) +} + +fn jalr_bits() -> u16 { + // This was previously accepting an argument funct3 of 3 bits and used the following formula: + //0b11001 | (funct3 << 5) + 0b11001 +} + +fn jal_bits() -> u16 { + 0b11011 +} + +fn opimm_bits(funct3: u16, funct7: u16) -> u16 { + assert!(funct3 <= 0b111); + 0b00100 | (funct3 << 5) | (funct7 << 8) +} + +fn opimm32_bits(funct3: u16, funct7: u16) -> u16 { + assert!(funct3 <= 0b111); + 0b00110 | (funct3 << 5) | (funct7 << 8) +} + +fn op_bits(funct3: u16, funct7: u16) -> u16 { + assert!(funct3 <= 0b111); + assert!(funct7 <= 0b111_1111); + 0b01100 | (funct3 << 5) | (funct7 << 8) +} + +fn op32_bits(funct3: u16, funct7: u16) -> u16 { + assert!(funct3 <= 0b111); + assert!(funct7 <= 0b111_1111); + 0b01110 | (funct3 << 5) | (funct7 << 8) +} + +fn lui_bits() -> u16 { + 0b01101 +} + +pub(crate) fn define<'defs>( + shared_defs: &'defs SharedDefinitions, + isa_settings: &SettingGroup, + recipes: &'defs RecipeGroup, +) -> PerCpuModeEncodings<'defs> { + // Instructions shorthands. + let shared = &shared_defs.instructions; + + let band = shared.by_name("band"); + let band_imm = shared.by_name("band_imm"); + let bor = shared.by_name("bor"); + let bor_imm = shared.by_name("bor_imm"); + let br_icmp = shared.by_name("br_icmp"); + let brz = shared.by_name("brz"); + let brnz = shared.by_name("brnz"); + let bxor = shared.by_name("bxor"); + let bxor_imm = shared.by_name("bxor_imm"); + let call = shared.by_name("call"); + let call_indirect = shared.by_name("call_indirect"); + let copy = shared.by_name("copy"); + let copy_nop = shared.by_name("copy_nop"); + let copy_to_ssa = shared.by_name("copy_to_ssa"); + let fill = shared.by_name("fill"); + let fill_nop = shared.by_name("fill_nop"); + let iadd = shared.by_name("iadd"); + let iadd_imm = shared.by_name("iadd_imm"); + let iconst = shared.by_name("iconst"); + let icmp = shared.by_name("icmp"); + let icmp_imm = shared.by_name("icmp_imm"); + let imul = shared.by_name("imul"); + let ishl = shared.by_name("ishl"); + let ishl_imm = shared.by_name("ishl_imm"); + let isub = shared.by_name("isub"); + let jump = shared.by_name("jump"); + let regmove = shared.by_name("regmove"); + let spill = shared.by_name("spill"); + let sshr = shared.by_name("sshr"); + let sshr_imm = shared.by_name("sshr_imm"); + let ushr = shared.by_name("ushr"); + let ushr_imm = shared.by_name("ushr_imm"); + let return_ = shared.by_name("return"); + + // Recipes shorthands, prefixed with r_. + let r_copytossa = recipes.by_name("copytossa"); + let r_fillnull = recipes.by_name("fillnull"); + let r_icall = recipes.by_name("Icall"); + let r_icopy = recipes.by_name("Icopy"); + let r_ii = recipes.by_name("Ii"); + let r_iicmp = recipes.by_name("Iicmp"); + let r_iret = recipes.by_name("Iret"); + let r_irmov = recipes.by_name("Irmov"); + let r_iz = recipes.by_name("Iz"); + let r_gp_sp = recipes.by_name("GPsp"); + let r_gp_fi = recipes.by_name("GPfi"); + let r_r = recipes.by_name("R"); + let r_ricmp = recipes.by_name("Ricmp"); + let r_rshamt = recipes.by_name("Rshamt"); + let r_sb = recipes.by_name("SB"); + let r_sb_zero = recipes.by_name("SBzero"); + let r_stacknull = recipes.by_name("stacknull"); + let r_u = recipes.by_name("U"); + let r_uj = recipes.by_name("UJ"); + let r_uj_call = recipes.by_name("UJcall"); + + // Predicates shorthands. + let use_m = isa_settings.predicate_by_name("use_m"); + + // Definitions. + let mut e = PerCpuModeEncodings::new(&recipes.recipes); + + // Basic arithmetic binary instructions are encoded in an R-type instruction. + for &(inst, inst_imm, f3, f7) in &[ + (iadd, Some(iadd_imm), 0b000, 0b000_0000), + (isub, None, 0b000, 0b010_0000), + (bxor, Some(bxor_imm), 0b100, 0b000_0000), + (bor, Some(bor_imm), 0b110, 0b000_0000), + (band, Some(band_imm), 0b111, 0b000_0000), + ] { + e.add32(e.enc(inst.bind(I32), r_r, op_bits(f3, f7))); + e.add64(e.enc(inst.bind(I64), r_r, op_bits(f3, f7))); + + // Immediate versions for add/xor/or/and. + if let Some(inst_imm) = inst_imm { + e.add32(e.enc(inst_imm.bind(I32), r_ii, opimm_bits(f3, 0))); + e.add64(e.enc(inst_imm.bind(I64), r_ii, opimm_bits(f3, 0))); + } + } + + // 32-bit ops in RV64. + e.add64(e.enc(iadd.bind(I32), r_r, op32_bits(0b000, 0b000_0000))); + e.add64(e.enc(isub.bind(I32), r_r, op32_bits(0b000, 0b010_0000))); + // There are no andiw/oriw/xoriw variations. + e.add64(e.enc(iadd_imm.bind(I32), r_ii, opimm32_bits(0b000, 0))); + + // Use iadd_imm with %x0 to materialize constants. + e.add32(e.enc(iconst.bind(I32), r_iz, opimm_bits(0b0, 0))); + e.add64(e.enc(iconst.bind(I32), r_iz, opimm_bits(0b0, 0))); + e.add64(e.enc(iconst.bind(I64), r_iz, opimm_bits(0b0, 0))); + + // Dynamic shifts have the same masking semantics as the clif base instructions. + for &(inst, inst_imm, f3, f7) in &[ + (ishl, ishl_imm, 0b1, 0b0), + (ushr, ushr_imm, 0b101, 0b0), + (sshr, sshr_imm, 0b101, 0b10_0000), + ] { + e.add32(e.enc(inst.bind(I32).bind(I32), r_r, op_bits(f3, f7))); + e.add64(e.enc(inst.bind(I64).bind(I64), r_r, op_bits(f3, f7))); + e.add64(e.enc(inst.bind(I32).bind(I32), r_r, op32_bits(f3, f7))); + // Allow i32 shift amounts in 64-bit shifts. + e.add64(e.enc(inst.bind(I64).bind(I32), r_r, op_bits(f3, f7))); + e.add64(e.enc(inst.bind(I32).bind(I64), r_r, op32_bits(f3, f7))); + + // Immediate shifts. + e.add32(e.enc(inst_imm.bind(I32), r_rshamt, opimm_bits(f3, f7))); + e.add64(e.enc(inst_imm.bind(I64), r_rshamt, opimm_bits(f3, f7))); + e.add64(e.enc(inst_imm.bind(I32), r_rshamt, opimm32_bits(f3, f7))); + } + + // Signed and unsigned integer 'less than'. There are no 'w' variants for comparing 32-bit + // numbers in RV64. + { + let mut var_pool = VarPool::new(); + + // Helper that creates an instruction predicate for an instruction in the icmp family. + let mut icmp_instp = |bound_inst: &BoundInstruction, + intcc_field: &'static str| + -> InstructionPredicateNode { + let x = var_pool.create("x"); + let y = var_pool.create("y"); + let cc = Literal::enumerator_for(&shared_defs.imm.intcc, intcc_field); + Apply::new( + bound_inst.clone().into(), + vec![Expr::Literal(cc), Expr::Var(x), Expr::Var(y)], + ) + .inst_predicate(&var_pool) + .unwrap() + }; + + let icmp_i32 = icmp.bind(I32); + let icmp_i64 = icmp.bind(I64); + e.add32( + e.enc(icmp_i32.clone(), r_ricmp, op_bits(0b010, 0b000_0000)) + .inst_predicate(icmp_instp(&icmp_i32, "slt")), + ); + e.add64( + e.enc(icmp_i64.clone(), r_ricmp, op_bits(0b010, 0b000_0000)) + .inst_predicate(icmp_instp(&icmp_i64, "slt")), + ); + + e.add32( + e.enc(icmp_i32.clone(), r_ricmp, op_bits(0b011, 0b000_0000)) + .inst_predicate(icmp_instp(&icmp_i32, "ult")), + ); + e.add64( + e.enc(icmp_i64.clone(), r_ricmp, op_bits(0b011, 0b000_0000)) + .inst_predicate(icmp_instp(&icmp_i64, "ult")), + ); + + // Immediate variants. + let icmp_i32 = icmp_imm.bind(I32); + let icmp_i64 = icmp_imm.bind(I64); + e.add32( + e.enc(icmp_i32.clone(), r_iicmp, opimm_bits(0b010, 0)) + .inst_predicate(icmp_instp(&icmp_i32, "slt")), + ); + e.add64( + e.enc(icmp_i64.clone(), r_iicmp, opimm_bits(0b010, 0)) + .inst_predicate(icmp_instp(&icmp_i64, "slt")), + ); + + e.add32( + e.enc(icmp_i32.clone(), r_iicmp, opimm_bits(0b011, 0)) + .inst_predicate(icmp_instp(&icmp_i32, "ult")), + ); + e.add64( + e.enc(icmp_i64.clone(), r_iicmp, opimm_bits(0b011, 0)) + .inst_predicate(icmp_instp(&icmp_i64, "ult")), + ); + } + + // Integer constants with the low 12 bits clear are materialized by lui. + e.add32(e.enc(iconst.bind(I32), r_u, lui_bits())); + e.add64(e.enc(iconst.bind(I32), r_u, lui_bits())); + e.add64(e.enc(iconst.bind(I64), r_u, lui_bits())); + + // "M" Standard Extension for Integer Multiplication and Division. + // Gated by the `use_m` flag. + e.add32( + e.enc(imul.bind(I32), r_r, op_bits(0b000, 0b0000_0001)) + .isa_predicate(use_m), + ); + e.add64( + e.enc(imul.bind(I64), r_r, op_bits(0b000, 0b0000_0001)) + .isa_predicate(use_m), + ); + e.add64( + e.enc(imul.bind(I32), r_r, op32_bits(0b000, 0b0000_0001)) + .isa_predicate(use_m), + ); + + // Control flow. + + // Unconditional branches. + e.add32(e.enc(jump, r_uj, jal_bits())); + e.add64(e.enc(jump, r_uj, jal_bits())); + e.add32(e.enc(call, r_uj_call, jal_bits())); + e.add64(e.enc(call, r_uj_call, jal_bits())); + + // Conditional branches. + { + let mut var_pool = VarPool::new(); + + // Helper that creates an instruction predicate for an instruction in the icmp family. + let mut br_icmp_instp = |bound_inst: &BoundInstruction, + intcc_field: &'static str| + -> InstructionPredicateNode { + let x = var_pool.create("x"); + let y = var_pool.create("y"); + let dest = var_pool.create("dest"); + let args = var_pool.create("args"); + let cc = Literal::enumerator_for(&shared_defs.imm.intcc, intcc_field); + Apply::new( + bound_inst.clone().into(), + vec![ + Expr::Literal(cc), + Expr::Var(x), + Expr::Var(y), + Expr::Var(dest), + Expr::Var(args), + ], + ) + .inst_predicate(&var_pool) + .unwrap() + }; + + let br_icmp_i32 = br_icmp.bind(I32); + let br_icmp_i64 = br_icmp.bind(I64); + for &(cond, f3) in &[ + ("eq", 0b000), + ("ne", 0b001), + ("slt", 0b100), + ("sge", 0b101), + ("ult", 0b110), + ("uge", 0b111), + ] { + e.add32( + e.enc(br_icmp_i32.clone(), r_sb, branch_bits(f3)) + .inst_predicate(br_icmp_instp(&br_icmp_i32, cond)), + ); + e.add64( + e.enc(br_icmp_i64.clone(), r_sb, branch_bits(f3)) + .inst_predicate(br_icmp_instp(&br_icmp_i64, cond)), + ); + } + } + + for &(inst, f3) in &[(brz, 0b000), (brnz, 0b001)] { + e.add32(e.enc(inst.bind(I32), r_sb_zero, branch_bits(f3))); + e.add64(e.enc(inst.bind(I64), r_sb_zero, branch_bits(f3))); + e.add32(e.enc(inst.bind(B1), r_sb_zero, branch_bits(f3))); + e.add64(e.enc(inst.bind(B1), r_sb_zero, branch_bits(f3))); + } + + // Returns are a special case of jalr_bits using %x1 to hold the return address. + // The return address is provided by a special-purpose `link` return value that + // is added by legalize_signature(). + e.add32(e.enc(return_, r_iret, jalr_bits())); + e.add64(e.enc(return_, r_iret, jalr_bits())); + e.add32(e.enc(call_indirect.bind(I32), r_icall, jalr_bits())); + e.add64(e.enc(call_indirect.bind(I64), r_icall, jalr_bits())); + + // Spill and fill. + e.add32(e.enc(spill.bind(I32), r_gp_sp, store_bits(0b010))); + e.add64(e.enc(spill.bind(I32), r_gp_sp, store_bits(0b010))); + e.add64(e.enc(spill.bind(I64), r_gp_sp, store_bits(0b011))); + e.add32(e.enc(fill.bind(I32), r_gp_fi, load_bits(0b010))); + e.add64(e.enc(fill.bind(I32), r_gp_fi, load_bits(0b010))); + e.add64(e.enc(fill.bind(I64), r_gp_fi, load_bits(0b011))); + + // No-op fills, created by late-stage redundant-fill removal. + for &ty in &[I64, I32] { + e.add64(e.enc(fill_nop.bind(ty), r_fillnull, 0)); + e.add32(e.enc(fill_nop.bind(ty), r_fillnull, 0)); + } + e.add64(e.enc(fill_nop.bind(B1), r_fillnull, 0)); + e.add32(e.enc(fill_nop.bind(B1), r_fillnull, 0)); + + // Register copies. + e.add32(e.enc(copy.bind(I32), r_icopy, opimm_bits(0b000, 0))); + e.add64(e.enc(copy.bind(I64), r_icopy, opimm_bits(0b000, 0))); + e.add64(e.enc(copy.bind(I32), r_icopy, opimm32_bits(0b000, 0))); + + e.add32(e.enc(regmove.bind(I32), r_irmov, opimm_bits(0b000, 0))); + e.add64(e.enc(regmove.bind(I64), r_irmov, opimm_bits(0b000, 0))); + e.add64(e.enc(regmove.bind(I32), r_irmov, opimm32_bits(0b000, 0))); + + e.add32(e.enc(copy.bind(B1), r_icopy, opimm_bits(0b000, 0))); + e.add64(e.enc(copy.bind(B1), r_icopy, opimm_bits(0b000, 0))); + e.add32(e.enc(regmove.bind(B1), r_irmov, opimm_bits(0b000, 0))); + e.add64(e.enc(regmove.bind(B1), r_irmov, opimm_bits(0b000, 0))); + + // Stack-slot-to-the-same-stack-slot copy, which is guaranteed to turn + // into a no-op. + // The same encoding is generated for both the 64- and 32-bit architectures. + for &ty in &[I64, I32, I16, I8] { + e.add32(e.enc(copy_nop.bind(ty), r_stacknull, 0)); + e.add64(e.enc(copy_nop.bind(ty), r_stacknull, 0)); + } + for &ty in &[F64, F32] { + e.add32(e.enc(copy_nop.bind(ty), r_stacknull, 0)); + e.add64(e.enc(copy_nop.bind(ty), r_stacknull, 0)); + } + + // Copy-to-SSA + e.add32(e.enc(copy_to_ssa.bind(I32), r_copytossa, opimm_bits(0b000, 0))); + e.add64(e.enc(copy_to_ssa.bind(I64), r_copytossa, opimm_bits(0b000, 0))); + e.add64(e.enc(copy_to_ssa.bind(I32), r_copytossa, opimm32_bits(0b000, 0))); + e.add32(e.enc(copy_to_ssa.bind(B1), r_copytossa, opimm_bits(0b000, 0))); + e.add64(e.enc(copy_to_ssa.bind(B1), r_copytossa, opimm_bits(0b000, 0))); + e.add32(e.enc(copy_to_ssa.bind(R32), r_copytossa, opimm_bits(0b000, 0))); + e.add64(e.enc(copy_to_ssa.bind(R64), r_copytossa, opimm_bits(0b000, 0))); + + e +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/riscv/mod.rs b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/mod.rs new file mode 100644 index 0000000000..801e61a3d2 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/mod.rs @@ -0,0 +1,134 @@ +use crate::cdsl::cpu_modes::CpuMode; +use crate::cdsl::instructions::InstructionGroupBuilder; +use crate::cdsl::isa::TargetIsa; +use crate::cdsl::regs::{IsaRegs, IsaRegsBuilder, RegBankBuilder, RegClassBuilder}; +use crate::cdsl::settings::{PredicateNode, SettingGroup, SettingGroupBuilder}; + +use crate::shared::types::Float::{F32, F64}; +use crate::shared::types::Int::{I32, I64}; +use crate::shared::Definitions as SharedDefinitions; + +mod encodings; +mod recipes; + +fn define_settings(shared: &SettingGroup) -> SettingGroup { + let mut setting = SettingGroupBuilder::new("riscv"); + + let supports_m = setting.add_bool( + "supports_m", + "CPU supports the 'M' extension (mul/div)", + false, + ); + let supports_a = setting.add_bool( + "supports_a", + "CPU supports the 'A' extension (atomics)", + false, + ); + let supports_f = setting.add_bool( + "supports_f", + "CPU supports the 'F' extension (float)", + false, + ); + let supports_d = setting.add_bool( + "supports_d", + "CPU supports the 'D' extension (double)", + false, + ); + + let enable_m = setting.add_bool( + "enable_m", + "Enable the use of 'M' instructions if available", + true, + ); + + setting.add_bool( + "enable_e", + "Enable the 'RV32E' instruction set with only 16 registers", + false, + ); + + let shared_enable_atomics = shared.get_bool("enable_atomics"); + let shared_enable_float = shared.get_bool("enable_float"); + let shared_enable_simd = shared.get_bool("enable_simd"); + + setting.add_predicate("use_m", predicate!(supports_m && enable_m)); + setting.add_predicate("use_a", predicate!(supports_a && shared_enable_atomics)); + setting.add_predicate("use_f", predicate!(supports_f && shared_enable_float)); + setting.add_predicate("use_d", predicate!(supports_d && shared_enable_float)); + setting.add_predicate( + "full_float", + predicate!(shared_enable_simd && supports_f && supports_d), + ); + + setting.build() +} + +fn define_registers() -> IsaRegs { + let mut regs = IsaRegsBuilder::new(); + + let builder = RegBankBuilder::new("IntRegs", "x") + .units(32) + .track_pressure(true); + let int_regs = regs.add_bank(builder); + + let builder = RegBankBuilder::new("FloatRegs", "f") + .units(32) + .track_pressure(true); + let float_regs = regs.add_bank(builder); + + let builder = RegClassBuilder::new_toplevel("GPR", int_regs); + regs.add_class(builder); + + let builder = RegClassBuilder::new_toplevel("FPR", float_regs); + regs.add_class(builder); + + regs.build() +} + +pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { + let settings = define_settings(&shared_defs.settings); + let regs = define_registers(); + + let inst_group = InstructionGroupBuilder::new(&mut shared_defs.all_instructions).build(); + + // CPU modes for 32-bit and 64-bit operation. + let mut rv_32 = CpuMode::new("RV32"); + let mut rv_64 = CpuMode::new("RV64"); + + let expand = shared_defs.transform_groups.by_name("expand"); + let narrow_no_flags = shared_defs.transform_groups.by_name("narrow_no_flags"); + + rv_32.legalize_monomorphic(expand); + rv_32.legalize_default(narrow_no_flags); + rv_32.legalize_type(I32, expand); + rv_32.legalize_type(F32, expand); + rv_32.legalize_type(F64, expand); + + rv_64.legalize_monomorphic(expand); + rv_64.legalize_default(narrow_no_flags); + rv_64.legalize_type(I32, expand); + rv_64.legalize_type(I64, expand); + rv_64.legalize_type(F32, expand); + rv_64.legalize_type(F64, expand); + + let recipes = recipes::define(shared_defs, ®s); + + let encodings = encodings::define(shared_defs, &settings, &recipes); + rv_32.set_encodings(encodings.enc32); + rv_64.set_encodings(encodings.enc64); + let encodings_predicates = encodings.inst_pred_reg.extract(); + + let recipes = recipes.collect(); + + let cpu_modes = vec![rv_32, rv_64]; + + TargetIsa::new( + "riscv", + inst_group, + settings, + regs, + recipes, + cpu_modes, + encodings_predicates, + ) +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/riscv/recipes.rs b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/recipes.rs new file mode 100644 index 0000000000..47acdbb042 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/riscv/recipes.rs @@ -0,0 +1,279 @@ +use std::collections::HashMap; + +use crate::cdsl::instructions::InstructionPredicate; +use crate::cdsl::recipes::{EncodingRecipeBuilder, EncodingRecipeNumber, Recipes, Stack}; +use crate::cdsl::regs::IsaRegs; +use crate::shared::Definitions as SharedDefinitions; + +/// An helper to create recipes and use them when defining the RISCV encodings. +pub(crate) struct RecipeGroup { + /// The actualy list of recipes explicitly created in this file. + pub recipes: Recipes, + + /// Provides fast lookup from a name to an encoding recipe. + name_to_recipe: HashMap<String, EncodingRecipeNumber>, +} + +impl RecipeGroup { + fn new() -> Self { + Self { + recipes: Recipes::new(), + name_to_recipe: HashMap::new(), + } + } + + fn push(&mut self, builder: EncodingRecipeBuilder) { + assert!( + self.name_to_recipe.get(&builder.name).is_none(), + format!("riscv recipe '{}' created twice", builder.name) + ); + let name = builder.name.clone(); + let number = self.recipes.push(builder.build()); + self.name_to_recipe.insert(name, number); + } + + pub fn by_name(&self, name: &str) -> EncodingRecipeNumber { + *self + .name_to_recipe + .get(name) + .unwrap_or_else(|| panic!("unknown riscv recipe name {}", name)) + } + + pub fn collect(self) -> Recipes { + self.recipes + } +} + +pub(crate) fn define(shared_defs: &SharedDefinitions, regs: &IsaRegs) -> RecipeGroup { + let formats = &shared_defs.formats; + + // Register classes shorthands. + let gpr = regs.class_by_name("GPR"); + + // Definitions. + let mut recipes = RecipeGroup::new(); + + // R-type 32-bit instructions: These are mostly binary arithmetic instructions. + // The encbits are `opcode[6:2] | (funct3 << 5) | (funct7 << 8) + recipes.push( + EncodingRecipeBuilder::new("R", &formats.binary, 4) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![gpr]) + .emit("put_r(bits, in_reg0, in_reg1, out_reg0, sink);"), + ); + + // R-type with an immediate shift amount instead of rs2. + recipes.push( + EncodingRecipeBuilder::new("Rshamt", &formats.binary_imm64, 4) + .operands_in(vec![gpr]) + .operands_out(vec![gpr]) + .emit("put_rshamt(bits, in_reg0, imm.into(), out_reg0, sink);"), + ); + + // R-type encoding of an integer comparison. + recipes.push( + EncodingRecipeBuilder::new("Ricmp", &formats.int_compare, 4) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![gpr]) + .emit("put_r(bits, in_reg0, in_reg1, out_reg0, sink);"), + ); + + recipes.push( + EncodingRecipeBuilder::new("Ii", &formats.binary_imm64, 4) + .operands_in(vec![gpr]) + .operands_out(vec![gpr]) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &*formats.binary_imm64, + "imm", + 12, + 0, + )) + .emit("put_i(bits, in_reg0, imm.into(), out_reg0, sink);"), + ); + + // I-type instruction with a hardcoded %x0 rs1. + recipes.push( + EncodingRecipeBuilder::new("Iz", &formats.unary_imm, 4) + .operands_out(vec![gpr]) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &formats.unary_imm, + "imm", + 12, + 0, + )) + .emit("put_i(bits, 0, imm.into(), out_reg0, sink);"), + ); + + // I-type encoding of an integer comparison. + recipes.push( + EncodingRecipeBuilder::new("Iicmp", &formats.int_compare_imm, 4) + .operands_in(vec![gpr]) + .operands_out(vec![gpr]) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &formats.int_compare_imm, + "imm", + 12, + 0, + )) + .emit("put_i(bits, in_reg0, imm.into(), out_reg0, sink);"), + ); + + // I-type encoding for `jalr` as a return instruction. We won't use the immediate offset. The + // variable return values are not encoded. + recipes.push( + EncodingRecipeBuilder::new("Iret", &formats.multiary, 4).emit( + r#" + // Return instructions are always a jalr to %x1. + // The return address is provided as a special-purpose link argument. + put_i( + bits, + 1, // rs1 = %x1 + 0, // no offset. + 0, // rd = %x0: no address written. + sink, + ); + "#, + ), + ); + + // I-type encoding for `jalr` as a call_indirect. + recipes.push( + EncodingRecipeBuilder::new("Icall", &formats.call_indirect, 4) + .operands_in(vec![gpr]) + .emit( + r#" + // call_indirect instructions are jalr with rd=%x1. + put_i( + bits, + in_reg0, + 0, // no offset. + 1, // rd = %x1: link register. + sink, + ); + "#, + ), + ); + + // Copy of a GPR is implemented as addi x, 0. + recipes.push( + EncodingRecipeBuilder::new("Icopy", &formats.unary, 4) + .operands_in(vec![gpr]) + .operands_out(vec![gpr]) + .emit("put_i(bits, in_reg0, 0, out_reg0, sink);"), + ); + + // Same for a GPR regmove. + recipes.push( + EncodingRecipeBuilder::new("Irmov", &formats.reg_move, 4) + .operands_in(vec![gpr]) + .emit("put_i(bits, src, 0, dst, sink);"), + ); + + // Same for copy-to-SSA -- GPR regmove. + recipes.push( + EncodingRecipeBuilder::new("copytossa", &formats.copy_to_ssa, 4) + // No operands_in to mention, because a source register is specified directly. + .operands_out(vec![gpr]) + .emit("put_i(bits, src, 0, out_reg0, sink);"), + ); + + // U-type instructions have a 20-bit immediate that targets bits 12-31. + recipes.push( + EncodingRecipeBuilder::new("U", &formats.unary_imm, 4) + .operands_out(vec![gpr]) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &formats.unary_imm, + "imm", + 32, + 12, + )) + .emit("put_u(bits, imm.into(), out_reg0, sink);"), + ); + + // UJ-type unconditional branch instructions. + recipes.push( + EncodingRecipeBuilder::new("UJ", &formats.jump, 4) + .branch_range((0, 21)) + .emit( + r#" + let dest = i64::from(func.offsets[destination]); + let disp = dest - i64::from(sink.offset()); + put_uj(bits, disp, 0, sink); + "#, + ), + ); + + recipes.push(EncodingRecipeBuilder::new("UJcall", &formats.call, 4).emit( + r#" + sink.reloc_external(func.srclocs[inst], + Reloc::RiscvCall, + &func.dfg.ext_funcs[func_ref].name, + 0); + // rd=%x1 is the standard link register. + put_uj(bits, 0, 1, sink); + "#, + )); + + // SB-type branch instructions. + recipes.push( + EncodingRecipeBuilder::new("SB", &formats.branch_icmp, 4) + .operands_in(vec![gpr, gpr]) + .branch_range((0, 13)) + .emit( + r#" + let dest = i64::from(func.offsets[destination]); + let disp = dest - i64::from(sink.offset()); + put_sb(bits, disp, in_reg0, in_reg1, sink); + "#, + ), + ); + + // SB-type branch instruction with rs2 fixed to zero. + recipes.push( + EncodingRecipeBuilder::new("SBzero", &formats.branch, 4) + .operands_in(vec![gpr]) + .branch_range((0, 13)) + .emit( + r#" + let dest = i64::from(func.offsets[destination]); + let disp = dest - i64::from(sink.offset()); + put_sb(bits, disp, in_reg0, 0, sink); + "#, + ), + ); + + // Spill of a GPR. + recipes.push( + EncodingRecipeBuilder::new("GPsp", &formats.unary, 4) + .operands_in(vec![gpr]) + .operands_out(vec![Stack::new(gpr)]) + .emit("unimplemented!();"), + ); + + // Fill of a GPR. + recipes.push( + EncodingRecipeBuilder::new("GPfi", &formats.unary, 4) + .operands_in(vec![Stack::new(gpr)]) + .operands_out(vec![gpr]) + .emit("unimplemented!();"), + ); + + // Stack-slot to same stack-slot copy, which is guaranteed to turn into a no-op. + recipes.push( + EncodingRecipeBuilder::new("stacknull", &formats.unary, 0) + .operands_in(vec![Stack::new(gpr)]) + .operands_out(vec![Stack::new(gpr)]) + .emit(""), + ); + + // No-op fills, created by late-stage redundant-fill removal. + recipes.push( + EncodingRecipeBuilder::new("fillnull", &formats.unary, 0) + .operands_in(vec![Stack::new(gpr)]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit(""), + ); + + recipes +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/encodings.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/encodings.rs new file mode 100644 index 0000000000..9ee12656c0 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/encodings.rs @@ -0,0 +1,2726 @@ +#![allow(non_snake_case)] + +use cranelift_codegen_shared::condcodes::IntCC; +use std::collections::HashMap; + +use crate::cdsl::encodings::{Encoding, EncodingBuilder}; +use crate::cdsl::instructions::{ + vector, Bindable, Immediate, InstSpec, Instruction, InstructionGroup, InstructionPredicate, + InstructionPredicateNode, InstructionPredicateRegistry, +}; +use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes}; +use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber}; +use crate::cdsl::types::{LaneType, ValueType}; +use crate::shared::types::Bool::{B1, B16, B32, B64, B8}; +use crate::shared::types::Float::{F32, F64}; +use crate::shared::types::Int::{I16, I32, I64, I8}; +use crate::shared::types::Reference::{R32, R64}; +use crate::shared::Definitions as SharedDefinitions; + +use crate::isa::x86::opcodes::*; + +use super::recipes::{RecipeGroup, Template}; +use crate::cdsl::instructions::BindParameter::Any; + +pub(crate) struct PerCpuModeEncodings { + pub enc32: Vec<Encoding>, + pub enc64: Vec<Encoding>, + pub recipes: Recipes, + recipes_by_name: HashMap<String, EncodingRecipeNumber>, + pub inst_pred_reg: InstructionPredicateRegistry, +} + +impl PerCpuModeEncodings { + fn new() -> Self { + Self { + enc32: Vec::new(), + enc64: Vec::new(), + recipes: Recipes::new(), + recipes_by_name: HashMap::new(), + inst_pred_reg: InstructionPredicateRegistry::new(), + } + } + + fn add_recipe(&mut self, recipe: EncodingRecipe) -> EncodingRecipeNumber { + if let Some(found_index) = self.recipes_by_name.get(&recipe.name) { + assert!( + self.recipes[*found_index] == recipe, + format!( + "trying to insert different recipes with a same name ({})", + recipe.name + ) + ); + *found_index + } else { + let recipe_name = recipe.name.clone(); + let index = self.recipes.push(recipe); + self.recipes_by_name.insert(recipe_name, index); + index + } + } + + fn make_encoding<T>( + &mut self, + inst: InstSpec, + template: Template, + builder_closure: T, + ) -> Encoding + where + T: FnOnce(EncodingBuilder) -> EncodingBuilder, + { + let (recipe, bits) = template.build(); + let recipe_number = self.add_recipe(recipe); + let builder = EncodingBuilder::new(inst, recipe_number, bits); + builder_closure(builder).build(&self.recipes, &mut self.inst_pred_reg) + } + + fn enc32_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T) + where + T: FnOnce(EncodingBuilder) -> EncodingBuilder, + { + let encoding = self.make_encoding(inst.into(), template, builder_closure); + self.enc32.push(encoding); + } + fn enc32(&mut self, inst: impl Into<InstSpec>, template: Template) { + self.enc32_func(inst, template, |x| x); + } + fn enc32_isap( + &mut self, + inst: impl Into<InstSpec>, + template: Template, + isap: SettingPredicateNumber, + ) { + self.enc32_func(inst, template, |encoding| encoding.isa_predicate(isap)); + } + fn enc32_instp( + &mut self, + inst: impl Into<InstSpec>, + template: Template, + instp: InstructionPredicateNode, + ) { + self.enc32_func(inst, template, |encoding| encoding.inst_predicate(instp)); + } + fn enc32_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16) { + let recipe_number = self.add_recipe(recipe.clone()); + let builder = EncodingBuilder::new(inst.into(), recipe_number, bits); + let encoding = builder.build(&self.recipes, &mut self.inst_pred_reg); + self.enc32.push(encoding); + } + + fn enc64_func<T>(&mut self, inst: impl Into<InstSpec>, template: Template, builder_closure: T) + where + T: FnOnce(EncodingBuilder) -> EncodingBuilder, + { + let encoding = self.make_encoding(inst.into(), template, builder_closure); + self.enc64.push(encoding); + } + fn enc64(&mut self, inst: impl Into<InstSpec>, template: Template) { + self.enc64_func(inst, template, |x| x); + } + fn enc64_isap( + &mut self, + inst: impl Into<InstSpec>, + template: Template, + isap: SettingPredicateNumber, + ) { + self.enc64_func(inst, template, |encoding| encoding.isa_predicate(isap)); + } + fn enc64_instp( + &mut self, + inst: impl Into<InstSpec>, + template: Template, + instp: InstructionPredicateNode, + ) { + self.enc64_func(inst, template, |encoding| encoding.inst_predicate(instp)); + } + fn enc64_rec(&mut self, inst: impl Into<InstSpec>, recipe: &EncodingRecipe, bits: u16) { + let recipe_number = self.add_recipe(recipe.clone()); + let builder = EncodingBuilder::new(inst.into(), recipe_number, bits); + let encoding = builder.build(&self.recipes, &mut self.inst_pred_reg); + self.enc64.push(encoding); + } + + /// Adds I32/I64 encodings as appropriate for a typed instruction. + /// The REX prefix is always inferred at runtime. + /// + /// Add encodings for `inst.i32` to X86_32. + /// Add encodings for `inst.i32` to X86_64 with optional, inferred REX. + /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix. + fn enc_i32_i64(&mut self, inst: impl Into<InstSpec>, template: Template) { + let inst: InstSpec = inst.into(); + + // I32 on x86: no REX prefix. + self.enc32(inst.bind(I32), template.infer_rex()); + + // I32 on x86_64: REX.W unset; REX.RXB determined at runtime from registers. + self.enc64(inst.bind(I32), template.infer_rex()); + + // I64 on x86_64: REX.W set; REX.RXB determined at runtime from registers. + self.enc64(inst.bind(I64), template.rex().w()); + } + + /// Adds I32/I64 encodings as appropriate for a typed instruction. + /// All variants of REX prefix are explicitly emitted, not inferred. + /// + /// Add encodings for `inst.i32` to X86_32. + /// Add encodings for `inst.i32` to X86_64 with and without REX. + /// Add encodings for `inst.i64` to X86_64 with and without REX. + fn enc_i32_i64_explicit_rex(&mut self, inst: impl Into<InstSpec>, template: Template) { + let inst: InstSpec = inst.into(); + self.enc32(inst.bind(I32), template.nonrex()); + + // REX-less encoding must come after REX encoding so we don't use it by default. + // Otherwise reg-alloc would never use r8 and up. + self.enc64(inst.bind(I32), template.rex()); + self.enc64(inst.bind(I32), template.nonrex()); + self.enc64(inst.bind(I64), template.rex().w()); + } + + /// Adds B32/B64 encodings as appropriate for a typed instruction. + /// The REX prefix is always inferred at runtime. + /// + /// Adds encoding for `inst.b32` to X86_32. + /// Adds encoding for `inst.b32` to X86_64 with optional, inferred REX. + /// Adds encoding for `inst.b64` to X86_64 with a REX.W prefix. + fn enc_b32_b64(&mut self, inst: impl Into<InstSpec>, template: Template) { + let inst: InstSpec = inst.into(); + + // B32 on x86: no REX prefix. + self.enc32(inst.bind(B32), template.infer_rex()); + + // B32 on x86_64: REX.W unset; REX.RXB determined at runtime from registers. + self.enc64(inst.bind(B32), template.infer_rex()); + + // B64 on x86_64: REX.W set; REX.RXB determined at runtime from registers. + self.enc64(inst.bind(B64), template.rex().w()); + } + + /// Add encodings for `inst.i32` to X86_32. + /// Add encodings for `inst.i32` to X86_64 with a REX prefix. + /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix. + fn enc_i32_i64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template) { + let inst: InstSpec = inst.into(); + self.enc32(inst.bind(I32), template.nonrex()); + self.enc64(inst.bind(I32), template.rex()); + self.enc64(inst.bind(I64), template.rex().w()); + } + + /// Add encodings for `inst.i32` to X86_32. + /// Add encodings for `inst.i32` to X86_64 with and without REX. + /// Add encodings for `inst.i64` to X86_64 with a REX.W prefix. + fn enc_i32_i64_instp( + &mut self, + inst: &Instruction, + template: Template, + instp: InstructionPredicateNode, + ) { + self.enc32_func(inst.bind(I32), template.nonrex(), |builder| { + builder.inst_predicate(instp.clone()) + }); + + // REX-less encoding must come after REX encoding so we don't use it by default. Otherwise + // reg-alloc would never use r8 and up. + self.enc64_func(inst.bind(I32), template.rex(), |builder| { + builder.inst_predicate(instp.clone()) + }); + self.enc64_func(inst.bind(I32), template.nonrex(), |builder| { + builder.inst_predicate(instp.clone()) + }); + self.enc64_func(inst.bind(I64), template.rex().w(), |builder| { + builder.inst_predicate(instp) + }); + } + + /// Add encodings for `inst.r32` to X86_32. + /// Add encodings for `inst.r32` to X86_64 with and without REX. + /// Add encodings for `inst.r64` to X86_64 with a REX.W prefix. + fn enc_r32_r64_instp( + &mut self, + inst: &Instruction, + template: Template, + instp: InstructionPredicateNode, + ) { + self.enc32_func(inst.bind(R32), template.nonrex(), |builder| { + builder.inst_predicate(instp.clone()) + }); + + // REX-less encoding must come after REX encoding so we don't use it by default. Otherwise + // reg-alloc would never use r8 and up. + self.enc64_func(inst.bind(R32), template.rex(), |builder| { + builder.inst_predicate(instp.clone()) + }); + self.enc64_func(inst.bind(R32), template.nonrex(), |builder| { + builder.inst_predicate(instp.clone()) + }); + self.enc64_func(inst.bind(R64), template.rex().w(), |builder| { + builder.inst_predicate(instp) + }); + } + + /// Add encodings for `inst.r32` to X86_32. + /// Add encodings for `inst.r64` to X86_64 with a REX.W prefix. + fn enc_r32_r64_rex_only(&mut self, inst: impl Into<InstSpec>, template: Template) { + let inst: InstSpec = inst.into(); + self.enc32(inst.bind(R32), template.nonrex()); + self.enc64(inst.bind(R64), template.rex().w()); + } + + fn enc_r32_r64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template) { + self.enc32(inst.clone().bind(R32).bind(Any), template.clone()); + + // REX-less encoding must come after REX encoding so we don't use it by + // default. Otherwise reg-alloc would never use r8 and up. + self.enc64(inst.clone().bind(R32).bind(Any), template.clone().rex()); + self.enc64(inst.clone().bind(R32).bind(Any), template.clone()); + + if w_bit { + self.enc64(inst.clone().bind(R64).bind(Any), template.rex().w()); + } else { + self.enc64(inst.clone().bind(R64).bind(Any), template.clone().rex()); + self.enc64(inst.clone().bind(R64).bind(Any), template); + } + } + + /// Add encodings for `inst` to X86_64 with and without a REX prefix. + fn enc_x86_64(&mut self, inst: impl Into<InstSpec> + Clone, template: Template) { + // See above comment about the ordering of rex vs non-rex encodings. + self.enc64(inst.clone(), template.rex()); + self.enc64(inst, template); + } + + /// Add encodings for `inst` to X86_64 with and without a REX prefix. + fn enc_x86_64_instp( + &mut self, + inst: impl Clone + Into<InstSpec>, + template: Template, + instp: InstructionPredicateNode, + ) { + // See above comment about the ordering of rex vs non-rex encodings. + self.enc64_func(inst.clone(), template.rex(), |builder| { + builder.inst_predicate(instp.clone()) + }); + self.enc64_func(inst, template, |builder| builder.inst_predicate(instp)); + } + fn enc_x86_64_isap( + &mut self, + inst: impl Clone + Into<InstSpec>, + template: Template, + isap: SettingPredicateNumber, + ) { + // See above comment about the ordering of rex vs non-rex encodings. + self.enc64_isap(inst.clone(), template.rex(), isap); + self.enc64_isap(inst, template, isap); + } + + /// Add all three encodings for `inst`: + /// - X86_32 + /// - X86_64 with and without the REX prefix. + fn enc_both(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) { + self.enc32(inst.clone(), template.clone()); + self.enc_x86_64(inst, template); + } + fn enc_both_isap( + &mut self, + inst: impl Clone + Into<InstSpec>, + template: Template, + isap: SettingPredicateNumber, + ) { + self.enc32_isap(inst.clone(), template.clone(), isap); + self.enc_x86_64_isap(inst, template, isap); + } + fn enc_both_instp( + &mut self, + inst: impl Clone + Into<InstSpec>, + template: Template, + instp: InstructionPredicateNode, + ) { + self.enc32_instp(inst.clone(), template.clone(), instp.clone()); + self.enc_x86_64_instp(inst, template, instp); + } + + /// Add two encodings for `inst`: + /// - X86_32, no REX prefix, since this is not valid in 32-bit mode. + /// - X86_64, dynamically infer the REX prefix. + fn enc_both_inferred(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) { + self.enc32(inst.clone(), template.clone()); + self.enc64(inst, template.infer_rex()); + } + fn enc_both_inferred_maybe_isap( + &mut self, + inst: impl Clone + Into<InstSpec>, + template: Template, + isap: Option<SettingPredicateNumber>, + ) { + self.enc32_maybe_isap(inst.clone(), template.clone(), isap); + self.enc64_maybe_isap(inst, template.infer_rex(), isap); + } + + /// Add two encodings for `inst`: + /// - X86_32 + /// - X86_64 with the REX prefix. + fn enc_both_rex_only(&mut self, inst: impl Clone + Into<InstSpec>, template: Template) { + self.enc32(inst.clone(), template.clone()); + self.enc64(inst, template.rex()); + } + + /// Add encodings for `inst.i32` to X86_32. + /// Add encodings for `inst.i32` to X86_64 with and without REX. + /// Add encodings for `inst.i64` to X86_64 with a REX prefix, using the `w_bit` + /// argument to determine whether or not to set the REX.W bit. + fn enc_i32_i64_ld_st(&mut self, inst: &Instruction, w_bit: bool, template: Template) { + self.enc32(inst.clone().bind(I32).bind(Any), template.clone()); + + // REX-less encoding must come after REX encoding so we don't use it by + // default. Otherwise reg-alloc would never use r8 and up. + self.enc64(inst.clone().bind(I32).bind(Any), template.clone().rex()); + self.enc64(inst.clone().bind(I32).bind(Any), template.clone()); + + if w_bit { + self.enc64(inst.clone().bind(I64).bind(Any), template.rex().w()); + } else { + self.enc64(inst.clone().bind(I64).bind(Any), template.clone().rex()); + self.enc64(inst.clone().bind(I64).bind(Any), template); + } + } + + /// Add the same encoding/recipe pairing to both X86_32 and X86_64 + fn enc_32_64_rec( + &mut self, + inst: impl Clone + Into<InstSpec>, + recipe: &EncodingRecipe, + bits: u16, + ) { + self.enc32_rec(inst.clone(), recipe, bits); + self.enc64_rec(inst, recipe, bits); + } + + /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened + fn enc_32_64_func<T>( + &mut self, + inst: impl Clone + Into<InstSpec>, + template: Template, + builder_closure: T, + ) where + T: FnOnce(EncodingBuilder) -> EncodingBuilder, + { + let encoding = self.make_encoding(inst.into(), template, builder_closure); + self.enc32.push(encoding.clone()); + self.enc64.push(encoding); + } + + /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand + /// binding) has already happened. + fn enc_32_64_maybe_isap( + &mut self, + inst: impl Clone + Into<InstSpec>, + template: Template, + isap: Option<SettingPredicateNumber>, + ) { + self.enc32_maybe_isap(inst.clone(), template.clone(), isap); + self.enc64_maybe_isap(inst, template, isap); + } + + fn enc32_maybe_isap( + &mut self, + inst: impl Into<InstSpec>, + template: Template, + isap: Option<SettingPredicateNumber>, + ) { + match isap { + None => self.enc32(inst, template), + Some(isap) => self.enc32_isap(inst, template, isap), + } + } + + fn enc64_maybe_isap( + &mut self, + inst: impl Into<InstSpec>, + template: Template, + isap: Option<SettingPredicateNumber>, + ) { + match isap { + None => self.enc64(inst, template), + Some(isap) => self.enc64_isap(inst, template, isap), + } + } +} + +// Definitions. + +#[inline(never)] +fn define_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) { + let shared = &shared_defs.instructions; + let formats = &shared_defs.formats; + + // Shorthands for instructions. + let bconst = shared.by_name("bconst"); + let bint = shared.by_name("bint"); + let copy = shared.by_name("copy"); + let copy_special = shared.by_name("copy_special"); + let copy_to_ssa = shared.by_name("copy_to_ssa"); + let get_pinned_reg = shared.by_name("get_pinned_reg"); + let iconst = shared.by_name("iconst"); + let ireduce = shared.by_name("ireduce"); + let regmove = shared.by_name("regmove"); + let sextend = shared.by_name("sextend"); + let set_pinned_reg = shared.by_name("set_pinned_reg"); + let uextend = shared.by_name("uextend"); + let dummy_sarg_t = shared.by_name("dummy_sarg_t"); + + // Shorthands for recipes. + let rec_copysp = r.template("copysp"); + let rec_furm_reg_to_ssa = r.template("furm_reg_to_ssa"); + let rec_get_pinned_reg = r.recipe("get_pinned_reg"); + let rec_null = r.recipe("null"); + let rec_pu_id = r.template("pu_id"); + let rec_pu_id_bool = r.template("pu_id_bool"); + let rec_pu_iq = r.template("pu_iq"); + let rec_rmov = r.template("rmov"); + let rec_set_pinned_reg = r.template("set_pinned_reg"); + let rec_u_id = r.template("u_id"); + let rec_u_id_z = r.template("u_id_z"); + let rec_umr = r.template("umr"); + let rec_umr_reg_to_ssa = r.template("umr_reg_to_ssa"); + let rec_urm_noflags = r.template("urm_noflags"); + let rec_urm_noflags_abcd = r.template("urm_noflags_abcd"); + let rec_dummy_sarg_t = r.recipe("dummy_sarg_t"); + + // The pinned reg is fixed to a certain value entirely user-controlled, so it generates nothing! + e.enc64_rec(get_pinned_reg.bind(I64), rec_get_pinned_reg, 0); + e.enc_x86_64( + set_pinned_reg.bind(I64), + rec_set_pinned_reg.opcodes(&MOV_STORE).rex().w(), + ); + + e.enc_i32_i64(copy, rec_umr.opcodes(&MOV_STORE)); + e.enc_r32_r64_rex_only(copy, rec_umr.opcodes(&MOV_STORE)); + e.enc_both(copy.bind(B1), rec_umr.opcodes(&MOV_STORE)); + e.enc_both(copy.bind(I8), rec_umr.opcodes(&MOV_STORE)); + e.enc_both(copy.bind(I16), rec_umr.opcodes(&MOV_STORE)); + + // TODO For x86-64, only define REX forms for now, since we can't describe the + // special regunit immediate operands with the current constraint language. + for &ty in &[I8, I16, I32] { + e.enc32(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE)); + e.enc64(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE).rex()); + } + for &ty in &[B8, B16, B32] { + e.enc32(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE)); + e.enc64(regmove.bind(ty), rec_rmov.opcodes(&MOV_STORE).rex()); + } + e.enc64(regmove.bind(I64), rec_rmov.opcodes(&MOV_STORE).rex().w()); + e.enc_both(regmove.bind(B1), rec_rmov.opcodes(&MOV_STORE)); + e.enc_both(regmove.bind(I8), rec_rmov.opcodes(&MOV_STORE)); + e.enc32(regmove.bind(R32), rec_rmov.opcodes(&MOV_STORE)); + e.enc64(regmove.bind(R32), rec_rmov.opcodes(&MOV_STORE).rex()); + e.enc64(regmove.bind(R64), rec_rmov.opcodes(&MOV_STORE).rex().w()); + + // Immediate constants. + e.enc32(iconst.bind(I32), rec_pu_id.opcodes(&MOV_IMM)); + + e.enc64(iconst.bind(I32), rec_pu_id.rex().opcodes(&MOV_IMM)); + e.enc64(iconst.bind(I32), rec_pu_id.opcodes(&MOV_IMM)); + + // The 32-bit immediate movl also zero-extends to 64 bits. + let is_unsigned_int32 = + InstructionPredicate::new_is_unsigned_int(&*formats.unary_imm, "imm", 32, 0); + + e.enc64_func( + iconst.bind(I64), + rec_pu_id.opcodes(&MOV_IMM).rex(), + |encoding| encoding.inst_predicate(is_unsigned_int32.clone()), + ); + e.enc64_func(iconst.bind(I64), rec_pu_id.opcodes(&MOV_IMM), |encoding| { + encoding.inst_predicate(is_unsigned_int32) + }); + + // Sign-extended 32-bit immediate. + e.enc64( + iconst.bind(I64), + rec_u_id.rex().opcodes(&MOV_IMM_SIGNEXTEND).rrr(0).w(), + ); + + // Finally, the MOV_IMM opcode takes an 8-byte immediate with a REX.W prefix. + e.enc64(iconst.bind(I64), rec_pu_iq.opcodes(&MOV_IMM).rex().w()); + + // Bool constants (uses MOV) + for &ty in &[B1, B8, B16, B32] { + e.enc_both(bconst.bind(ty), rec_pu_id_bool.opcodes(&MOV_IMM)); + } + e.enc64(bconst.bind(B64), rec_pu_id_bool.opcodes(&MOV_IMM).rex()); + + let is_zero_int = InstructionPredicate::new_is_zero_int(&formats.unary_imm, "imm"); + e.enc_both_instp( + iconst.bind(I8), + rec_u_id_z.opcodes(&XORB), + is_zero_int.clone(), + ); + + // You may expect that i16 encodings would have an 0x66 prefix on the opcode to indicate that + // encodings should be on 16-bit operands (f.ex, "xor %ax, %ax"). Cranelift currently does not + // know that it can drop the 0x66 prefix and clear the upper half of a 32-bit register in these + // scenarios, so we explicitly select a wider but permissible opcode. + // + // This effectively formalizes the i16->i32 widening that Cranelift performs when there isn't + // an appropriate i16 encoding available. + e.enc_both_instp( + iconst.bind(I16), + rec_u_id_z.opcodes(&XOR), + is_zero_int.clone(), + ); + e.enc_both_instp( + iconst.bind(I32), + rec_u_id_z.opcodes(&XOR), + is_zero_int.clone(), + ); + e.enc_x86_64_instp(iconst.bind(I64), rec_u_id_z.opcodes(&XOR), is_zero_int); + + // Numerical conversions. + + // Reducing an integer is a no-op. + e.enc32_rec(ireduce.bind(I8).bind(I16), rec_null, 0); + e.enc32_rec(ireduce.bind(I8).bind(I32), rec_null, 0); + e.enc32_rec(ireduce.bind(I16).bind(I32), rec_null, 0); + + e.enc64_rec(ireduce.bind(I8).bind(I16), rec_null, 0); + e.enc64_rec(ireduce.bind(I8).bind(I32), rec_null, 0); + e.enc64_rec(ireduce.bind(I16).bind(I32), rec_null, 0); + e.enc64_rec(ireduce.bind(I8).bind(I64), rec_null, 0); + e.enc64_rec(ireduce.bind(I16).bind(I64), rec_null, 0); + e.enc64_rec(ireduce.bind(I32).bind(I64), rec_null, 0); + + // TODO: Add encodings for cbw, cwde, cdqe, which are sign-extending + // instructions for %al/%ax/%eax to %ax/%eax/%rax. + + // movsbl + e.enc32( + sextend.bind(I32).bind(I8), + rec_urm_noflags_abcd.opcodes(&MOVSX_BYTE), + ); + e.enc64( + sextend.bind(I32).bind(I8), + rec_urm_noflags.opcodes(&MOVSX_BYTE).rex(), + ); + e.enc64( + sextend.bind(I32).bind(I8), + rec_urm_noflags_abcd.opcodes(&MOVSX_BYTE), + ); + + // movswl + e.enc32( + sextend.bind(I32).bind(I16), + rec_urm_noflags.opcodes(&MOVSX_WORD), + ); + e.enc64( + sextend.bind(I32).bind(I16), + rec_urm_noflags.opcodes(&MOVSX_WORD).rex(), + ); + e.enc64( + sextend.bind(I32).bind(I16), + rec_urm_noflags.opcodes(&MOVSX_WORD), + ); + + // movsbq + e.enc64( + sextend.bind(I64).bind(I8), + rec_urm_noflags.opcodes(&MOVSX_BYTE).rex().w(), + ); + + // movswq + e.enc64( + sextend.bind(I64).bind(I16), + rec_urm_noflags.opcodes(&MOVSX_WORD).rex().w(), + ); + + // movslq + e.enc64( + sextend.bind(I64).bind(I32), + rec_urm_noflags.opcodes(&MOVSXD).rex().w(), + ); + + // movzbl + e.enc32( + uextend.bind(I32).bind(I8), + rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE), + ); + e.enc64( + uextend.bind(I32).bind(I8), + rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(), + ); + e.enc64( + uextend.bind(I32).bind(I8), + rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE), + ); + + // movzwl + e.enc32( + uextend.bind(I32).bind(I16), + rec_urm_noflags.opcodes(&MOVZX_WORD), + ); + e.enc64( + uextend.bind(I32).bind(I16), + rec_urm_noflags.opcodes(&MOVZX_WORD).rex(), + ); + e.enc64( + uextend.bind(I32).bind(I16), + rec_urm_noflags.opcodes(&MOVZX_WORD), + ); + + // movzbq, encoded as movzbl because it's equivalent and shorter. + e.enc64( + uextend.bind(I64).bind(I8), + rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(), + ); + e.enc64( + uextend.bind(I64).bind(I8), + rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE), + ); + + // movzwq, encoded as movzwl because it's equivalent and shorter + e.enc64( + uextend.bind(I64).bind(I16), + rec_urm_noflags.opcodes(&MOVZX_WORD).rex(), + ); + e.enc64( + uextend.bind(I64).bind(I16), + rec_urm_noflags.opcodes(&MOVZX_WORD), + ); + + // A 32-bit register copy clears the high 32 bits. + e.enc64( + uextend.bind(I64).bind(I32), + rec_umr.opcodes(&MOV_STORE).rex(), + ); + e.enc64(uextend.bind(I64).bind(I32), rec_umr.opcodes(&MOV_STORE)); + + // Convert bool to int. + // + // This assumes that b1 is represented as an 8-bit low register with the value 0 + // or 1. + // + // Encode movzbq as movzbl, because it's equivalent and shorter. + for &to in &[I8, I16, I32, I64] { + for &from in &[B1, B8] { + e.enc64( + bint.bind(to).bind(from), + rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(), + ); + e.enc64( + bint.bind(to).bind(from), + rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE), + ); + if to != I64 { + e.enc32( + bint.bind(to).bind(from), + rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE), + ); + } + } + } + for (to, from) in &[(I16, B16), (I32, B32), (I64, B64)] { + e.enc_both( + bint.bind(*to).bind(*from), + rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE), + ); + } + + // Copy Special + // For x86-64, only define REX forms for now, since we can't describe the + // special regunit immediate operands with the current constraint language. + e.enc64(copy_special, rec_copysp.opcodes(&MOV_STORE).rex().w()); + e.enc32(copy_special, rec_copysp.opcodes(&MOV_STORE)); + + // Copy to SSA. These have to be done with special _rex_only encoders, because the standard + // machinery for deciding whether a REX.{RXB} prefix is needed doesn't take into account + // the source register, which is specified directly in the instruction. + e.enc_i32_i64_rex_only(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(&MOV_STORE)); + e.enc_r32_r64_rex_only(copy_to_ssa, rec_umr_reg_to_ssa.opcodes(&MOV_STORE)); + e.enc_both_rex_only(copy_to_ssa.bind(B1), rec_umr_reg_to_ssa.opcodes(&MOV_STORE)); + e.enc_both_rex_only(copy_to_ssa.bind(I8), rec_umr_reg_to_ssa.opcodes(&MOV_STORE)); + e.enc_both_rex_only( + copy_to_ssa.bind(I16), + rec_umr_reg_to_ssa.opcodes(&MOV_STORE), + ); + e.enc_both_rex_only( + copy_to_ssa.bind(F64), + rec_furm_reg_to_ssa.opcodes(&MOVSD_LOAD), + ); + e.enc_both_rex_only( + copy_to_ssa.bind(F32), + rec_furm_reg_to_ssa.opcodes(&MOVSS_LOAD), + ); + + e.enc_32_64_rec(dummy_sarg_t, rec_dummy_sarg_t, 0); +} + +#[inline(never)] +fn define_memory( + e: &mut PerCpuModeEncodings, + shared_defs: &SharedDefinitions, + x86: &InstructionGroup, + r: &RecipeGroup, +) { + let shared = &shared_defs.instructions; + let formats = &shared_defs.formats; + + // Shorthands for instructions. + let adjust_sp_down = shared.by_name("adjust_sp_down"); + let adjust_sp_down_imm = shared.by_name("adjust_sp_down_imm"); + let adjust_sp_up_imm = shared.by_name("adjust_sp_up_imm"); + let copy_nop = shared.by_name("copy_nop"); + let fill = shared.by_name("fill"); + let fill_nop = shared.by_name("fill_nop"); + let istore16 = shared.by_name("istore16"); + let istore16_complex = shared.by_name("istore16_complex"); + let istore32 = shared.by_name("istore32"); + let istore32_complex = shared.by_name("istore32_complex"); + let istore8 = shared.by_name("istore8"); + let istore8_complex = shared.by_name("istore8_complex"); + let load = shared.by_name("load"); + let load_complex = shared.by_name("load_complex"); + let regfill = shared.by_name("regfill"); + let regspill = shared.by_name("regspill"); + let sload16 = shared.by_name("sload16"); + let sload16_complex = shared.by_name("sload16_complex"); + let sload32 = shared.by_name("sload32"); + let sload32_complex = shared.by_name("sload32_complex"); + let sload8 = shared.by_name("sload8"); + let sload8_complex = shared.by_name("sload8_complex"); + let spill = shared.by_name("spill"); + let store = shared.by_name("store"); + let store_complex = shared.by_name("store_complex"); + let uload16 = shared.by_name("uload16"); + let uload16_complex = shared.by_name("uload16_complex"); + let uload32 = shared.by_name("uload32"); + let uload32_complex = shared.by_name("uload32_complex"); + let uload8 = shared.by_name("uload8"); + let uload8_complex = shared.by_name("uload8_complex"); + let x86_pop = x86.by_name("x86_pop"); + let x86_push = x86.by_name("x86_push"); + + // Shorthands for recipes. + let rec_adjustsp = r.template("adjustsp"); + let rec_adjustsp_ib = r.template("adjustsp_ib"); + let rec_adjustsp_id = r.template("adjustsp_id"); + let rec_ffillnull = r.recipe("ffillnull"); + let rec_fillnull = r.recipe("fillnull"); + let rec_fillSib32 = r.template("fillSib32"); + let rec_ld = r.template("ld"); + let rec_ldDisp32 = r.template("ldDisp32"); + let rec_ldDisp8 = r.template("ldDisp8"); + let rec_ldWithIndex = r.template("ldWithIndex"); + let rec_ldWithIndexDisp32 = r.template("ldWithIndexDisp32"); + let rec_ldWithIndexDisp8 = r.template("ldWithIndexDisp8"); + let rec_popq = r.template("popq"); + let rec_pushq = r.template("pushq"); + let rec_regfill32 = r.template("regfill32"); + let rec_regspill32 = r.template("regspill32"); + let rec_spillSib32 = r.template("spillSib32"); + let rec_st = r.template("st"); + let rec_stacknull = r.recipe("stacknull"); + let rec_stDisp32 = r.template("stDisp32"); + let rec_stDisp32_abcd = r.template("stDisp32_abcd"); + let rec_stDisp8 = r.template("stDisp8"); + let rec_stDisp8_abcd = r.template("stDisp8_abcd"); + let rec_stWithIndex = r.template("stWithIndex"); + let rec_stWithIndexDisp32 = r.template("stWithIndexDisp32"); + let rec_stWithIndexDisp32_abcd = r.template("stWithIndexDisp32_abcd"); + let rec_stWithIndexDisp8 = r.template("stWithIndexDisp8"); + let rec_stWithIndexDisp8_abcd = r.template("stWithIndexDisp8_abcd"); + let rec_stWithIndex_abcd = r.template("stWithIndex_abcd"); + let rec_st_abcd = r.template("st_abcd"); + + // Loads and stores. + let is_load_complex_length_two = + InstructionPredicate::new_length_equals(&*formats.load_complex, 2); + + for recipe in &[rec_ldWithIndex, rec_ldWithIndexDisp8, rec_ldWithIndexDisp32] { + e.enc_i32_i64_instp( + load_complex, + recipe.opcodes(&MOV_LOAD), + is_load_complex_length_two.clone(), + ); + e.enc_r32_r64_instp( + load_complex, + recipe.opcodes(&MOV_LOAD), + is_load_complex_length_two.clone(), + ); + e.enc_x86_64_instp( + uload32_complex, + recipe.opcodes(&MOV_LOAD), + is_load_complex_length_two.clone(), + ); + + e.enc64_instp( + sload32_complex, + recipe.opcodes(&MOVSXD).rex().w(), + is_load_complex_length_two.clone(), + ); + + e.enc_i32_i64_instp( + uload16_complex, + recipe.opcodes(&MOVZX_WORD), + is_load_complex_length_two.clone(), + ); + e.enc_i32_i64_instp( + sload16_complex, + recipe.opcodes(&MOVSX_WORD), + is_load_complex_length_two.clone(), + ); + + e.enc_i32_i64_instp( + uload8_complex, + recipe.opcodes(&MOVZX_BYTE), + is_load_complex_length_two.clone(), + ); + + e.enc_i32_i64_instp( + sload8_complex, + recipe.opcodes(&MOVSX_BYTE), + is_load_complex_length_two.clone(), + ); + } + + let is_store_complex_length_three = + InstructionPredicate::new_length_equals(&*formats.store_complex, 3); + + for recipe in &[rec_stWithIndex, rec_stWithIndexDisp8, rec_stWithIndexDisp32] { + e.enc_i32_i64_instp( + store_complex, + recipe.opcodes(&MOV_STORE), + is_store_complex_length_three.clone(), + ); + e.enc_r32_r64_instp( + store_complex, + recipe.opcodes(&MOV_STORE), + is_store_complex_length_three.clone(), + ); + e.enc_x86_64_instp( + istore32_complex, + recipe.opcodes(&MOV_STORE), + is_store_complex_length_three.clone(), + ); + e.enc_both_instp( + istore16_complex.bind(I32), + recipe.opcodes(&MOV_STORE_16), + is_store_complex_length_three.clone(), + ); + e.enc_x86_64_instp( + istore16_complex.bind(I64), + recipe.opcodes(&MOV_STORE_16), + is_store_complex_length_three.clone(), + ); + } + + for recipe in &[ + rec_stWithIndex_abcd, + rec_stWithIndexDisp8_abcd, + rec_stWithIndexDisp32_abcd, + ] { + e.enc_both_instp( + istore8_complex.bind(I32), + recipe.opcodes(&MOV_BYTE_STORE), + is_store_complex_length_three.clone(), + ); + e.enc_x86_64_instp( + istore8_complex.bind(I64), + recipe.opcodes(&MOV_BYTE_STORE), + is_store_complex_length_three.clone(), + ); + } + + for recipe in &[rec_st, rec_stDisp8, rec_stDisp32] { + e.enc_i32_i64_ld_st(store, true, recipe.opcodes(&MOV_STORE)); + e.enc_r32_r64_ld_st(store, true, recipe.opcodes(&MOV_STORE)); + e.enc_x86_64(istore32.bind(I64).bind(Any), recipe.opcodes(&MOV_STORE)); + e.enc_i32_i64_ld_st(istore16, false, recipe.opcodes(&MOV_STORE_16)); + } + + // Byte stores are more complicated because the registers they can address + // depends of the presence of a REX prefix. The st*_abcd recipes fall back to + // the corresponding st* recipes when a REX prefix is applied. + + for recipe in &[rec_st_abcd, rec_stDisp8_abcd, rec_stDisp32_abcd] { + e.enc_both(istore8.bind(I32).bind(Any), recipe.opcodes(&MOV_BYTE_STORE)); + e.enc_x86_64(istore8.bind(I64).bind(Any), recipe.opcodes(&MOV_BYTE_STORE)); + } + + e.enc_i32_i64_explicit_rex(spill, rec_spillSib32.opcodes(&MOV_STORE)); + e.enc_i32_i64_explicit_rex(regspill, rec_regspill32.opcodes(&MOV_STORE)); + e.enc_r32_r64_rex_only(spill, rec_spillSib32.opcodes(&MOV_STORE)); + e.enc_r32_r64_rex_only(regspill, rec_regspill32.opcodes(&MOV_STORE)); + + // Use a 32-bit write for spilling `b1`, `i8` and `i16` to avoid + // constraining the permitted registers. + // See MIN_SPILL_SLOT_SIZE which makes this safe. + + e.enc_both(spill.bind(B1), rec_spillSib32.opcodes(&MOV_STORE)); + e.enc_both(regspill.bind(B1), rec_regspill32.opcodes(&MOV_STORE)); + for &ty in &[I8, I16] { + e.enc_both(spill.bind(ty), rec_spillSib32.opcodes(&MOV_STORE)); + e.enc_both(regspill.bind(ty), rec_regspill32.opcodes(&MOV_STORE)); + } + + for recipe in &[rec_ld, rec_ldDisp8, rec_ldDisp32] { + e.enc_i32_i64_ld_st(load, true, recipe.opcodes(&MOV_LOAD)); + e.enc_r32_r64_ld_st(load, true, recipe.opcodes(&MOV_LOAD)); + e.enc_x86_64(uload32.bind(I64), recipe.opcodes(&MOV_LOAD)); + e.enc64(sload32.bind(I64), recipe.opcodes(&MOVSXD).rex().w()); + e.enc_i32_i64_ld_st(uload16, true, recipe.opcodes(&MOVZX_WORD)); + e.enc_i32_i64_ld_st(sload16, true, recipe.opcodes(&MOVSX_WORD)); + e.enc_i32_i64_ld_st(uload8, true, recipe.opcodes(&MOVZX_BYTE)); + e.enc_i32_i64_ld_st(sload8, true, recipe.opcodes(&MOVSX_BYTE)); + } + + e.enc_i32_i64_explicit_rex(fill, rec_fillSib32.opcodes(&MOV_LOAD)); + e.enc_i32_i64_explicit_rex(regfill, rec_regfill32.opcodes(&MOV_LOAD)); + e.enc_r32_r64_rex_only(fill, rec_fillSib32.opcodes(&MOV_LOAD)); + e.enc_r32_r64_rex_only(regfill, rec_regfill32.opcodes(&MOV_LOAD)); + + // No-op fills, created by late-stage redundant-fill removal. + for &ty in &[I64, I32, I16, I8] { + e.enc64_rec(fill_nop.bind(ty), rec_fillnull, 0); + e.enc32_rec(fill_nop.bind(ty), rec_fillnull, 0); + } + e.enc64_rec(fill_nop.bind(B1), rec_fillnull, 0); + e.enc32_rec(fill_nop.bind(B1), rec_fillnull, 0); + for &ty in &[F64, F32] { + e.enc64_rec(fill_nop.bind(ty), rec_ffillnull, 0); + e.enc32_rec(fill_nop.bind(ty), rec_ffillnull, 0); + } + for &ty in &[R64, R32] { + e.enc64_rec(fill_nop.bind(ty), rec_fillnull, 0); + e.enc32_rec(fill_nop.bind(ty), rec_fillnull, 0); + } + + // Load 32 bits from `b1`, `i8` and `i16` spill slots. See `spill.b1` above. + + e.enc_both(fill.bind(B1), rec_fillSib32.opcodes(&MOV_LOAD)); + e.enc_both(regfill.bind(B1), rec_regfill32.opcodes(&MOV_LOAD)); + for &ty in &[I8, I16] { + e.enc_both(fill.bind(ty), rec_fillSib32.opcodes(&MOV_LOAD)); + e.enc_both(regfill.bind(ty), rec_regfill32.opcodes(&MOV_LOAD)); + } + + // Push and Pop. + e.enc32(x86_push.bind(I32), rec_pushq.opcodes(&PUSH_REG)); + e.enc_x86_64(x86_push.bind(I64), rec_pushq.opcodes(&PUSH_REG)); + + e.enc32(x86_pop.bind(I32), rec_popq.opcodes(&POP_REG)); + e.enc_x86_64(x86_pop.bind(I64), rec_popq.opcodes(&POP_REG)); + + // Stack-slot-to-the-same-stack-slot copy, which is guaranteed to turn + // into a no-op. + // The same encoding is generated for both the 64- and 32-bit architectures. + for &ty in &[I64, I32, I16, I8] { + e.enc64_rec(copy_nop.bind(ty), rec_stacknull, 0); + e.enc32_rec(copy_nop.bind(ty), rec_stacknull, 0); + } + for &ty in &[F64, F32] { + e.enc64_rec(copy_nop.bind(ty), rec_stacknull, 0); + e.enc32_rec(copy_nop.bind(ty), rec_stacknull, 0); + } + + // Adjust SP down by a dynamic value (or up, with a negative operand). + e.enc32(adjust_sp_down.bind(I32), rec_adjustsp.opcodes(&SUB)); + e.enc64( + adjust_sp_down.bind(I64), + rec_adjustsp.opcodes(&SUB).rex().w(), + ); + + // Adjust SP up by an immediate (or down, with a negative immediate). + e.enc32(adjust_sp_up_imm, rec_adjustsp_ib.opcodes(&CMP_IMM8)); + e.enc32(adjust_sp_up_imm, rec_adjustsp_id.opcodes(&CMP_IMM)); + e.enc64( + adjust_sp_up_imm, + rec_adjustsp_ib.opcodes(&CMP_IMM8).rex().w(), + ); + e.enc64( + adjust_sp_up_imm, + rec_adjustsp_id.opcodes(&CMP_IMM).rex().w(), + ); + + // Adjust SP down by an immediate (or up, with a negative immediate). + e.enc32( + adjust_sp_down_imm, + rec_adjustsp_ib.opcodes(&CMP_IMM8).rrr(5), + ); + e.enc32(adjust_sp_down_imm, rec_adjustsp_id.opcodes(&CMP_IMM).rrr(5)); + e.enc64( + adjust_sp_down_imm, + rec_adjustsp_ib.opcodes(&CMP_IMM8).rrr(5).rex().w(), + ); + e.enc64( + adjust_sp_down_imm, + rec_adjustsp_id.opcodes(&CMP_IMM).rrr(5).rex().w(), + ); +} + +#[inline(never)] +fn define_fpu_moves(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) { + let shared = &shared_defs.instructions; + + // Shorthands for instructions. + let bitcast = shared.by_name("bitcast"); + let copy = shared.by_name("copy"); + let regmove = shared.by_name("regmove"); + + // Shorthands for recipes. + let rec_frmov = r.template("frmov"); + let rec_frurm = r.template("frurm"); + let rec_furm = r.template("furm"); + let rec_rfumr = r.template("rfumr"); + + // Floating-point moves. + // movd + e.enc_both( + bitcast.bind(F32).bind(I32), + rec_frurm.opcodes(&MOVD_LOAD_XMM), + ); + e.enc_both( + bitcast.bind(I32).bind(F32), + rec_rfumr.opcodes(&MOVD_STORE_XMM), + ); + + // movq + e.enc64( + bitcast.bind(F64).bind(I64), + rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(), + ); + e.enc64( + bitcast.bind(I64).bind(F64), + rec_rfumr.opcodes(&MOVD_STORE_XMM).rex().w(), + ); + + // movaps + e.enc_both(copy.bind(F32), rec_furm.opcodes(&MOVAPS_LOAD)); + e.enc_both(copy.bind(F64), rec_furm.opcodes(&MOVAPS_LOAD)); + + // TODO For x86-64, only define REX forms for now, since we can't describe the special regunit + // immediate operands with the current constraint language. + e.enc32(regmove.bind(F32), rec_frmov.opcodes(&MOVAPS_LOAD)); + e.enc64(regmove.bind(F32), rec_frmov.opcodes(&MOVAPS_LOAD).rex()); + + // TODO For x86-64, only define REX forms for now, since we can't describe the special regunit + // immediate operands with the current constraint language. + e.enc32(regmove.bind(F64), rec_frmov.opcodes(&MOVAPS_LOAD)); + e.enc64(regmove.bind(F64), rec_frmov.opcodes(&MOVAPS_LOAD).rex()); +} + +#[inline(never)] +fn define_fpu_memory( + e: &mut PerCpuModeEncodings, + shared_defs: &SharedDefinitions, + r: &RecipeGroup, +) { + let shared = &shared_defs.instructions; + + // Shorthands for instructions. + let fill = shared.by_name("fill"); + let load = shared.by_name("load"); + let load_complex = shared.by_name("load_complex"); + let regfill = shared.by_name("regfill"); + let regspill = shared.by_name("regspill"); + let spill = shared.by_name("spill"); + let store = shared.by_name("store"); + let store_complex = shared.by_name("store_complex"); + + // Shorthands for recipes. + let rec_ffillSib32 = r.template("ffillSib32"); + let rec_fld = r.template("fld"); + let rec_fldDisp32 = r.template("fldDisp32"); + let rec_fldDisp8 = r.template("fldDisp8"); + let rec_fldWithIndex = r.template("fldWithIndex"); + let rec_fldWithIndexDisp32 = r.template("fldWithIndexDisp32"); + let rec_fldWithIndexDisp8 = r.template("fldWithIndexDisp8"); + let rec_fregfill32 = r.template("fregfill32"); + let rec_fregspill32 = r.template("fregspill32"); + let rec_fspillSib32 = r.template("fspillSib32"); + let rec_fst = r.template("fst"); + let rec_fstDisp32 = r.template("fstDisp32"); + let rec_fstDisp8 = r.template("fstDisp8"); + let rec_fstWithIndex = r.template("fstWithIndex"); + let rec_fstWithIndexDisp32 = r.template("fstWithIndexDisp32"); + let rec_fstWithIndexDisp8 = r.template("fstWithIndexDisp8"); + + // Float loads and stores. + e.enc_both(load.bind(F32).bind(Any), rec_fld.opcodes(&MOVSS_LOAD)); + e.enc_both(load.bind(F32).bind(Any), rec_fldDisp8.opcodes(&MOVSS_LOAD)); + e.enc_both(load.bind(F32).bind(Any), rec_fldDisp32.opcodes(&MOVSS_LOAD)); + + e.enc_both( + load_complex.bind(F32), + rec_fldWithIndex.opcodes(&MOVSS_LOAD), + ); + e.enc_both( + load_complex.bind(F32), + rec_fldWithIndexDisp8.opcodes(&MOVSS_LOAD), + ); + e.enc_both( + load_complex.bind(F32), + rec_fldWithIndexDisp32.opcodes(&MOVSS_LOAD), + ); + + e.enc_both(load.bind(F64).bind(Any), rec_fld.opcodes(&MOVSD_LOAD)); + e.enc_both(load.bind(F64).bind(Any), rec_fldDisp8.opcodes(&MOVSD_LOAD)); + e.enc_both(load.bind(F64).bind(Any), rec_fldDisp32.opcodes(&MOVSD_LOAD)); + + e.enc_both( + load_complex.bind(F64), + rec_fldWithIndex.opcodes(&MOVSD_LOAD), + ); + e.enc_both( + load_complex.bind(F64), + rec_fldWithIndexDisp8.opcodes(&MOVSD_LOAD), + ); + e.enc_both( + load_complex.bind(F64), + rec_fldWithIndexDisp32.opcodes(&MOVSD_LOAD), + ); + + e.enc_both(store.bind(F32).bind(Any), rec_fst.opcodes(&MOVSS_STORE)); + e.enc_both( + store.bind(F32).bind(Any), + rec_fstDisp8.opcodes(&MOVSS_STORE), + ); + e.enc_both( + store.bind(F32).bind(Any), + rec_fstDisp32.opcodes(&MOVSS_STORE), + ); + + e.enc_both( + store_complex.bind(F32), + rec_fstWithIndex.opcodes(&MOVSS_STORE), + ); + e.enc_both( + store_complex.bind(F32), + rec_fstWithIndexDisp8.opcodes(&MOVSS_STORE), + ); + e.enc_both( + store_complex.bind(F32), + rec_fstWithIndexDisp32.opcodes(&MOVSS_STORE), + ); + + e.enc_both(store.bind(F64).bind(Any), rec_fst.opcodes(&MOVSD_STORE)); + e.enc_both( + store.bind(F64).bind(Any), + rec_fstDisp8.opcodes(&MOVSD_STORE), + ); + e.enc_both( + store.bind(F64).bind(Any), + rec_fstDisp32.opcodes(&MOVSD_STORE), + ); + + e.enc_both( + store_complex.bind(F64), + rec_fstWithIndex.opcodes(&MOVSD_STORE), + ); + e.enc_both( + store_complex.bind(F64), + rec_fstWithIndexDisp8.opcodes(&MOVSD_STORE), + ); + e.enc_both( + store_complex.bind(F64), + rec_fstWithIndexDisp32.opcodes(&MOVSD_STORE), + ); + + e.enc_both(fill.bind(F32), rec_ffillSib32.opcodes(&MOVSS_LOAD)); + e.enc_both(regfill.bind(F32), rec_fregfill32.opcodes(&MOVSS_LOAD)); + e.enc_both(fill.bind(F64), rec_ffillSib32.opcodes(&MOVSD_LOAD)); + e.enc_both(regfill.bind(F64), rec_fregfill32.opcodes(&MOVSD_LOAD)); + + e.enc_both(spill.bind(F32), rec_fspillSib32.opcodes(&MOVSS_STORE)); + e.enc_both(regspill.bind(F32), rec_fregspill32.opcodes(&MOVSS_STORE)); + e.enc_both(spill.bind(F64), rec_fspillSib32.opcodes(&MOVSD_STORE)); + e.enc_both(regspill.bind(F64), rec_fregspill32.opcodes(&MOVSD_STORE)); +} + +#[inline(never)] +fn define_fpu_ops( + e: &mut PerCpuModeEncodings, + shared_defs: &SharedDefinitions, + settings: &SettingGroup, + x86: &InstructionGroup, + r: &RecipeGroup, +) { + let shared = &shared_defs.instructions; + let formats = &shared_defs.formats; + + // Shorthands for instructions. + let ceil = shared.by_name("ceil"); + let f32const = shared.by_name("f32const"); + let f64const = shared.by_name("f64const"); + let fadd = shared.by_name("fadd"); + let fcmp = shared.by_name("fcmp"); + let fcvt_from_sint = shared.by_name("fcvt_from_sint"); + let fdemote = shared.by_name("fdemote"); + let fdiv = shared.by_name("fdiv"); + let ffcmp = shared.by_name("ffcmp"); + let floor = shared.by_name("floor"); + let fmul = shared.by_name("fmul"); + let fpromote = shared.by_name("fpromote"); + let fsub = shared.by_name("fsub"); + let nearest = shared.by_name("nearest"); + let sqrt = shared.by_name("sqrt"); + let trunc = shared.by_name("trunc"); + let x86_cvtt2si = x86.by_name("x86_cvtt2si"); + let x86_fmax = x86.by_name("x86_fmax"); + let x86_fmin = x86.by_name("x86_fmin"); + + // Shorthands for recipes. + let rec_f32imm_z = r.template("f32imm_z"); + let rec_f64imm_z = r.template("f64imm_z"); + let rec_fa = r.template("fa"); + let rec_fcmp = r.template("fcmp"); + let rec_fcscc = r.template("fcscc"); + let rec_frurm = r.template("frurm"); + let rec_furm = r.template("furm"); + let rec_furmi_rnd = r.template("furmi_rnd"); + let rec_rfurm = r.template("rfurm"); + + // Predicates shorthands. + let use_sse41 = settings.predicate_by_name("use_sse41"); + + // Floating-point constants equal to 0.0 can be encoded using either `xorps` or `xorpd`, for + // 32-bit and 64-bit floats respectively. + let is_zero_32_bit_float = + InstructionPredicate::new_is_zero_32bit_float(&*formats.unary_ieee32, "imm"); + e.enc32_instp( + f32const, + rec_f32imm_z.opcodes(&XORPS), + is_zero_32_bit_float.clone(), + ); + + let is_zero_64_bit_float = + InstructionPredicate::new_is_zero_64bit_float(&*formats.unary_ieee64, "imm"); + e.enc32_instp( + f64const, + rec_f64imm_z.opcodes(&XORPD), + is_zero_64_bit_float.clone(), + ); + + e.enc_x86_64_instp(f32const, rec_f32imm_z.opcodes(&XORPS), is_zero_32_bit_float); + e.enc_x86_64_instp(f64const, rec_f64imm_z.opcodes(&XORPD), is_zero_64_bit_float); + + // cvtsi2ss + e.enc_i32_i64(fcvt_from_sint.bind(F32), rec_frurm.opcodes(&CVTSI2SS)); + + // cvtsi2sd + e.enc_i32_i64(fcvt_from_sint.bind(F64), rec_frurm.opcodes(&CVTSI2SD)); + + // cvtss2sd + e.enc_both(fpromote.bind(F64).bind(F32), rec_furm.opcodes(&CVTSS2SD)); + + // cvtsd2ss + e.enc_both(fdemote.bind(F32).bind(F64), rec_furm.opcodes(&CVTSD2SS)); + + // cvttss2si + e.enc_both( + x86_cvtt2si.bind(I32).bind(F32), + rec_rfurm.opcodes(&CVTTSS2SI), + ); + e.enc64( + x86_cvtt2si.bind(I64).bind(F32), + rec_rfurm.opcodes(&CVTTSS2SI).rex().w(), + ); + + // cvttsd2si + e.enc_both( + x86_cvtt2si.bind(I32).bind(F64), + rec_rfurm.opcodes(&CVTTSD2SI), + ); + e.enc64( + x86_cvtt2si.bind(I64).bind(F64), + rec_rfurm.opcodes(&CVTTSD2SI).rex().w(), + ); + + // Exact square roots. + e.enc_both(sqrt.bind(F32), rec_furm.opcodes(&SQRTSS)); + e.enc_both(sqrt.bind(F64), rec_furm.opcodes(&SQRTSD)); + + // Rounding. The recipe looks at the opcode to pick an immediate. + for inst in &[nearest, floor, ceil, trunc] { + e.enc_both_isap(inst.bind(F32), rec_furmi_rnd.opcodes(&ROUNDSS), use_sse41); + e.enc_both_isap(inst.bind(F64), rec_furmi_rnd.opcodes(&ROUNDSD), use_sse41); + } + + // Binary arithmetic ops. + e.enc_both(fadd.bind(F32), rec_fa.opcodes(&ADDSS)); + e.enc_both(fadd.bind(F64), rec_fa.opcodes(&ADDSD)); + + e.enc_both(fsub.bind(F32), rec_fa.opcodes(&SUBSS)); + e.enc_both(fsub.bind(F64), rec_fa.opcodes(&SUBSD)); + + e.enc_both(fmul.bind(F32), rec_fa.opcodes(&MULSS)); + e.enc_both(fmul.bind(F64), rec_fa.opcodes(&MULSD)); + + e.enc_both(fdiv.bind(F32), rec_fa.opcodes(&DIVSS)); + e.enc_both(fdiv.bind(F64), rec_fa.opcodes(&DIVSD)); + + e.enc_both(x86_fmin.bind(F32), rec_fa.opcodes(&MINSS)); + e.enc_both(x86_fmin.bind(F64), rec_fa.opcodes(&MINSD)); + + e.enc_both(x86_fmax.bind(F32), rec_fa.opcodes(&MAXSS)); + e.enc_both(x86_fmax.bind(F64), rec_fa.opcodes(&MAXSD)); + + // Comparisons. + // + // This only covers the condition codes in `supported_floatccs`, the rest are + // handled by legalization patterns. + e.enc_both(fcmp.bind(F32), rec_fcscc.opcodes(&UCOMISS)); + e.enc_both(fcmp.bind(F64), rec_fcscc.opcodes(&UCOMISD)); + e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(&UCOMISS)); + e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(&UCOMISD)); +} + +#[inline(never)] +fn define_alu( + e: &mut PerCpuModeEncodings, + shared_defs: &SharedDefinitions, + settings: &SettingGroup, + x86: &InstructionGroup, + r: &RecipeGroup, +) { + let shared = &shared_defs.instructions; + + // Shorthands for instructions. + let clz = shared.by_name("clz"); + let ctz = shared.by_name("ctz"); + let icmp = shared.by_name("icmp"); + let icmp_imm = shared.by_name("icmp_imm"); + let ifcmp = shared.by_name("ifcmp"); + let ifcmp_imm = shared.by_name("ifcmp_imm"); + let ifcmp_sp = shared.by_name("ifcmp_sp"); + let ishl = shared.by_name("ishl"); + let ishl_imm = shared.by_name("ishl_imm"); + let popcnt = shared.by_name("popcnt"); + let rotl = shared.by_name("rotl"); + let rotl_imm = shared.by_name("rotl_imm"); + let rotr = shared.by_name("rotr"); + let rotr_imm = shared.by_name("rotr_imm"); + let selectif = shared.by_name("selectif"); + let selectif_spectre_guard = shared.by_name("selectif_spectre_guard"); + let sshr = shared.by_name("sshr"); + let sshr_imm = shared.by_name("sshr_imm"); + let trueff = shared.by_name("trueff"); + let trueif = shared.by_name("trueif"); + let ushr = shared.by_name("ushr"); + let ushr_imm = shared.by_name("ushr_imm"); + let x86_bsf = x86.by_name("x86_bsf"); + let x86_bsr = x86.by_name("x86_bsr"); + + // Shorthands for recipes. + let rec_bsf_and_bsr = r.template("bsf_and_bsr"); + let rec_cmov = r.template("cmov"); + let rec_icscc = r.template("icscc"); + let rec_icscc_ib = r.template("icscc_ib"); + let rec_icscc_id = r.template("icscc_id"); + let rec_rcmp = r.template("rcmp"); + let rec_rcmp_ib = r.template("rcmp_ib"); + let rec_rcmp_id = r.template("rcmp_id"); + let rec_rcmp_sp = r.template("rcmp_sp"); + let rec_rc = r.template("rc"); + let rec_setf_abcd = r.template("setf_abcd"); + let rec_seti_abcd = r.template("seti_abcd"); + let rec_urm = r.template("urm"); + + // Predicates shorthands. + let use_popcnt = settings.predicate_by_name("use_popcnt"); + let use_lzcnt = settings.predicate_by_name("use_lzcnt"); + let use_bmi1 = settings.predicate_by_name("use_bmi1"); + + let band = shared.by_name("band"); + let band_imm = shared.by_name("band_imm"); + let band_not = shared.by_name("band_not"); + let bnot = shared.by_name("bnot"); + let bor = shared.by_name("bor"); + let bor_imm = shared.by_name("bor_imm"); + let bxor = shared.by_name("bxor"); + let bxor_imm = shared.by_name("bxor_imm"); + let iadd = shared.by_name("iadd"); + let iadd_ifcarry = shared.by_name("iadd_ifcarry"); + let iadd_ifcin = shared.by_name("iadd_ifcin"); + let iadd_ifcout = shared.by_name("iadd_ifcout"); + let iadd_imm = shared.by_name("iadd_imm"); + let imul = shared.by_name("imul"); + let isub = shared.by_name("isub"); + let isub_ifbin = shared.by_name("isub_ifbin"); + let isub_ifborrow = shared.by_name("isub_ifborrow"); + let isub_ifbout = shared.by_name("isub_ifbout"); + let x86_sdivmodx = x86.by_name("x86_sdivmodx"); + let x86_smulx = x86.by_name("x86_smulx"); + let x86_udivmodx = x86.by_name("x86_udivmodx"); + let x86_umulx = x86.by_name("x86_umulx"); + + let rec_div = r.template("div"); + let rec_fa = r.template("fa"); + let rec_fax = r.template("fax"); + let rec_mulx = r.template("mulx"); + let rec_r_ib = r.template("r_ib"); + let rec_r_id = r.template("r_id"); + let rec_rin = r.template("rin"); + let rec_rio = r.template("rio"); + let rec_rout = r.template("rout"); + let rec_rr = r.template("rr"); + let rec_rrx = r.template("rrx"); + let rec_ur = r.template("ur"); + + e.enc_i32_i64(iadd, rec_rr.opcodes(&ADD)); + e.enc_i32_i64(iadd_ifcout, rec_rout.opcodes(&ADD)); + e.enc_i32_i64(iadd_ifcin, rec_rin.opcodes(&ADC)); + e.enc_i32_i64(iadd_ifcarry, rec_rio.opcodes(&ADC)); + e.enc_i32_i64(iadd_imm, rec_r_ib.opcodes(&ADD_IMM8_SIGN_EXTEND).rrr(0)); + e.enc_i32_i64(iadd_imm, rec_r_id.opcodes(&ADD_IMM).rrr(0)); + + e.enc_i32_i64(isub, rec_rr.opcodes(&SUB)); + e.enc_i32_i64(isub_ifbout, rec_rout.opcodes(&SUB)); + e.enc_i32_i64(isub_ifbin, rec_rin.opcodes(&SBB)); + e.enc_i32_i64(isub_ifborrow, rec_rio.opcodes(&SBB)); + + e.enc_i32_i64(band, rec_rr.opcodes(&AND)); + e.enc_b32_b64(band, rec_rr.opcodes(&AND)); + + // TODO: band_imm.i64 with an unsigned 32-bit immediate can be encoded as band_imm.i32. Can + // even use the single-byte immediate for 0xffff_ffXX masks. + + e.enc_i32_i64(band_imm, rec_r_ib.opcodes(&AND_IMM8_SIGN_EXTEND).rrr(4)); + e.enc_i32_i64(band_imm, rec_r_id.opcodes(&AND_IMM).rrr(4)); + + e.enc_i32_i64(bor, rec_rr.opcodes(&OR)); + e.enc_b32_b64(bor, rec_rr.opcodes(&OR)); + e.enc_i32_i64(bor_imm, rec_r_ib.opcodes(&OR_IMM8_SIGN_EXTEND).rrr(1)); + e.enc_i32_i64(bor_imm, rec_r_id.opcodes(&OR_IMM).rrr(1)); + + e.enc_i32_i64(bxor, rec_rr.opcodes(&XOR)); + e.enc_b32_b64(bxor, rec_rr.opcodes(&XOR)); + e.enc_i32_i64(bxor_imm, rec_r_ib.opcodes(&XOR_IMM8_SIGN_EXTEND).rrr(6)); + e.enc_i32_i64(bxor_imm, rec_r_id.opcodes(&XOR_IMM).rrr(6)); + + // x86 has a bitwise not instruction NOT. + e.enc_i32_i64(bnot, rec_ur.opcodes(&NOT).rrr(2)); + e.enc_b32_b64(bnot, rec_ur.opcodes(&NOT).rrr(2)); + e.enc_both(bnot.bind(B1), rec_ur.opcodes(&NOT).rrr(2)); + + // Also add a `b1` encodings for the logic instructions. + // TODO: Should this be done with 8-bit instructions? It would improve partial register + // dependencies. + e.enc_both(band.bind(B1), rec_rr.opcodes(&AND)); + e.enc_both(bor.bind(B1), rec_rr.opcodes(&OR)); + e.enc_both(bxor.bind(B1), rec_rr.opcodes(&XOR)); + + e.enc_i32_i64(imul, rec_rrx.opcodes(&IMUL)); + e.enc_i32_i64(x86_sdivmodx, rec_div.opcodes(&IDIV).rrr(7)); + e.enc_i32_i64(x86_udivmodx, rec_div.opcodes(&DIV).rrr(6)); + + e.enc_i32_i64(x86_smulx, rec_mulx.opcodes(&IMUL_RDX_RAX).rrr(5)); + e.enc_i32_i64(x86_umulx, rec_mulx.opcodes(&MUL).rrr(4)); + + // Binary bitwise ops. + // + // The F64 version is intentionally encoded using the single-precision opcode: + // the operation is identical and the encoding is one byte shorter. + e.enc_both(band.bind(F32), rec_fa.opcodes(&ANDPS)); + e.enc_both(band.bind(F64), rec_fa.opcodes(&ANDPS)); + + e.enc_both(bor.bind(F32), rec_fa.opcodes(&ORPS)); + e.enc_both(bor.bind(F64), rec_fa.opcodes(&ORPS)); + + e.enc_both(bxor.bind(F32), rec_fa.opcodes(&XORPS)); + e.enc_both(bxor.bind(F64), rec_fa.opcodes(&XORPS)); + + // The `andnps(x,y)` instruction computes `~x&y`, while band_not(x,y)` is `x&~y. + e.enc_both(band_not.bind(F32), rec_fax.opcodes(&ANDNPS)); + e.enc_both(band_not.bind(F64), rec_fax.opcodes(&ANDNPS)); + + // Shifts and rotates. + // Note that the dynamic shift amount is only masked by 5 or 6 bits; the 8-bit + // and 16-bit shifts would need explicit masking. + + for &(inst, rrr) in &[(rotl, 0), (rotr, 1), (ishl, 4), (ushr, 5), (sshr, 7)] { + // Cannot use enc_i32_i64 for this pattern because instructions require + // to bind any. + e.enc32(inst.bind(I32).bind(I8), rec_rc.opcodes(&ROTATE_CL).rrr(rrr)); + e.enc32( + inst.bind(I32).bind(I16), + rec_rc.opcodes(&ROTATE_CL).rrr(rrr), + ); + e.enc32( + inst.bind(I32).bind(I32), + rec_rc.opcodes(&ROTATE_CL).rrr(rrr), + ); + e.enc64( + inst.bind(I64).bind(Any), + rec_rc.opcodes(&ROTATE_CL).rrr(rrr).rex().w(), + ); + e.enc64( + inst.bind(I32).bind(Any), + rec_rc.opcodes(&ROTATE_CL).rrr(rrr).rex(), + ); + e.enc64( + inst.bind(I32).bind(Any), + rec_rc.opcodes(&ROTATE_CL).rrr(rrr), + ); + } + + e.enc_i32_i64(rotl_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(0)); + e.enc_i32_i64(rotr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(1)); + e.enc_i32_i64(ishl_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(4)); + e.enc_i32_i64(ushr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(5)); + e.enc_i32_i64(sshr_imm, rec_r_ib.opcodes(&ROTATE_IMM8).rrr(7)); + + // Population count. + e.enc32_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT), use_popcnt); + e.enc64_isap( + popcnt.bind(I64), + rec_urm.opcodes(&POPCNT).rex().w(), + use_popcnt, + ); + e.enc64_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT).rex(), use_popcnt); + e.enc64_isap(popcnt.bind(I32), rec_urm.opcodes(&POPCNT), use_popcnt); + + // Count leading zero bits. + e.enc32_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT), use_lzcnt); + e.enc64_isap(clz.bind(I64), rec_urm.opcodes(&LZCNT).rex().w(), use_lzcnt); + e.enc64_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT).rex(), use_lzcnt); + e.enc64_isap(clz.bind(I32), rec_urm.opcodes(&LZCNT), use_lzcnt); + + // Count trailing zero bits. + e.enc32_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT), use_bmi1); + e.enc64_isap(ctz.bind(I64), rec_urm.opcodes(&TZCNT).rex().w(), use_bmi1); + e.enc64_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT).rex(), use_bmi1); + e.enc64_isap(ctz.bind(I32), rec_urm.opcodes(&TZCNT), use_bmi1); + + // Bit scan forwards and reverse + e.enc_i32_i64(x86_bsf, rec_bsf_and_bsr.opcodes(&BIT_SCAN_FORWARD)); + e.enc_i32_i64(x86_bsr, rec_bsf_and_bsr.opcodes(&BIT_SCAN_REVERSE)); + + // Comparisons + e.enc_i32_i64(icmp, rec_icscc.opcodes(&CMP_REG)); + e.enc_i32_i64(icmp_imm, rec_icscc_ib.opcodes(&CMP_IMM8).rrr(7)); + e.enc_i32_i64(icmp_imm, rec_icscc_id.opcodes(&CMP_IMM).rrr(7)); + e.enc_i32_i64(ifcmp, rec_rcmp.opcodes(&CMP_REG)); + e.enc_i32_i64(ifcmp_imm, rec_rcmp_ib.opcodes(&CMP_IMM8).rrr(7)); + e.enc_i32_i64(ifcmp_imm, rec_rcmp_id.opcodes(&CMP_IMM).rrr(7)); + // TODO: We could special-case ifcmp_imm(x, 0) to TEST(x, x). + + e.enc32(ifcmp_sp.bind(I32), rec_rcmp_sp.opcodes(&CMP_REG)); + e.enc64(ifcmp_sp.bind(I64), rec_rcmp_sp.opcodes(&CMP_REG).rex().w()); + + // Convert flags to bool. + // This encodes `b1` as an 8-bit low register with the value 0 or 1. + e.enc_both(trueif, rec_seti_abcd.opcodes(&SET_BYTE_IF_OVERFLOW)); + e.enc_both(trueff, rec_setf_abcd.opcodes(&SET_BYTE_IF_OVERFLOW)); + + // Conditional move (a.k.a integer select). + e.enc_i32_i64(selectif, rec_cmov.opcodes(&CMOV_OVERFLOW)); + // A Spectre-guard integer select is exactly the same as a selectif, but + // is not associated with any other legalization rules and is not + // recognized by any optimizations, so it must arrive here unmodified + // and in its original place. + e.enc_i32_i64(selectif_spectre_guard, rec_cmov.opcodes(&CMOV_OVERFLOW)); +} + +#[inline(never)] +#[allow(clippy::cognitive_complexity)] +fn define_simd( + e: &mut PerCpuModeEncodings, + shared_defs: &SharedDefinitions, + settings: &SettingGroup, + x86: &InstructionGroup, + r: &RecipeGroup, +) { + let shared = &shared_defs.instructions; + let formats = &shared_defs.formats; + + // Shorthands for instructions. + let avg_round = shared.by_name("avg_round"); + let bitcast = shared.by_name("bitcast"); + let bor = shared.by_name("bor"); + let bxor = shared.by_name("bxor"); + let copy = shared.by_name("copy"); + let copy_nop = shared.by_name("copy_nop"); + let copy_to_ssa = shared.by_name("copy_to_ssa"); + let fadd = shared.by_name("fadd"); + let fcmp = shared.by_name("fcmp"); + let fcvt_from_sint = shared.by_name("fcvt_from_sint"); + let fdiv = shared.by_name("fdiv"); + let fill = shared.by_name("fill"); + let fill_nop = shared.by_name("fill_nop"); + let fmul = shared.by_name("fmul"); + let fsub = shared.by_name("fsub"); + let iabs = shared.by_name("iabs"); + let iadd = shared.by_name("iadd"); + let icmp = shared.by_name("icmp"); + let imul = shared.by_name("imul"); + let ishl_imm = shared.by_name("ishl_imm"); + let load = shared.by_name("load"); + let load_complex = shared.by_name("load_complex"); + let raw_bitcast = shared.by_name("raw_bitcast"); + let regfill = shared.by_name("regfill"); + let regmove = shared.by_name("regmove"); + let regspill = shared.by_name("regspill"); + let sadd_sat = shared.by_name("sadd_sat"); + let scalar_to_vector = shared.by_name("scalar_to_vector"); + let sload8x8 = shared.by_name("sload8x8"); + let sload8x8_complex = shared.by_name("sload8x8_complex"); + let sload16x4 = shared.by_name("sload16x4"); + let sload16x4_complex = shared.by_name("sload16x4_complex"); + let sload32x2 = shared.by_name("sload32x2"); + let sload32x2_complex = shared.by_name("sload32x2_complex"); + let spill = shared.by_name("spill"); + let sqrt = shared.by_name("sqrt"); + let sshr_imm = shared.by_name("sshr_imm"); + let ssub_sat = shared.by_name("ssub_sat"); + let store = shared.by_name("store"); + let store_complex = shared.by_name("store_complex"); + let swiden_low = shared.by_name("swiden_low"); + let uadd_sat = shared.by_name("uadd_sat"); + let uload8x8 = shared.by_name("uload8x8"); + let uload8x8_complex = shared.by_name("uload8x8_complex"); + let uload16x4 = shared.by_name("uload16x4"); + let uload16x4_complex = shared.by_name("uload16x4_complex"); + let uload32x2 = shared.by_name("uload32x2"); + let uload32x2_complex = shared.by_name("uload32x2_complex"); + let snarrow = shared.by_name("snarrow"); + let unarrow = shared.by_name("unarrow"); + let uwiden_low = shared.by_name("uwiden_low"); + let ushr_imm = shared.by_name("ushr_imm"); + let usub_sat = shared.by_name("usub_sat"); + let vconst = shared.by_name("vconst"); + let vselect = shared.by_name("vselect"); + let x86_cvtt2si = x86.by_name("x86_cvtt2si"); + let x86_insertps = x86.by_name("x86_insertps"); + let x86_fmax = x86.by_name("x86_fmax"); + let x86_fmin = x86.by_name("x86_fmin"); + let x86_movlhps = x86.by_name("x86_movlhps"); + let x86_movsd = x86.by_name("x86_movsd"); + let x86_pblendw = x86.by_name("x86_pblendw"); + let x86_pextr = x86.by_name("x86_pextr"); + let x86_pinsr = x86.by_name("x86_pinsr"); + let x86_pmaxs = x86.by_name("x86_pmaxs"); + let x86_pmaxu = x86.by_name("x86_pmaxu"); + let x86_pmins = x86.by_name("x86_pmins"); + let x86_pminu = x86.by_name("x86_pminu"); + let x86_pmullq = x86.by_name("x86_pmullq"); + let x86_pmuludq = x86.by_name("x86_pmuludq"); + let x86_palignr = x86.by_name("x86_palignr"); + let x86_pshufb = x86.by_name("x86_pshufb"); + let x86_pshufd = x86.by_name("x86_pshufd"); + let x86_psll = x86.by_name("x86_psll"); + let x86_psra = x86.by_name("x86_psra"); + let x86_psrl = x86.by_name("x86_psrl"); + let x86_ptest = x86.by_name("x86_ptest"); + let x86_punpckh = x86.by_name("x86_punpckh"); + let x86_punpckl = x86.by_name("x86_punpckl"); + let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps"); + + // Shorthands for recipes. + let rec_blend = r.template("blend"); + let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128"); + let rec_evex_reg_rm_128 = r.template("evex_reg_rm_128"); + let rec_f_ib = r.template("f_ib"); + let rec_fa = r.template("fa"); + let rec_fa_ib = r.template("fa_ib"); + let rec_fax = r.template("fax"); + let rec_fcmp = r.template("fcmp"); + let rec_ffillSib32 = r.template("ffillSib32"); + let rec_ffillnull = r.recipe("ffillnull"); + let rec_fld = r.template("fld"); + let rec_fldDisp32 = r.template("fldDisp32"); + let rec_fldDisp8 = r.template("fldDisp8"); + let rec_fldWithIndex = r.template("fldWithIndex"); + let rec_fldWithIndexDisp32 = r.template("fldWithIndexDisp32"); + let rec_fldWithIndexDisp8 = r.template("fldWithIndexDisp8"); + let rec_fregfill32 = r.template("fregfill32"); + let rec_fregspill32 = r.template("fregspill32"); + let rec_frmov = r.template("frmov"); + let rec_frurm = r.template("frurm"); + let rec_fspillSib32 = r.template("fspillSib32"); + let rec_fst = r.template("fst"); + let rec_fstDisp32 = r.template("fstDisp32"); + let rec_fstDisp8 = r.template("fstDisp8"); + let rec_fstWithIndex = r.template("fstWithIndex"); + let rec_fstWithIndexDisp32 = r.template("fstWithIndexDisp32"); + let rec_fstWithIndexDisp8 = r.template("fstWithIndexDisp8"); + let rec_furm = r.template("furm"); + let rec_furm_reg_to_ssa = r.template("furm_reg_to_ssa"); + let rec_icscc_fpr = r.template("icscc_fpr"); + let rec_null_fpr = r.recipe("null_fpr"); + let rec_pfcmp = r.template("pfcmp"); + let rec_r_ib_unsigned_fpr = r.template("r_ib_unsigned_fpr"); + let rec_r_ib_unsigned_gpr = r.template("r_ib_unsigned_gpr"); + let rec_r_ib_unsigned_r = r.template("r_ib_unsigned_r"); + let rec_stacknull = r.recipe("stacknull"); + let rec_vconst = r.template("vconst"); + let rec_vconst_optimized = r.template("vconst_optimized"); + + // Predicates shorthands. + settings.predicate_by_name("all_ones_funcaddrs_and_not_is_pic"); + settings.predicate_by_name("not_all_ones_funcaddrs_and_not_is_pic"); + let use_ssse3_simd = settings.predicate_by_name("use_ssse3_simd"); + let use_sse41_simd = settings.predicate_by_name("use_sse41_simd"); + let use_sse42_simd = settings.predicate_by_name("use_sse42_simd"); + let use_avx512dq_simd = settings.predicate_by_name("use_avx512dq_simd"); + let use_avx512vl_simd = settings.predicate_by_name("use_avx512vl_simd"); + + // SIMD vector size: eventually multiple vector sizes may be supported but for now only + // SSE-sized vectors are available. + let sse_vector_size: u64 = 128; + + // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see + // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the + // value across the register. + + let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128; + + // PSHUFB, 8-bit shuffle using two XMM registers. + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let instruction = x86_pshufb.bind(vector(ty, sse_vector_size)); + let template = rec_fa.opcodes(&PSHUFB); + e.enc_both_inferred_maybe_isap(instruction.clone(), template.clone(), Some(use_ssse3_simd)); + } + + // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate. + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) { + let instruction = x86_pshufd.bind(vector(ty, sse_vector_size)); + let template = rec_r_ib_unsigned_fpr.opcodes(&PSHUFD); + e.enc_both_inferred(instruction, template); + } + + // SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be + // either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB; + // for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let opcode = match ty.lane_bits() { + 32 => &BLENDVPS, + 64 => &BLENDVPD, + _ => &PBLENDVB, + }; + let instruction = vselect.bind(vector(ty, sse_vector_size)); + let template = rec_blend.opcodes(opcode); + e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd)); + } + + // PBLENDW, select lanes using a u8 immediate. + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) { + let instruction = x86_pblendw.bind(vector(ty, sse_vector_size)); + let template = rec_fa_ib.opcodes(&PBLENDW); + e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd)); + } + + // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according + // to the Intel manual: "When the destination operand is an XMM register, the source operand is + // written to the low doubleword of the register and the register is zero-extended to 128 bits." + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let instruction = scalar_to_vector.bind(vector(ty, sse_vector_size)); + if ty.is_float() { + // No need to move floats--they already live in XMM registers. + e.enc_32_64_rec(instruction, rec_null_fpr, 0); + } else { + let template = rec_frurm.opcodes(&MOVD_LOAD_XMM); + if ty.lane_bits() < 64 { + e.enc_both_inferred(instruction, template); + } else { + // No 32-bit encodings for 64-bit widths. + assert_eq!(ty.lane_bits(), 64); + e.enc64(instruction, template.rex().w()); + } + } + } + + // SIMD insertlane + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let (opcode, isap): (&[_], _) = match ty.lane_bits() { + 8 => (&PINSRB, Some(use_sse41_simd)), + 16 => (&PINSRW, None), + 32 | 64 => (&PINSR, Some(use_sse41_simd)), + _ => panic!("invalid size for SIMD insertlane"), + }; + + let instruction = x86_pinsr.bind(vector(ty, sse_vector_size)); + let template = rec_r_ib_unsigned_r.opcodes(opcode); + if ty.lane_bits() < 64 { + e.enc_both_inferred_maybe_isap(instruction, template, isap); + } else { + // It turns out the 64-bit widths have REX/W encodings and only are available on + // x86_64. + e.enc64_maybe_isap(instruction, template.rex().w(), isap); + } + } + + // For legalizing insertlane with floats, INSERTPS from SSE4.1. + { + let instruction = x86_insertps.bind(vector(F32, sse_vector_size)); + let template = rec_fa_ib.opcodes(&INSERTPS); + e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd)); + } + + // For legalizing insertlane with floats, MOVSD from SSE2. + { + let instruction = x86_movsd.bind(vector(F64, sse_vector_size)); + let template = rec_fa.opcodes(&MOVSD_LOAD); + e.enc_both_inferred(instruction, template); // from SSE2 + } + + // For legalizing insertlane with floats, MOVLHPS from SSE. + { + let instruction = x86_movlhps.bind(vector(F64, sse_vector_size)); + let template = rec_fa.opcodes(&MOVLHPS); + e.enc_both_inferred(instruction, template); // from SSE + } + + // SIMD extractlane + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let opcode = match ty.lane_bits() { + 8 => &PEXTRB, + 16 => &PEXTRW, + 32 | 64 => &PEXTR, + _ => panic!("invalid size for SIMD extractlane"), + }; + + let instruction = x86_pextr.bind(vector(ty, sse_vector_size)); + let template = rec_r_ib_unsigned_gpr.opcodes(opcode); + if ty.lane_bits() < 64 { + e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd)); + } else { + // It turns out the 64-bit widths have REX/W encodings and only are available on + // x86_64. + e.enc64_maybe_isap(instruction, template.rex().w(), Some(use_sse41_simd)); + } + } + + // SIMD packing/unpacking + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let (high, low) = match ty.lane_bits() { + 8 => (&PUNPCKHBW, &PUNPCKLBW), + 16 => (&PUNPCKHWD, &PUNPCKLWD), + 32 => (&PUNPCKHDQ, &PUNPCKLDQ), + 64 => (&PUNPCKHQDQ, &PUNPCKLQDQ), + _ => panic!("invalid size for SIMD packing/unpacking"), + }; + + e.enc_both_inferred( + x86_punpckh.bind(vector(ty, sse_vector_size)), + rec_fa.opcodes(high), + ); + e.enc_both_inferred( + x86_punpckl.bind(vector(ty, sse_vector_size)), + rec_fa.opcodes(low), + ); + } + + // SIMD narrow/widen + for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] { + let snarrow = snarrow.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes)); + } + for (ty, opcodes, isap) in &[ + (I16, &PACKUSWB[..], None), + (I32, &PACKUSDW[..], Some(use_sse41_simd)), + ] { + let unarrow = unarrow.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap); + } + for (ty, swiden_opcode, uwiden_opcode) in &[ + (I8, &PMOVSXBW[..], &PMOVZXBW[..]), + (I16, &PMOVSXWD[..], &PMOVZXWD[..]), + ] { + let isap = Some(use_sse41_simd); + let swiden_low = swiden_low.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred_maybe_isap(swiden_low, rec_furm.opcodes(*swiden_opcode), isap); + let uwiden_low = uwiden_low.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred_maybe_isap(uwiden_low, rec_furm.opcodes(*uwiden_opcode), isap); + } + for ty in &[I8, I16, I32, I64] { + e.enc_both_inferred_maybe_isap( + x86_palignr.bind(vector(*ty, sse_vector_size)), + rec_fa_ib.opcodes(&PALIGNR[..]), + Some(use_ssse3_simd), + ); + } + + // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8). + for from_type in ValueType::all_lane_types().filter(allowed_simd_type) { + for to_type in + ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type) + { + let instruction = raw_bitcast + .bind(vector(to_type, sse_vector_size)) + .bind(vector(from_type, sse_vector_size)); + e.enc_32_64_rec(instruction, rec_null_fpr, 0); + } + } + + // SIMD raw bitcast floats to vector (and back); assumes that floats are already stored in an + // XMM register. + for float_type in &[F32, F64] { + for lane_type in ValueType::all_lane_types().filter(allowed_simd_type) { + e.enc_32_64_rec( + raw_bitcast + .bind(vector(lane_type, sse_vector_size)) + .bind(*float_type), + rec_null_fpr, + 0, + ); + e.enc_32_64_rec( + raw_bitcast + .bind(*float_type) + .bind(vector(lane_type, sse_vector_size)), + rec_null_fpr, + 0, + ); + } + } + + // SIMD conversions + { + let fcvt_from_sint_32 = fcvt_from_sint + .bind(vector(F32, sse_vector_size)) + .bind(vector(I32, sse_vector_size)); + e.enc_both(fcvt_from_sint_32, rec_furm.opcodes(&CVTDQ2PS)); + + e.enc_32_64_maybe_isap( + x86_vcvtudq2ps, + rec_evex_reg_rm_128.opcodes(&VCVTUDQ2PS), + Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F + ); + + e.enc_both_inferred( + x86_cvtt2si + .bind(vector(I32, sse_vector_size)) + .bind(vector(F32, sse_vector_size)), + rec_furm.opcodes(&CVTTPS2DQ), + ); + } + + // SIMD vconst for special cases (all zeroes, all ones) + // this must be encoded prior to the MOVUPS implementation (below) so the compiler sees this + // encoding first + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let instruction = vconst.bind(vector(ty, sse_vector_size)); + + let is_zero_128bit = + InstructionPredicate::new_is_all_zeroes(&*formats.unary_const, "constant_handle"); + let template = rec_vconst_optimized.opcodes(&PXOR).infer_rex(); + e.enc_32_64_func(instruction.clone(), template, |builder| { + builder.inst_predicate(is_zero_128bit) + }); + + let is_ones_128bit = + InstructionPredicate::new_is_all_ones(&*formats.unary_const, "constant_handle"); + let template = rec_vconst_optimized.opcodes(&PCMPEQB).infer_rex(); + e.enc_32_64_func(instruction, template, |builder| { + builder.inst_predicate(is_ones_128bit) + }); + } + + // SIMD vconst using MOVUPS + // TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have + // to guarantee that the constants are aligned when emitted and there is currently no mechanism + // for that; alternately, constants could be loaded into XMM registers using a sequence like: + // MOVQ + MOVHPD + MOVQ + MOVLPD (this allows the constants to be immediates instead of stored + // in memory) but some performance measurements are needed. + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let instruction = vconst.bind(vector(ty, sse_vector_size)); + let template = rec_vconst.opcodes(&MOVUPS_LOAD); + e.enc_both_inferred(instruction, template); // from SSE + } + + // SIMD register movement: store, load, spill, fill, regmove, etc. All of these use encodings of + // MOVUPS and MOVAPS from SSE (TODO ideally all of these would either use MOVAPS when we have + // alignment or type-specific encodings, see https://github.com/bytecodealliance/wasmtime/issues/1124). + // Also, it would be ideal to infer REX prefixes for all of these instructions but for the + // time being only instructions with common recipes have `infer_rex()` support. + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + // Store + let bound_store = store.bind(vector(ty, sse_vector_size)).bind(Any); + e.enc_both_inferred(bound_store.clone(), rec_fst.opcodes(&MOVUPS_STORE)); + e.enc_both_inferred(bound_store.clone(), rec_fstDisp8.opcodes(&MOVUPS_STORE)); + e.enc_both_inferred(bound_store, rec_fstDisp32.opcodes(&MOVUPS_STORE)); + + // Store complex + let bound_store_complex = store_complex.bind(vector(ty, sse_vector_size)); + e.enc_both( + bound_store_complex.clone(), + rec_fstWithIndex.opcodes(&MOVUPS_STORE), + ); + e.enc_both( + bound_store_complex.clone(), + rec_fstWithIndexDisp8.opcodes(&MOVUPS_STORE), + ); + e.enc_both( + bound_store_complex, + rec_fstWithIndexDisp32.opcodes(&MOVUPS_STORE), + ); + + // Load + let bound_load = load.bind(vector(ty, sse_vector_size)).bind(Any); + e.enc_both_inferred(bound_load.clone(), rec_fld.opcodes(&MOVUPS_LOAD)); + e.enc_both_inferred(bound_load.clone(), rec_fldDisp8.opcodes(&MOVUPS_LOAD)); + e.enc_both_inferred(bound_load, rec_fldDisp32.opcodes(&MOVUPS_LOAD)); + + // Load complex + let bound_load_complex = load_complex.bind(vector(ty, sse_vector_size)); + e.enc_both( + bound_load_complex.clone(), + rec_fldWithIndex.opcodes(&MOVUPS_LOAD), + ); + e.enc_both( + bound_load_complex.clone(), + rec_fldWithIndexDisp8.opcodes(&MOVUPS_LOAD), + ); + e.enc_both( + bound_load_complex, + rec_fldWithIndexDisp32.opcodes(&MOVUPS_LOAD), + ); + + // Spill + let bound_spill = spill.bind(vector(ty, sse_vector_size)); + e.enc_both(bound_spill, rec_fspillSib32.opcodes(&MOVUPS_STORE)); + let bound_regspill = regspill.bind(vector(ty, sse_vector_size)); + e.enc_both(bound_regspill, rec_fregspill32.opcodes(&MOVUPS_STORE)); + + // Fill + let bound_fill = fill.bind(vector(ty, sse_vector_size)); + e.enc_both(bound_fill, rec_ffillSib32.opcodes(&MOVUPS_LOAD)); + let bound_regfill = regfill.bind(vector(ty, sse_vector_size)); + e.enc_both(bound_regfill, rec_fregfill32.opcodes(&MOVUPS_LOAD)); + let bound_fill_nop = fill_nop.bind(vector(ty, sse_vector_size)); + e.enc_32_64_rec(bound_fill_nop, rec_ffillnull, 0); + + // Regmove + let bound_regmove = regmove.bind(vector(ty, sse_vector_size)); + e.enc_both(bound_regmove, rec_frmov.opcodes(&MOVAPS_LOAD)); + + // Copy + let bound_copy = copy.bind(vector(ty, sse_vector_size)); + e.enc_both(bound_copy, rec_furm.opcodes(&MOVAPS_LOAD)); + let bound_copy_to_ssa = copy_to_ssa.bind(vector(ty, sse_vector_size)); + e.enc_both(bound_copy_to_ssa, rec_furm_reg_to_ssa.opcodes(&MOVAPS_LOAD)); + let bound_copy_nop = copy_nop.bind(vector(ty, sse_vector_size)); + e.enc_32_64_rec(bound_copy_nop, rec_stacknull, 0); + } + + // SIMD load extend + for (inst, opcodes) in &[ + (uload8x8, &PMOVZXBW), + (uload16x4, &PMOVZXWD), + (uload32x2, &PMOVZXDQ), + (sload8x8, &PMOVSXBW), + (sload16x4, &PMOVSXWD), + (sload32x2, &PMOVSXDQ), + ] { + let isap = Some(use_sse41_simd); + for recipe in &[rec_fld, rec_fldDisp8, rec_fldDisp32] { + let inst = *inst; + let template = recipe.opcodes(*opcodes); + e.enc_both_inferred_maybe_isap(inst.clone().bind(I32), template.clone(), isap); + e.enc64_maybe_isap(inst.bind(I64), template.infer_rex(), isap); + } + } + + // SIMD load extend (complex addressing) + let is_load_complex_length_two = + InstructionPredicate::new_length_equals(&*formats.load_complex, 2); + for (inst, opcodes) in &[ + (uload8x8_complex, &PMOVZXBW), + (uload16x4_complex, &PMOVZXWD), + (uload32x2_complex, &PMOVZXDQ), + (sload8x8_complex, &PMOVSXBW), + (sload16x4_complex, &PMOVSXWD), + (sload32x2_complex, &PMOVSXDQ), + ] { + for recipe in &[ + rec_fldWithIndex, + rec_fldWithIndexDisp8, + rec_fldWithIndexDisp32, + ] { + let template = recipe.opcodes(*opcodes); + let predicate = |encoding: EncodingBuilder| { + encoding + .isa_predicate(use_sse41_simd) + .inst_predicate(is_load_complex_length_two.clone()) + }; + e.enc32_func(inst.clone(), template.clone(), predicate); + // No infer_rex calculator for these recipes; place REX version first as in enc_x86_64. + e.enc64_func(inst.clone(), template.rex(), predicate); + e.enc64_func(inst.clone(), template, predicate); + } + } + + // SIMD integer addition + for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] { + let iadd = iadd.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(iadd, rec_fa.opcodes(*opcodes)); + } + + // SIMD integer saturating addition + e.enc_both_inferred( + sadd_sat.bind(vector(I8, sse_vector_size)), + rec_fa.opcodes(&PADDSB), + ); + e.enc_both_inferred( + sadd_sat.bind(vector(I16, sse_vector_size)), + rec_fa.opcodes(&PADDSW), + ); + e.enc_both_inferred( + uadd_sat.bind(vector(I8, sse_vector_size)), + rec_fa.opcodes(&PADDUSB), + ); + e.enc_both_inferred( + uadd_sat.bind(vector(I16, sse_vector_size)), + rec_fa.opcodes(&PADDUSW), + ); + + // SIMD integer subtraction + let isub = shared.by_name("isub"); + for (ty, opcodes) in &[(I8, &PSUBB), (I16, &PSUBW), (I32, &PSUBD), (I64, &PSUBQ)] { + let isub = isub.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(isub, rec_fa.opcodes(*opcodes)); + } + + // SIMD integer saturating subtraction + e.enc_both_inferred( + ssub_sat.bind(vector(I8, sse_vector_size)), + rec_fa.opcodes(&PSUBSB), + ); + e.enc_both_inferred( + ssub_sat.bind(vector(I16, sse_vector_size)), + rec_fa.opcodes(&PSUBSW), + ); + e.enc_both_inferred( + usub_sat.bind(vector(I8, sse_vector_size)), + rec_fa.opcodes(&PSUBUSB), + ); + e.enc_both_inferred( + usub_sat.bind(vector(I16, sse_vector_size)), + rec_fa.opcodes(&PSUBUSW), + ); + + // SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16 + // and I64x2 and these are (at the time of writing) not necessary for WASM SIMD. + for (ty, opcodes, isap) in &[ + (I16, &PMULLW[..], None), + (I32, &PMULLD[..], Some(use_sse41_simd)), + ] { + let imul = imul.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred_maybe_isap(imul, rec_fa.opcodes(opcodes), *isap); + } + + // SIMD multiplication with lane expansion. + e.enc_both_inferred(x86_pmuludq, rec_fa.opcodes(&PMULUDQ)); + + // SIMD integer multiplication for I64x2 using a AVX512. + { + e.enc_32_64_maybe_isap( + x86_pmullq, + rec_evex_reg_vvvv_rm_128.opcodes(&VPMULLQ).w(), + Some(use_avx512dq_simd), // TODO need an OR predicate to join with AVX512VL + ); + } + + // SIMD integer average with rounding. + for (ty, opcodes) in &[(I8, &PAVGB[..]), (I16, &PAVGW[..])] { + let avgr = avg_round.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(avgr, rec_fa.opcodes(opcodes)); + } + + // SIMD integer absolute value. + for (ty, opcodes) in &[(I8, &PABSB[..]), (I16, &PABSW[..]), (I32, &PABSD)] { + let iabs = iabs.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred_maybe_isap(iabs, rec_furm.opcodes(opcodes), Some(use_ssse3_simd)); + } + + // SIMD logical operations + let band = shared.by_name("band"); + let band_not = shared.by_name("band_not"); + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + // and + let band = band.bind(vector(ty, sse_vector_size)); + e.enc_both_inferred(band, rec_fa.opcodes(&PAND)); + + // and not (note flipped recipe operands to match band_not order) + let band_not = band_not.bind(vector(ty, sse_vector_size)); + e.enc_both_inferred(band_not, rec_fax.opcodes(&PANDN)); + + // or + let bor = bor.bind(vector(ty, sse_vector_size)); + e.enc_both_inferred(bor, rec_fa.opcodes(&POR)); + + // xor + let bxor = bxor.bind(vector(ty, sse_vector_size)); + e.enc_both_inferred(bxor, rec_fa.opcodes(&PXOR)); + + // ptest + let x86_ptest = x86_ptest.bind(vector(ty, sse_vector_size)); + e.enc_both_inferred_maybe_isap(x86_ptest, rec_fcmp.opcodes(&PTEST), Some(use_sse41_simd)); + } + + // SIMD bitcast from I32/I64 to the low bits of a vector (e.g. I64x2); this register movement + // allows SIMD shifts to be legalized more easily. TODO ideally this would be typed as an + // I128x1 but restrictions on the type builder prevent this; the general idea here is that + // the upper bits are all zeroed and do not form parts of any separate lane. See + // https://github.com/bytecodealliance/wasmtime/issues/1140. + e.enc_both_inferred( + bitcast.bind(vector(I64, sse_vector_size)).bind(I32), + rec_frurm.opcodes(&MOVD_LOAD_XMM), + ); + e.enc64( + bitcast.bind(vector(I64, sse_vector_size)).bind(I64), + rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(), + ); + + // SIMD shift left + for (ty, opcodes) in &[(I16, &PSLLW), (I32, &PSLLD), (I64, &PSLLQ)] { + let x86_psll = x86_psll.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(x86_psll, rec_fa.opcodes(*opcodes)); + } + + // SIMD shift right (logical) + for (ty, opcodes) in &[(I16, &PSRLW), (I32, &PSRLD), (I64, &PSRLQ)] { + let x86_psrl = x86_psrl.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(x86_psrl, rec_fa.opcodes(*opcodes)); + } + + // SIMD shift right (arithmetic) + for (ty, opcodes) in &[(I16, &PSRAW), (I32, &PSRAD)] { + let x86_psra = x86_psra.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(x86_psra, rec_fa.opcodes(*opcodes)); + } + + // SIMD immediate shift + for (ty, opcodes) in &[(I16, &PS_W_IMM), (I32, &PS_D_IMM), (I64, &PS_Q_IMM)] { + let ishl_imm = ishl_imm.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(ishl_imm, rec_f_ib.opcodes(*opcodes).rrr(6)); + + let ushr_imm = ushr_imm.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(ushr_imm, rec_f_ib.opcodes(*opcodes).rrr(2)); + + // One exception: PSRAQ does not exist in for 64x2 in SSE2, it requires a higher CPU feature set. + if *ty != I64 { + let sshr_imm = sshr_imm.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(sshr_imm, rec_f_ib.opcodes(*opcodes).rrr(4)); + } + } + + // SIMD integer comparisons + { + use IntCC::*; + for (ty, cc, opcodes, isa_predicate) in &[ + (I8, Equal, &PCMPEQB[..], None), + (I16, Equal, &PCMPEQW[..], None), + (I32, Equal, &PCMPEQD[..], None), + (I64, Equal, &PCMPEQQ[..], Some(use_sse41_simd)), + (I8, SignedGreaterThan, &PCMPGTB[..], None), + (I16, SignedGreaterThan, &PCMPGTW[..], None), + (I32, SignedGreaterThan, &PCMPGTD[..], None), + (I64, SignedGreaterThan, &PCMPGTQ, Some(use_sse42_simd)), + ] { + let instruction = icmp + .bind(Immediate::IntCC(*cc)) + .bind(vector(*ty, sse_vector_size)); + let template = rec_icscc_fpr.opcodes(opcodes); + e.enc_both_inferred_maybe_isap(instruction, template, *isa_predicate); + } + } + + // SIMD min/max + for (ty, inst, opcodes, isa_predicate) in &[ + (I8, x86_pmaxs, &PMAXSB[..], Some(use_sse41_simd)), + (I16, x86_pmaxs, &PMAXSW[..], None), + (I32, x86_pmaxs, &PMAXSD[..], Some(use_sse41_simd)), + (I8, x86_pmaxu, &PMAXUB[..], None), + (I16, x86_pmaxu, &PMAXUW[..], Some(use_sse41_simd)), + (I32, x86_pmaxu, &PMAXUD[..], Some(use_sse41_simd)), + (I8, x86_pmins, &PMINSB[..], Some(use_sse41_simd)), + (I16, x86_pmins, &PMINSW[..], None), + (I32, x86_pmins, &PMINSD[..], Some(use_sse41_simd)), + (I8, x86_pminu, &PMINUB[..], None), + (I16, x86_pminu, &PMINUW[..], Some(use_sse41_simd)), + (I32, x86_pminu, &PMINUD[..], Some(use_sse41_simd)), + ] { + let inst = inst.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred_maybe_isap(inst, rec_fa.opcodes(opcodes), *isa_predicate); + } + + // SIMD float comparisons + e.enc_both_inferred( + fcmp.bind(vector(F32, sse_vector_size)), + rec_pfcmp.opcodes(&CMPPS), + ); + e.enc_both_inferred( + fcmp.bind(vector(F64, sse_vector_size)), + rec_pfcmp.opcodes(&CMPPD), + ); + + // SIMD float arithmetic + for (ty, inst, opcodes) in &[ + (F32, fadd, &ADDPS[..]), + (F64, fadd, &ADDPD[..]), + (F32, fsub, &SUBPS[..]), + (F64, fsub, &SUBPD[..]), + (F32, fmul, &MULPS[..]), + (F64, fmul, &MULPD[..]), + (F32, fdiv, &DIVPS[..]), + (F64, fdiv, &DIVPD[..]), + (F32, x86_fmin, &MINPS[..]), + (F64, x86_fmin, &MINPD[..]), + (F32, x86_fmax, &MAXPS[..]), + (F64, x86_fmax, &MAXPD[..]), + ] { + let inst = inst.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(inst, rec_fa.opcodes(opcodes)); + } + for (ty, inst, opcodes) in &[(F32, sqrt, &SQRTPS[..]), (F64, sqrt, &SQRTPD[..])] { + let inst = inst.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred(inst, rec_furm.opcodes(opcodes)); + } +} + +#[inline(never)] +fn define_entity_ref( + e: &mut PerCpuModeEncodings, + shared_defs: &SharedDefinitions, + settings: &SettingGroup, + r: &RecipeGroup, +) { + let shared = &shared_defs.instructions; + let formats = &shared_defs.formats; + + // Shorthands for instructions. + let const_addr = shared.by_name("const_addr"); + let func_addr = shared.by_name("func_addr"); + let stack_addr = shared.by_name("stack_addr"); + let symbol_value = shared.by_name("symbol_value"); + + // Shorthands for recipes. + let rec_allones_fnaddr4 = r.template("allones_fnaddr4"); + let rec_allones_fnaddr8 = r.template("allones_fnaddr8"); + let rec_fnaddr4 = r.template("fnaddr4"); + let rec_fnaddr8 = r.template("fnaddr8"); + let rec_const_addr = r.template("const_addr"); + let rec_got_fnaddr8 = r.template("got_fnaddr8"); + let rec_got_gvaddr8 = r.template("got_gvaddr8"); + let rec_gvaddr4 = r.template("gvaddr4"); + let rec_gvaddr8 = r.template("gvaddr8"); + let rec_pcrel_fnaddr8 = r.template("pcrel_fnaddr8"); + let rec_pcrel_gvaddr8 = r.template("pcrel_gvaddr8"); + let rec_spaddr_id = r.template("spaddr_id"); + + // Predicates shorthands. + let all_ones_funcaddrs_and_not_is_pic = + settings.predicate_by_name("all_ones_funcaddrs_and_not_is_pic"); + let is_pic = settings.predicate_by_name("is_pic"); + let not_all_ones_funcaddrs_and_not_is_pic = + settings.predicate_by_name("not_all_ones_funcaddrs_and_not_is_pic"); + let not_is_pic = settings.predicate_by_name("not_is_pic"); + + // Function addresses. + + // Non-PIC, all-ones funcaddresses. + e.enc32_isap( + func_addr.bind(I32), + rec_fnaddr4.opcodes(&MOV_IMM), + not_all_ones_funcaddrs_and_not_is_pic, + ); + e.enc64_isap( + func_addr.bind(I64), + rec_fnaddr8.opcodes(&MOV_IMM).rex().w(), + not_all_ones_funcaddrs_and_not_is_pic, + ); + + // Non-PIC, all-zeros funcaddresses. + e.enc32_isap( + func_addr.bind(I32), + rec_allones_fnaddr4.opcodes(&MOV_IMM), + all_ones_funcaddrs_and_not_is_pic, + ); + e.enc64_isap( + func_addr.bind(I64), + rec_allones_fnaddr8.opcodes(&MOV_IMM).rex().w(), + all_ones_funcaddrs_and_not_is_pic, + ); + + // 64-bit, colocated, both PIC and non-PIC. Use the lea instruction's pc-relative field. + let is_colocated_func = + InstructionPredicate::new_is_colocated_func(&*formats.func_addr, "func_ref"); + e.enc64_instp( + func_addr.bind(I64), + rec_pcrel_fnaddr8.opcodes(&LEA).rex().w(), + is_colocated_func, + ); + + // 64-bit, non-colocated, PIC. + e.enc64_isap( + func_addr.bind(I64), + rec_got_fnaddr8.opcodes(&MOV_LOAD).rex().w(), + is_pic, + ); + + // Global addresses. + + // Non-PIC. + e.enc32_isap( + symbol_value.bind(I32), + rec_gvaddr4.opcodes(&MOV_IMM), + not_is_pic, + ); + e.enc64_isap( + symbol_value.bind(I64), + rec_gvaddr8.opcodes(&MOV_IMM).rex().w(), + not_is_pic, + ); + + // PIC, colocated. + e.enc64_func( + symbol_value.bind(I64), + rec_pcrel_gvaddr8.opcodes(&LEA).rex().w(), + |encoding| { + encoding + .isa_predicate(is_pic) + .inst_predicate(InstructionPredicate::new_is_colocated_data(formats)) + }, + ); + + // PIC, non-colocated. + e.enc64_isap( + symbol_value.bind(I64), + rec_got_gvaddr8.opcodes(&MOV_LOAD).rex().w(), + is_pic, + ); + + // Stack addresses. + // + // TODO: Add encoding rules for stack_load and stack_store, so that they + // don't get legalized to stack_addr + load/store. + e.enc64(stack_addr.bind(I64), rec_spaddr_id.opcodes(&LEA).rex().w()); + e.enc32(stack_addr.bind(I32), rec_spaddr_id.opcodes(&LEA)); + + // Constant addresses (PIC). + e.enc64(const_addr.bind(I64), rec_const_addr.opcodes(&LEA).rex().w()); + e.enc32(const_addr.bind(I32), rec_const_addr.opcodes(&LEA)); +} + +/// Control flow opcodes. +#[inline(never)] +fn define_control_flow( + e: &mut PerCpuModeEncodings, + shared_defs: &SharedDefinitions, + settings: &SettingGroup, + r: &RecipeGroup, +) { + let shared = &shared_defs.instructions; + let formats = &shared_defs.formats; + + // Shorthands for instructions. + let brff = shared.by_name("brff"); + let brif = shared.by_name("brif"); + let brnz = shared.by_name("brnz"); + let brz = shared.by_name("brz"); + let call = shared.by_name("call"); + let call_indirect = shared.by_name("call_indirect"); + let debugtrap = shared.by_name("debugtrap"); + let indirect_jump_table_br = shared.by_name("indirect_jump_table_br"); + let jump = shared.by_name("jump"); + let jump_table_base = shared.by_name("jump_table_base"); + let jump_table_entry = shared.by_name("jump_table_entry"); + let return_ = shared.by_name("return"); + let trap = shared.by_name("trap"); + let trapff = shared.by_name("trapff"); + let trapif = shared.by_name("trapif"); + let resumable_trap = shared.by_name("resumable_trap"); + + // Shorthands for recipes. + let rec_brfb = r.template("brfb"); + let rec_brfd = r.template("brfd"); + let rec_brib = r.template("brib"); + let rec_brid = r.template("brid"); + let rec_call_id = r.template("call_id"); + let rec_call_plt_id = r.template("call_plt_id"); + let rec_call_r = r.template("call_r"); + let rec_debugtrap = r.recipe("debugtrap"); + let rec_indirect_jmp = r.template("indirect_jmp"); + let rec_jmpb = r.template("jmpb"); + let rec_jmpd = r.template("jmpd"); + let rec_jt_base = r.template("jt_base"); + let rec_jt_entry = r.template("jt_entry"); + let rec_ret = r.template("ret"); + let rec_t8jccb_abcd = r.template("t8jccb_abcd"); + let rec_t8jccd_abcd = r.template("t8jccd_abcd"); + let rec_t8jccd_long = r.template("t8jccd_long"); + let rec_tjccb = r.template("tjccb"); + let rec_tjccd = r.template("tjccd"); + let rec_trap = r.template("trap"); + let rec_trapif = r.recipe("trapif"); + let rec_trapff = r.recipe("trapff"); + + // Predicates shorthands. + let is_pic = settings.predicate_by_name("is_pic"); + + // Call/return + + // 32-bit, both PIC and non-PIC. + e.enc32(call, rec_call_id.opcodes(&CALL_RELATIVE)); + + // 64-bit, colocated, both PIC and non-PIC. Use the call instruction's pc-relative field. + let is_colocated_func = InstructionPredicate::new_is_colocated_func(&*formats.call, "func_ref"); + e.enc64_instp(call, rec_call_id.opcodes(&CALL_RELATIVE), is_colocated_func); + + // 64-bit, non-colocated, PIC. There is no 64-bit non-colocated non-PIC version, since non-PIC + // is currently using the large model, which requires calls be lowered to + // func_addr+call_indirect. + e.enc64_isap(call, rec_call_plt_id.opcodes(&CALL_RELATIVE), is_pic); + + e.enc32( + call_indirect.bind(I32), + rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2), + ); + e.enc64( + call_indirect.bind(I64), + rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2).rex(), + ); + e.enc64( + call_indirect.bind(I64), + rec_call_r.opcodes(&JUMP_ABSOLUTE).rrr(2), + ); + + e.enc32(return_, rec_ret.opcodes(&RET_NEAR)); + e.enc64(return_, rec_ret.opcodes(&RET_NEAR)); + + // Branches. + e.enc32(jump, rec_jmpb.opcodes(&JUMP_SHORT)); + e.enc64(jump, rec_jmpb.opcodes(&JUMP_SHORT)); + e.enc32(jump, rec_jmpd.opcodes(&JUMP_NEAR_RELATIVE)); + e.enc64(jump, rec_jmpd.opcodes(&JUMP_NEAR_RELATIVE)); + + e.enc_both(brif, rec_brib.opcodes(&JUMP_SHORT_IF_OVERFLOW)); + e.enc_both(brif, rec_brid.opcodes(&JUMP_NEAR_IF_OVERFLOW)); + + // Not all float condition codes are legal, see `supported_floatccs`. + e.enc_both(brff, rec_brfb.opcodes(&JUMP_SHORT_IF_OVERFLOW)); + e.enc_both(brff, rec_brfd.opcodes(&JUMP_NEAR_IF_OVERFLOW)); + + // Note that the tjccd opcode will be prefixed with 0x0f. + e.enc_i32_i64_explicit_rex(brz, rec_tjccb.opcodes(&JUMP_SHORT_IF_EQUAL)); + e.enc_i32_i64_explicit_rex(brz, rec_tjccd.opcodes(&TEST_BYTE_REG)); + e.enc_i32_i64_explicit_rex(brnz, rec_tjccb.opcodes(&JUMP_SHORT_IF_NOT_EQUAL)); + e.enc_i32_i64_explicit_rex(brnz, rec_tjccd.opcodes(&TEST_REG)); + + // Branch on a b1 value in a register only looks at the low 8 bits. See also + // bint encodings below. + // + // Start with the worst-case encoding for X86_32 only. The register allocator + // can't handle a branch with an ABCD-constrained operand. + e.enc32(brz.bind(B1), rec_t8jccd_long.opcodes(&TEST_BYTE_REG)); + e.enc32(brnz.bind(B1), rec_t8jccd_long.opcodes(&TEST_REG)); + + e.enc_both(brz.bind(B1), rec_t8jccb_abcd.opcodes(&JUMP_SHORT_IF_EQUAL)); + e.enc_both(brz.bind(B1), rec_t8jccd_abcd.opcodes(&TEST_BYTE_REG)); + e.enc_both( + brnz.bind(B1), + rec_t8jccb_abcd.opcodes(&JUMP_SHORT_IF_NOT_EQUAL), + ); + e.enc_both(brnz.bind(B1), rec_t8jccd_abcd.opcodes(&TEST_REG)); + + // Jump tables. + e.enc64( + jump_table_entry.bind(I64), + rec_jt_entry.opcodes(&MOVSXD).rex().w(), + ); + e.enc32(jump_table_entry.bind(I32), rec_jt_entry.opcodes(&MOV_LOAD)); + + e.enc64( + jump_table_base.bind(I64), + rec_jt_base.opcodes(&LEA).rex().w(), + ); + e.enc32(jump_table_base.bind(I32), rec_jt_base.opcodes(&LEA)); + + e.enc_x86_64( + indirect_jump_table_br.bind(I64), + rec_indirect_jmp.opcodes(&JUMP_ABSOLUTE).rrr(4), + ); + e.enc32( + indirect_jump_table_br.bind(I32), + rec_indirect_jmp.opcodes(&JUMP_ABSOLUTE).rrr(4), + ); + + // Trap as ud2 + e.enc32(trap, rec_trap.opcodes(&UNDEFINED2)); + e.enc64(trap, rec_trap.opcodes(&UNDEFINED2)); + e.enc32(resumable_trap, rec_trap.opcodes(&UNDEFINED2)); + e.enc64(resumable_trap, rec_trap.opcodes(&UNDEFINED2)); + + // Debug trap as int3 + e.enc32_rec(debugtrap, rec_debugtrap, 0); + e.enc64_rec(debugtrap, rec_debugtrap, 0); + + e.enc32_rec(trapif, rec_trapif, 0); + e.enc64_rec(trapif, rec_trapif, 0); + e.enc32_rec(trapff, rec_trapff, 0); + e.enc64_rec(trapff, rec_trapff, 0); +} + +/// Reference type instructions. +#[inline(never)] +fn define_reftypes(e: &mut PerCpuModeEncodings, shared_defs: &SharedDefinitions, r: &RecipeGroup) { + let shared = &shared_defs.instructions; + + let is_null = shared.by_name("is_null"); + let is_invalid = shared.by_name("is_invalid"); + let null = shared.by_name("null"); + let safepoint = shared.by_name("safepoint"); + + let rec_is_zero = r.template("is_zero"); + let rec_is_invalid = r.template("is_invalid"); + let rec_pu_id_ref = r.template("pu_id_ref"); + let rec_safepoint = r.recipe("safepoint"); + + // Null references implemented as iconst 0. + e.enc32(null.bind(R32), rec_pu_id_ref.opcodes(&MOV_IMM)); + + e.enc64(null.bind(R64), rec_pu_id_ref.rex().opcodes(&MOV_IMM)); + e.enc64(null.bind(R64), rec_pu_id_ref.opcodes(&MOV_IMM)); + + // is_null, implemented by testing whether the value is 0. + e.enc_r32_r64_rex_only(is_null, rec_is_zero.opcodes(&TEST_REG)); + + // is_invalid, implemented by testing whether the value is -1. + e.enc_r32_r64_rex_only(is_invalid, rec_is_invalid.opcodes(&CMP_IMM8).rrr(7)); + + // safepoint instruction calls sink, no actual encoding. + e.enc32_rec(safepoint, rec_safepoint, 0); + e.enc64_rec(safepoint, rec_safepoint, 0); +} + +#[allow(clippy::cognitive_complexity)] +pub(crate) fn define( + shared_defs: &SharedDefinitions, + settings: &SettingGroup, + x86: &InstructionGroup, + r: &RecipeGroup, +) -> PerCpuModeEncodings { + // Definitions. + let mut e = PerCpuModeEncodings::new(); + + define_moves(&mut e, shared_defs, r); + define_memory(&mut e, shared_defs, x86, r); + define_fpu_moves(&mut e, shared_defs, r); + define_fpu_memory(&mut e, shared_defs, r); + define_fpu_ops(&mut e, shared_defs, settings, x86, r); + define_alu(&mut e, shared_defs, settings, x86, r); + define_simd(&mut e, shared_defs, settings, x86, r); + define_entity_ref(&mut e, shared_defs, settings, r); + define_control_flow(&mut e, shared_defs, settings, r); + define_reftypes(&mut e, shared_defs, r); + + let x86_elf_tls_get_addr = x86.by_name("x86_elf_tls_get_addr"); + let x86_macho_tls_get_addr = x86.by_name("x86_macho_tls_get_addr"); + + let rec_elf_tls_get_addr = r.recipe("elf_tls_get_addr"); + let rec_macho_tls_get_addr = r.recipe("macho_tls_get_addr"); + + e.enc64_rec(x86_elf_tls_get_addr, rec_elf_tls_get_addr, 0); + e.enc64_rec(x86_macho_tls_get_addr, rec_macho_tls_get_addr, 0); + + e +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/instructions.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/instructions.rs new file mode 100644 index 0000000000..7acd2e2c50 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/instructions.rs @@ -0,0 +1,723 @@ +#![allow(non_snake_case)] + +use crate::cdsl::instructions::{ + AllInstructions, InstructionBuilder as Inst, InstructionGroup, InstructionGroupBuilder, +}; +use crate::cdsl::operands::Operand; +use crate::cdsl::types::ValueType; +use crate::cdsl::typevar::{Interval, TypeSetBuilder, TypeVar}; +use crate::shared::entities::EntityRefs; +use crate::shared::formats::Formats; +use crate::shared::immediates::Immediates; +use crate::shared::types; + +#[allow(clippy::many_single_char_names)] +pub(crate) fn define( + mut all_instructions: &mut AllInstructions, + formats: &Formats, + immediates: &Immediates, + entities: &EntityRefs, +) -> InstructionGroup { + let mut ig = InstructionGroupBuilder::new(&mut all_instructions); + + let iflags: &TypeVar = &ValueType::Special(types::Flag::IFlags.into()).into(); + + let iWord = &TypeVar::new( + "iWord", + "A scalar integer machine word", + TypeSetBuilder::new().ints(32..64).build(), + ); + let nlo = &Operand::new("nlo", iWord).with_doc("Low part of numerator"); + let nhi = &Operand::new("nhi", iWord).with_doc("High part of numerator"); + let d = &Operand::new("d", iWord).with_doc("Denominator"); + let q = &Operand::new("q", iWord).with_doc("Quotient"); + let r = &Operand::new("r", iWord).with_doc("Remainder"); + + ig.push( + Inst::new( + "x86_udivmodx", + r#" + Extended unsigned division. + + Concatenate the bits in `nhi` and `nlo` to form the numerator. + Interpret the bits as an unsigned number and divide by the unsigned + denominator `d`. Trap when `d` is zero or if the quotient is larger + than the range of the output. + + Return both quotient and remainder. + "#, + &formats.ternary, + ) + .operands_in(vec![nlo, nhi, d]) + .operands_out(vec![q, r]) + .can_trap(true), + ); + + ig.push( + Inst::new( + "x86_sdivmodx", + r#" + Extended signed division. + + Concatenate the bits in `nhi` and `nlo` to form the numerator. + Interpret the bits as a signed number and divide by the signed + denominator `d`. Trap when `d` is zero or if the quotient is outside + the range of the output. + + Return both quotient and remainder. + "#, + &formats.ternary, + ) + .operands_in(vec![nlo, nhi, d]) + .operands_out(vec![q, r]) + .can_trap(true), + ); + + let argL = &Operand::new("argL", iWord); + let argR = &Operand::new("argR", iWord); + let resLo = &Operand::new("resLo", iWord); + let resHi = &Operand::new("resHi", iWord); + + ig.push( + Inst::new( + "x86_umulx", + r#" + Unsigned integer multiplication, producing a double-length result. + + Polymorphic over all scalar integer types, but does not support vector + types. + "#, + &formats.binary, + ) + .operands_in(vec![argL, argR]) + .operands_out(vec![resLo, resHi]), + ); + + ig.push( + Inst::new( + "x86_smulx", + r#" + Signed integer multiplication, producing a double-length result. + + Polymorphic over all scalar integer types, but does not support vector + types. + "#, + &formats.binary, + ) + .operands_in(vec![argL, argR]) + .operands_out(vec![resLo, resHi]), + ); + + let Float = &TypeVar::new( + "Float", + "A scalar or vector floating point number", + TypeSetBuilder::new() + .floats(Interval::All) + .simd_lanes(Interval::All) + .build(), + ); + let IntTo = &TypeVar::new( + "IntTo", + "An integer type with the same number of lanes", + TypeSetBuilder::new() + .ints(32..64) + .simd_lanes(Interval::All) + .build(), + ); + let x = &Operand::new("x", Float); + let a = &Operand::new("a", IntTo); + + ig.push( + Inst::new( + "x86_cvtt2si", + r#" + Convert with truncation floating point to signed integer. + + The source floating point operand is converted to a signed integer by + rounding towards zero. If the result can't be represented in the output + type, returns the smallest signed value the output type can represent. + + This instruction does not trap. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + + let f32x4 = &TypeVar::new( + "f32x4", + "A floating point number", + TypeSetBuilder::new() + .floats(32..32) + .simd_lanes(4..4) + .build(), + ); + let i32x4 = &TypeVar::new( + "i32x4", + "An integer type with the same number of lanes", + TypeSetBuilder::new().ints(32..32).simd_lanes(4..4).build(), + ); + let x = &Operand::new("x", i32x4); + let a = &Operand::new("a", f32x4); + + ig.push( + Inst::new( + "x86_vcvtudq2ps", + r#" + Convert unsigned integer to floating point. + + Convert packed doubleword unsigned integers to packed single-precision floating-point + values. This instruction does not trap. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + + let x = &Operand::new("x", Float); + let a = &Operand::new("a", Float); + let y = &Operand::new("y", Float); + + ig.push( + Inst::new( + "x86_fmin", + r#" + Floating point minimum with x86 semantics. + + This is equivalent to the C ternary operator `x < y ? x : y` which + differs from `fmin` when either operand is NaN or when comparing + +0.0 to -0.0. + + When the two operands don't compare as LT, `y` is returned unchanged, + even if it is a signalling NaN. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_fmax", + r#" + Floating point maximum with x86 semantics. + + This is equivalent to the C ternary operator `x > y ? x : y` which + differs from `fmax` when either operand is NaN or when comparing + +0.0 to -0.0. + + When the two operands don't compare as GT, `y` is returned unchanged, + even if it is a signalling NaN. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + let x = &Operand::new("x", iWord); + + ig.push( + Inst::new( + "x86_push", + r#" + Pushes a value onto the stack. + + Decrements the stack pointer and stores the specified value on to the top. + + This is polymorphic in i32 and i64. However, it is only implemented for i64 + in 64-bit mode, and only for i32 in 32-bit mode. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .other_side_effects(true) + .can_store(true), + ); + + ig.push( + Inst::new( + "x86_pop", + r#" + Pops a value from the stack. + + Loads a value from the top of the stack and then increments the stack + pointer. + + This is polymorphic in i32 and i64. However, it is only implemented for i64 + in 64-bit mode, and only for i32 in 32-bit mode. + "#, + &formats.nullary, + ) + .operands_out(vec![x]) + .other_side_effects(true) + .can_load(true), + ); + + let y = &Operand::new("y", iWord); + let rflags = &Operand::new("rflags", iflags); + + ig.push( + Inst::new( + "x86_bsr", + r#" + Bit Scan Reverse -- returns the bit-index of the most significant 1 + in the word. Result is undefined if the argument is zero. However, it + sets the Z flag depending on the argument, so it is at least easy to + detect and handle that case. + + This is polymorphic in i32 and i64. It is implemented for both i64 and + i32 in 64-bit mode, and only for i32 in 32-bit mode. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![y, rflags]), + ); + + ig.push( + Inst::new( + "x86_bsf", + r#" + Bit Scan Forwards -- returns the bit-index of the least significant 1 + in the word. Is otherwise identical to 'bsr', just above. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![y, rflags]), + ); + + let uimm8 = &immediates.uimm8; + let TxN = &TypeVar::new( + "TxN", + "A SIMD vector type", + TypeSetBuilder::new() + .ints(Interval::All) + .floats(Interval::All) + .bools(Interval::All) + .simd_lanes(Interval::All) + .includes_scalars(false) + .build(), + ); + let a = &Operand::new("a", TxN).with_doc("A vector value (i.e. held in an XMM register)"); + let b = &Operand::new("b", TxN).with_doc("A vector value (i.e. held in an XMM register)"); + let i = &Operand::new("i", uimm8).with_doc("An ordering operand controlling the copying of data from the source to the destination; see PSHUFD in Intel manual for details"); + + ig.push( + Inst::new( + "x86_pshufd", + r#" + Packed Shuffle Doublewords -- copies data from either memory or lanes in an extended + register and re-orders the data according to the passed immediate byte. + "#, + &formats.binary_imm8, + ) + .operands_in(vec![a, i]) // TODO allow copying from memory here (need more permissive type than TxN) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_pshufb", + r#" + Packed Shuffle Bytes -- re-orders data in an extended register using a shuffle + mask from either memory or another extended register + "#, + &formats.binary, + ) + .operands_in(vec![a, b]) // TODO allow re-ordering from memory here (need more permissive type than TxN) + .operands_out(vec![a]), + ); + + let mask = &Operand::new("mask", uimm8).with_doc("mask to select lanes from b"); + ig.push( + Inst::new( + "x86_pblendw", + r#" + Blend packed words using an immediate mask. Each bit of the 8-bit immediate corresponds to a + lane in ``b``: if the bit is set, the lane is copied into ``a``. + "#, + &formats.ternary_imm8, + ) + .operands_in(vec![a, b, mask]) + .operands_out(vec![a]), + ); + + let Idx = &Operand::new("Idx", uimm8).with_doc("Lane index"); + let x = &Operand::new("x", TxN); + let a = &Operand::new("a", &TxN.lane_of()); + + ig.push( + Inst::new( + "x86_pextr", + r#" + Extract lane ``Idx`` from ``x``. + The lane index, ``Idx``, is an immediate value, not an SSA value. It + must indicate a valid lane index for the type of ``x``. + "#, + &formats.binary_imm8, + ) + .operands_in(vec![x, Idx]) + .operands_out(vec![a]), + ); + + let IBxN = &TypeVar::new( + "IBxN", + "A SIMD vector type containing only booleans and integers", + TypeSetBuilder::new() + .ints(Interval::All) + .bools(Interval::All) + .simd_lanes(Interval::All) + .includes_scalars(false) + .build(), + ); + let x = &Operand::new("x", IBxN); + let y = &Operand::new("y", &IBxN.lane_of()).with_doc("New lane value"); + let a = &Operand::new("a", IBxN); + + ig.push( + Inst::new( + "x86_pinsr", + r#" + Insert ``y`` into ``x`` at lane ``Idx``. + The lane index, ``Idx``, is an immediate value, not an SSA value. It + must indicate a valid lane index for the type of ``x``. + "#, + &formats.ternary_imm8, + ) + .operands_in(vec![x, y, Idx]) + .operands_out(vec![a]), + ); + + let FxN = &TypeVar::new( + "FxN", + "A SIMD vector type containing floats", + TypeSetBuilder::new() + .floats(Interval::All) + .simd_lanes(Interval::All) + .includes_scalars(false) + .build(), + ); + let x = &Operand::new("x", FxN); + let y = &Operand::new("y", &FxN.lane_of()).with_doc("New lane value"); + let a = &Operand::new("a", FxN); + + ig.push( + Inst::new( + "x86_insertps", + r#" + Insert a lane of ``y`` into ``x`` at using ``Idx`` to encode both which lane the value is + extracted from and which it is inserted to. This is similar to x86_pinsr but inserts + floats, which are already stored in an XMM register. + "#, + &formats.ternary_imm8, + ) + .operands_in(vec![x, y, Idx]) + .operands_out(vec![a]), + ); + + let x = &Operand::new("x", TxN); + let y = &Operand::new("y", TxN); + let a = &Operand::new("a", TxN); + + ig.push( + Inst::new( + "x86_punpckh", + r#" + Unpack the high-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional + i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation + would result in ``a = [y3, x3, y2, x2]`` (using the Intel manual's right-to-left lane + ordering). + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_punpckl", + r#" + Unpack the low-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional + i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation + would result in ``a = [y1, x1, y0, x0]`` (using the Intel manual's right-to-left lane + ordering). + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + let x = &Operand::new("x", FxN); + let y = &Operand::new("y", FxN); + let a = &Operand::new("a", FxN); + + ig.push( + Inst::new( + "x86_movsd", + r#" + Move the low 64 bits of the float vector ``y`` to the low 64 bits of float vector ``x`` + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_movlhps", + r#" + Move the low 64 bits of the float vector ``y`` to the high 64 bits of float vector ``x`` + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + let IxN = &TypeVar::new( + "IxN", + "A SIMD vector type containing integers", + TypeSetBuilder::new() + .ints(Interval::All) + .simd_lanes(Interval::All) + .includes_scalars(false) + .build(), + ); + let I128 = &TypeVar::new( + "I128", + "A SIMD vector type containing one large integer (due to Cranelift type constraints, \ + this uses the Cranelift I64X2 type but should be understood as one large value, i.e., the \ + upper lane is concatenated with the lower lane to form the integer)", + TypeSetBuilder::new() + .ints(64..64) + .simd_lanes(2..2) + .includes_scalars(false) + .build(), + ); + + let x = &Operand::new("x", IxN).with_doc("Vector value to shift"); + let y = &Operand::new("y", I128).with_doc("Number of bits to shift"); + let a = &Operand::new("a", IxN); + + ig.push( + Inst::new( + "x86_psll", + r#" + Shift Packed Data Left Logical -- This implements the behavior of the shared instruction + ``ishl`` but alters the shift operand to live in an XMM register as expected by the PSLL* + family of instructions. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_psrl", + r#" + Shift Packed Data Right Logical -- This implements the behavior of the shared instruction + ``ushr`` but alters the shift operand to live in an XMM register as expected by the PSRL* + family of instructions. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_psra", + r#" + Shift Packed Data Right Arithmetic -- This implements the behavior of the shared + instruction ``sshr`` but alters the shift operand to live in an XMM register as expected by + the PSRA* family of instructions. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + let I64x2 = &TypeVar::new( + "I64x2", + "A SIMD vector type containing two 64-bit integers", + TypeSetBuilder::new() + .ints(64..64) + .simd_lanes(2..2) + .includes_scalars(false) + .build(), + ); + + let x = &Operand::new("x", I64x2); + let y = &Operand::new("y", I64x2); + let a = &Operand::new("a", I64x2); + ig.push( + Inst::new( + "x86_pmullq", + r#" + Multiply Packed Integers -- Multiply two 64x2 integers and receive a 64x2 result with + lane-wise wrapping if the result overflows. This instruction is necessary to add distinct + encodings for CPUs with newer vector features. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_pmuludq", + r#" + Multiply Packed Integers -- Using only the bottom 32 bits in each lane, multiply two 64x2 + unsigned integers and receive a 64x2 result. This instruction avoids the need for handling + overflow as in `x86_pmullq`. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + let x = &Operand::new("x", TxN); + let y = &Operand::new("y", TxN); + let f = &Operand::new("f", iflags); + ig.push( + Inst::new( + "x86_ptest", + r#" + Logical Compare -- PTEST will set the ZF flag if all bits in the result are 0 of the + bitwise AND of the first source operand (first operand) and the second source operand + (second operand). PTEST sets the CF flag if all bits in the result are 0 of the bitwise + AND of the second source operand (second operand) and the logical NOT of the destination + operand (first operand). + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![f]), + ); + + let x = &Operand::new("x", IxN); + let y = &Operand::new("y", IxN); + let a = &Operand::new("a", IxN); + ig.push( + Inst::new( + "x86_pmaxs", + r#" + Maximum of Packed Signed Integers -- Compare signed integers in the first and second + operand and return the maximum values. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_pmaxu", + r#" + Maximum of Packed Unsigned Integers -- Compare unsigned integers in the first and second + operand and return the maximum values. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_pmins", + r#" + Minimum of Packed Signed Integers -- Compare signed integers in the first and second + operand and return the minimum values. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_pminu", + r#" + Minimum of Packed Unsigned Integers -- Compare unsigned integers in the first and second + operand and return the minimum values. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + let c = &Operand::new("c", uimm8) + .with_doc("The number of bytes to shift right; see PALIGNR in Intel manual for details"); + ig.push( + Inst::new( + "x86_palignr", + r#" + Concatenate destination and source operands, extracting a byte-aligned result shifted to + the right by `c`. + "#, + &formats.ternary_imm8, + ) + .operands_in(vec![x, y, c]) + .operands_out(vec![a]), + ); + + let i64_t = &TypeVar::new( + "i64_t", + "A scalar 64bit integer", + TypeSetBuilder::new().ints(64..64).build(), + ); + + let GV = &Operand::new("GV", &entities.global_value); + let addr = &Operand::new("addr", i64_t); + + ig.push( + Inst::new( + "x86_elf_tls_get_addr", + r#" + Elf tls get addr -- This implements the GD TLS model for ELF. The clobber output should + not be used. + "#, + &formats.unary_global_value, + ) + // This is a bit overly broad to mark as clobbering *all* the registers, because it should + // only preserve caller-saved registers. There's no way to indicate this to register + // allocation yet, though, so mark as clobbering all registers instead. + .clobbers_all_regs(true) + .operands_in(vec![GV]) + .operands_out(vec![addr]), + ); + ig.push( + Inst::new( + "x86_macho_tls_get_addr", + r#" + Mach-O tls get addr -- This implements TLS access for Mach-O. The clobber output should + not be used. + "#, + &formats.unary_global_value, + ) + // See above comment for x86_elf_tls_get_addr. + .clobbers_all_regs(true) + .operands_in(vec![GV]) + .operands_out(vec![addr]), + ); + + ig.build() +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/legalize.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/legalize.rs new file mode 100644 index 0000000000..681b3104d5 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/legalize.rs @@ -0,0 +1,829 @@ +use crate::cdsl::ast::{constant, var, ExprBuilder, Literal}; +use crate::cdsl::instructions::{vector, Bindable, InstructionGroup}; +use crate::cdsl::types::{LaneType, ValueType}; +use crate::cdsl::xform::TransformGroupBuilder; +use crate::shared::types::Float::{F32, F64}; +use crate::shared::types::Int::{I16, I32, I64, I8}; +use crate::shared::Definitions as SharedDefinitions; + +#[allow(clippy::many_single_char_names)] +pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) { + let mut expand = TransformGroupBuilder::new( + "x86_expand", + r#" + Legalize instructions by expansion. + + Use x86-specific instructions if needed."#, + ) + .isa("x86") + .chain_with(shared.transform_groups.by_name("expand_flags").id); + + let mut narrow = TransformGroupBuilder::new( + "x86_narrow", + r#" + Legalize instructions by narrowing. + + Use x86-specific instructions if needed."#, + ) + .isa("x86") + .chain_with(shared.transform_groups.by_name("narrow_flags").id); + + let mut narrow_avx = TransformGroupBuilder::new( + "x86_narrow_avx", + r#" + Legalize instructions by narrowing with CPU feature checks. + + This special case converts using x86 AVX instructions where available."#, + ) + .isa("x86"); + // We cannot chain with the x86_narrow group until this group is built, see bottom of this + // function for where this is chained. + + let mut widen = TransformGroupBuilder::new( + "x86_widen", + r#" + Legalize instructions by widening. + + Use x86-specific instructions if needed."#, + ) + .isa("x86") + .chain_with(shared.transform_groups.by_name("widen").id); + + // List of instructions. + let insts = &shared.instructions; + let band = insts.by_name("band"); + let bor = insts.by_name("bor"); + let clz = insts.by_name("clz"); + let ctz = insts.by_name("ctz"); + let fcmp = insts.by_name("fcmp"); + let fcvt_from_uint = insts.by_name("fcvt_from_uint"); + let fcvt_to_sint = insts.by_name("fcvt_to_sint"); + let fcvt_to_uint = insts.by_name("fcvt_to_uint"); + let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat"); + let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat"); + let fmax = insts.by_name("fmax"); + let fmin = insts.by_name("fmin"); + let iadd = insts.by_name("iadd"); + let iconst = insts.by_name("iconst"); + let imul = insts.by_name("imul"); + let ineg = insts.by_name("ineg"); + let isub = insts.by_name("isub"); + let ishl = insts.by_name("ishl"); + let ireduce = insts.by_name("ireduce"); + let popcnt = insts.by_name("popcnt"); + let sdiv = insts.by_name("sdiv"); + let selectif = insts.by_name("selectif"); + let smulhi = insts.by_name("smulhi"); + let srem = insts.by_name("srem"); + let tls_value = insts.by_name("tls_value"); + let udiv = insts.by_name("udiv"); + let umulhi = insts.by_name("umulhi"); + let ushr = insts.by_name("ushr"); + let ushr_imm = insts.by_name("ushr_imm"); + let urem = insts.by_name("urem"); + + let x86_bsf = x86_instructions.by_name("x86_bsf"); + let x86_bsr = x86_instructions.by_name("x86_bsr"); + let x86_umulx = x86_instructions.by_name("x86_umulx"); + let x86_smulx = x86_instructions.by_name("x86_smulx"); + + let imm = &shared.imm; + + // Shift by a 64-bit amount is equivalent to a shift by that amount mod 32, so we can reduce + // the size of the shift amount. This is useful for x86_32, where an I64 shift amount is + // not encodable. + let a = var("a"); + let x = var("x"); + let y = var("y"); + let z = var("z"); + + for &ty in &[I8, I16, I32] { + let ishl_by_i64 = ishl.bind(ty).bind(I64); + let ireduce = ireduce.bind(I32); + expand.legalize( + def!(a = ishl_by_i64(x, y)), + vec![def!(z = ireduce(y)), def!(a = ishl(x, z))], + ); + } + + for &ty in &[I8, I16, I32] { + let ushr_by_i64 = ushr.bind(ty).bind(I64); + let ireduce = ireduce.bind(I32); + expand.legalize( + def!(a = ushr_by_i64(x, y)), + vec![def!(z = ireduce(y)), def!(a = ishl(x, z))], + ); + } + + // Division and remainder. + // + // The srem expansion requires custom code because srem INT_MIN, -1 is not + // allowed to trap. The other ops need to check avoid_div_traps. + expand.custom_legalize(sdiv, "expand_sdivrem"); + expand.custom_legalize(srem, "expand_sdivrem"); + expand.custom_legalize(udiv, "expand_udivrem"); + expand.custom_legalize(urem, "expand_udivrem"); + + // Double length (widening) multiplication. + let a = var("a"); + let x = var("x"); + let y = var("y"); + let a1 = var("a1"); + let a2 = var("a2"); + let res_lo = var("res_lo"); + let res_hi = var("res_hi"); + + expand.legalize( + def!(res_hi = umulhi(x, y)), + vec![def!((res_lo, res_hi) = x86_umulx(x, y))], + ); + + expand.legalize( + def!(res_hi = smulhi(x, y)), + vec![def!((res_lo, res_hi) = x86_smulx(x, y))], + ); + + // Floating point condition codes. + // + // The 8 condition codes in `supported_floatccs` are directly supported by a + // `ucomiss` or `ucomisd` instruction. The remaining codes need legalization + // patterns. + + let floatcc_eq = Literal::enumerator_for(&imm.floatcc, "eq"); + let floatcc_ord = Literal::enumerator_for(&imm.floatcc, "ord"); + let floatcc_ueq = Literal::enumerator_for(&imm.floatcc, "ueq"); + let floatcc_ne = Literal::enumerator_for(&imm.floatcc, "ne"); + let floatcc_uno = Literal::enumerator_for(&imm.floatcc, "uno"); + let floatcc_one = Literal::enumerator_for(&imm.floatcc, "one"); + + // Equality needs an explicit `ord` test which checks the parity bit. + expand.legalize( + def!(a = fcmp(floatcc_eq, x, y)), + vec![ + def!(a1 = fcmp(floatcc_ord, x, y)), + def!(a2 = fcmp(floatcc_ueq, x, y)), + def!(a = band(a1, a2)), + ], + ); + expand.legalize( + def!(a = fcmp(floatcc_ne, x, y)), + vec![ + def!(a1 = fcmp(floatcc_uno, x, y)), + def!(a2 = fcmp(floatcc_one, x, y)), + def!(a = bor(a1, a2)), + ], + ); + + let floatcc_lt = &Literal::enumerator_for(&imm.floatcc, "lt"); + let floatcc_gt = &Literal::enumerator_for(&imm.floatcc, "gt"); + let floatcc_le = &Literal::enumerator_for(&imm.floatcc, "le"); + let floatcc_ge = &Literal::enumerator_for(&imm.floatcc, "ge"); + let floatcc_ugt = &Literal::enumerator_for(&imm.floatcc, "ugt"); + let floatcc_ult = &Literal::enumerator_for(&imm.floatcc, "ult"); + let floatcc_uge = &Literal::enumerator_for(&imm.floatcc, "uge"); + let floatcc_ule = &Literal::enumerator_for(&imm.floatcc, "ule"); + + // Inequalities that need to be reversed. + for &(cc, rev_cc) in &[ + (floatcc_lt, floatcc_gt), + (floatcc_le, floatcc_ge), + (floatcc_ugt, floatcc_ult), + (floatcc_uge, floatcc_ule), + ] { + expand.legalize(def!(a = fcmp(cc, x, y)), vec![def!(a = fcmp(rev_cc, y, x))]); + } + + // We need to modify the CFG for min/max legalization. + expand.custom_legalize(fmin, "expand_minmax"); + expand.custom_legalize(fmax, "expand_minmax"); + + // Conversions from unsigned need special handling. + expand.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint"); + // Conversions from float to int can trap and modify the control flow graph. + expand.custom_legalize(fcvt_to_sint, "expand_fcvt_to_sint"); + expand.custom_legalize(fcvt_to_uint, "expand_fcvt_to_uint"); + expand.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat"); + expand.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat"); + + // Count leading and trailing zeroes, for baseline x86_64 + let c_minus_one = var("c_minus_one"); + let c_thirty_one = var("c_thirty_one"); + let c_thirty_two = var("c_thirty_two"); + let c_sixty_three = var("c_sixty_three"); + let c_sixty_four = var("c_sixty_four"); + let index1 = var("index1"); + let r2flags = var("r2flags"); + let index2 = var("index2"); + + let intcc_eq = Literal::enumerator_for(&imm.intcc, "eq"); + let imm64_minus_one = Literal::constant(&imm.imm64, -1); + let imm64_63 = Literal::constant(&imm.imm64, 63); + expand.legalize( + def!(a = clz.I64(x)), + vec![ + def!(c_minus_one = iconst(imm64_minus_one)), + def!(c_sixty_three = iconst(imm64_63)), + def!((index1, r2flags) = x86_bsr(x)), + def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)), + def!(a = isub(c_sixty_three, index2)), + ], + ); + + let imm64_31 = Literal::constant(&imm.imm64, 31); + expand.legalize( + def!(a = clz.I32(x)), + vec![ + def!(c_minus_one = iconst(imm64_minus_one)), + def!(c_thirty_one = iconst(imm64_31)), + def!((index1, r2flags) = x86_bsr(x)), + def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)), + def!(a = isub(c_thirty_one, index2)), + ], + ); + + let imm64_64 = Literal::constant(&imm.imm64, 64); + expand.legalize( + def!(a = ctz.I64(x)), + vec![ + def!(c_sixty_four = iconst(imm64_64)), + def!((index1, r2flags) = x86_bsf(x)), + def!(a = selectif(intcc_eq, r2flags, c_sixty_four, index1)), + ], + ); + + let imm64_32 = Literal::constant(&imm.imm64, 32); + expand.legalize( + def!(a = ctz.I32(x)), + vec![ + def!(c_thirty_two = iconst(imm64_32)), + def!((index1, r2flags) = x86_bsf(x)), + def!(a = selectif(intcc_eq, r2flags, c_thirty_two, index1)), + ], + ); + + // Population count for baseline x86_64 + let x = var("x"); + let r = var("r"); + + let qv3 = var("qv3"); + let qv4 = var("qv4"); + let qv5 = var("qv5"); + let qv6 = var("qv6"); + let qv7 = var("qv7"); + let qv8 = var("qv8"); + let qv9 = var("qv9"); + let qv10 = var("qv10"); + let qv11 = var("qv11"); + let qv12 = var("qv12"); + let qv13 = var("qv13"); + let qv14 = var("qv14"); + let qv15 = var("qv15"); + let qc77 = var("qc77"); + #[allow(non_snake_case)] + let qc0F = var("qc0F"); + let qc01 = var("qc01"); + + let imm64_1 = Literal::constant(&imm.imm64, 1); + let imm64_4 = Literal::constant(&imm.imm64, 4); + expand.legalize( + def!(r = popcnt.I64(x)), + vec![ + def!(qv3 = ushr_imm(x, imm64_1)), + def!(qc77 = iconst(Literal::constant(&imm.imm64, 0x7777_7777_7777_7777))), + def!(qv4 = band(qv3, qc77)), + def!(qv5 = isub(x, qv4)), + def!(qv6 = ushr_imm(qv4, imm64_1)), + def!(qv7 = band(qv6, qc77)), + def!(qv8 = isub(qv5, qv7)), + def!(qv9 = ushr_imm(qv7, imm64_1)), + def!(qv10 = band(qv9, qc77)), + def!(qv11 = isub(qv8, qv10)), + def!(qv12 = ushr_imm(qv11, imm64_4)), + def!(qv13 = iadd(qv11, qv12)), + def!(qc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F_0F0F_0F0F_0F0F))), + def!(qv14 = band(qv13, qc0F)), + def!(qc01 = iconst(Literal::constant(&imm.imm64, 0x0101_0101_0101_0101))), + def!(qv15 = imul(qv14, qc01)), + def!(r = ushr_imm(qv15, Literal::constant(&imm.imm64, 56))), + ], + ); + + let lv3 = var("lv3"); + let lv4 = var("lv4"); + let lv5 = var("lv5"); + let lv6 = var("lv6"); + let lv7 = var("lv7"); + let lv8 = var("lv8"); + let lv9 = var("lv9"); + let lv10 = var("lv10"); + let lv11 = var("lv11"); + let lv12 = var("lv12"); + let lv13 = var("lv13"); + let lv14 = var("lv14"); + let lv15 = var("lv15"); + let lc77 = var("lc77"); + #[allow(non_snake_case)] + let lc0F = var("lc0F"); + let lc01 = var("lc01"); + + expand.legalize( + def!(r = popcnt.I32(x)), + vec![ + def!(lv3 = ushr_imm(x, imm64_1)), + def!(lc77 = iconst(Literal::constant(&imm.imm64, 0x7777_7777))), + def!(lv4 = band(lv3, lc77)), + def!(lv5 = isub(x, lv4)), + def!(lv6 = ushr_imm(lv4, imm64_1)), + def!(lv7 = band(lv6, lc77)), + def!(lv8 = isub(lv5, lv7)), + def!(lv9 = ushr_imm(lv7, imm64_1)), + def!(lv10 = band(lv9, lc77)), + def!(lv11 = isub(lv8, lv10)), + def!(lv12 = ushr_imm(lv11, imm64_4)), + def!(lv13 = iadd(lv11, lv12)), + def!(lc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F_0F0F))), + def!(lv14 = band(lv13, lc0F)), + def!(lc01 = iconst(Literal::constant(&imm.imm64, 0x0101_0101))), + def!(lv15 = imul(lv14, lc01)), + def!(r = ushr_imm(lv15, Literal::constant(&imm.imm64, 24))), + ], + ); + + expand.custom_legalize(ineg, "convert_ineg"); + expand.custom_legalize(tls_value, "expand_tls_value"); + widen.custom_legalize(ineg, "convert_ineg"); + + // To reduce compilation times, separate out large blocks of legalizations by theme. + define_simd(shared, x86_instructions, &mut narrow, &mut narrow_avx); + + expand.build_and_add_to(&mut shared.transform_groups); + let narrow_id = narrow.build_and_add_to(&mut shared.transform_groups); + narrow_avx + .chain_with(narrow_id) + .build_and_add_to(&mut shared.transform_groups); + widen.build_and_add_to(&mut shared.transform_groups); +} + +fn define_simd( + shared: &mut SharedDefinitions, + x86_instructions: &InstructionGroup, + narrow: &mut TransformGroupBuilder, + narrow_avx: &mut TransformGroupBuilder, +) { + let insts = &shared.instructions; + let band = insts.by_name("band"); + let band_not = insts.by_name("band_not"); + let bitcast = insts.by_name("bitcast"); + let bitselect = insts.by_name("bitselect"); + let bor = insts.by_name("bor"); + let bnot = insts.by_name("bnot"); + let bxor = insts.by_name("bxor"); + let extractlane = insts.by_name("extractlane"); + let fabs = insts.by_name("fabs"); + let fcmp = insts.by_name("fcmp"); + let fcvt_from_uint = insts.by_name("fcvt_from_uint"); + let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat"); + let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat"); + let fmax = insts.by_name("fmax"); + let fmin = insts.by_name("fmin"); + let fneg = insts.by_name("fneg"); + let iadd_imm = insts.by_name("iadd_imm"); + let icmp = insts.by_name("icmp"); + let imax = insts.by_name("imax"); + let imin = insts.by_name("imin"); + let imul = insts.by_name("imul"); + let ineg = insts.by_name("ineg"); + let insertlane = insts.by_name("insertlane"); + let ishl = insts.by_name("ishl"); + let ishl_imm = insts.by_name("ishl_imm"); + let load_splat = insts.by_name("load_splat"); + let raw_bitcast = insts.by_name("raw_bitcast"); + let scalar_to_vector = insts.by_name("scalar_to_vector"); + let splat = insts.by_name("splat"); + let shuffle = insts.by_name("shuffle"); + let sshr = insts.by_name("sshr"); + let swizzle = insts.by_name("swizzle"); + let trueif = insts.by_name("trueif"); + let uadd_sat = insts.by_name("uadd_sat"); + let umax = insts.by_name("umax"); + let umin = insts.by_name("umin"); + let snarrow = insts.by_name("snarrow"); + let swiden_high = insts.by_name("swiden_high"); + let swiden_low = insts.by_name("swiden_low"); + let ushr_imm = insts.by_name("ushr_imm"); + let ushr = insts.by_name("ushr"); + let uwiden_high = insts.by_name("uwiden_high"); + let uwiden_low = insts.by_name("uwiden_low"); + let vconst = insts.by_name("vconst"); + let vall_true = insts.by_name("vall_true"); + let vany_true = insts.by_name("vany_true"); + let vselect = insts.by_name("vselect"); + + let x86_palignr = x86_instructions.by_name("x86_palignr"); + let x86_pmaxs = x86_instructions.by_name("x86_pmaxs"); + let x86_pmaxu = x86_instructions.by_name("x86_pmaxu"); + let x86_pmins = x86_instructions.by_name("x86_pmins"); + let x86_pminu = x86_instructions.by_name("x86_pminu"); + let x86_pshufb = x86_instructions.by_name("x86_pshufb"); + let x86_pshufd = x86_instructions.by_name("x86_pshufd"); + let x86_psra = x86_instructions.by_name("x86_psra"); + let x86_ptest = x86_instructions.by_name("x86_ptest"); + let x86_punpckh = x86_instructions.by_name("x86_punpckh"); + let x86_punpckl = x86_instructions.by_name("x86_punpckl"); + + let imm = &shared.imm; + + // Set up variables and immediates. + let uimm8_zero = Literal::constant(&imm.uimm8, 0x00); + let uimm8_one = Literal::constant(&imm.uimm8, 0x01); + let uimm8_eight = Literal::constant(&imm.uimm8, 8); + let u128_zeroes = constant(vec![0x00; 16]); + let u128_ones = constant(vec![0xff; 16]); + let u128_seventies = constant(vec![0x70; 16]); + let a = var("a"); + let b = var("b"); + let c = var("c"); + let d = var("d"); + let e = var("e"); + let f = var("f"); + let g = var("g"); + let h = var("h"); + let x = var("x"); + let y = var("y"); + let z = var("z"); + + // Limit the SIMD vector size: eventually multiple vector sizes may be supported + // but for now only SSE-sized vectors are available. + let sse_vector_size: u64 = 128; + let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128; + + // SIMD splat: 8-bits + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { + let splat_any8x16 = splat.bind(vector(ty, sse_vector_size)); + narrow.legalize( + def!(y = splat_any8x16(x)), + vec![ + // Move into the lowest 8 bits of an XMM register. + def!(a = scalar_to_vector(x)), + // Zero out a different XMM register; the shuffle mask for moving the lowest byte + // to all other byte lanes is 0x0. + def!(b = vconst(u128_zeroes)), + // PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b). + def!(y = x86_pshufb(a, b)), + ], + ); + } + + // SIMD splat: 16-bits + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) { + let splat_x16x8 = splat.bind(vector(ty, sse_vector_size)); + let raw_bitcast_any16x8_to_i32x4 = raw_bitcast + .bind(vector(I32, sse_vector_size)) + .bind(vector(ty, sse_vector_size)); + let raw_bitcast_i32x4_to_any16x8 = raw_bitcast + .bind(vector(ty, sse_vector_size)) + .bind(vector(I32, sse_vector_size)); + narrow.legalize( + def!(y = splat_x16x8(x)), + vec![ + // Move into the lowest 16 bits of an XMM register. + def!(a = scalar_to_vector(x)), + // Insert the value again but in the next lowest 16 bits. + def!(b = insertlane(a, x, uimm8_one)), + // No instruction emitted; pretend this is an I32x4 so we can use PSHUFD. + def!(c = raw_bitcast_any16x8_to_i32x4(b)), + // Broadcast the bytes in the XMM register with PSHUFD. + def!(d = x86_pshufd(c, uimm8_zero)), + // No instruction emitted; pretend this is an X16x8 again. + def!(y = raw_bitcast_i32x4_to_any16x8(d)), + ], + ); + } + + // SIMD splat: 32-bits + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) { + let splat_any32x4 = splat.bind(vector(ty, sse_vector_size)); + narrow.legalize( + def!(y = splat_any32x4(x)), + vec![ + // Translate to an x86 MOV to get the value in an XMM register. + def!(a = scalar_to_vector(x)), + // Broadcast the bytes in the XMM register with PSHUFD. + def!(y = x86_pshufd(a, uimm8_zero)), + ], + ); + } + + // SIMD splat: 64-bits + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) { + let splat_any64x2 = splat.bind(vector(ty, sse_vector_size)); + narrow.legalize( + def!(y = splat_any64x2(x)), + vec![ + // Move into the lowest 64 bits of an XMM register. + def!(a = scalar_to_vector(x)), + // Move into the highest 64 bits of the same XMM register. + def!(y = insertlane(a, x, uimm8_one)), + ], + ); + } + + // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec requiring + // mask indexes greater than 15 to have the same semantics as a 0 index. For the spec discussion, + // see https://github.com/WebAssembly/simd/issues/93. + { + let swizzle = swizzle.bind(vector(I8, sse_vector_size)); + narrow.legalize( + def!(a = swizzle(x, y)), + vec![ + def!(b = vconst(u128_seventies)), + def!(c = uadd_sat(y, b)), + def!(a = x86_pshufb(x, c)), + ], + ); + } + + // SIMD bnot + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let bnot = bnot.bind(vector(ty, sse_vector_size)); + narrow.legalize( + def!(y = bnot(x)), + vec![def!(a = vconst(u128_ones)), def!(y = bxor(a, x))], + ); + } + + // SIMD shift right (arithmetic, i16x8 and i32x4) + for ty in &[I16, I32] { + let sshr = sshr.bind(vector(*ty, sse_vector_size)); + let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size)); + narrow.legalize( + def!(a = sshr(x, y)), + vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))], + ); + } + // SIMD shift right (arithmetic, i8x16) + { + let sshr = sshr.bind(vector(I8, sse_vector_size)); + let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size)); + let raw_bitcast_i16x8 = raw_bitcast.bind(vector(I16, sse_vector_size)); + let raw_bitcast_i16x8_again = raw_bitcast.bind(vector(I16, sse_vector_size)); + narrow.legalize( + def!(z = sshr(x, y)), + vec![ + // Since we will use the high byte of each 16x8 lane, shift an extra 8 bits. + def!(a = iadd_imm(y, uimm8_eight)), + def!(b = bitcast_i64x2(a)), + // Take the low 8 bytes of x, duplicate them in 16x8 lanes, then shift right. + def!(c = x86_punpckl(x, x)), + def!(d = raw_bitcast_i16x8(c)), + def!(e = x86_psra(d, b)), + // Take the high 8 bytes of x, duplicate them in 16x8 lanes, then shift right. + def!(f = x86_punpckh(x, x)), + def!(g = raw_bitcast_i16x8_again(f)), + def!(h = x86_psra(g, b)), + // Re-pack the vector. + def!(z = snarrow(e, h)), + ], + ); + } + // SIMD shift right (arithmetic, i64x2) + { + let sshr_vector = sshr.bind(vector(I64, sse_vector_size)); + let sshr_scalar_lane0 = sshr.bind(I64); + let sshr_scalar_lane1 = sshr.bind(I64); + narrow.legalize( + def!(z = sshr_vector(x, y)), + vec![ + // Use scalar operations to shift the first lane. + def!(a = extractlane(x, uimm8_zero)), + def!(b = sshr_scalar_lane0(a, y)), + def!(c = insertlane(x, b, uimm8_zero)), + // Do the same for the second lane. + def!(d = extractlane(x, uimm8_one)), + def!(e = sshr_scalar_lane1(d, y)), + def!(z = insertlane(c, e, uimm8_one)), + ], + ); + } + + // SIMD select + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let bitselect = bitselect.bind(vector(ty, sse_vector_size)); // must bind both x/y and c + narrow.legalize( + def!(d = bitselect(c, x, y)), + vec![ + def!(a = band(x, c)), + def!(b = band_not(y, c)), + def!(d = bor(a, b)), + ], + ); + } + + // SIMD vselect; replace with bitselect if BLEND* instructions are not available. + // This works, because each lane of boolean vector is filled with zeroes or ones. + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let vselect = vselect.bind(vector(ty, sse_vector_size)); + let raw_bitcast = raw_bitcast.bind(vector(ty, sse_vector_size)); + narrow.legalize( + def!(d = vselect(c, x, y)), + vec![def!(a = raw_bitcast(c)), def!(d = bitselect(a, x, y))], + ); + } + + // SIMD vany_true + let ne = Literal::enumerator_for(&imm.intcc, "ne"); + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let vany_true = vany_true.bind(vector(ty, sse_vector_size)); + narrow.legalize( + def!(y = vany_true(x)), + vec![def!(a = x86_ptest(x, x)), def!(y = trueif(ne, a))], + ); + } + + // SIMD vall_true + let eq = Literal::enumerator_for(&imm.intcc, "eq"); + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let vall_true = vall_true.bind(vector(ty, sse_vector_size)); + if ty.is_int() { + // In the common case (Wasm's integer-only all_true), we do not require a + // bitcast. + narrow.legalize( + def!(y = vall_true(x)), + vec![ + def!(a = vconst(u128_zeroes)), + def!(c = icmp(eq, x, a)), + def!(d = x86_ptest(c, c)), + def!(y = trueif(eq, d)), + ], + ); + } else { + // However, to support other types we must bitcast them to an integer vector to + // use icmp. + let lane_type_as_int = LaneType::int_from_bits(ty.lane_bits() as u16); + let raw_bitcast_to_int = raw_bitcast.bind(vector(lane_type_as_int, sse_vector_size)); + narrow.legalize( + def!(y = vall_true(x)), + vec![ + def!(a = vconst(u128_zeroes)), + def!(b = raw_bitcast_to_int(x)), + def!(c = icmp(eq, b, a)), + def!(d = x86_ptest(c, c)), + def!(y = trueif(eq, d)), + ], + ); + } + } + + // SIMD icmp ne + let ne = Literal::enumerator_for(&imm.intcc, "ne"); + for ty in ValueType::all_lane_types().filter(|ty| allowed_simd_type(ty) && ty.is_int()) { + let icmp_ = icmp.bind(vector(ty, sse_vector_size)); + narrow.legalize( + def!(c = icmp_(ne, a, b)), + vec![def!(x = icmp(eq, a, b)), def!(c = bnot(x))], + ); + } + + // SIMD icmp greater-/less-than + let sgt = Literal::enumerator_for(&imm.intcc, "sgt"); + let ugt = Literal::enumerator_for(&imm.intcc, "ugt"); + let sge = Literal::enumerator_for(&imm.intcc, "sge"); + let uge = Literal::enumerator_for(&imm.intcc, "uge"); + let slt = Literal::enumerator_for(&imm.intcc, "slt"); + let ult = Literal::enumerator_for(&imm.intcc, "ult"); + let sle = Literal::enumerator_for(&imm.intcc, "sle"); + let ule = Literal::enumerator_for(&imm.intcc, "ule"); + for ty in &[I8, I16, I32] { + // greater-than + let icmp_ = icmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize( + def!(c = icmp_(ugt, a, b)), + vec![ + def!(x = x86_pmaxu(a, b)), + def!(y = icmp(eq, x, b)), + def!(c = bnot(y)), + ], + ); + let icmp_ = icmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize( + def!(c = icmp_(sge, a, b)), + vec![def!(x = x86_pmins(a, b)), def!(c = icmp(eq, x, b))], + ); + let icmp_ = icmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize( + def!(c = icmp_(uge, a, b)), + vec![def!(x = x86_pminu(a, b)), def!(c = icmp(eq, x, b))], + ); + + // less-than + let icmp_ = icmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = icmp_(slt, a, b)), vec![def!(c = icmp(sgt, b, a))]); + let icmp_ = icmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = icmp_(ult, a, b)), vec![def!(c = icmp(ugt, b, a))]); + let icmp_ = icmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = icmp_(sle, a, b)), vec![def!(c = icmp(sge, b, a))]); + let icmp_ = icmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = icmp_(ule, a, b)), vec![def!(c = icmp(uge, b, a))]); + } + + // SIMD integer min/max + for ty in &[I8, I16, I32] { + let imin = imin.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = imin(a, b)), vec![def!(c = x86_pmins(a, b))]); + let umin = umin.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = umin(a, b)), vec![def!(c = x86_pminu(a, b))]); + let imax = imax.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = imax(a, b)), vec![def!(c = x86_pmaxs(a, b))]); + let umax = umax.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = umax(a, b)), vec![def!(c = x86_pmaxu(a, b))]); + } + + // SIMD fcmp greater-/less-than + let gt = Literal::enumerator_for(&imm.floatcc, "gt"); + let lt = Literal::enumerator_for(&imm.floatcc, "lt"); + let ge = Literal::enumerator_for(&imm.floatcc, "ge"); + let le = Literal::enumerator_for(&imm.floatcc, "le"); + let ugt = Literal::enumerator_for(&imm.floatcc, "ugt"); + let ult = Literal::enumerator_for(&imm.floatcc, "ult"); + let uge = Literal::enumerator_for(&imm.floatcc, "uge"); + let ule = Literal::enumerator_for(&imm.floatcc, "ule"); + for ty in &[F32, F64] { + let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = fcmp_(gt, a, b)), vec![def!(c = fcmp(lt, b, a))]); + let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = fcmp_(ge, a, b)), vec![def!(c = fcmp(le, b, a))]); + let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = fcmp_(ult, a, b)), vec![def!(c = fcmp(ugt, b, a))]); + let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size)); + narrow.legalize(def!(c = fcmp_(ule, a, b)), vec![def!(c = fcmp(uge, b, a))]); + } + + for ty in &[F32, F64] { + let fneg = fneg.bind(vector(*ty, sse_vector_size)); + let lane_type_as_int = LaneType::int_from_bits(LaneType::from(*ty).lane_bits() as u16); + let uimm8_shift = Literal::constant(&imm.uimm8, lane_type_as_int.lane_bits() as i64 - 1); + let vconst = vconst.bind(vector(lane_type_as_int, sse_vector_size)); + let bitcast_to_float = raw_bitcast.bind(vector(*ty, sse_vector_size)); + narrow.legalize( + def!(b = fneg(a)), + vec![ + def!(c = vconst(u128_ones)), + def!(d = ishl_imm(c, uimm8_shift)), // Create a mask of all 0s except the MSB. + def!(e = bitcast_to_float(d)), // Cast mask to the floating-point type. + def!(b = bxor(a, e)), // Flip the MSB. + ], + ); + } + + // SIMD fabs + for ty in &[F32, F64] { + let fabs = fabs.bind(vector(*ty, sse_vector_size)); + let lane_type_as_int = LaneType::int_from_bits(LaneType::from(*ty).lane_bits() as u16); + let vconst = vconst.bind(vector(lane_type_as_int, sse_vector_size)); + let bitcast_to_float = raw_bitcast.bind(vector(*ty, sse_vector_size)); + narrow.legalize( + def!(b = fabs(a)), + vec![ + def!(c = vconst(u128_ones)), + def!(d = ushr_imm(c, uimm8_one)), // Create a mask of all 1s except the MSB. + def!(e = bitcast_to_float(d)), // Cast mask to the floating-point type. + def!(b = band(a, e)), // Unset the MSB. + ], + ); + } + + // SIMD widen + for ty in &[I8, I16] { + let swiden_high = swiden_high.bind(vector(*ty, sse_vector_size)); + narrow.legalize( + def!(b = swiden_high(a)), + vec![ + def!(c = x86_palignr(a, a, uimm8_eight)), + def!(b = swiden_low(c)), + ], + ); + let uwiden_high = uwiden_high.bind(vector(*ty, sse_vector_size)); + narrow.legalize( + def!(b = uwiden_high(a)), + vec![ + def!(c = x86_palignr(a, a, uimm8_eight)), + def!(b = uwiden_low(c)), + ], + ); + } + + narrow.custom_legalize(shuffle, "convert_shuffle"); + narrow.custom_legalize(extractlane, "convert_extractlane"); + narrow.custom_legalize(insertlane, "convert_insertlane"); + narrow.custom_legalize(ineg, "convert_ineg"); + narrow.custom_legalize(ushr, "convert_ushr"); + narrow.custom_legalize(ishl, "convert_ishl"); + narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector"); + narrow.custom_legalize(fmin, "expand_minmax_vector"); + narrow.custom_legalize(fmax, "expand_minmax_vector"); + narrow.custom_legalize(load_splat, "expand_load_splat"); + + narrow_avx.custom_legalize(imul, "convert_i64x2_imul"); + narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector"); + narrow_avx.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat_vector"); +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/mod.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/mod.rs new file mode 100644 index 0000000000..a272e83900 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/mod.rs @@ -0,0 +1,88 @@ +use crate::cdsl::cpu_modes::CpuMode; +use crate::cdsl::isa::TargetIsa; +use crate::cdsl::types::{ReferenceType, VectorType}; + +use crate::shared::types::Bool::B1; +use crate::shared::types::Float::{F32, F64}; +use crate::shared::types::Int::{I16, I32, I64, I8}; +use crate::shared::types::Reference::{R32, R64}; +use crate::shared::Definitions as SharedDefinitions; + +mod encodings; +mod instructions; +mod legalize; +mod opcodes; +mod recipes; +mod registers; +pub(crate) mod settings; + +pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { + let settings = settings::define(&shared_defs.settings); + let regs = registers::define(); + + let inst_group = instructions::define( + &mut shared_defs.all_instructions, + &shared_defs.formats, + &shared_defs.imm, + &shared_defs.entities, + ); + legalize::define(shared_defs, &inst_group); + + // CPU modes for 32-bit and 64-bit operations. + let mut x86_64 = CpuMode::new("I64"); + let mut x86_32 = CpuMode::new("I32"); + + let expand_flags = shared_defs.transform_groups.by_name("expand_flags"); + let x86_widen = shared_defs.transform_groups.by_name("x86_widen"); + let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow"); + let x86_narrow_avx = shared_defs.transform_groups.by_name("x86_narrow_avx"); + let x86_expand = shared_defs.transform_groups.by_name("x86_expand"); + + x86_32.legalize_monomorphic(expand_flags); + x86_32.legalize_default(x86_narrow); + x86_32.legalize_type(B1, expand_flags); + x86_32.legalize_type(I8, x86_widen); + x86_32.legalize_type(I16, x86_widen); + x86_32.legalize_type(I32, x86_expand); + x86_32.legalize_value_type(ReferenceType(R32), x86_expand); + x86_32.legalize_type(F32, x86_expand); + x86_32.legalize_type(F64, x86_expand); + x86_32.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx); + x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx); + x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx); + + x86_64.legalize_monomorphic(expand_flags); + x86_64.legalize_default(x86_narrow); + x86_64.legalize_type(B1, expand_flags); + x86_64.legalize_type(I8, x86_widen); + x86_64.legalize_type(I16, x86_widen); + x86_64.legalize_type(I32, x86_expand); + x86_64.legalize_type(I64, x86_expand); + x86_64.legalize_value_type(ReferenceType(R64), x86_expand); + x86_64.legalize_type(F32, x86_expand); + x86_64.legalize_type(F64, x86_expand); + x86_64.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx); + x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx); + x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx); + + let recipes = recipes::define(shared_defs, &settings, ®s); + + let encodings = encodings::define(shared_defs, &settings, &inst_group, &recipes); + x86_32.set_encodings(encodings.enc32); + x86_64.set_encodings(encodings.enc64); + let encodings_predicates = encodings.inst_pred_reg.extract(); + + let recipes = encodings.recipes; + + let cpu_modes = vec![x86_64, x86_32]; + + TargetIsa::new( + "x86", + inst_group, + settings, + regs, + recipes, + cpu_modes, + encodings_predicates, + ) +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/opcodes.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/opcodes.rs new file mode 100644 index 0000000000..09c07c458f --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/opcodes.rs @@ -0,0 +1,721 @@ +//! Static, named definitions of instruction opcodes. + +/// Empty opcode for use as a default. +pub static EMPTY: [u8; 0] = []; + +/// Add with carry flag r{16,32,64} to r/m of the same size. +pub static ADC: [u8; 1] = [0x11]; + +/// Add r{16,32,64} to r/m of the same size. +pub static ADD: [u8; 1] = [0x01]; + +/// Add imm{16,32} to r/m{16,32,64}, possibly sign-extended. +pub static ADD_IMM: [u8; 1] = [0x81]; + +/// Add sign-extended imm8 to r/m{16,32,64}. +pub static ADD_IMM8_SIGN_EXTEND: [u8; 1] = [0x83]; + +/// Add packed double-precision floating-point values from xmm2/mem to xmm1 and store result in +/// xmm1 (SSE2). +pub static ADDPD: [u8; 3] = [0x66, 0x0f, 0x58]; + +/// Add packed single-precision floating-point values from xmm2/mem to xmm1 and store result in +/// xmm1 (SSE). +pub static ADDPS: [u8; 2] = [0x0f, 0x58]; + +/// Add the low double-precision floating-point value from xmm2/mem to xmm1 +/// and store the result in xmm1. +pub static ADDSD: [u8; 3] = [0xf2, 0x0f, 0x58]; + +/// Add the low single-precision floating-point value from xmm2/mem to xmm1 +/// and store the result in xmm1. +pub static ADDSS: [u8; 3] = [0xf3, 0x0f, 0x58]; + +/// r/m{16,32,64} AND register of the same size (Intel docs have a typo). +pub static AND: [u8; 1] = [0x21]; + +/// imm{16,32} AND r/m{16,32,64}, possibly sign-extended. +pub static AND_IMM: [u8; 1] = [0x81]; + +/// r/m{16,32,64} AND sign-extended imm8. +pub static AND_IMM8_SIGN_EXTEND: [u8; 1] = [0x83]; + +/// Return the bitwise logical AND NOT of packed single-precision floating-point +/// values in xmm1 and xmm2/mem. +pub static ANDNPS: [u8; 2] = [0x0f, 0x55]; + +/// Return the bitwise logical AND of packed single-precision floating-point values +/// in xmm1 and xmm2/mem. +pub static ANDPS: [u8; 2] = [0x0f, 0x54]; + +/// Bit scan forward (stores index of first encountered 1 from the front). +pub static BIT_SCAN_FORWARD: [u8; 2] = [0x0f, 0xbc]; + +/// Bit scan reverse (stores index of first encountered 1 from the back). +pub static BIT_SCAN_REVERSE: [u8; 2] = [0x0f, 0xbd]; + +/// Select packed single-precision floating-point values from xmm1 and xmm2/m128 +/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1). +pub static BLENDVPS: [u8; 4] = [0x66, 0x0f, 0x38, 0x14]; + +/// Select packed double-precision floating-point values from xmm1 and xmm2/m128 +/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1). +pub static BLENDVPD: [u8; 4] = [0x66, 0x0f, 0x38, 0x15]; + +/// Call near, relative, displacement relative to next instruction (sign-extended). +pub static CALL_RELATIVE: [u8; 1] = [0xe8]; + +/// Move r/m{16,32,64} if overflow (OF=1). +pub static CMOV_OVERFLOW: [u8; 2] = [0x0f, 0x40]; + +/// Compare imm{16,32} with r/m{16,32,64} (sign-extended if 64). +pub static CMP_IMM: [u8; 1] = [0x81]; + +/// Compare imm8 with r/m{16,32,64}. +pub static CMP_IMM8: [u8; 1] = [0x83]; + +/// Compare r{16,32,64} with r/m of the same size. +pub static CMP_REG: [u8; 1] = [0x39]; + +/// Compare packed double-precision floating-point value in xmm2/m32 and xmm1 using bits 2:0 of +/// imm8 as comparison predicate (SSE2). +pub static CMPPD: [u8; 3] = [0x66, 0x0f, 0xc2]; + +/// Compare packed single-precision floating-point value in xmm2/m32 and xmm1 using bits 2:0 of +/// imm8 as comparison predicate (SSE). +pub static CMPPS: [u8; 2] = [0x0f, 0xc2]; + +/// Convert four packed signed doubleword integers from xmm2/mem to four packed single-precision +/// floating-point values in xmm1 (SSE2). +pub static CVTDQ2PS: [u8; 2] = [0x0f, 0x5b]; + +/// Convert scalar double-precision floating-point value to scalar single-precision +/// floating-point value. +pub static CVTSD2SS: [u8; 3] = [0xf2, 0x0f, 0x5a]; + +/// Convert doubleword integer to scalar double-precision floating-point value. +pub static CVTSI2SD: [u8; 3] = [0xf2, 0x0f, 0x2a]; + +/// Convert doubleword integer to scalar single-precision floating-point value. +pub static CVTSI2SS: [u8; 3] = [0xf3, 0x0f, 0x2a]; + +/// Convert scalar single-precision floating-point value to scalar double-precision +/// float-point value. +pub static CVTSS2SD: [u8; 3] = [0xf3, 0x0f, 0x5a]; + +/// Convert four packed single-precision floating-point values from xmm2/mem to four packed signed +/// doubleword values in xmm1 using truncation (SSE2). +pub static CVTTPS2DQ: [u8; 3] = [0xf3, 0x0f, 0x5b]; + +/// Convert with truncation scalar double-precision floating-point value to signed +/// integer. +pub static CVTTSD2SI: [u8; 3] = [0xf2, 0x0f, 0x2c]; + +/// Convert with truncation scalar single-precision floating-point value to integer. +pub static CVTTSS2SI: [u8; 3] = [0xf3, 0x0f, 0x2c]; + +/// Unsigned divide for {16,32,64}-bit. +pub static DIV: [u8; 1] = [0xf7]; + +/// Divide packed double-precision floating-point values in xmm1 by packed double-precision +/// floating-point values in xmm2/mem (SSE2). +pub static DIVPD: [u8; 3] = [0x66, 0x0f, 0x5e]; + +/// Divide packed single-precision floating-point values in xmm1 by packed single-precision +/// floating-point values in xmm2/mem (SSE). +pub static DIVPS: [u8; 2] = [0x0f, 0x5e]; + +/// Divide low double-precision floating-point value in xmm1 by low double-precision +/// floating-point value in xmm2/m64. +pub static DIVSD: [u8; 3] = [0xf2, 0x0f, 0x5e]; + +/// Divide low single-precision floating-point value in xmm1 by low single-precision +/// floating-point value in xmm2/m32. +pub static DIVSS: [u8; 3] = [0xf3, 0x0f, 0x5e]; + +/// Signed divide for {16,32,64}-bit. +pub static IDIV: [u8; 1] = [0xf7]; + +/// Signed multiply for {16,32,64}-bit, generic registers. +pub static IMUL: [u8; 2] = [0x0f, 0xaf]; + +/// Signed multiply for {16,32,64}-bit, storing into RDX:RAX. +pub static IMUL_RDX_RAX: [u8; 1] = [0xf7]; + +/// Insert scalar single-precision floating-point value. +pub static INSERTPS: [u8; 4] = [0x66, 0x0f, 0x3a, 0x21]; + +/// Either: +/// 1. Jump near, absolute indirect, RIP = 64-bit offset from register or memory. +/// 2. Jump far, absolute indirect, address given in m16:64. +pub static JUMP_ABSOLUTE: [u8; 1] = [0xff]; + +/// Jump near, relative, RIP = RIP + 32-bit displacement sign extended to 64 bits. +pub static JUMP_NEAR_RELATIVE: [u8; 1] = [0xe9]; + +/// Jump near (rel32) if overflow (OF=1). +pub static JUMP_NEAR_IF_OVERFLOW: [u8; 2] = [0x0f, 0x80]; + +/// Jump short, relative, RIP = RIP + 8-bit displacement sign extended to 64 bits. +pub static JUMP_SHORT: [u8; 1] = [0xeb]; + +/// Jump short (rel8) if equal (ZF=1). +pub static JUMP_SHORT_IF_EQUAL: [u8; 1] = [0x74]; + +/// Jump short (rel8) if not equal (ZF=0). +pub static JUMP_SHORT_IF_NOT_EQUAL: [u8; 1] = [0x75]; + +/// Jump short (rel8) if overflow (OF=1). +pub static JUMP_SHORT_IF_OVERFLOW: [u8; 1] = [0x70]; + +/// Store effective address for m in register r{16,32,64}. +pub static LEA: [u8; 1] = [0x8d]; + +/// Count the number of leading zero bits. +pub static LZCNT: [u8; 3] = [0xf3, 0x0f, 0xbd]; + +/// Return the maximum packed double-precision floating-point values between xmm1 and xmm2/m128 +/// (SSE2). +pub static MAXPD: [u8; 3] = [0x66, 0x0f, 0x5f]; + +/// Return the maximum packed single-precision floating-point values between xmm1 and xmm2/m128 +/// (SSE). +pub static MAXPS: [u8; 2] = [0x0f, 0x5f]; + +/// Return the maximum scalar double-precision floating-point value between +/// xmm2/m64 and xmm1. +pub static MAXSD: [u8; 3] = [0xf2, 0x0f, 0x5f]; + +/// Return the maximum scalar single-precision floating-point value between +/// xmm2/m32 and xmm1. +pub static MAXSS: [u8; 3] = [0xf3, 0x0f, 0x5f]; + +/// Return the minimum packed double-precision floating-point values between xmm1 and xmm2/m128 +/// (SSE2). +pub static MINPD: [u8; 3] = [0x66, 0x0f, 0x5d]; + +/// Return the minimum packed single-precision floating-point values between xmm1 and xmm2/m128 +/// (SSE). +pub static MINPS: [u8; 2] = [0x0f, 0x5d]; + +/// Return the minimum scalar double-precision floating-point value between +/// xmm2/m64 and xmm1. +pub static MINSD: [u8; 3] = [0xf2, 0x0f, 0x5d]; + +/// Return the minimum scalar single-precision floating-point value between +/// xmm2/m32 and xmm1. +pub static MINSS: [u8; 3] = [0xf3, 0x0f, 0x5d]; + +/// Move r8 to r/m8. +pub static MOV_BYTE_STORE: [u8; 1] = [0x88]; + +/// Move imm{16,32,64} to same-sized register. +pub static MOV_IMM: [u8; 1] = [0xb8]; + +/// Move imm{16,32} to r{16,32,64}, sign-extended if 64-bit target. +pub static MOV_IMM_SIGNEXTEND: [u8; 1] = [0xc7]; + +/// Move {r/m16, r/m32, r/m64} to same-sized register. +pub static MOV_LOAD: [u8; 1] = [0x8b]; + +/// Move r16 to r/m16. +pub static MOV_STORE_16: [u8; 2] = [0x66, 0x89]; + +/// Move {r16, r32, r64} to same-sized register or memory. +pub static MOV_STORE: [u8; 1] = [0x89]; + +/// Move aligned packed single-precision floating-point values from x/m to xmm (SSE). +pub static MOVAPS_LOAD: [u8; 2] = [0x0f, 0x28]; + +/// Move doubleword from r/m32 to xmm (SSE2). Quadword with REX prefix. +pub static MOVD_LOAD_XMM: [u8; 3] = [0x66, 0x0f, 0x6e]; + +/// Move doubleword from xmm to r/m32 (SSE2). Quadword with REX prefix. +pub static MOVD_STORE_XMM: [u8; 3] = [0x66, 0x0f, 0x7e]; + +/// Move packed single-precision floating-point values low to high (SSE). +pub static MOVLHPS: [u8; 2] = [0x0f, 0x16]; + +/// Move scalar double-precision floating-point value (from reg/mem to reg). +pub static MOVSD_LOAD: [u8; 3] = [0xf2, 0x0f, 0x10]; + +/// Move scalar double-precision floating-point value (from reg to reg/mem). +pub static MOVSD_STORE: [u8; 3] = [0xf2, 0x0f, 0x11]; + +/// Move scalar single-precision floating-point value (from reg to reg/mem). +pub static MOVSS_STORE: [u8; 3] = [0xf3, 0x0f, 0x11]; + +/// Move scalar single-precision floating-point-value (from reg/mem to reg). +pub static MOVSS_LOAD: [u8; 3] = [0xf3, 0x0f, 0x10]; + +/// Move byte to register with sign-extension. +pub static MOVSX_BYTE: [u8; 2] = [0x0f, 0xbe]; + +/// Move word to register with sign-extension. +pub static MOVSX_WORD: [u8; 2] = [0x0f, 0xbf]; + +/// Move doubleword to register with sign-extension. +pub static MOVSXD: [u8; 1] = [0x63]; + +/// Move unaligned packed single-precision floating-point from x/m to xmm (SSE). +pub static MOVUPS_LOAD: [u8; 2] = [0x0f, 0x10]; + +/// Move unaligned packed single-precision floating-point value from xmm to x/m (SSE). +pub static MOVUPS_STORE: [u8; 2] = [0x0f, 0x11]; + +/// Move byte to register with zero-extension. +pub static MOVZX_BYTE: [u8; 2] = [0x0f, 0xb6]; + +/// Move word to register with zero-extension. +pub static MOVZX_WORD: [u8; 2] = [0x0f, 0xb7]; + +/// Unsigned multiply for {16,32,64}-bit. +pub static MUL: [u8; 1] = [0xf7]; + +/// Multiply packed double-precision floating-point values from xmm2/mem to xmm1 and store result +/// in xmm1 (SSE2). +pub static MULPD: [u8; 3] = [0x66, 0x0f, 0x59]; + +/// Multiply packed single-precision floating-point values from xmm2/mem to xmm1 and store result +/// in xmm1 (SSE). +pub static MULPS: [u8; 2] = [0x0f, 0x59]; + +/// Multiply the low double-precision floating-point value in xmm2/m64 by the +/// low double-precision floating-point value in xmm1. +pub static MULSD: [u8; 3] = [0xf2, 0x0f, 0x59]; + +/// Multiply the low single-precision floating-point value in xmm2/m32 by the +/// low single-precision floating-point value in xmm1. +pub static MULSS: [u8; 3] = [0xf3, 0x0f, 0x59]; + +/// Reverse each bit of r/m{16,32,64}. +pub static NOT: [u8; 1] = [0xf7]; + +/// r{16,32,64} OR register of same size. +pub static OR: [u8; 1] = [0x09]; + +/// imm{16,32} OR r/m{16,32,64}, possibly sign-extended. +pub static OR_IMM: [u8; 1] = [0x81]; + +/// r/m{16,32,64} OR sign-extended imm8. +pub static OR_IMM8_SIGN_EXTEND: [u8; 1] = [0x83]; + +/// Return the bitwise logical OR of packed single-precision values in xmm and x/m (SSE). +pub static ORPS: [u8; 2] = [0x0f, 0x56]; + +/// Compute the absolute value of bytes in xmm2/m128 and store the unsigned result in xmm1 (SSSE3). +pub static PABSB: [u8; 4] = [0x66, 0x0f, 0x38, 0x1c]; + +/// Compute the absolute value of 32-bit integers in xmm2/m128 and store the unsigned result in +/// xmm1 (SSSE3). +pub static PABSD: [u8; 4] = [0x66, 0x0f, 0x38, 0x1e]; + +/// Compute the absolute value of 16-bit integers in xmm2/m128 and store the unsigned result in +/// xmm1 (SSSE3). +pub static PABSW: [u8; 4] = [0x66, 0x0f, 0x38, 0x1d]; + +/// Converts 8 packed signed word integers from xmm1 and from xmm2/m128 into 16 packed signed byte +/// integers in xmm1 using signed saturation (SSE2). +pub static PACKSSWB: [u8; 3] = [0x66, 0x0f, 0x63]; + +/// Converts 4 packed signed doubleword integers from xmm1 and from xmm2/m128 into 8 packed signed +/// word integers in xmm1 using signed saturation (SSE2). +pub static PACKSSDW: [u8; 3] = [0x66, 0x0f, 0x6b]; + +/// Converts 8 packed signed word integers from xmm1 and from xmm2/m128 into 16 packed unsigned byte +/// integers in xmm1 using unsigned saturation (SSE2). +pub static PACKUSWB: [u8; 3] = [0x66, 0x0f, 0x67]; + +/// Converts 4 packed signed doubleword integers from xmm1 and from xmm2/m128 into 8 unpacked signed +/// word integers in xmm1 using unsigned saturation (SSE4.1). +pub static PACKUSDW: [u8; 4] = [0x66, 0x0f, 0x38, 0x2b]; + +/// Add packed byte integers from xmm2/m128 and xmm1 (SSE2). +pub static PADDB: [u8; 3] = [0x66, 0x0f, 0xfc]; + +/// Add packed doubleword integers from xmm2/m128 and xmm1 (SSE2). +pub static PADDD: [u8; 3] = [0x66, 0x0f, 0xfe]; + +/// Add packed quadword integers from xmm2/m128 and xmm1 (SSE2). +pub static PADDQ: [u8; 3] = [0x66, 0x0f, 0xd4]; + +/// Add packed word integers from xmm2/m128 and xmm1 (SSE2). +pub static PADDW: [u8; 3] = [0x66, 0x0f, 0xfd]; + +/// Add packed signed byte integers from xmm2/m128 and xmm1 saturate the results (SSE). +pub static PADDSB: [u8; 3] = [0x66, 0x0f, 0xec]; + +/// Add packed signed word integers from xmm2/m128 and xmm1 saturate the results (SSE). +pub static PADDSW: [u8; 3] = [0x66, 0x0f, 0xed]; + +/// Add packed unsigned byte integers from xmm2/m128 and xmm1 saturate the results (SSE). +pub static PADDUSB: [u8; 3] = [0x66, 0x0f, 0xdc]; + +/// Add packed unsigned word integers from xmm2/m128 and xmm1 saturate the results (SSE). +pub static PADDUSW: [u8; 3] = [0x66, 0x0f, 0xdd]; + +/// Concatenate destination and source operands, extract a byte-aligned result into xmm1 that is +/// shifted to the right by the constant number of bytes in imm8 (SSSE3). +pub static PALIGNR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0f]; + +/// Bitwise AND of xmm2/m128 and xmm1 (SSE2). +pub static PAND: [u8; 3] = [0x66, 0x0f, 0xdb]; + +/// Bitwise AND NOT of xmm2/m128 and xmm1 (SSE2). +pub static PANDN: [u8; 3] = [0x66, 0x0f, 0xdf]; + +/// Average packed unsigned byte integers from xmm2/m128 and xmm1 with rounding (SSE2). +pub static PAVGB: [u8; 3] = [0x66, 0x0f, 0xE0]; + +/// Average packed unsigned word integers from xmm2/m128 and xmm1 with rounding (SSE2). +pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3]; + +/// Select byte values from xmm1 and xmm2/m128 from mask specified in the high bit of each byte +/// in XMM0 and store the values into xmm1 (SSE4.1). +pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10]; + +/// Select words from xmm1 and xmm2/m128 from mask specified in imm8 and store the values into xmm1 +/// (SSE4.1). +pub static PBLENDW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0e]; + +/// Compare packed data for equal (SSE2). +pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74]; + +/// Compare packed data for equal (SSE2). +pub static PCMPEQD: [u8; 3] = [0x66, 0x0f, 0x76]; + +/// Compare packed data for equal (SSE4.1). +pub static PCMPEQQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x29]; + +/// Compare packed data for equal (SSE2). +pub static PCMPEQW: [u8; 3] = [0x66, 0x0f, 0x75]; + +/// Compare packed signed byte integers for greater than (SSE2). +pub static PCMPGTB: [u8; 3] = [0x66, 0x0f, 0x64]; + +/// Compare packed signed doubleword integers for greater than (SSE2). +pub static PCMPGTD: [u8; 3] = [0x66, 0x0f, 0x66]; + +/// Compare packed signed quadword integers for greater than (SSE4.2). +pub static PCMPGTQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x37]; + +/// Compare packed signed word integers for greater than (SSE2). +pub static PCMPGTW: [u8; 3] = [0x66, 0x0f, 0x65]; + +/// Extract doubleword or quadword, depending on REX.W (SSE4.1). +pub static PEXTR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x16]; + +/// Extract byte (SSE4.1). +pub static PEXTRB: [u8; 4] = [0x66, 0x0f, 0x3a, 0x14]; + +/// Extract word (SSE4.1). There is a 3-byte SSE2 variant that can also move to m/16. +pub static PEXTRW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x15]; + +/// Insert doubleword or quadword, depending on REX.W (SSE4.1). +pub static PINSR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x22]; + +/// Insert byte (SSE4.1). +pub static PINSRB: [u8; 4] = [0x66, 0x0f, 0x3a, 0x20]; + +/// Insert word (SSE2). +pub static PINSRW: [u8; 3] = [0x66, 0x0f, 0xc4]; + +/// Compare packed signed byte integers in xmm1 and xmm2/m128 and store packed maximum values in +/// xmm1 (SSE4.1). +pub static PMAXSB: [u8; 4] = [0x66, 0x0f, 0x38, 0x3c]; + +/// Compare packed signed doubleword integers in xmm1 and xmm2/m128 and store packed maximum +/// values in xmm1 (SSE4.1). +pub static PMAXSD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3d]; + +/// Compare packed signed word integers in xmm1 and xmm2/m128 and store packed maximum values in +/// xmm1 (SSE2). +pub static PMAXSW: [u8; 3] = [0x66, 0x0f, 0xee]; + +/// Compare packed unsigned byte integers in xmm1 and xmm2/m128 and store packed maximum values in +/// xmm1 (SSE2). +pub static PMAXUB: [u8; 3] = [0x66, 0x0f, 0xde]; + +/// Compare packed unsigned doubleword integers in xmm1 and xmm2/m128 and store packed maximum +/// values in xmm1 (SSE4.1). +pub static PMAXUD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3f]; + +/// Compare packed unsigned word integers in xmm1 and xmm2/m128 and store packed maximum values in +/// xmm1 (SSE4.1). +pub static PMAXUW: [u8; 4] = [0x66, 0x0f, 0x38, 0x3e]; + +/// Compare packed signed byte integers in xmm1 and xmm2/m128 and store packed minimum values in +/// xmm1 (SSE4.1). +pub static PMINSB: [u8; 4] = [0x66, 0x0f, 0x38, 0x38]; + +/// Compare packed signed doubleword integers in xmm1 and xmm2/m128 and store packed minimum +/// values in xmm1 (SSE4.1). +pub static PMINSD: [u8; 4] = [0x66, 0x0f, 0x38, 0x39]; + +/// Compare packed signed word integers in xmm1 and xmm2/m128 and store packed minimum values in +/// xmm1 (SSE2). +pub static PMINSW: [u8; 3] = [0x66, 0x0f, 0xea]; + +/// Compare packed unsigned byte integers in xmm1 and xmm2/m128 and store packed minimum values in +/// xmm1 (SSE2). +pub static PMINUB: [u8; 3] = [0x66, 0x0f, 0xda]; + +/// Compare packed unsigned doubleword integers in xmm1 and xmm2/m128 and store packed minimum +/// values in xmm1 (SSE4.1). +pub static PMINUD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3b]; + +/// Compare packed unsigned word integers in xmm1 and xmm2/m128 and store packed minimum values in +/// xmm1 (SSE4.1). +pub static PMINUW: [u8; 4] = [0x66, 0x0f, 0x38, 0x3a]; + +/// Sign extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit +/// integers in xmm1 (SSE4.1). +pub static PMOVSXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x20]; + +/// Sign extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit +/// integers in xmm1 (SSE4.1). +pub static PMOVSXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x23]; + +/// Sign extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit +/// integers in xmm1 (SSE4.1). +pub static PMOVSXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x25]; + +/// Zero extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit +/// integers in xmm1 (SSE4.1). +pub static PMOVZXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x30]; + +/// Zero extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit +/// integers in xmm1 (SSE4.1). +pub static PMOVZXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x33]; + +/// Zero extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit +/// integers in xmm1 (SSE4.1). +pub static PMOVZXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x35]; + +/// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of +/// the results in xmm1 (SSE2). +pub static PMULLW: [u8; 3] = [0x66, 0x0f, 0xd5]; + +/// Multiply the packed doubleword signed integers in xmm1 and xmm2/m128 and store the low 32 +/// bits of each product in xmm1 (SSE4.1). +pub static PMULLD: [u8; 4] = [0x66, 0x0f, 0x38, 0x40]; + +/// Multiply the packed quadword signed integers in xmm2 and xmm3/m128 and store the low 64 +/// bits of each product in xmm1 (AVX512VL/DQ). Requires an EVEX encoding. +pub static VPMULLQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x40]; + +/// Multiply packed unsigned doubleword integers in xmm1 by packed unsigned doubleword integers +/// in xmm2/m128, and store the quadword results in xmm1 (SSE2). +pub static PMULUDQ: [u8; 3] = [0x66, 0x0f, 0xf4]; + +/// Pop top of stack into r{16,32,64}; increment stack pointer. +pub static POP_REG: [u8; 1] = [0x58]; + +/// Returns the count of number of bits set to 1. +pub static POPCNT: [u8; 3] = [0xf3, 0x0f, 0xb8]; + +/// Bitwise OR of xmm2/m128 and xmm1 (SSE2). +pub static POR: [u8; 3] = [0x66, 0x0f, 0xeb]; + +/// Shuffle bytes in xmm1 according to contents of xmm2/m128 (SSE3). +pub static PSHUFB: [u8; 4] = [0x66, 0x0f, 0x38, 0x00]; + +/// Shuffle the doublewords in xmm2/m128 based on the encoding in imm8 and +/// store the result in xmm1 (SSE2). +pub static PSHUFD: [u8; 3] = [0x66, 0x0f, 0x70]; + +/// Shift words in xmm1 by imm8; the direction and sign-bit behavior is controlled by the RRR +/// digit used in the ModR/M byte (SSE2). +pub static PS_W_IMM: [u8; 3] = [0x66, 0x0f, 0x71]; + +/// Shift doublewords in xmm1 by imm8; the direction and sign-bit behavior is controlled by the RRR +/// digit used in the ModR/M byte (SSE2). +pub static PS_D_IMM: [u8; 3] = [0x66, 0x0f, 0x72]; + +/// Shift quadwords in xmm1 by imm8; the direction and sign-bit behavior is controlled by the RRR +/// digit used in the ModR/M byte (SSE2). +pub static PS_Q_IMM: [u8; 3] = [0x66, 0x0f, 0x73]; + +/// Shift words in xmm1 left by xmm2/m128 while shifting in 0s (SSE2). +pub static PSLLW: [u8; 3] = [0x66, 0x0f, 0xf1]; + +/// Shift doublewords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2). +pub static PSLLD: [u8; 3] = [0x66, 0x0f, 0xf2]; + +/// Shift quadwords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2). +pub static PSLLQ: [u8; 3] = [0x66, 0x0f, 0xf3]; + +/// Shift words in xmm1 right by xmm2/m128 while shifting in 0s (SSE2). +pub static PSRLW: [u8; 3] = [0x66, 0x0f, 0xd1]; + +/// Shift doublewords in xmm1 right by xmm2/m128 while shifting in 0s (SSE2). +pub static PSRLD: [u8; 3] = [0x66, 0x0f, 0xd2]; + +/// Shift quadwords in xmm1 right by xmm2/m128 while shifting in 0s (SSE2). +pub static PSRLQ: [u8; 3] = [0x66, 0x0f, 0xd3]; + +/// Shift words in xmm1 right by xmm2/m128 while shifting in sign bits (SSE2). +pub static PSRAW: [u8; 3] = [0x66, 0x0f, 0xe1]; + +/// Shift doublewords in xmm1 right by xmm2/m128 while shifting in sign bits (SSE2). +pub static PSRAD: [u8; 3] = [0x66, 0x0f, 0xe2]; + +/// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2). +pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8]; + +/// Subtract packed word integers in xmm2/m128 from packed word integers in xmm1 (SSE2). +pub static PSUBW: [u8; 3] = [0x66, 0x0f, 0xf9]; + +/// Subtract packed doubleword integers in xmm2/m128 from doubleword byte integers in xmm1 (SSE2). +pub static PSUBD: [u8; 3] = [0x66, 0x0f, 0xfa]; + +/// Subtract packed quadword integers in xmm2/m128 from xmm1 (SSE2). +pub static PSUBQ: [u8; 3] = [0x66, 0x0f, 0xfb]; + +/// Subtract packed signed byte integers in xmm2/m128 from packed signed byte integers in xmm1 +/// and saturate results (SSE2). +pub static PSUBSB: [u8; 3] = [0x66, 0x0f, 0xe8]; + +/// Subtract packed signed word integers in xmm2/m128 from packed signed word integers in xmm1 +/// and saturate results (SSE2). +pub static PSUBSW: [u8; 3] = [0x66, 0x0f, 0xe9]; + +/// Subtract packed unsigned byte integers in xmm2/m128 from packed unsigned byte integers in xmm1 +/// and saturate results (SSE2). +pub static PSUBUSB: [u8; 3] = [0x66, 0x0f, 0xd8]; + +/// Subtract packed unsigned word integers in xmm2/m128 from packed unsigned word integers in xmm1 +/// and saturate results (SSE2). +pub static PSUBUSW: [u8; 3] = [0x66, 0x0f, 0xd9]; + +/// Set ZF if xmm2/m128 AND xmm1 result is all 0s; set CF if xmm2/m128 AND NOT xmm1 result is all +/// 0s (SSE4.1). +pub static PTEST: [u8; 4] = [0x66, 0x0f, 0x38, 0x17]; + +/// Unpack and interleave high-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKHBW: [u8; 3] = [0x66, 0x0f, 0x68]; + +/// Unpack and interleave high-order words from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKHWD: [u8; 3] = [0x66, 0x0f, 0x69]; + +/// Unpack and interleave high-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKHDQ: [u8; 3] = [0x66, 0x0f, 0x6A]; + +/// Unpack and interleave high-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKHQDQ: [u8; 3] = [0x66, 0x0f, 0x6D]; + +/// Unpack and interleave low-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKLBW: [u8; 3] = [0x66, 0x0f, 0x60]; + +/// Unpack and interleave low-order words from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKLWD: [u8; 3] = [0x66, 0x0f, 0x61]; + +/// Unpack and interleave low-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKLDQ: [u8; 3] = [0x66, 0x0f, 0x62]; + +/// Unpack and interleave low-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2). +pub static PUNPCKLQDQ: [u8; 3] = [0x66, 0x0f, 0x6C]; + +/// Push r{16,32,64}. +pub static PUSH_REG: [u8; 1] = [0x50]; + +/// Logical exclusive OR (SSE2). +pub static PXOR: [u8; 3] = [0x66, 0x0f, 0xef]; + +/// Near return to calling procedure. +pub static RET_NEAR: [u8; 1] = [0xc3]; + +/// General rotation opcode. Kind of rotation depends on encoding. +pub static ROTATE_CL: [u8; 1] = [0xd3]; + +/// General rotation opcode. Kind of rotation depends on encoding. +pub static ROTATE_IMM8: [u8; 1] = [0xc1]; + +/// Round scalar doubl-precision floating-point values. +pub static ROUNDSD: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0b]; + +/// Round scalar single-precision floating-point values. +pub static ROUNDSS: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0a]; + +/// Subtract with borrow r{16,32,64} from r/m of the same size. +pub static SBB: [u8; 1] = [0x19]; + +/// Set byte if overflow (OF=1). +pub static SET_BYTE_IF_OVERFLOW: [u8; 2] = [0x0f, 0x90]; + +/// Compute the square root of the packed double-precision floating-point values and store the +/// result in xmm1 (SSE2). +pub static SQRTPD: [u8; 3] = [0x66, 0x0f, 0x51]; + +/// Compute the square root of the packed double-precision floating-point values and store the +/// result in xmm1 (SSE). +pub static SQRTPS: [u8; 2] = [0x0f, 0x51]; + +/// Compute square root of scalar double-precision floating-point value. +pub static SQRTSD: [u8; 3] = [0xf2, 0x0f, 0x51]; + +/// Compute square root of scalar single-precision value. +pub static SQRTSS: [u8; 3] = [0xf3, 0x0f, 0x51]; + +/// Subtract r{16,32,64} from r/m of same size. +pub static SUB: [u8; 1] = [0x29]; + +/// Subtract packed double-precision floating-point values in xmm2/mem from xmm1 and store result +/// in xmm1 (SSE2). +pub static SUBPD: [u8; 3] = [0x66, 0x0f, 0x5c]; + +/// Subtract packed single-precision floating-point values in xmm2/mem from xmm1 and store result +/// in xmm1 (SSE). +pub static SUBPS: [u8; 2] = [0x0f, 0x5c]; + +/// Subtract the low double-precision floating-point value in xmm2/m64 from xmm1 +/// and store the result in xmm1. +pub static SUBSD: [u8; 3] = [0xf2, 0x0f, 0x5c]; + +/// Subtract the low single-precision floating-point value in xmm2/m32 from xmm1 +/// and store the result in xmm1. +pub static SUBSS: [u8; 3] = [0xf3, 0x0f, 0x5c]; + +/// AND r8 with r/m8; set SF, ZF, PF according to result. +pub static TEST_BYTE_REG: [u8; 1] = [0x84]; + +/// AND {r16, r32, r64} with r/m of the same size; set SF, ZF, PF according to result. +pub static TEST_REG: [u8; 1] = [0x85]; + +/// Count the number of trailing zero bits. +pub static TZCNT: [u8; 3] = [0xf3, 0x0f, 0xbc]; + +/// Compare low double-precision floating-point values in xmm1 and xmm2/mem64 +/// and set the EFLAGS flags accordingly. +pub static UCOMISD: [u8; 3] = [0x66, 0x0f, 0x2e]; + +/// Compare low single-precision floating-point values in xmm1 and xmm2/mem32 +/// and set the EFLAGS flags accordingly. +pub static UCOMISS: [u8; 2] = [0x0f, 0x2e]; + +/// Raise invalid opcode instruction. +pub static UNDEFINED2: [u8; 2] = [0x0f, 0x0b]; + +/// Convert four packed unsigned doubleword integers from xmm2/m128/m32bcst to packed +/// single-precision floating-point values in xmm1 with writemask k1. Rounding behavior +/// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode +/// (AVX512VL, AVX512F). +pub static VCVTUDQ2PS: [u8; 3] = [0xf2, 0x0f, 0x7a]; + +/// imm{16,32} XOR r/m{16,32,64}, possibly sign-extended. +pub static XOR_IMM: [u8; 1] = [0x81]; + +/// r/m{16,32,64} XOR sign-extended imm8. +pub static XOR_IMM8_SIGN_EXTEND: [u8; 1] = [0x83]; + +/// r/m{16,32,64} XOR register of the same size. +pub static XOR: [u8; 1] = [0x31]; + +/// r/m8 XOR r8. +pub static XORB: [u8; 1] = [0x30]; + +/// Bitwise logical XOR of packed double-precision floating-point values. +pub static XORPD: [u8; 3] = [0x66, 0x0f, 0x57]; + +/// Bitwise logical XOR of packed single-precision floating-point values. +pub static XORPS: [u8; 2] = [0x0f, 0x57]; diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/recipes.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/recipes.rs new file mode 100644 index 0000000000..f45f8dc673 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/recipes.rs @@ -0,0 +1,3445 @@ +//! Encoding recipes for x86/x86_64. +use std::rc::Rc; + +use cranelift_codegen_shared::isa::x86::EncodingBits; + +use crate::cdsl::ast::Literal; +use crate::cdsl::formats::InstructionFormat; +use crate::cdsl::instructions::InstructionPredicate; +use crate::cdsl::recipes::{ + EncodingRecipe, EncodingRecipeBuilder, OperandConstraint, Register, Stack, +}; +use crate::cdsl::regs::IsaRegs; +use crate::cdsl::settings::SettingGroup; +use crate::shared::Definitions as SharedDefinitions; + +use crate::isa::x86::opcodes; + +/// Helper data structure to create recipes and template recipes. +/// It contains all the recipes and recipe templates that might be used in the encodings crate of +/// this same directory. +pub(crate) struct RecipeGroup<'builder> { + /// Memoized registers description, to pass it to builders later. + regs: &'builder IsaRegs, + + /// All the recipes explicitly created in this file. This is different from the final set of + /// recipes, which is definitive only once encodings have generated new recipes on the fly. + recipes: Vec<EncodingRecipe>, + + /// All the recipe templates created in this file. + templates: Vec<Rc<Template<'builder>>>, +} + +impl<'builder> RecipeGroup<'builder> { + fn new(regs: &'builder IsaRegs) -> Self { + Self { + regs, + recipes: Vec::new(), + templates: Vec::new(), + } + } + fn add_recipe(&mut self, recipe: EncodingRecipeBuilder) { + self.recipes.push(recipe.build()); + } + fn add_template_recipe(&mut self, recipe: EncodingRecipeBuilder) -> Rc<Template<'builder>> { + let template = Rc::new(Template::new(recipe, self.regs)); + self.templates.push(template.clone()); + template + } + fn add_template_inferred( + &mut self, + recipe: EncodingRecipeBuilder, + infer_function: &'static str, + ) -> Rc<Template<'builder>> { + let template = + Rc::new(Template::new(recipe, self.regs).inferred_rex_compute_size(infer_function)); + self.templates.push(template.clone()); + template + } + fn add_template(&mut self, template: Template<'builder>) -> Rc<Template<'builder>> { + let template = Rc::new(template); + self.templates.push(template.clone()); + template + } + pub fn recipe(&self, name: &str) -> &EncodingRecipe { + self.recipes + .iter() + .find(|recipe| recipe.name == name) + .unwrap_or_else(|| panic!("unknown recipe name: {}. Try template?", name)) + } + pub fn template(&self, name: &str) -> &Template { + self.templates + .iter() + .find(|recipe| recipe.name() == name) + .unwrap_or_else(|| panic!("unknown template name: {}. Try recipe?", name)) + } +} + +// Opcode representation. +// +// Cranelift requires each recipe to have a single encoding size in bytes, and x86 opcodes are +// variable length, so we use separate recipes for different styles of opcodes and prefixes. The +// opcode format is indicated by the recipe name prefix. +// +// The match case below does not include the REX prefix which goes after the mandatory prefix. +// VEX/XOP and EVEX prefixes are not yet supported. Encodings using any of these prefixes are +// represented by separate recipes. +// +// The encoding bits are: +// +// 0-7: The opcode byte <op>. +// 8-9: pp, mandatory prefix: +// 00 none (Op*) +// 01 66 (Mp*) +// 10 F3 (Mp*) +// 11 F2 (Mp*) +// 10-11: mm, opcode map: +// 00 <op> (Op1/Mp1) +// 01 0F <op> (Op2/Mp2) +// 10 0F 38 <op> (Op3/Mp3) +// 11 0F 3A <op> (Op3/Mp3) +// 12-14 rrr, opcode bits for the ModR/M byte for certain opcodes. +// 15: REX.W bit (or VEX.W/E) +// +// There is some redundancy between bits 8-11 and the recipe names, but we have enough bits, and +// the pp+mm format is ready for supporting VEX prefixes. +// +// TODO Cranelift doesn't actually require recipe to have different encoding sizes anymore, so this +// could be simplified. + +/// Given a sequence of opcode bytes, compute the recipe name prefix and encoding bits. +fn decode_opcodes(op_bytes: &[u8], rrr: u16, w: u16) -> (&'static str, u16) { + let enc = EncodingBits::new(op_bytes, rrr, w); + (enc.prefix().recipe_name_prefix(), enc.bits()) +} + +/// Given a snippet of Rust code (or None), replace the `PUT_OP` macro with the +/// corresponding `put_*` function from the `binemit.rs` module. +fn replace_put_op(code: Option<String>, prefix: &str) -> Option<String> { + code.map(|code| code.replace("{{PUT_OP}}", &format!("put_{}", prefix.to_lowercase()))) +} + +/// Replaces constraints to a REX-prefixed register class by the equivalent non-REX register class. +fn replace_nonrex_constraints( + regs: &IsaRegs, + constraints: Vec<OperandConstraint>, +) -> Vec<OperandConstraint> { + constraints + .into_iter() + .map(|constraint| match constraint { + OperandConstraint::RegClass(rc_index) => { + let new_rc_index = if rc_index == regs.class_by_name("GPR") { + regs.class_by_name("GPR8") + } else if rc_index == regs.class_by_name("FPR") { + regs.class_by_name("FPR8") + } else { + rc_index + }; + OperandConstraint::RegClass(new_rc_index) + } + _ => constraint, + }) + .collect() +} + +fn replace_evex_constraints( + _: &IsaRegs, + constraints: Vec<OperandConstraint>, +) -> Vec<OperandConstraint> { + constraints + .into_iter() + .map(|constraint| match constraint { + OperandConstraint::RegClass(rc_index) => { + // FIXME(#1306) this should be able to upgrade the register class to FPR32 as in + // `replace_nonrex_constraints` above, e.g. When FPR32 is re-added, add back in the + // rc_index conversion to FPR32. In the meantime, this is effectively a no-op + // conversion--the register class stays the same. + OperandConstraint::RegClass(rc_index) + } + _ => constraint, + }) + .collect() +} + +/// Specifies how the prefix (e.g. REX) is emitted by a Recipe. +#[derive(Copy, Clone, PartialEq)] +pub enum RecipePrefixKind { + /// The REX emission behavior is not hardcoded for the Recipe + /// and may be overridden when using the Template. + Unspecified, + + /// The Recipe must hardcode the non-emission of the REX prefix. + NeverEmitRex, + + /// The Recipe must hardcode the emission of the REX prefix. + AlwaysEmitRex, + + /// The Recipe should infer the emission of the REX.RXB bits from registers, + /// and the REX.W bit from the EncodingBits. + /// + /// Because such a Recipe has a non-constant instruction size, it must have + /// a special `compute_size` handler for the inferrable-REX case. + InferRex, + + /// The Recipe must hardcode the emission of an EVEX prefix. + Evex, +} + +impl Default for RecipePrefixKind { + fn default() -> Self { + Self::Unspecified + } +} + +/// Previously called a TailRecipe in the Python meta language, this allows to create multiple +/// variants of a single base EncodingRecipe (rex prefix, specialized w/rrr bits, different +/// opcodes). It serves as a prototype of an EncodingRecipe, which is then used when actually creating +/// Encodings, in encodings.rs. This is an idiosyncrasy of the x86 meta-language, and could be +/// reconsidered later. +#[derive(Clone)] +pub(crate) struct Template<'builder> { + /// Description of registers, used in the build() method. + regs: &'builder IsaRegs, + + /// The recipe template, which is to be specialized (by copy). + recipe: EncodingRecipeBuilder, + + /// How is the REX prefix emitted? + rex_kind: RecipePrefixKind, + + /// Function for `compute_size()` when REX is inferrable. + inferred_rex_compute_size: Option<&'static str>, + + /// Other recipe to use when REX-prefixed. + when_prefixed: Option<Rc<Template<'builder>>>, + + // Parameters passed in the EncodingBits. + /// Value of the W bit (0 or 1), stored in the EncodingBits. + w_bit: u16, + /// Value of the RRR bits (between 0 and 0b111). + rrr_bits: u16, + /// Opcode bytes. + op_bytes: &'static [u8], +} + +impl<'builder> Template<'builder> { + fn new(recipe: EncodingRecipeBuilder, regs: &'builder IsaRegs) -> Self { + Self { + regs, + recipe, + rex_kind: RecipePrefixKind::default(), + inferred_rex_compute_size: None, + when_prefixed: None, + w_bit: 0, + rrr_bits: 0, + op_bytes: &opcodes::EMPTY, + } + } + + fn name(&self) -> &str { + &self.recipe.name + } + fn rex_kind(self, kind: RecipePrefixKind) -> Self { + Self { + rex_kind: kind, + ..self + } + } + fn inferred_rex_compute_size(self, function: &'static str) -> Self { + Self { + inferred_rex_compute_size: Some(function), + ..self + } + } + fn when_prefixed(self, template: Rc<Template<'builder>>) -> Self { + assert!(self.when_prefixed.is_none()); + Self { + when_prefixed: Some(template), + ..self + } + } + + // Copy setters. + pub fn opcodes(&self, op_bytes: &'static [u8]) -> Self { + assert!(!op_bytes.is_empty()); + let mut copy = self.clone(); + copy.op_bytes = op_bytes; + copy + } + pub fn w(&self) -> Self { + let mut copy = self.clone(); + copy.w_bit = 1; + copy + } + pub fn rrr(&self, value: u16) -> Self { + assert!(value <= 0b111); + let mut copy = self.clone(); + copy.rrr_bits = value; + copy + } + pub fn nonrex(&self) -> Self { + assert!( + self.rex_kind != RecipePrefixKind::AlwaysEmitRex, + "Template requires REX prefix." + ); + let mut copy = self.clone(); + copy.rex_kind = RecipePrefixKind::NeverEmitRex; + copy + } + pub fn rex(&self) -> Self { + assert!( + self.rex_kind != RecipePrefixKind::NeverEmitRex, + "Template requires no REX prefix." + ); + if let Some(prefixed) = &self.when_prefixed { + let mut ret = prefixed.rex(); + // Forward specialized parameters. + ret.op_bytes = self.op_bytes; + ret.w_bit = self.w_bit; + ret.rrr_bits = self.rrr_bits; + return ret; + } + let mut copy = self.clone(); + copy.rex_kind = RecipePrefixKind::AlwaysEmitRex; + copy + } + pub fn infer_rex(&self) -> Self { + assert!( + self.rex_kind != RecipePrefixKind::NeverEmitRex, + "Template requires no REX prefix." + ); + assert!( + self.when_prefixed.is_none(), + "infer_rex used with when_prefixed()." + ); + let mut copy = self.clone(); + copy.rex_kind = RecipePrefixKind::InferRex; + copy + } + + pub fn build(mut self) -> (EncodingRecipe, u16) { + let (opcode, bits) = decode_opcodes(&self.op_bytes, self.rrr_bits, self.w_bit); + + let (recipe_name, size_addendum) = match self.rex_kind { + RecipePrefixKind::Unspecified | RecipePrefixKind::NeverEmitRex => { + // Ensure the operands are limited to non-REX constraints. + let operands_in = self.recipe.operands_in.unwrap_or_default(); + self.recipe.operands_in = Some(replace_nonrex_constraints(self.regs, operands_in)); + let operands_out = self.recipe.operands_out.unwrap_or_default(); + self.recipe.operands_out = + Some(replace_nonrex_constraints(self.regs, operands_out)); + + (opcode.into(), self.op_bytes.len() as u64) + } + RecipePrefixKind::AlwaysEmitRex => { + ("Rex".to_string() + opcode, self.op_bytes.len() as u64 + 1) + } + RecipePrefixKind::InferRex => { + assert_eq!(self.w_bit, 0, "A REX.W bit always requires a REX prefix; avoid using `infer_rex().w()` and use `rex().w()` instead."); + // Hook up the right function for inferred compute_size(). + assert!( + self.inferred_rex_compute_size.is_some(), + "InferRex recipe '{}' needs an inferred_rex_compute_size function.", + &self.recipe.name + ); + self.recipe.compute_size = self.inferred_rex_compute_size; + + ("DynRex".to_string() + opcode, self.op_bytes.len() as u64) + } + RecipePrefixKind::Evex => { + // Allow the operands to expand limits to EVEX constraints. + let operands_in = self.recipe.operands_in.unwrap_or_default(); + self.recipe.operands_in = Some(replace_evex_constraints(self.regs, operands_in)); + let operands_out = self.recipe.operands_out.unwrap_or_default(); + self.recipe.operands_out = Some(replace_evex_constraints(self.regs, operands_out)); + + ("Evex".to_string() + opcode, 4 + 1) + } + }; + + self.recipe.base_size += size_addendum; + + // Branch ranges are relative to the end of the instruction. + // For InferRex, the range should be the minimum, assuming no REX. + if let Some(range) = self.recipe.branch_range.as_mut() { + range.inst_size += size_addendum; + } + + self.recipe.emit = replace_put_op(self.recipe.emit, &recipe_name); + self.recipe.name = recipe_name + &self.recipe.name; + + (self.recipe.build(), bits) + } +} + +/// Returns a predicate checking that the "cond" field of the instruction contains one of the +/// directly supported floating point condition codes. +fn supported_floatccs_predicate( + supported_cc: &[Literal], + format: &InstructionFormat, +) -> InstructionPredicate { + supported_cc + .iter() + .fold(InstructionPredicate::new(), |pred, literal| { + pred.or(InstructionPredicate::new_is_field_equal( + format, + "cond", + literal.to_rust_code(), + )) + }) +} + +/// Return an instruction predicate that checks if `iform.imm` is a valid `scale` for a SIB byte. +fn valid_scale(format: &InstructionFormat) -> InstructionPredicate { + ["1", "2", "4", "8"] + .iter() + .fold(InstructionPredicate::new(), |pred, &literal| { + pred.or(InstructionPredicate::new_is_field_equal( + format, + "imm", + literal.into(), + )) + }) +} + +pub(crate) fn define<'shared>( + shared_defs: &'shared SharedDefinitions, + settings: &'shared SettingGroup, + regs: &'shared IsaRegs, +) -> RecipeGroup<'shared> { + // The set of floating point condition codes that are directly supported. + // Other condition codes need to be reversed or expressed as two tests. + let floatcc = &shared_defs.imm.floatcc; + let supported_floatccs: Vec<Literal> = ["ord", "uno", "one", "ueq", "gt", "ge", "ult", "ule"] + .iter() + .map(|name| Literal::enumerator_for(floatcc, name)) + .collect(); + + // Register classes shorthands. + let abcd = regs.class_by_name("ABCD"); + let gpr = regs.class_by_name("GPR"); + let fpr = regs.class_by_name("FPR"); + let flag = regs.class_by_name("FLAG"); + + // Operand constraints shorthands. + let reg_rflags = Register::new(flag, regs.regunit_by_name(flag, "rflags")); + let reg_rax = Register::new(gpr, regs.regunit_by_name(gpr, "rax")); + let reg_rcx = Register::new(gpr, regs.regunit_by_name(gpr, "rcx")); + let reg_rdx = Register::new(gpr, regs.regunit_by_name(gpr, "rdx")); + let reg_r15 = Register::new(gpr, regs.regunit_by_name(gpr, "r15")); + let reg_xmm0 = Register::new(fpr, regs.regunit_by_name(fpr, "xmm0")); + + // Stack operand with a 32-bit signed displacement from either RBP or RSP. + let stack_gpr32 = Stack::new(gpr); + let stack_fpr32 = Stack::new(fpr); + + let formats = &shared_defs.formats; + + // Predicates shorthands. + let use_sse41 = settings.predicate_by_name("use_sse41"); + + // Definitions. + let mut recipes = RecipeGroup::new(regs); + + // A null unary instruction that takes a GPR register. Can be used for identity copies and + // no-op conversions. + recipes.add_recipe( + EncodingRecipeBuilder::new("null", &formats.unary, 0) + .operands_in(vec![gpr]) + .operands_out(vec![0]) + .emit(""), + ); + recipes.add_recipe( + EncodingRecipeBuilder::new("null_fpr", &formats.unary, 0) + .operands_in(vec![fpr]) + .operands_out(vec![0]) + .emit(""), + ); + recipes.add_recipe( + EncodingRecipeBuilder::new("stacknull", &formats.unary, 0) + .operands_in(vec![stack_gpr32]) + .operands_out(vec![stack_gpr32]) + .emit(""), + ); + + recipes.add_recipe( + EncodingRecipeBuilder::new("get_pinned_reg", &formats.nullary, 0) + .operands_out(vec![reg_r15]) + .emit(""), + ); + // umr with a fixed register output that's r15. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("set_pinned_reg", &formats.unary, 1) + .operands_in(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + let r15 = RU::r15.into(); + {{PUT_OP}}(bits, rex2(r15, in_reg0), sink); + modrm_rr(r15, in_reg0, sink); + "#, + ), + ); + + // No-op fills, created by late-stage redundant-fill removal. + recipes.add_recipe( + EncodingRecipeBuilder::new("fillnull", &formats.unary, 0) + .operands_in(vec![stack_gpr32]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit(""), + ); + recipes.add_recipe( + EncodingRecipeBuilder::new("ffillnull", &formats.unary, 0) + .operands_in(vec![stack_gpr32]) + .operands_out(vec![fpr]) + .clobbers_flags(false) + .emit(""), + ); + + recipes.add_recipe( + EncodingRecipeBuilder::new("debugtrap", &formats.nullary, 1).emit("sink.put1(0xcc);"), + ); + + // XX opcode, no ModR/M. + recipes.add_template_recipe(EncodingRecipeBuilder::new("trap", &formats.trap, 0).emit( + r#" + sink.trap(code, func.srclocs[inst]); + {{PUT_OP}}(bits, BASE_REX, sink); + "#, + )); + + // Macro: conditional jump over a ud2. + recipes.add_recipe( + EncodingRecipeBuilder::new("trapif", &formats.int_cond_trap, 4) + .operands_in(vec![reg_rflags]) + .clobbers_flags(false) + .emit( + r#" + // Jump over a 2-byte ud2. + sink.put1(0x70 | (icc2opc(cond.inverse()) as u8)); + sink.put1(2); + // ud2. + sink.trap(code, func.srclocs[inst]); + sink.put1(0x0f); + sink.put1(0x0b); + "#, + ), + ); + + recipes.add_recipe( + EncodingRecipeBuilder::new("trapff", &formats.float_cond_trap, 4) + .operands_in(vec![reg_rflags]) + .clobbers_flags(false) + .inst_predicate(supported_floatccs_predicate( + &supported_floatccs, + &*formats.float_cond_trap, + )) + .emit( + r#" + // Jump over a 2-byte ud2. + sink.put1(0x70 | (fcc2opc(cond.inverse()) as u8)); + sink.put1(2); + // ud2. + sink.trap(code, func.srclocs[inst]); + sink.put1(0x0f); + sink.put1(0x0b); + "#, + ), + ); + + // XX /r + recipes.add_template_inferred( + EncodingRecipeBuilder::new("rr", &formats.binary, 1) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![0]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink); + modrm_rr(in_reg0, in_reg1, sink); + "#, + ), + "size_with_inferred_rex_for_inreg0_inreg1", + ); + + // XX /r with operands swapped. (RM form). + recipes.add_template_inferred( + EncodingRecipeBuilder::new("rrx", &formats.binary, 1) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![0]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + "#, + ), + "size_with_inferred_rex_for_inreg0_inreg1", + ); + + // XX /r with FPR ins and outs. A form. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("fa", &formats.binary, 1) + .operands_in(vec![fpr, fpr]) + .operands_out(vec![0]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + "#, + ), + "size_with_inferred_rex_for_inreg0_inreg1", + ); + + // XX /r with FPR ins and outs. A form with input operands swapped. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("fax", &formats.binary, 1) + .operands_in(vec![fpr, fpr]) + .operands_out(vec![1]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink); + modrm_rr(in_reg0, in_reg1, sink); + "#, + ), + // The operand order does not matter for calculating whether a REX prefix is needed. + "size_with_inferred_rex_for_inreg0_inreg1", + ); + + // XX /r with FPR ins and outs. A form with a byte immediate. + { + recipes.add_template_inferred( + EncodingRecipeBuilder::new("fa_ib", &formats.ternary_imm8, 2) + .operands_in(vec![fpr, fpr]) + .operands_out(vec![0]) + .inst_predicate(InstructionPredicate::new_is_unsigned_int( + &*formats.ternary_imm8, + "imm", + 8, + 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + let imm: i64 = imm.into(); + sink.put1(imm as u8); + "#, + ), + "size_with_inferred_rex_for_inreg0_inreg1", + ); + } + + // XX /n for a unary operation with extension bits. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("ur", &formats.unary, 1) + .operands_in(vec![gpr]) + .operands_out(vec![0]) + .emit( + r#" + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"), + ); + + // XX /r, but for a unary operator with separate input/output register, like + // copies. MR form, preserving flags. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("umr", &formats.unary, 1) + .operands_in(vec![gpr]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(out_reg0, in_reg0), sink); + modrm_rr(out_reg0, in_reg0, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_outreg0"), + ); + + // Same as umr, but with FPR -> GPR registers. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("rfumr", &formats.unary, 1) + .operands_in(vec![fpr]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(out_reg0, in_reg0), sink); + modrm_rr(out_reg0, in_reg0, sink); + "#, + ), + ); + + // Same as umr, but with the source register specified directly. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("umr_reg_to_ssa", &formats.copy_to_ssa, 1) + // No operands_in to mention, because a source register is specified directly. + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(out_reg0, src), sink); + modrm_rr(out_reg0, src, sink); + "#, + ), + ); + + // XX /r, but for a unary operator with separate input/output register. + // RM form. Clobbers FLAGS. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("urm", &formats.unary, 1) + .operands_in(vec![gpr]) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + "#, + ), + ); + + // XX /r. Same as urm, but doesn't clobber FLAGS. + let urm_noflags = recipes.add_template_recipe( + EncodingRecipeBuilder::new("urm_noflags", &formats.unary, 1) + .operands_in(vec![gpr]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + "#, + ), + ); + + // XX /r. Same as urm_noflags, but input limited to ABCD. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("urm_noflags_abcd", &formats.unary, 1) + .operands_in(vec![abcd]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + "#, + ), + regs, + ) + .when_prefixed(urm_noflags), + ); + + // XX /r, RM form, FPR -> FPR. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("furm", &formats.unary, 1) + .operands_in(vec![fpr]) + .operands_out(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + "#, + ), + "size_with_inferred_rex_for_inreg0_outreg0", + ); + + // Same as furm, but with the source register specified directly. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("furm_reg_to_ssa", &formats.copy_to_ssa, 1) + // No operands_in to mention, because a source register is specified directly. + .operands_out(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(src, out_reg0), sink); + modrm_rr(src, out_reg0, sink); + "#, + ), + ); + + // XX /r, RM form, GPR -> FPR. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("frurm", &formats.unary, 1) + .operands_in(vec![gpr]) + .operands_out(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + "#, + ), + "size_with_inferred_rex_for_inreg0_outreg0", + ); + + // XX /r, RM form, FPR -> GPR. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("rfurm", &formats.unary, 1) + .operands_in(vec![fpr]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + "#, + ), + ); + + // XX /r, RMI form for one of the roundXX SSE 4.1 instructions. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("furmi_rnd", &formats.unary, 2) + .operands_in(vec![fpr]) + .operands_out(vec![fpr]) + .isa_predicate(use_sse41) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + sink.put1(match opcode { + Opcode::Nearest => 0b00, + Opcode::Floor => 0b01, + Opcode::Ceil => 0b10, + Opcode::Trunc => 0b11, + x => panic!("{} unexpected for furmi_rnd", opcode), + }); + "#, + ), + ); + + // XX /r, for regmove instructions. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("rmov", &formats.reg_move, 1) + .operands_in(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(dst, src), sink); + modrm_rr(dst, src, sink); + "#, + ), + ); + + // XX /r, for regmove instructions (FPR version, RM encoded). + recipes.add_template_recipe( + EncodingRecipeBuilder::new("frmov", &formats.reg_move, 1) + .operands_in(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(src, dst), sink); + modrm_rr(src, dst, sink); + "#, + ), + ); + + // XX /n with one arg in %rcx, for shifts. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("rc", &formats.binary, 1) + .operands_in(vec![ + OperandConstraint::RegClass(gpr), + OperandConstraint::FixedReg(reg_rcx), + ]) + .operands_out(vec![0]) + .emit( + r#" + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + "#, + ), + ); + + // XX /n for division: inputs in %rax, %rdx, r. Outputs in %rax, %rdx. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("div", &formats.ternary, 1) + .operands_in(vec![ + OperandConstraint::FixedReg(reg_rax), + OperandConstraint::FixedReg(reg_rdx), + OperandConstraint::RegClass(gpr), + ]) + .operands_out(vec![reg_rax, reg_rdx]) + .emit( + r#" + sink.trap(TrapCode::IntegerDivisionByZero, func.srclocs[inst]); + {{PUT_OP}}(bits, rex1(in_reg2), sink); + modrm_r_bits(in_reg2, bits, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg2"), + ); + + // XX /n for {s,u}mulx: inputs in %rax, r. Outputs in %rdx(hi):%rax(lo) + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("mulx", &formats.binary, 1) + .operands_in(vec![ + OperandConstraint::FixedReg(reg_rax), + OperandConstraint::RegClass(gpr), + ]) + .operands_out(vec![ + OperandConstraint::FixedReg(reg_rax), + OperandConstraint::FixedReg(reg_rdx), + ]) + .emit( + r#" + {{PUT_OP}}(bits, rex1(in_reg1), sink); + modrm_r_bits(in_reg1, bits, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg1"), + ); + + // XX /r for BLEND* instructions + recipes.add_template_inferred( + EncodingRecipeBuilder::new("blend", &formats.ternary, 1) + .operands_in(vec![ + OperandConstraint::FixedReg(reg_xmm0), + OperandConstraint::RegClass(fpr), + OperandConstraint::RegClass(fpr), + ]) + .operands_out(vec![2]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg1, in_reg2), sink); + modrm_rr(in_reg1, in_reg2, sink); + "#, + ), + "size_with_inferred_rex_for_inreg1_inreg2", + ); + + // XX /n ib with 8-bit immediate sign-extended. + { + recipes.add_template_inferred( + EncodingRecipeBuilder::new("r_ib", &formats.binary_imm64, 2) + .operands_in(vec![gpr]) + .operands_out(vec![0]) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &*formats.binary_imm64, + "imm", + 8, + 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + let imm: i64 = imm.into(); + sink.put1(imm as u8); + "#, + ), + "size_with_inferred_rex_for_inreg0", + ); + + recipes.add_template_inferred( + EncodingRecipeBuilder::new("f_ib", &formats.binary_imm64, 2) + .operands_in(vec![fpr]) + .operands_out(vec![0]) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &*formats.binary_imm64, + "imm", + 8, + 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + let imm: i64 = imm.into(); + sink.put1(imm as u8); + "#, + ), + "size_with_inferred_rex_for_inreg0", + ); + + // XX /n id with 32-bit immediate sign-extended. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("r_id", &formats.binary_imm64, 5) + .operands_in(vec![gpr]) + .operands_out(vec![0]) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &*formats.binary_imm64, + "imm", + 32, + 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + let imm: i64 = imm.into(); + sink.put4(imm as u32); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"), + ); + } + + // XX /r ib with 8-bit unsigned immediate (e.g. for pshufd) + { + recipes.add_template_inferred( + EncodingRecipeBuilder::new("r_ib_unsigned_fpr", &formats.binary_imm8, 2) + .operands_in(vec![fpr]) + .operands_out(vec![fpr]) + .inst_predicate(InstructionPredicate::new_is_unsigned_int( + &*formats.binary_imm8, + "imm", + 8, + 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + let imm: i64 = imm.into(); + sink.put1(imm as u8); + "#, + ), + "size_with_inferred_rex_for_inreg0_outreg0", + ); + } + + // XX /r ib with 8-bit unsigned immediate (e.g. for extractlane) + { + recipes.add_template_inferred( + EncodingRecipeBuilder::new("r_ib_unsigned_gpr", &formats.binary_imm8, 2) + .operands_in(vec![fpr]) + .operands_out(vec![gpr]) + .inst_predicate(InstructionPredicate::new_is_unsigned_int( + &*formats.binary_imm8, "imm", 8, 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(out_reg0, in_reg0), sink); + modrm_rr(out_reg0, in_reg0, sink); // note the flipped register in the ModR/M byte + let imm: i64 = imm.into(); + sink.put1(imm as u8); + "#, + ), "size_with_inferred_rex_for_inreg0_outreg0" + ); + } + + // XX /r ib with 8-bit unsigned immediate (e.g. for insertlane) + { + recipes.add_template_inferred( + EncodingRecipeBuilder::new("r_ib_unsigned_r", &formats.ternary_imm8, 2) + .operands_in(vec![fpr, gpr]) + .operands_out(vec![0]) + .inst_predicate(InstructionPredicate::new_is_unsigned_int( + &*formats.ternary_imm8, + "imm", + 8, + 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + let imm: i64 = imm.into(); + sink.put1(imm as u8); + "#, + ), + "size_with_inferred_rex_for_inreg0_inreg1", + ); + } + + { + // XX /n id with 32-bit immediate sign-extended. UnaryImm version. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("u_id", &formats.unary_imm, 5) + .operands_out(vec![gpr]) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &*formats.unary_imm, + "imm", + 32, + 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex1(out_reg0), sink); + modrm_r_bits(out_reg0, bits, sink); + let imm: i64 = imm.into(); + sink.put4(imm as u32); + "#, + ), + ); + } + + // XX+rd id unary with 32-bit immediate. Note no recipe predicate. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("pu_id", &formats.unary_imm, 4) + .operands_out(vec![gpr]) + .emit( + r#" + // The destination register is encoded in the low bits of the opcode. + // No ModR/M. + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + let imm: i64 = imm.into(); + sink.put4(imm as u32); + "#, + ), + ); + + // XX+rd id unary with bool immediate. Note no recipe predicate. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("pu_id_bool", &formats.unary_bool, 4) + .operands_out(vec![gpr]) + .emit( + r#" + // The destination register is encoded in the low bits of the opcode. + // No ModR/M. + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + let imm: u32 = if imm { 1 } else { 0 }; + sink.put4(imm); + "#, + ), + ); + + // XX+rd id nullary with 0 as 32-bit immediate. Note no recipe predicate. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("pu_id_ref", &formats.nullary, 4) + .operands_out(vec![gpr]) + .emit( + r#" + // The destination register is encoded in the low bits of the opcode. + // No ModR/M. + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + sink.put4(0); + "#, + ), + ); + + // XX+rd iq unary with 64-bit immediate. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("pu_iq", &formats.unary_imm, 8) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + let imm: i64 = imm.into(); + sink.put8(imm as u64); + "#, + ), + ); + + // XX+rd id unary with zero immediate. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("u_id_z", &formats.unary_imm, 1) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink); + modrm_rr(out_reg0, out_reg0, sink); + "#, + ), + ); + + // XX /n Unary with floating point 32-bit immediate equal to zero. + { + recipes.add_template_recipe( + EncodingRecipeBuilder::new("f32imm_z", &formats.unary_ieee32, 1) + .operands_out(vec![fpr]) + .inst_predicate(InstructionPredicate::new_is_zero_32bit_float( + &*formats.unary_ieee32, + "imm", + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink); + modrm_rr(out_reg0, out_reg0, sink); + "#, + ), + ); + } + + // XX /n Unary with floating point 64-bit immediate equal to zero. + { + recipes.add_template_recipe( + EncodingRecipeBuilder::new("f64imm_z", &formats.unary_ieee64, 1) + .operands_out(vec![fpr]) + .inst_predicate(InstructionPredicate::new_is_zero_64bit_float( + &*formats.unary_ieee64, + "imm", + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink); + modrm_rr(out_reg0, out_reg0, sink); + "#, + ), + ); + } + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("pushq", &formats.unary, 0) + .operands_in(vec![gpr]) + .emit( + r#" + sink.trap(TrapCode::StackOverflow, func.srclocs[inst]); + {{PUT_OP}}(bits | (in_reg0 & 7), rex1(in_reg0), sink); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("popq", &formats.nullary, 0) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + "#, + ), + ); + + // XX /r, for regmove instructions. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("copysp", &formats.copy_special, 1) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(dst, src), sink); + modrm_rr(dst, src, sink); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("adjustsp", &formats.unary, 1) + .operands_in(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(RU::rsp.into(), in_reg0), sink); + modrm_rr(RU::rsp.into(), in_reg0, sink); + "#, + ), + ); + + { + recipes.add_template_recipe( + EncodingRecipeBuilder::new("adjustsp_ib", &formats.unary_imm, 2) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &*formats.unary_imm, + "imm", + 8, + 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex1(RU::rsp.into()), sink); + modrm_r_bits(RU::rsp.into(), bits, sink); + let imm: i64 = imm.into(); + sink.put1(imm as u8); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("adjustsp_id", &formats.unary_imm, 5) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &*formats.unary_imm, + "imm", + 32, + 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex1(RU::rsp.into()), sink); + modrm_r_bits(RU::rsp.into(), bits, sink); + let imm: i64 = imm.into(); + sink.put4(imm as u32); + "#, + ), + ); + } + + recipes.add_recipe( + EncodingRecipeBuilder::new("dummy_sarg_t", &formats.nullary, 0) + .operands_out(vec![Stack::new(gpr)]) + .emit(""), + ); + + // XX+rd id with Abs4 function relocation. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fnaddr4", &formats.func_addr, 4) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + sink.reloc_external(func.srclocs[inst], + Reloc::Abs4, + &func.dfg.ext_funcs[func_ref].name, + 0); + sink.put4(0); + "#, + ), + ); + + // XX+rd iq with Abs8 function relocation. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fnaddr8", &formats.func_addr, 8) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + sink.reloc_external(func.srclocs[inst], + Reloc::Abs8, + &func.dfg.ext_funcs[func_ref].name, + 0); + sink.put8(0); + "#, + ), + ); + + // Similar to fnaddr4, but writes !0 (this is used by BaldrMonkey). + recipes.add_template_recipe( + EncodingRecipeBuilder::new("allones_fnaddr4", &formats.func_addr, 4) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + sink.reloc_external(func.srclocs[inst], + Reloc::Abs4, + &func.dfg.ext_funcs[func_ref].name, + 0); + // Write the immediate as `!0` for the benefit of BaldrMonkey. + sink.put4(!0); + "#, + ), + ); + + // Similar to fnaddr8, but writes !0 (this is used by BaldrMonkey). + recipes.add_template_recipe( + EncodingRecipeBuilder::new("allones_fnaddr8", &formats.func_addr, 8) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + sink.reloc_external(func.srclocs[inst], + Reloc::Abs8, + &func.dfg.ext_funcs[func_ref].name, + 0); + // Write the immediate as `!0` for the benefit of BaldrMonkey. + sink.put8(!0); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("pcrel_fnaddr8", &formats.func_addr, 5) + .operands_out(vec![gpr]) + // rex2 gets passed 0 for r/m register because the upper bit of + // r/m doesn't get decoded when in rip-relative addressing mode. + .emit( + r#" + {{PUT_OP}}(bits, rex2(0, out_reg0), sink); + modrm_riprel(out_reg0, sink); + // The addend adjusts for the difference between the end of the + // instruction and the beginning of the immediate field. + sink.reloc_external(func.srclocs[inst], + Reloc::X86PCRel4, + &func.dfg.ext_funcs[func_ref].name, + -4); + sink.put4(0); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("got_fnaddr8", &formats.func_addr, 5) + .operands_out(vec![gpr]) + // rex2 gets passed 0 for r/m register because the upper bit of + // r/m doesn't get decoded when in rip-relative addressing mode. + .emit( + r#" + {{PUT_OP}}(bits, rex2(0, out_reg0), sink); + modrm_riprel(out_reg0, sink); + // The addend adjusts for the difference between the end of the + // instruction and the beginning of the immediate field. + sink.reloc_external(func.srclocs[inst], + Reloc::X86GOTPCRel4, + &func.dfg.ext_funcs[func_ref].name, + -4); + sink.put4(0); + "#, + ), + ); + + // XX+rd id with Abs4 globalsym relocation. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("gvaddr4", &formats.unary_global_value, 4) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + sink.reloc_external(func.srclocs[inst], + Reloc::Abs4, + &func.global_values[global_value].symbol_name(), + 0); + sink.put4(0); + "#, + ), + ); + + // XX+rd iq with Abs8 globalsym relocation. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("gvaddr8", &formats.unary_global_value, 8) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits | (out_reg0 & 7), rex1(out_reg0), sink); + sink.reloc_external(func.srclocs[inst], + Reloc::Abs8, + &func.global_values[global_value].symbol_name(), + 0); + sink.put8(0); + "#, + ), + ); + + // XX+rd iq with PCRel4 globalsym relocation. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("pcrel_gvaddr8", &formats.unary_global_value, 5) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(0, out_reg0), sink); + modrm_rm(5, out_reg0, sink); + // The addend adjusts for the difference between the end of the + // instruction and the beginning of the immediate field. + sink.reloc_external(func.srclocs[inst], + Reloc::X86PCRel4, + &func.global_values[global_value].symbol_name(), + -4); + sink.put4(0); + "#, + ), + ); + + // XX+rd iq with Abs8 globalsym relocation. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("got_gvaddr8", &formats.unary_global_value, 5) + .operands_out(vec![gpr]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(0, out_reg0), sink); + modrm_rm(5, out_reg0, sink); + // The addend adjusts for the difference between the end of the + // instruction and the beginning of the immediate field. + sink.reloc_external(func.srclocs[inst], + Reloc::X86GOTPCRel4, + &func.global_values[global_value].symbol_name(), + -4); + sink.put4(0); + "#, + ), + ); + + // Stack addresses. + // + // TODO Alternative forms for 8-bit immediates, when applicable. + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("spaddr_id", &formats.stack_load, 6) + .operands_out(vec![gpr]) + .emit( + r#" + let sp = StackRef::sp(stack_slot, &func.stack_slots); + let base = stk_base(sp.base); + {{PUT_OP}}(bits, rex2(base, out_reg0), sink); + modrm_sib_disp32(out_reg0, sink); + sib_noindex(base, sink); + let imm : i32 = offset.into(); + sink.put4(sp.offset.checked_add(imm).unwrap() as u32); + "#, + ), + ); + + // Constant addresses. + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("const_addr", &formats.unary_const, 5) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(0, out_reg0), sink); + modrm_riprel(out_reg0, sink); + const_disp4(constant_handle, func, sink); + "#, + ), + ); + + // Store recipes. + + { + // Simple stores. + + // A predicate asking if the offset is zero. + let has_no_offset = + InstructionPredicate::new_is_field_equal(&*formats.store, "offset", "0".into()); + + // XX /r register-indirect store with no offset. + let st = recipes.add_template_recipe( + EncodingRecipeBuilder::new("st", &formats.store, 1) + .operands_in(vec![gpr, gpr]) + .inst_predicate(has_no_offset.clone()) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_or_offset_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + if needs_sib_byte(in_reg1) { + modrm_sib(in_reg0, sink); + sib_noindex(in_reg1, sink); + } else if needs_offset(in_reg1) { + modrm_disp8(in_reg1, in_reg0, sink); + sink.put1(0); + } else { + modrm_rm(in_reg1, in_reg0, sink); + } + "#, + ), + ); + + // XX /r register-indirect store with no offset. + // Only ABCD allowed for stored value. This is for byte stores with no REX. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("st_abcd", &formats.store, 1) + .operands_in(vec![abcd, gpr]) + .inst_predicate(has_no_offset.clone()) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_or_offset_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + if needs_sib_byte(in_reg1) { + modrm_sib(in_reg0, sink); + sib_noindex(in_reg1, sink); + } else if needs_offset(in_reg1) { + modrm_disp8(in_reg1, in_reg0, sink); + sink.put1(0); + } else { + modrm_rm(in_reg1, in_reg0, sink); + } + "#, + ), + regs, + ) + .when_prefixed(st), + ); + + // XX /r register-indirect store of FPR with no offset. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("fst", &formats.store, 1) + .operands_in(vec![fpr, gpr]) + .inst_predicate(has_no_offset) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_or_offset_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + if needs_sib_byte(in_reg1) { + modrm_sib(in_reg0, sink); + sib_noindex(in_reg1, sink); + } else if needs_offset(in_reg1) { + modrm_disp8(in_reg1, in_reg0, sink); + sink.put1(0); + } else { + modrm_rm(in_reg1, in_reg0, sink); + } + "#, + ), + "size_plus_maybe_sib_or_offset_inreg1_plus_rex_prefix_for_inreg0_inreg1", + ); + + let has_small_offset = + InstructionPredicate::new_is_signed_int(&*formats.store, "offset", 8, 0); + + // XX /r register-indirect store with 8-bit offset. + let st_disp8 = recipes.add_template_recipe( + EncodingRecipeBuilder::new("stDisp8", &formats.store, 2) + .operands_in(vec![gpr, gpr]) + .inst_predicate(has_small_offset.clone()) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + if needs_sib_byte(in_reg1) { + modrm_sib_disp8(in_reg0, sink); + sib_noindex(in_reg1, sink); + } else { + modrm_disp8(in_reg1, in_reg0, sink); + } + let offset: i32 = offset.into(); + sink.put1(offset as u8); + "#, + ), + ); + + // XX /r register-indirect store with 8-bit offset. + // Only ABCD allowed for stored value. This is for byte stores with no REX. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("stDisp8_abcd", &formats.store, 2) + .operands_in(vec![abcd, gpr]) + .inst_predicate(has_small_offset.clone()) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + if needs_sib_byte(in_reg1) { + modrm_sib_disp8(in_reg0, sink); + sib_noindex(in_reg1, sink); + } else { + modrm_disp8(in_reg1, in_reg0, sink); + } + let offset: i32 = offset.into(); + sink.put1(offset as u8); + "#, + ), + regs, + ) + .when_prefixed(st_disp8), + ); + + // XX /r register-indirect store with 8-bit offset of FPR. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("fstDisp8", &formats.store, 2) + .operands_in(vec![fpr, gpr]) + .inst_predicate(has_small_offset) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + if needs_sib_byte(in_reg1) { + modrm_sib_disp8(in_reg0, sink); + sib_noindex(in_reg1, sink); + } else { + modrm_disp8(in_reg1, in_reg0, sink); + } + let offset: i32 = offset.into(); + sink.put1(offset as u8); + "#, + ), + "size_plus_maybe_sib_inreg1_plus_rex_prefix_for_inreg0_inreg1", + ); + + // XX /r register-indirect store with 32-bit offset. + let st_disp32 = recipes.add_template_recipe( + EncodingRecipeBuilder::new("stDisp32", &formats.store, 5) + .operands_in(vec![gpr, gpr]) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + if needs_sib_byte(in_reg1) { + modrm_sib_disp32(in_reg0, sink); + sib_noindex(in_reg1, sink); + } else { + modrm_disp32(in_reg1, in_reg0, sink); + } + let offset: i32 = offset.into(); + sink.put4(offset as u32); + "#, + ), + ); + + // XX /r register-indirect store with 32-bit offset. + // Only ABCD allowed for stored value. This is for byte stores with no REX. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("stDisp32_abcd", &formats.store, 5) + .operands_in(vec![abcd, gpr]) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + if needs_sib_byte(in_reg1) { + modrm_sib_disp32(in_reg0, sink); + sib_noindex(in_reg1, sink); + } else { + modrm_disp32(in_reg1, in_reg0, sink); + } + let offset: i32 = offset.into(); + sink.put4(offset as u32); + "#, + ), + regs, + ) + .when_prefixed(st_disp32), + ); + + // XX /r register-indirect store with 32-bit offset of FPR. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("fstDisp32", &formats.store, 5) + .operands_in(vec![fpr, gpr]) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + if needs_sib_byte(in_reg1) { + modrm_sib_disp32(in_reg0, sink); + sib_noindex(in_reg1, sink); + } else { + modrm_disp32(in_reg1, in_reg0, sink); + } + let offset: i32 = offset.into(); + sink.put4(offset as u32); + "#, + ), + "size_plus_maybe_sib_inreg1_plus_rex_prefix_for_inreg0_inreg1", + ); + } + + { + // Complex stores. + + // A predicate asking if the offset is zero. + let has_no_offset = + InstructionPredicate::new_is_field_equal(&*formats.store_complex, "offset", "0".into()); + + // XX /r register-indirect store with index and no offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("stWithIndex", &formats.store_complex, 2) + .operands_in(vec![gpr, gpr, gpr]) + .inst_predicate(has_no_offset.clone()) + .clobbers_flags(false) + .compute_size("size_plus_maybe_offset_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink); + // The else branch always inserts an SIB byte. + if needs_offset(in_reg1) { + modrm_sib_disp8(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + sink.put1(0); + } else { + modrm_sib(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + } + "#, + ), + ); + + // XX /r register-indirect store with index and no offset. + // Only ABCD allowed for stored value. This is for byte stores with no REX. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("stWithIndex_abcd", &formats.store_complex, 2) + .operands_in(vec![abcd, gpr, gpr]) + .inst_predicate(has_no_offset.clone()) + .clobbers_flags(false) + .compute_size("size_plus_maybe_offset_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink); + // The else branch always inserts an SIB byte. + if needs_offset(in_reg1) { + modrm_sib_disp8(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + sink.put1(0); + } else { + modrm_sib(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + } + "#, + ), + ); + + // XX /r register-indirect store with index and no offset of FPR. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fstWithIndex", &formats.store_complex, 2) + .operands_in(vec![fpr, gpr, gpr]) + .inst_predicate(has_no_offset) + .clobbers_flags(false) + .compute_size("size_plus_maybe_offset_for_inreg_1") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink); + // The else branch always inserts an SIB byte. + if needs_offset(in_reg1) { + modrm_sib_disp8(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + sink.put1(0); + } else { + modrm_sib(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + } + "#, + ), + ); + + let has_small_offset = + InstructionPredicate::new_is_signed_int(&*formats.store_complex, "offset", 8, 0); + + // XX /r register-indirect store with index and 8-bit offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("stWithIndexDisp8", &formats.store_complex, 3) + .operands_in(vec![gpr, gpr, gpr]) + .inst_predicate(has_small_offset.clone()) + .clobbers_flags(false) + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink); + modrm_sib_disp8(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + let offset: i32 = offset.into(); + sink.put1(offset as u8); + "#, + ), + ); + + // XX /r register-indirect store with index and 8-bit offset. + // Only ABCD allowed for stored value. This is for byte stores with no REX. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("stWithIndexDisp8_abcd", &formats.store_complex, 3) + .operands_in(vec![abcd, gpr, gpr]) + .inst_predicate(has_small_offset.clone()) + .clobbers_flags(false) + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink); + modrm_sib_disp8(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + let offset: i32 = offset.into(); + sink.put1(offset as u8); + "#, + ), + ); + + // XX /r register-indirect store with index and 8-bit offset of FPR. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fstWithIndexDisp8", &formats.store_complex, 3) + .operands_in(vec![fpr, gpr, gpr]) + .inst_predicate(has_small_offset) + .clobbers_flags(false) + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink); + modrm_sib_disp8(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + let offset: i32 = offset.into(); + sink.put1(offset as u8); + "#, + ), + ); + + let has_big_offset = + InstructionPredicate::new_is_signed_int(&*formats.store_complex, "offset", 32, 0); + + // XX /r register-indirect store with index and 32-bit offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("stWithIndexDisp32", &formats.store_complex, 6) + .operands_in(vec![gpr, gpr, gpr]) + .inst_predicate(has_big_offset.clone()) + .clobbers_flags(false) + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink); + modrm_sib_disp32(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + let offset: i32 = offset.into(); + sink.put4(offset as u32); + "#, + ), + ); + + // XX /r register-indirect store with index and 32-bit offset. + // Only ABCD allowed for stored value. This is for byte stores with no REX. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("stWithIndexDisp32_abcd", &formats.store_complex, 6) + .operands_in(vec![abcd, gpr, gpr]) + .inst_predicate(has_big_offset.clone()) + .clobbers_flags(false) + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink); + modrm_sib_disp32(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + let offset: i32 = offset.into(); + sink.put4(offset as u32); + "#, + ), + ); + + // XX /r register-indirect store with index and 32-bit offset of FPR. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fstWithIndexDisp32", &formats.store_complex, 6) + .operands_in(vec![fpr, gpr, gpr]) + .inst_predicate(has_big_offset) + .clobbers_flags(false) + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg1, in_reg0, in_reg2), sink); + modrm_sib_disp32(in_reg0, sink); + sib(0, in_reg2, in_reg1, sink); + let offset: i32 = offset.into(); + sink.put4(offset as u32); + "#, + ), + ); + } + + // Unary spill with SIB and 32-bit displacement. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("spillSib32", &formats.unary, 6) + .operands_in(vec![gpr]) + .operands_out(vec![stack_gpr32]) + .clobbers_flags(false) + .emit( + r#" + sink.trap(TrapCode::StackOverflow, func.srclocs[inst]); + let base = stk_base(out_stk0.base); + {{PUT_OP}}(bits, rex2(base, in_reg0), sink); + modrm_sib_disp32(in_reg0, sink); + sib_noindex(base, sink); + sink.put4(out_stk0.offset as u32); + "#, + ), + ); + + // Like spillSib32, but targeting an FPR rather than a GPR. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fspillSib32", &formats.unary, 6) + .operands_in(vec![fpr]) + .operands_out(vec![stack_fpr32]) + .clobbers_flags(false) + .emit( + r#" + sink.trap(TrapCode::StackOverflow, func.srclocs[inst]); + let base = stk_base(out_stk0.base); + {{PUT_OP}}(bits, rex2(base, in_reg0), sink); + modrm_sib_disp32(in_reg0, sink); + sib_noindex(base, sink); + sink.put4(out_stk0.offset as u32); + "#, + ), + ); + + // Regspill using RSP-relative addressing. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("regspill32", &formats.reg_spill, 6) + .operands_in(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + sink.trap(TrapCode::StackOverflow, func.srclocs[inst]); + let dst = StackRef::sp(dst, &func.stack_slots); + let base = stk_base(dst.base); + {{PUT_OP}}(bits, rex2(base, src), sink); + modrm_sib_disp32(src, sink); + sib_noindex(base, sink); + sink.put4(dst.offset as u32); + "#, + ), + ); + + // Like regspill32, but targeting an FPR rather than a GPR. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fregspill32", &formats.reg_spill, 6) + .operands_in(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" + sink.trap(TrapCode::StackOverflow, func.srclocs[inst]); + let dst = StackRef::sp(dst, &func.stack_slots); + let base = stk_base(dst.base); + {{PUT_OP}}(bits, rex2(base, src), sink); + modrm_sib_disp32(src, sink); + sib_noindex(base, sink); + sink.put4(dst.offset as u32); + "#, + ), + ); + + // Load recipes. + + { + // Simple loads. + + // A predicate asking if the offset is zero. + let has_no_offset = + InstructionPredicate::new_is_field_equal(&*formats.load, "offset", "0".into()); + + // XX /r load with no offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("ld", &formats.load, 1) + .operands_in(vec![gpr]) + .operands_out(vec![gpr]) + .inst_predicate(has_no_offset.clone()) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_or_offset_for_inreg_0") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + if needs_sib_byte(in_reg0) { + modrm_sib(out_reg0, sink); + sib_noindex(in_reg0, sink); + } else if needs_offset(in_reg0) { + modrm_disp8(in_reg0, out_reg0, sink); + sink.put1(0); + } else { + modrm_rm(in_reg0, out_reg0, sink); + } + "#, + ), + ); + + // XX /r float load with no offset. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("fld", &formats.load, 1) + .operands_in(vec![gpr]) + .operands_out(vec![fpr]) + .inst_predicate(has_no_offset) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_or_offset_for_inreg_0") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + if needs_sib_byte(in_reg0) { + modrm_sib(out_reg0, sink); + sib_noindex(in_reg0, sink); + } else if needs_offset(in_reg0) { + modrm_disp8(in_reg0, out_reg0, sink); + sink.put1(0); + } else { + modrm_rm(in_reg0, out_reg0, sink); + } + "#, + ), + "size_plus_maybe_sib_or_offset_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0", + ); + + let has_small_offset = + InstructionPredicate::new_is_signed_int(&*formats.load, "offset", 8, 0); + + // XX /r load with 8-bit offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("ldDisp8", &formats.load, 2) + .operands_in(vec![gpr]) + .operands_out(vec![gpr]) + .inst_predicate(has_small_offset.clone()) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_for_inreg_0") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + if needs_sib_byte(in_reg0) { + modrm_sib_disp8(out_reg0, sink); + sib_noindex(in_reg0, sink); + } else { + modrm_disp8(in_reg0, out_reg0, sink); + } + let offset: i32 = offset.into(); + sink.put1(offset as u8); + "#, + ), + ); + + // XX /r float load with 8-bit offset. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("fldDisp8", &formats.load, 2) + .operands_in(vec![gpr]) + .operands_out(vec![fpr]) + .inst_predicate(has_small_offset) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_for_inreg_0") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + if needs_sib_byte(in_reg0) { + modrm_sib_disp8(out_reg0, sink); + sib_noindex(in_reg0, sink); + } else { + modrm_disp8(in_reg0, out_reg0, sink); + } + let offset: i32 = offset.into(); + sink.put1(offset as u8); + "#, + ), + "size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0", + ); + + let has_big_offset = + InstructionPredicate::new_is_signed_int(&*formats.load, "offset", 32, 0); + + // XX /r load with 32-bit offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("ldDisp32", &formats.load, 5) + .operands_in(vec![gpr]) + .operands_out(vec![gpr]) + .inst_predicate(has_big_offset.clone()) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_for_inreg_0") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + if needs_sib_byte(in_reg0) { + modrm_sib_disp32(out_reg0, sink); + sib_noindex(in_reg0, sink); + } else { + modrm_disp32(in_reg0, out_reg0, sink); + } + let offset: i32 = offset.into(); + sink.put4(offset as u32); + "#, + ), + ); + + // XX /r float load with 32-bit offset. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("fldDisp32", &formats.load, 5) + .operands_in(vec![gpr]) + .operands_out(vec![fpr]) + .inst_predicate(has_big_offset) + .clobbers_flags(false) + .compute_size("size_plus_maybe_sib_for_inreg_0") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + if needs_sib_byte(in_reg0) { + modrm_sib_disp32(out_reg0, sink); + sib_noindex(in_reg0, sink); + } else { + modrm_disp32(in_reg0, out_reg0, sink); + } + let offset: i32 = offset.into(); + sink.put4(offset as u32); + "#, + ), + "size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0", + ); + } + + { + // Complex loads. + + // A predicate asking if the offset is zero. + let has_no_offset = + InstructionPredicate::new_is_field_equal(&*formats.load_complex, "offset", "0".into()); + + // XX /r load with index and no offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("ldWithIndex", &formats.load_complex, 2) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![gpr]) + .inst_predicate(has_no_offset.clone()) + .clobbers_flags(false) + .compute_size("size_plus_maybe_offset_for_inreg_0") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink); + // The else branch always inserts an SIB byte. + if needs_offset(in_reg0) { + modrm_sib_disp8(out_reg0, sink); + sib(0, in_reg1, in_reg0, sink); + sink.put1(0); + } else { + modrm_sib(out_reg0, sink); + sib(0, in_reg1, in_reg0, sink); + } + "#, + ), + ); + + // XX /r float load with index and no offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fldWithIndex", &formats.load_complex, 2) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![fpr]) + .inst_predicate(has_no_offset) + .clobbers_flags(false) + .compute_size("size_plus_maybe_offset_for_inreg_0") + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink); + // The else branch always inserts an SIB byte. + if needs_offset(in_reg0) { + modrm_sib_disp8(out_reg0, sink); + sib(0, in_reg1, in_reg0, sink); + sink.put1(0); + } else { + modrm_sib(out_reg0, sink); + sib(0, in_reg1, in_reg0, sink); + } + "#, + ), + ); + + let has_small_offset = + InstructionPredicate::new_is_signed_int(&*formats.load_complex, "offset", 8, 0); + + // XX /r load with index and 8-bit offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("ldWithIndexDisp8", &formats.load_complex, 3) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![gpr]) + .inst_predicate(has_small_offset.clone()) + .clobbers_flags(false) + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink); + modrm_sib_disp8(out_reg0, sink); + sib(0, in_reg1, in_reg0, sink); + let offset: i32 = offset.into(); + sink.put1(offset as u8); + "#, + ), + ); + + // XX /r float load with 8-bit offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fldWithIndexDisp8", &formats.load_complex, 3) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![fpr]) + .inst_predicate(has_small_offset) + .clobbers_flags(false) + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink); + modrm_sib_disp8(out_reg0, sink); + sib(0, in_reg1, in_reg0, sink); + let offset: i32 = offset.into(); + sink.put1(offset as u8); + "#, + ), + ); + + let has_big_offset = + InstructionPredicate::new_is_signed_int(&*formats.load_complex, "offset", 32, 0); + + // XX /r load with index and 32-bit offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("ldWithIndexDisp32", &formats.load_complex, 6) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![gpr]) + .inst_predicate(has_big_offset.clone()) + .clobbers_flags(false) + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink); + modrm_sib_disp32(out_reg0, sink); + sib(0, in_reg1, in_reg0, sink); + let offset: i32 = offset.into(); + sink.put4(offset as u32); + "#, + ), + ); + + // XX /r float load with index and 32-bit offset. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fldWithIndexDisp32", &formats.load_complex, 6) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![fpr]) + .inst_predicate(has_big_offset) + .clobbers_flags(false) + .emit( + r#" + if !flags.notrap() { + sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]); + } + {{PUT_OP}}(bits, rex3(in_reg0, out_reg0, in_reg1), sink); + modrm_sib_disp32(out_reg0, sink); + sib(0, in_reg1, in_reg0, sink); + let offset: i32 = offset.into(); + sink.put4(offset as u32); + "#, + ), + ); + } + + // Unary fill with SIB and 32-bit displacement. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fillSib32", &formats.unary, 6) + .operands_in(vec![stack_gpr32]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + let base = stk_base(in_stk0.base); + {{PUT_OP}}(bits, rex2(base, out_reg0), sink); + modrm_sib_disp32(out_reg0, sink); + sib_noindex(base, sink); + sink.put4(in_stk0.offset as u32); + "#, + ), + ); + + // Like fillSib32, but targeting an FPR rather than a GPR. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("ffillSib32", &formats.unary, 6) + .operands_in(vec![stack_fpr32]) + .operands_out(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" + let base = stk_base(in_stk0.base); + {{PUT_OP}}(bits, rex2(base, out_reg0), sink); + modrm_sib_disp32(out_reg0, sink); + sib_noindex(base, sink); + sink.put4(in_stk0.offset as u32); + "#, + ), + ); + + // Regfill with RSP-relative 32-bit displacement. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("regfill32", &formats.reg_fill, 6) + .operands_in(vec![stack_gpr32]) + .clobbers_flags(false) + .emit( + r#" + let src = StackRef::sp(src, &func.stack_slots); + let base = stk_base(src.base); + {{PUT_OP}}(bits, rex2(base, dst), sink); + modrm_sib_disp32(dst, sink); + sib_noindex(base, sink); + sink.put4(src.offset as u32); + "#, + ), + ); + + // Like regfill32, but targeting an FPR rather than a GPR. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fregfill32", &formats.reg_fill, 6) + .operands_in(vec![stack_fpr32]) + .clobbers_flags(false) + .emit( + r#" + let src = StackRef::sp(src, &func.stack_slots); + let base = stk_base(src.base); + {{PUT_OP}}(bits, rex2(base, dst), sink); + modrm_sib_disp32(dst, sink); + sib_noindex(base, sink); + sink.put4(src.offset as u32); + "#, + ), + ); + + // Call/return. + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("call_id", &formats.call, 4).emit( + r#" + sink.trap(TrapCode::StackOverflow, func.srclocs[inst]); + {{PUT_OP}}(bits, BASE_REX, sink); + // The addend adjusts for the difference between the end of the + // instruction and the beginning of the immediate field. + sink.reloc_external(func.srclocs[inst], + Reloc::X86CallPCRel4, + &func.dfg.ext_funcs[func_ref].name, + -4); + sink.put4(0); + sink.add_call_site(opcode, func.srclocs[inst]); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("call_plt_id", &formats.call, 4).emit( + r#" + sink.trap(TrapCode::StackOverflow, func.srclocs[inst]); + {{PUT_OP}}(bits, BASE_REX, sink); + sink.reloc_external(func.srclocs[inst], + Reloc::X86CallPLTRel4, + &func.dfg.ext_funcs[func_ref].name, + -4); + sink.put4(0); + sink.add_call_site(opcode, func.srclocs[inst]); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("call_r", &formats.call_indirect, 1) + .operands_in(vec![gpr]) + .emit( + r#" + sink.trap(TrapCode::StackOverflow, func.srclocs[inst]); + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + sink.add_call_site(opcode, func.srclocs[inst]); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("ret", &formats.multiary, 0) + .emit("{{PUT_OP}}(bits, BASE_REX, sink);"), + ); + + // Branches. + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("jmpb", &formats.jump, 1) + .branch_range((1, 8)) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, BASE_REX, sink); + disp1(destination, func, sink); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("jmpd", &formats.jump, 4) + .branch_range((4, 32)) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, BASE_REX, sink); + disp4(destination, func, sink); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("brib", &formats.branch_int, 1) + .operands_in(vec![reg_rflags]) + .branch_range((1, 8)) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits | icc2opc(cond), BASE_REX, sink); + disp1(destination, func, sink); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("brid", &formats.branch_int, 4) + .operands_in(vec![reg_rflags]) + .branch_range((4, 32)) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits | icc2opc(cond), BASE_REX, sink); + disp4(destination, func, sink); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("brfb", &formats.branch_float, 1) + .operands_in(vec![reg_rflags]) + .branch_range((1, 8)) + .clobbers_flags(false) + .inst_predicate(supported_floatccs_predicate( + &supported_floatccs, + &*formats.branch_float, + )) + .emit( + r#" + {{PUT_OP}}(bits | fcc2opc(cond), BASE_REX, sink); + disp1(destination, func, sink); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("brfd", &formats.branch_float, 4) + .operands_in(vec![reg_rflags]) + .branch_range((4, 32)) + .clobbers_flags(false) + .inst_predicate(supported_floatccs_predicate( + &supported_floatccs, + &*formats.branch_float, + )) + .emit( + r#" + {{PUT_OP}}(bits | fcc2opc(cond), BASE_REX, sink); + disp4(destination, func, sink); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("indirect_jmp", &formats.indirect_jump, 1) + .operands_in(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("jt_entry", &formats.branch_table_entry, 2) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .inst_predicate(valid_scale(&*formats.branch_table_entry)) + .compute_size("size_plus_maybe_offset_for_inreg_1") + .emit( + r#" + {{PUT_OP}}(bits, rex3(in_reg1, out_reg0, in_reg0), sink); + if needs_offset(in_reg1) { + modrm_sib_disp8(out_reg0, sink); + sib(imm.trailing_zeros() as u8, in_reg0, in_reg1, sink); + sink.put1(0); + } else { + modrm_sib(out_reg0, sink); + sib(imm.trailing_zeros() as u8, in_reg0, in_reg1, sink); + } + "#, + ), + ); + + recipes.add_template_inferred( + EncodingRecipeBuilder::new("vconst", &formats.unary_const, 5) + .operands_out(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(0, out_reg0), sink); + modrm_riprel(out_reg0, sink); + const_disp4(constant_handle, func, sink); + "#, + ), + "size_with_inferred_rex_for_outreg0", + ); + + recipes.add_template_inferred( + EncodingRecipeBuilder::new("vconst_optimized", &formats.unary_const, 1) + .operands_out(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink); + modrm_rr(out_reg0, out_reg0, sink); + "#, + ), + "size_with_inferred_rex_for_outreg0", + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("jt_base", &formats.branch_table_base, 5) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(0, out_reg0), sink); + modrm_riprel(out_reg0, sink); + + // No reloc is needed here as the jump table is emitted directly after + // the function body. + jt_disp4(table, func, sink); + "#, + ), + ); + + // Test flags and set a register. + // + // These setCC instructions only set the low 8 bits, and they can only write ABCD registers + // without a REX prefix. + // + // Other instruction encodings accepting `b1` inputs have the same constraints and only look at + // the low 8 bits of the input register. + + let seti = recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("seti", &formats.int_cond, 1) + .operands_in(vec![reg_rflags]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits | icc2opc(cond), rex1(out_reg0), sink); + modrm_r_bits(out_reg0, bits, sink); + "#, + ), + regs, + ) + .rex_kind(RecipePrefixKind::AlwaysEmitRex), + ); + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("seti_abcd", &formats.int_cond, 1) + .operands_in(vec![reg_rflags]) + .operands_out(vec![abcd]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits | icc2opc(cond), rex1(out_reg0), sink); + modrm_r_bits(out_reg0, bits, sink); + "#, + ), + regs, + ) + .when_prefixed(seti), + ); + + let setf = recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("setf", &formats.float_cond, 1) + .operands_in(vec![reg_rflags]) + .operands_out(vec![gpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits | fcc2opc(cond), rex1(out_reg0), sink); + modrm_r_bits(out_reg0, bits, sink); + "#, + ), + regs, + ) + .rex_kind(RecipePrefixKind::AlwaysEmitRex), + ); + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("setf_abcd", &formats.float_cond, 1) + .operands_in(vec![reg_rflags]) + .operands_out(vec![abcd]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits | fcc2opc(cond), rex1(out_reg0), sink); + modrm_r_bits(out_reg0, bits, sink); + "#, + ), + regs, + ) + .when_prefixed(setf), + ); + + // Conditional move (a.k.a integer select) + // (maybe-REX.W) 0F 4x modrm(r,r) + // 1 byte, modrm(r,r), is after the opcode + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("cmov", &formats.int_select, 1) + .operands_in(vec![ + OperandConstraint::FixedReg(reg_rflags), + OperandConstraint::RegClass(gpr), + OperandConstraint::RegClass(gpr), + ]) + .operands_out(vec![2]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits | icc2opc(cond), rex2(in_reg1, in_reg2), sink); + modrm_rr(in_reg1, in_reg2, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_cmov"), + ); + + // Bit scan forwards and reverse + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("bsf_and_bsr", &formats.unary, 1) + .operands_in(vec![gpr]) + .operands_out(vec![ + OperandConstraint::RegClass(gpr), + OperandConstraint::FixedReg(reg_rflags), + ]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_outreg0"), + ); + + // Arithematic with flag I/O. + + // XX /r, MR form. Add two GPR registers and set carry flag. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("rout", &formats.binary, 1) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![ + OperandConstraint::TiedInput(0), + OperandConstraint::FixedReg(reg_rflags), + ]) + .clobbers_flags(true) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink); + modrm_rr(in_reg0, in_reg1, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"), + ); + + // XX /r, MR form. Add two GPR registers and get carry flag. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("rin", &formats.ternary, 1) + .operands_in(vec![ + OperandConstraint::RegClass(gpr), + OperandConstraint::RegClass(gpr), + OperandConstraint::FixedReg(reg_rflags), + ]) + .operands_out(vec![0]) + .clobbers_flags(true) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink); + modrm_rr(in_reg0, in_reg1, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"), + ); + + // XX /r, MR form. Add two GPR registers with carry flag. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("rio", &formats.ternary, 1) + .operands_in(vec![ + OperandConstraint::RegClass(gpr), + OperandConstraint::RegClass(gpr), + OperandConstraint::FixedReg(reg_rflags), + ]) + .operands_out(vec![ + OperandConstraint::TiedInput(0), + OperandConstraint::FixedReg(reg_rflags), + ]) + .clobbers_flags(true) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink); + modrm_rr(in_reg0, in_reg1, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"), + ); + + // Compare and set flags. + + // XX /r, MR form. Compare two GPR registers and set flags. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("rcmp", &formats.binary, 1) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![reg_rflags]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink); + modrm_rr(in_reg0, in_reg1, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"), + ); + + // Same as rcmp, but second operand is the stack pointer. + recipes.add_template_recipe( + EncodingRecipeBuilder::new("rcmp_sp", &formats.unary, 1) + .operands_in(vec![gpr]) + .operands_out(vec![reg_rflags]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, RU::rsp.into()), sink); + modrm_rr(in_reg0, RU::rsp.into(), sink); + "#, + ), + ); + + // XX /r, RM form. Compare two FPR registers and set flags. + recipes.add_template_inferred( + EncodingRecipeBuilder::new("fcmp", &formats.binary, 1) + .operands_in(vec![fpr, fpr]) + .operands_out(vec![reg_rflags]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + "#, + ), + "size_with_inferred_rex_for_inreg0_inreg1", + ); + + { + let has_small_offset = + InstructionPredicate::new_is_signed_int(&*formats.binary_imm64, "imm", 8, 0); + + // XX /n, MI form with imm8. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("rcmp_ib", &formats.binary_imm64, 2) + .operands_in(vec![gpr]) + .operands_out(vec![reg_rflags]) + .inst_predicate(has_small_offset) + .emit( + r#" + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + let imm: i64 = imm.into(); + sink.put1(imm as u8); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"), + ); + + let has_big_offset = + InstructionPredicate::new_is_signed_int(&*formats.binary_imm64, "imm", 32, 0); + + // XX /n, MI form with imm32. + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("rcmp_id", &formats.binary_imm64, 5) + .operands_in(vec![gpr]) + .operands_out(vec![reg_rflags]) + .inst_predicate(has_big_offset) + .emit( + r#" + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + let imm: i64 = imm.into(); + sink.put4(imm as u32); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"), + ); + } + + // Test-and-branch. + // + // This recipe represents the macro fusion of a test and a conditional branch. + // This serves two purposes: + // + // 1. Guarantee that the test and branch get scheduled next to each other so + // macro fusion is guaranteed to be possible. + // 2. Hide the status flags from Cranelift which doesn't currently model flags. + // + // The encoding bits affect both the test and the branch instruction: + // + // Bits 0-7 are the Jcc opcode. + // Bits 8-15 control the test instruction which always has opcode byte 0x85. + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("tjccb", &formats.branch, 1 + 2) + .operands_in(vec![gpr]) + .branch_range((3, 8)) + .emit( + r#" + // test r, r. + {{PUT_OP}}((bits & 0xff00) | 0x85, rex2(in_reg0, in_reg0), sink); + modrm_rr(in_reg0, in_reg0, sink); + // Jcc instruction. + sink.put1(bits as u8); + disp1(destination, func, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"), + ); + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("tjccd", &formats.branch, 1 + 6) + .operands_in(vec![gpr]) + .branch_range((7, 32)) + .emit( + r#" + // test r, r. + {{PUT_OP}}((bits & 0xff00) | 0x85, rex2(in_reg0, in_reg0), sink); + modrm_rr(in_reg0, in_reg0, sink); + // Jcc instruction. + sink.put1(0x0f); + sink.put1(bits as u8); + disp4(destination, func, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"), + ); + + // 8-bit test-and-branch. + + let t8jccb = recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("t8jccb", &formats.branch, 1 + 2) + .operands_in(vec![gpr]) + .branch_range((3, 8)) + .emit( + r#" + // test8 r, r. + {{PUT_OP}}((bits & 0xff00) | 0x84, rex2(in_reg0, in_reg0), sink); + modrm_rr(in_reg0, in_reg0, sink); + // Jcc instruction. + sink.put1(bits as u8); + disp1(destination, func, sink); + "#, + ), + regs, + ) + .rex_kind(RecipePrefixKind::AlwaysEmitRex), + ); + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("t8jccb_abcd", &formats.branch, 1 + 2) + .operands_in(vec![abcd]) + .branch_range((3, 8)) + .emit( + r#" + // test8 r, r. + {{PUT_OP}}((bits & 0xff00) | 0x84, rex2(in_reg0, in_reg0), sink); + modrm_rr(in_reg0, in_reg0, sink); + // Jcc instruction. + sink.put1(bits as u8); + disp1(destination, func, sink); + "#, + ), + regs, + ) + .when_prefixed(t8jccb), + ); + + let t8jccd = recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("t8jccd", &formats.branch, 1 + 6) + .operands_in(vec![gpr]) + .branch_range((7, 32)) + .emit( + r#" + // test8 r, r. + {{PUT_OP}}((bits & 0xff00) | 0x84, rex2(in_reg0, in_reg0), sink); + modrm_rr(in_reg0, in_reg0, sink); + // Jcc instruction. + sink.put1(0x0f); + sink.put1(bits as u8); + disp4(destination, func, sink); + "#, + ), + regs, + ) + .rex_kind(RecipePrefixKind::AlwaysEmitRex), + ); + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("t8jccd_abcd", &formats.branch, 1 + 6) + .operands_in(vec![abcd]) + .branch_range((7, 32)) + .emit( + r#" + // test8 r, r. + {{PUT_OP}}((bits & 0xff00) | 0x84, rex2(in_reg0, in_reg0), sink); + modrm_rr(in_reg0, in_reg0, sink); + // Jcc instruction. + sink.put1(0x0f); + sink.put1(bits as u8); + disp4(destination, func, sink); + "#, + ), + regs, + ) + .when_prefixed(t8jccd), + ); + + // Worst case test-and-branch recipe for brz.b1 and brnz.b1 in 32-bit mode. + // The register allocator can't handle a branch instruction with constrained + // operands like the t8jccd_abcd above. This variant can accept the b1 opernd in + // any register, but is is larger because it uses a 32-bit test instruction with + // a 0xff immediate. + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("t8jccd_long", &formats.branch, 5 + 6) + .operands_in(vec![gpr]) + .branch_range((11, 32)) + .emit( + r#" + // test32 r, 0xff. + {{PUT_OP}}((bits & 0xff00) | 0xf7, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + sink.put4(0xff); + // Jcc instruction. + sink.put1(0x0f); + sink.put1(bits as u8); + disp4(destination, func, sink); + "#, + ), + ); + + // Comparison that produces a `b1` result in a GPR. + // + // This is a macro of a `cmp` instruction followed by a `setCC` instruction. + // + // TODO This is not a great solution because: + // + // - The cmp+setcc combination is not recognized by CPU's macro fusion. + // - The 64-bit encoding has issues with REX prefixes. The `cmp` and `setCC` + // instructions may need a REX independently. + // - Modeling CPU flags in the type system would be better. + // + // Since the `setCC` instructions only write an 8-bit register, we use that as + // our `b1` representation: A `b1` value is represented as a GPR where the low 8 + // bits are known to be 0 or 1. The high bits are undefined. + // + // This bandaid macro doesn't support a REX prefix for the final `setCC` + // instruction, so it is limited to the `ABCD` register class for booleans. + // The omission of a `when_prefixed` alternative is deliberate here. + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("icscc", &formats.int_compare, 1 + 3) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![abcd]) + .emit( + r#" + // Comparison instruction. + {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink); + modrm_rr(in_reg0, in_reg1, sink); + // `setCC` instruction, no REX. + let setcc = 0x90 | icc2opc(cond); + sink.put1(0x0f); + sink.put1(setcc as u8); + modrm_rr(out_reg0, 0, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"), + ); + + recipes.add_template_inferred( + EncodingRecipeBuilder::new("icscc_fpr", &formats.int_compare, 1) + .operands_in(vec![fpr, fpr]) + .operands_out(vec![0]) + .emit( + r#" + // Comparison instruction. + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + "#, + ), + "size_with_inferred_rex_for_inreg0_inreg1", + ); + + { + let is_small_imm = + InstructionPredicate::new_is_signed_int(&*formats.int_compare_imm, "imm", 8, 0); + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("icscc_ib", &formats.int_compare_imm, 2 + 3) + .operands_in(vec![gpr]) + .operands_out(vec![abcd]) + .inst_predicate(is_small_imm) + .emit( + r#" + // Comparison instruction. + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + let imm: i64 = imm.into(); + sink.put1(imm as u8); + // `setCC` instruction, no REX. + let setcc = 0x90 | icc2opc(cond); + sink.put1(0x0f); + sink.put1(setcc as u8); + modrm_rr(out_reg0, 0, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"), + ); + + let is_big_imm = + InstructionPredicate::new_is_signed_int(&*formats.int_compare_imm, "imm", 32, 0); + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("icscc_id", &formats.int_compare_imm, 5 + 3) + .operands_in(vec![gpr]) + .operands_out(vec![abcd]) + .inst_predicate(is_big_imm) + .emit( + r#" + // Comparison instruction. + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + let imm: i64 = imm.into(); + sink.put4(imm as u32); + // `setCC` instruction, no REX. + let setcc = 0x90 | icc2opc(cond); + sink.put1(0x0f); + sink.put1(setcc as u8); + modrm_rr(out_reg0, 0, sink); + "#, + ), + regs, + ) + .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"), + ); + } + + // Make a FloatCompare instruction predicate with the supported condition codes. + // + // Same thing for floating point. + // + // The ucomiss/ucomisd instructions set the FLAGS bits CF/PF/CF like this: + // + // ZPC OSA + // UN 111 000 + // GT 000 000 + // LT 001 000 + // EQ 100 000 + // + // Not all floating point condition codes are supported. + // The omission of a `when_prefixed` alternative is deliberate here. + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fcscc", &formats.float_compare, 1 + 3) + .operands_in(vec![fpr, fpr]) + .operands_out(vec![abcd]) + .inst_predicate(supported_floatccs_predicate( + &supported_floatccs, + &*formats.float_compare, + )) + .emit( + r#" + // Comparison instruction. + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + // `setCC` instruction, no REX. + use crate::ir::condcodes::FloatCC::*; + let setcc = match cond { + Ordered => 0x9b, // EQ|LT|GT => setnp (P=0) + Unordered => 0x9a, // UN => setp (P=1) + OrderedNotEqual => 0x95, // LT|GT => setne (Z=0), + UnorderedOrEqual => 0x94, // UN|EQ => sete (Z=1) + GreaterThan => 0x97, // GT => seta (C=0&Z=0) + GreaterThanOrEqual => 0x93, // GT|EQ => setae (C=0) + UnorderedOrLessThan => 0x92, // UN|LT => setb (C=1) + UnorderedOrLessThanOrEqual => 0x96, // UN|LT|EQ => setbe (Z=1|C=1) + Equal | // EQ + NotEqual | // UN|LT|GT + LessThan | // LT + LessThanOrEqual | // LT|EQ + UnorderedOrGreaterThan | // UN|GT + UnorderedOrGreaterThanOrEqual // UN|GT|EQ + => panic!("{} not supported by fcscc", cond), + }; + sink.put1(0x0f); + sink.put1(setcc); + modrm_rr(out_reg0, 0, sink); + "#, + ), + ); + + { + let supported_floatccs: Vec<Literal> = ["eq", "lt", "le", "uno", "ne", "uge", "ugt", "ord"] + .iter() + .map(|name| Literal::enumerator_for(floatcc, name)) + .collect(); + recipes.add_template_inferred( + EncodingRecipeBuilder::new("pfcmp", &formats.float_compare, 2) + .operands_in(vec![fpr, fpr]) + .operands_out(vec![0]) + .inst_predicate(supported_floatccs_predicate( + &supported_floatccs[..], + &*formats.float_compare, + )) + .emit( + r#" + // Comparison instruction. + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + // Add immediate byte indicating what type of comparison. + use crate::ir::condcodes::FloatCC::*; + let imm = match cond { + Equal => 0x00, + LessThan => 0x01, + LessThanOrEqual => 0x02, + Unordered => 0x03, + NotEqual => 0x04, + UnorderedOrGreaterThanOrEqual => 0x05, + UnorderedOrGreaterThan => 0x06, + Ordered => 0x07, + _ => panic!("{} not supported by pfcmp", cond), + }; + sink.put1(imm); + "#, + ), + "size_with_inferred_rex_for_inreg0_inreg1", + ); + } + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("is_zero", &formats.unary, 2 + 2) + .operands_in(vec![gpr]) + .operands_out(vec![abcd]) + .emit( + r#" + // Test instruction. + {{PUT_OP}}(bits, rex2(in_reg0, in_reg0), sink); + modrm_rr(in_reg0, in_reg0, sink); + // Check ZF = 1 flag to see if register holds 0. + sink.put1(0x0f); + sink.put1(0x94); + modrm_rr(out_reg0, 0, sink); + "#, + ), + ); + + recipes.add_template_recipe( + EncodingRecipeBuilder::new("is_invalid", &formats.unary, 2 + 3) + .operands_in(vec![gpr]) + .operands_out(vec![abcd]) + .emit( + r#" + // Comparison instruction. + {{PUT_OP}}(bits, rex1(in_reg0), sink); + modrm_r_bits(in_reg0, bits, sink); + sink.put1(0xff); + // `setCC` instruction, no REX. + use crate::ir::condcodes::IntCC::*; + let setcc = 0x90 | icc2opc(Equal); + sink.put1(0x0f); + sink.put1(setcc as u8); + modrm_rr(out_reg0, 0, sink); + "#, + ), + ); + + recipes.add_recipe( + EncodingRecipeBuilder::new("safepoint", &formats.multiary, 0).emit( + r#" + sink.add_stack_map(args, func, isa); + "#, + ), + ); + + // Both `elf_tls_get_addr` and `macho_tls_get_addr` require all caller-saved registers to be spilled. + // This is currently special cased in `regalloc/spilling.rs` in the `visit_inst` function. + + recipes.add_recipe( + EncodingRecipeBuilder::new("elf_tls_get_addr", &formats.unary_global_value, 16) + // FIXME Correct encoding for non rax registers + .operands_out(vec![reg_rax]) + .emit( + r#" + // output %rax + // clobbers %rdi + + // Those data16 prefixes are necessary to pad to 16 bytes. + + // data16 lea gv@tlsgd(%rip),%rdi + sink.put1(0x66); // data16 + sink.put1(0b01001000); // rex.w + const LEA: u8 = 0x8d; + sink.put1(LEA); // lea + modrm_riprel(0b111/*out_reg0*/, sink); // 0x3d + sink.reloc_external(func.srclocs[inst], + Reloc::ElfX86_64TlsGd, + &func.global_values[global_value].symbol_name(), + -4); + sink.put4(0); + + // data16 data16 callq __tls_get_addr-4 + sink.put1(0x66); // data16 + sink.put1(0x66); // data16 + sink.put1(0b01001000); // rex.w + sink.put1(0xe8); // call + sink.reloc_external(func.srclocs[inst], + Reloc::X86CallPLTRel4, + &ExternalName::LibCall(LibCall::ElfTlsGetAddr), + -4); + sink.put4(0); + "#, + ), + ); + + recipes.add_recipe( + EncodingRecipeBuilder::new("macho_tls_get_addr", &formats.unary_global_value, 9) + // FIXME Correct encoding for non rax registers + .operands_out(vec![reg_rax]) + .emit( + r#" + // output %rax + // clobbers %rdi + + // movq gv@tlv(%rip), %rdi + sink.put1(0x48); // rex + sink.put1(0x8b); // mov + modrm_riprel(0b111/*out_reg0*/, sink); // 0x3d + sink.reloc_external(func.srclocs[inst], + Reloc::MachOX86_64Tlv, + &func.global_values[global_value].symbol_name(), + -4); + sink.put4(0); + + // callq *(%rdi) + sink.put1(0xff); + sink.put1(0x17); + "#, + ), + ); + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("evex_reg_vvvv_rm_128", &formats.binary, 1) + .operands_in(vec![fpr, fpr]) + .operands_out(vec![fpr]) + .emit( + r#" + // instruction encoding operands: reg (op1, w), vvvv (op2, r), rm (op3, r) + // this maps to: out_reg0, in_reg0, in_reg1 + let context = EvexContext::Other { length: EvexVectorLength::V128 }; + let masking = EvexMasking::None; + put_evex(bits, out_reg0, in_reg0, in_reg1, context, masking, sink); // params: reg, vvvv, rm + modrm_rr(in_reg1, out_reg0, sink); // params: rm, reg + "#, + ), + regs).rex_kind(RecipePrefixKind::Evex) + ); + + recipes.add_template( + Template::new( + EncodingRecipeBuilder::new("evex_reg_rm_128", &formats.unary, 1) + .operands_in(vec![fpr]) + .operands_out(vec![fpr]) + .emit( + r#" + // instruction encoding operands: reg (op1, w), rm (op2, r) + // this maps to: out_reg0, in_reg0 + let context = EvexContext::Other { length: EvexVectorLength::V128 }; + let masking = EvexMasking::None; + put_evex(bits, out_reg0, 0, in_reg0, context, masking, sink); // params: reg, vvvv, rm + modrm_rr(in_reg0, out_reg0, sink); // params: rm, reg + "#, + ), + regs).rex_kind(RecipePrefixKind::Evex) + ); + + recipes +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/registers.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/registers.rs new file mode 100644 index 0000000000..85a8965f89 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/registers.rs @@ -0,0 +1,43 @@ +use crate::cdsl::regs::{IsaRegs, IsaRegsBuilder, RegBankBuilder, RegClassBuilder}; + +pub(crate) fn define() -> IsaRegs { + let mut regs = IsaRegsBuilder::new(); + + let builder = RegBankBuilder::new("FloatRegs", "xmm") + .units(16) + .track_pressure(true); + let float_regs = regs.add_bank(builder); + + let builder = RegBankBuilder::new("IntRegs", "r") + .units(16) + .names(vec!["rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi"]) + .track_pressure(true) + .pinned_reg(15); + let int_regs = regs.add_bank(builder); + + let builder = RegBankBuilder::new("FlagRegs", "") + .units(1) + .names(vec!["rflags"]) + .track_pressure(false); + let flag_reg = regs.add_bank(builder); + + let builder = RegClassBuilder::new_toplevel("GPR", int_regs); + let gpr = regs.add_class(builder); + + let builder = RegClassBuilder::new_toplevel("FPR", float_regs); + let fpr = regs.add_class(builder); + + let builder = RegClassBuilder::new_toplevel("FLAG", flag_reg); + regs.add_class(builder); + + let builder = RegClassBuilder::subclass_of("GPR8", gpr, 0, 8); + let gpr8 = regs.add_class(builder); + + let builder = RegClassBuilder::subclass_of("ABCD", gpr8, 0, 4); + regs.add_class(builder); + + let builder = RegClassBuilder::subclass_of("FPR8", fpr, 0, 8); + regs.add_class(builder); + + regs.build() +} diff --git a/third_party/rust/cranelift-codegen-meta/src/isa/x86/settings.rs b/third_party/rust/cranelift-codegen-meta/src/isa/x86/settings.rs new file mode 100644 index 0000000000..dddd69abb3 --- /dev/null +++ b/third_party/rust/cranelift-codegen-meta/src/isa/x86/settings.rs @@ -0,0 +1,135 @@ +use crate::cdsl::settings::{PredicateNode, SettingGroup, SettingGroupBuilder}; + +pub(crate) fn define(shared: &SettingGroup) -> SettingGroup { + let mut settings = SettingGroupBuilder::new("x86"); + + // CPUID.01H:ECX + let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false); + let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false); + let has_sse41 = settings.add_bool("has_sse41", "SSE4.1: CPUID.01H:ECX.SSE4_1[bit 19]", false); + let has_sse42 = settings.add_bool("has_sse42", "SSE4.2: CPUID.01H:ECX.SSE4_2[bit 20]", false); + let has_avx = settings.add_bool("has_avx", "AVX: CPUID.01H:ECX.AVX[bit 28]", false); + let has_avx2 = settings.add_bool("has_avx2", "AVX2: CPUID.07H:EBX.AVX2[bit 5]", false); + let has_avx512dq = settings.add_bool( + "has_avx512dq", + "AVX512DQ: CPUID.07H:EBX.AVX512DQ[bit 17]", + false, + ); + let has_avx512vl = settings.add_bool( + "has_avx512vl", + "AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]", + false, + ); + let has_avx512f = settings.add_bool( + "has_avx512f", + "AVX512F: CPUID.07H:EBX.AVX512F[bit 16]", + false, + ); + let has_popcnt = settings.add_bool("has_popcnt", "POPCNT: CPUID.01H:ECX.POPCNT[bit 23]", false); + + // CPUID.(EAX=07H, ECX=0H):EBX + let has_bmi1 = settings.add_bool( + "has_bmi1", + "BMI1: CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]", + false, + ); + let has_bmi2 = settings.add_bool( + "has_bmi2", + "BMI2: CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]", + false, + ); + + // CPUID.EAX=80000001H:ECX + let has_lzcnt = settings.add_bool( + "has_lzcnt", + "LZCNT: CPUID.EAX=80000001H:ECX.LZCNT[bit 5]", + false, + ); + + let shared_enable_simd = shared.get_bool("enable_simd"); + + settings.add_predicate("use_ssse3", predicate!(has_ssse3)); + settings.add_predicate("use_sse41", predicate!(has_sse41)); + settings.add_predicate("use_sse42", predicate!(has_sse41 && has_sse42)); + + settings.add_predicate( + "use_ssse3_simd", + predicate!(shared_enable_simd && has_ssse3), + ); + settings.add_predicate( + "use_sse41_simd", + predicate!(shared_enable_simd && has_sse41), + ); + settings.add_predicate( + "use_sse42_simd", + predicate!(shared_enable_simd && has_sse41 && has_sse42), + ); + + settings.add_predicate("use_avx_simd", predicate!(shared_enable_simd && has_avx)); + settings.add_predicate("use_avx2_simd", predicate!(shared_enable_simd && has_avx2)); + settings.add_predicate( + "use_avx512dq_simd", + predicate!(shared_enable_simd && has_avx512dq), + ); + settings.add_predicate( + "use_avx512vl_simd", + predicate!(shared_enable_simd && has_avx512vl), + ); + settings.add_predicate( + "use_avx512f_simd", + predicate!(shared_enable_simd && has_avx512f), + ); + + settings.add_predicate("use_popcnt", predicate!(has_popcnt && has_sse42)); + settings.add_predicate("use_bmi1", predicate!(has_bmi1)); + settings.add_predicate("use_lzcnt", predicate!(has_lzcnt)); + + // Some shared boolean values are used in x86 instruction predicates, so we need to group them + // in the same TargetIsa, for compabitibity with code generated by meta-python. + // TODO Once all the meta generation code has been migrated from Python to Rust, we can put it + // back in the shared SettingGroup, and use it in x86 instruction predicates. + + let is_pic = shared.get_bool("is_pic"); + let emit_all_ones_funcaddrs = shared.get_bool("emit_all_ones_funcaddrs"); + settings.add_predicate("is_pic", predicate!(is_pic)); + settings.add_predicate("not_is_pic", predicate!(!is_pic)); + settings.add_predicate( + "all_ones_funcaddrs_and_not_is_pic", + predicate!(emit_all_ones_funcaddrs && !is_pic), + ); + settings.add_predicate( + "not_all_ones_funcaddrs_and_not_is_pic", + predicate!(!emit_all_ones_funcaddrs && !is_pic), + ); + + // Presets corresponding to x86 CPUs. + + settings.add_preset("baseline", preset!()); + let nehalem = settings.add_preset( + "nehalem", + preset!(has_sse3 && has_ssse3 && has_sse41 && has_sse42 && has_popcnt), + ); + let haswell = settings.add_preset( + "haswell", + preset!(nehalem && has_bmi1 && has_bmi2 && has_lzcnt), + ); + let broadwell = settings.add_preset("broadwell", preset!(haswell)); + let skylake = settings.add_preset("skylake", preset!(broadwell)); + let cannonlake = settings.add_preset("cannonlake", preset!(skylake)); + settings.add_preset("icelake", preset!(cannonlake)); + settings.add_preset( + "znver1", + preset!( + has_sse3 + && has_ssse3 + && has_sse41 + && has_sse42 + && has_popcnt + && has_bmi1 + && has_bmi2 + && has_lzcnt + ), + ); + + settings.build() +} |