diff options
Diffstat (limited to 'third_party/rust/cranelift-codegen/src/isa')
59 files changed, 49674 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/abi.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/abi.rs new file mode 100644 index 0000000000..dfb7db4dbf --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/abi.rs @@ -0,0 +1,850 @@ +//! Implementation of a standard AArch64 ABI. + +use crate::ir; +use crate::ir::types; +use crate::ir::types::*; +use crate::ir::MemFlags; +use crate::isa; +use crate::isa::aarch64::{inst::EmitState, inst::*}; +use crate::machinst::*; +use crate::settings; +use crate::{CodegenError, CodegenResult}; +use alloc::boxed::Box; +use alloc::vec::Vec; +use regalloc::{RealReg, Reg, RegClass, Set, Writable}; +use smallvec::SmallVec; + +// We use a generic implementation that factors out AArch64 and x64 ABI commonalities, because +// these ABIs are very similar. + +/// Support for the AArch64 ABI from the callee side (within a function body). +pub(crate) type AArch64ABICallee = ABICalleeImpl<AArch64MachineDeps>; + +/// Support for the AArch64 ABI from the caller side (at a callsite). +pub(crate) type AArch64ABICaller = ABICallerImpl<AArch64MachineDeps>; + +// Spidermonkey specific ABI convention. + +/// This is SpiderMonkey's `WasmTableCallSigReg`. +static BALDRDASH_SIG_REG: u8 = 10; + +/// This is SpiderMonkey's `WasmTlsReg`. +static BALDRDASH_TLS_REG: u8 = 23; + +/// Offset in stack-arg area to callee-TLS slot in Baldrdash-2020 calling convention. +static BALDRDASH_CALLEE_TLS_OFFSET: i64 = 0; +/// Offset in stack-arg area to caller-TLS slot in Baldrdash-2020 calling convention. +static BALDRDASH_CALLER_TLS_OFFSET: i64 = 8; + +// These two lists represent the registers the JIT may *not* use at any point in generated code. +// +// So these are callee-preserved from the JIT's point of view, and every register not in this list +// has to be caller-preserved by definition. +// +// Keep these lists in sync with the NonAllocatableMask set in Spidermonkey's +// Architecture-arm64.cpp. + +// Indexed by physical register number. +#[rustfmt::skip] +static BALDRDASH_JIT_CALLEE_SAVED_GPR: &[bool] = &[ + /* 0 = */ false, false, false, false, false, false, false, false, + /* 8 = */ false, false, false, false, false, false, false, false, + /* 16 = */ true /* x16 / ip1 */, true /* x17 / ip2 */, true /* x18 / TLS */, false, + /* 20 = */ false, false, false, false, + /* 24 = */ false, false, false, false, + // There should be 28, the pseudo stack pointer in this list, however the wasm stubs trash it + // gladly right now. + /* 28 = */ false, false, true /* x30 = FP */, false /* x31 = SP */ +]; + +#[rustfmt::skip] +static BALDRDASH_JIT_CALLEE_SAVED_FPU: &[bool] = &[ + /* 0 = */ false, false, false, false, false, false, false, false, + /* 8 = */ false, false, false, false, false, false, false, false, + /* 16 = */ false, false, false, false, false, false, false, false, + /* 24 = */ false, false, false, false, false, false, false, true /* v31 / d31 */ +]; + +/// This is the limit for the size of argument and return-value areas on the +/// stack. We place a reasonable limit here to avoid integer overflow issues +/// with 32-bit arithmetic: for now, 128 MB. +static STACK_ARG_RET_SIZE_LIMIT: u64 = 128 * 1024 * 1024; + +/// Try to fill a Baldrdash register, returning it if it was found. +fn try_fill_baldrdash_reg(call_conv: isa::CallConv, param: &ir::AbiParam) -> Option<ABIArg> { + if call_conv.extends_baldrdash() { + match ¶m.purpose { + &ir::ArgumentPurpose::VMContext => { + // This is SpiderMonkey's `WasmTlsReg`. + Some(ABIArg::Reg( + xreg(BALDRDASH_TLS_REG).to_real_reg(), + ir::types::I64, + param.extension, + param.purpose, + )) + } + &ir::ArgumentPurpose::SignatureId => { + // This is SpiderMonkey's `WasmTableCallSigReg`. + Some(ABIArg::Reg( + xreg(BALDRDASH_SIG_REG).to_real_reg(), + ir::types::I64, + param.extension, + param.purpose, + )) + } + &ir::ArgumentPurpose::CalleeTLS => { + // This is SpiderMonkey's callee TLS slot in the extended frame of Wasm's ABI-2020. + assert!(call_conv == isa::CallConv::Baldrdash2020); + Some(ABIArg::Stack( + BALDRDASH_CALLEE_TLS_OFFSET, + ir::types::I64, + ir::ArgumentExtension::None, + param.purpose, + )) + } + &ir::ArgumentPurpose::CallerTLS => { + // This is SpiderMonkey's caller TLS slot in the extended frame of Wasm's ABI-2020. + assert!(call_conv == isa::CallConv::Baldrdash2020); + Some(ABIArg::Stack( + BALDRDASH_CALLER_TLS_OFFSET, + ir::types::I64, + ir::ArgumentExtension::None, + param.purpose, + )) + } + _ => None, + } + } else { + None + } +} + +impl Into<AMode> for StackAMode { + fn into(self) -> AMode { + match self { + StackAMode::FPOffset(off, ty) => AMode::FPOffset(off, ty), + StackAMode::NominalSPOffset(off, ty) => AMode::NominalSPOffset(off, ty), + StackAMode::SPOffset(off, ty) => AMode::SPOffset(off, ty), + } + } +} + +// Returns the size of stack space needed to store the +// `int_reg` and `vec_reg`. +fn saved_reg_stack_size( + int_reg: &[Writable<RealReg>], + vec_reg: &[Writable<RealReg>], +) -> (usize, usize) { + // Round up to multiple of 2, to keep 16-byte stack alignment. + let int_save_bytes = (int_reg.len() + (int_reg.len() & 1)) * 8; + let vec_save_bytes = vec_reg.len() * 16; + (int_save_bytes, vec_save_bytes) +} + +/// AArch64-specific ABI behavior. This struct just serves as an implementation +/// point for the trait; it is never actually instantiated. +pub(crate) struct AArch64MachineDeps; + +impl ABIMachineSpec for AArch64MachineDeps { + type I = Inst; + + fn word_bits() -> u32 { + 64 + } + + /// Return required stack alignment in bytes. + fn stack_align(_call_conv: isa::CallConv) -> u32 { + 16 + } + + fn compute_arg_locs( + call_conv: isa::CallConv, + params: &[ir::AbiParam], + args_or_rets: ArgsOrRets, + add_ret_area_ptr: bool, + ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> { + let is_baldrdash = call_conv.extends_baldrdash(); + let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020; + + // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4. + let mut next_xreg = 0; + let mut next_vreg = 0; + let mut next_stack: u64 = 0; + let mut ret = vec![]; + + if args_or_rets == ArgsOrRets::Args && has_baldrdash_tls { + // Baldrdash ABI-2020 always has two stack-arg slots reserved, for the callee and + // caller TLS-register values, respectively. + next_stack = 16; + } + + // Note on return values: on the regular non-baldrdash ABI, we may return values in 8 + // registers for V128 and I64 registers independently of the number of register values + // returned in the other class. That is, we can return values in up to 8 integer and 8 + // vector registers at once. + // In Baldrdash, we can only use one register for return value for all the register + // classes. That is, we can't return values in both one integer and one vector register; + // only one return value may be in a register. + + let (max_per_class_reg_vals, mut remaining_reg_vals) = match (args_or_rets, is_baldrdash) { + (ArgsOrRets::Args, _) => (8, 16), // x0-x7 and v0-v7 + (ArgsOrRets::Rets, false) => (8, 16), // x0-x7 and v0-v7 + (ArgsOrRets::Rets, true) => (1, 1), // x0 or v0, but not both + }; + + for i in 0..params.len() { + // Process returns backward, according to the SpiderMonkey ABI (which we + // adopt internally if `is_baldrdash` is set). + let param = match (args_or_rets, is_baldrdash) { + (ArgsOrRets::Args, _) => ¶ms[i], + (ArgsOrRets::Rets, false) => ¶ms[i], + (ArgsOrRets::Rets, true) => ¶ms[params.len() - 1 - i], + }; + + // Validate "purpose". + match ¶m.purpose { + &ir::ArgumentPurpose::VMContext + | &ir::ArgumentPurpose::Normal + | &ir::ArgumentPurpose::StackLimit + | &ir::ArgumentPurpose::SignatureId + | &ir::ArgumentPurpose::CallerTLS + | &ir::ArgumentPurpose::CalleeTLS => {} + _ => panic!( + "Unsupported argument purpose {:?} in signature: {:?}", + param.purpose, params + ), + } + + assert!( + legal_type_for_machine(param.value_type), + "Invalid type for AArch64: {:?}", + param.value_type + ); + let rc = Inst::rc_for_type(param.value_type).unwrap(); + + let next_reg = match rc { + RegClass::I64 => &mut next_xreg, + RegClass::V128 => &mut next_vreg, + _ => panic!("Invalid register class: {:?}", rc), + }; + + if let Some(param) = try_fill_baldrdash_reg(call_conv, param) { + assert!(rc == RegClass::I64); + ret.push(param); + } else if *next_reg < max_per_class_reg_vals && remaining_reg_vals > 0 { + let reg = match rc { + RegClass::I64 => xreg(*next_reg), + RegClass::V128 => vreg(*next_reg), + _ => unreachable!(), + }; + ret.push(ABIArg::Reg( + reg.to_real_reg(), + param.value_type, + param.extension, + param.purpose, + )); + *next_reg += 1; + remaining_reg_vals -= 1; + } else { + // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte + // stack alignment happens separately after all args.) + let size = (ty_bits(param.value_type) / 8) as u64; + let size = std::cmp::max(size, 8); + // Align. + debug_assert!(size.is_power_of_two()); + next_stack = (next_stack + size - 1) & !(size - 1); + ret.push(ABIArg::Stack( + next_stack as i64, + param.value_type, + param.extension, + param.purpose, + )); + next_stack += size; + } + } + + if args_or_rets == ArgsOrRets::Rets && is_baldrdash { + ret.reverse(); + } + + let extra_arg = if add_ret_area_ptr { + debug_assert!(args_or_rets == ArgsOrRets::Args); + if next_xreg < max_per_class_reg_vals && remaining_reg_vals > 0 { + ret.push(ABIArg::Reg( + xreg(next_xreg).to_real_reg(), + I64, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + )); + } else { + ret.push(ABIArg::Stack( + next_stack as i64, + I64, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + )); + next_stack += 8; + } + Some(ret.len() - 1) + } else { + None + }; + + next_stack = (next_stack + 15) & !15; + + // To avoid overflow issues, limit the arg/return size to something + // reasonable -- here, 128 MB. + if next_stack > STACK_ARG_RET_SIZE_LIMIT { + return Err(CodegenError::ImplLimitExceeded); + } + + Ok((ret, next_stack as i64, extra_arg)) + } + + fn fp_to_arg_offset(call_conv: isa::CallConv, flags: &settings::Flags) -> i64 { + if call_conv.extends_baldrdash() { + let num_words = flags.baldrdash_prologue_words() as i64; + debug_assert!(num_words > 0, "baldrdash must set baldrdash_prologue_words"); + debug_assert_eq!(num_words % 2, 0, "stack must be 16-aligned"); + num_words * 8 + } else { + 16 // frame pointer + return address. + } + } + + fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Inst { + Inst::gen_load(into_reg, mem.into(), ty, MemFlags::trusted()) + } + + fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Inst { + Inst::gen_store(mem.into(), from_reg, ty, MemFlags::trusted()) + } + + fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst { + Inst::gen_move(to_reg, from_reg, ty) + } + + fn gen_extend( + to_reg: Writable<Reg>, + from_reg: Reg, + signed: bool, + from_bits: u8, + to_bits: u8, + ) -> Inst { + assert!(from_bits < to_bits); + Inst::Extend { + rd: to_reg, + rn: from_reg, + signed, + from_bits, + to_bits, + } + } + + fn gen_ret() -> Inst { + Inst::Ret + } + + fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Inst; 4]> { + let imm = imm as u64; + let mut insts = SmallVec::new(); + if let Some(imm12) = Imm12::maybe_from_u64(imm) { + insts.push(Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: into_reg, + rn: from_reg, + imm12, + }); + } else { + let scratch2 = writable_tmp2_reg(); + assert_ne!(scratch2.to_reg(), from_reg); + insts.extend(Inst::load_constant(scratch2, imm.into())); + insts.push(Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd: into_reg, + rn: from_reg, + rm: scratch2.to_reg(), + extendop: ExtendOp::UXTX, + }); + } + insts + } + + fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Inst; 2]> { + let mut insts = SmallVec::new(); + insts.push(Inst::AluRRRExtend { + alu_op: ALUOp::SubS64, + rd: writable_zero_reg(), + rn: stack_reg(), + rm: limit_reg, + extendop: ExtendOp::UXTX, + }); + insts.push(Inst::TrapIf { + trap_code: ir::TrapCode::StackOverflow, + // Here `Lo` == "less than" when interpreting the two + // operands as unsigned integers. + kind: CondBrKind::Cond(Cond::Lo), + }); + insts + } + + fn gen_epilogue_placeholder() -> Inst { + Inst::EpiloguePlaceholder + } + + fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Inst { + let mem = mem.into(); + Inst::LoadAddr { rd: into_reg, mem } + } + + fn get_stacklimit_reg() -> Reg { + spilltmp_reg() + } + + fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Inst { + let mem = AMode::RegOffset(base, offset as i64, ty); + Inst::gen_load(into_reg, mem, ty, MemFlags::trusted()) + } + + fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Inst { + let mem = AMode::RegOffset(base, offset as i64, ty); + Inst::gen_store(mem, from_reg, ty, MemFlags::trusted()) + } + + fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Inst; 2]> { + if amount == 0 { + return SmallVec::new(); + } + + let (amount, is_sub) = if amount > 0 { + (amount as u64, false) + } else { + (-amount as u64, true) + }; + + let alu_op = if is_sub { ALUOp::Sub64 } else { ALUOp::Add64 }; + + let mut ret = SmallVec::new(); + if let Some(imm12) = Imm12::maybe_from_u64(amount) { + let adj_inst = Inst::AluRRImm12 { + alu_op, + rd: writable_stack_reg(), + rn: stack_reg(), + imm12, + }; + ret.push(adj_inst); + } else { + let tmp = writable_spilltmp_reg(); + let const_inst = Inst::load_constant(tmp, amount); + let adj_inst = Inst::AluRRRExtend { + alu_op, + rd: writable_stack_reg(), + rn: stack_reg(), + rm: tmp.to_reg(), + extendop: ExtendOp::UXTX, + }; + ret.extend(const_inst); + ret.push(adj_inst); + } + ret + } + + fn gen_nominal_sp_adj(offset: i32) -> Inst { + Inst::VirtualSPOffsetAdj { + offset: offset as i64, + } + } + + fn gen_prologue_frame_setup() -> SmallVec<[Inst; 2]> { + let mut insts = SmallVec::new(); + // stp fp (x29), lr (x30), [sp, #-16]! + insts.push(Inst::StoreP64 { + rt: fp_reg(), + rt2: link_reg(), + mem: PairAMode::PreIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(), + ), + flags: MemFlags::trusted(), + }); + // mov fp (x29), sp. This uses the ADDI rd, rs, 0 form of `MOV` because + // the usual encoding (`ORR`) does not work with SP. + insts.push(Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: writable_fp_reg(), + rn: stack_reg(), + imm12: Imm12 { + bits: 0, + shift12: false, + }, + }); + insts + } + + fn gen_epilogue_frame_restore() -> SmallVec<[Inst; 2]> { + let mut insts = SmallVec::new(); + + // MOV (alias of ORR) interprets x31 as XZR, so use an ADD here. + // MOV to SP is an alias of ADD. + insts.push(Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: writable_stack_reg(), + rn: fp_reg(), + imm12: Imm12 { + bits: 0, + shift12: false, + }, + }); + insts.push(Inst::LoadP64 { + rt: writable_fp_reg(), + rt2: writable_link_reg(), + mem: PairAMode::PostIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(), + ), + flags: MemFlags::trusted(), + }); + + insts + } + + // Returns stack bytes used as well as instructions. Does not adjust + // nominal SP offset; abi_impl generic code will do that. + fn gen_clobber_save( + call_conv: isa::CallConv, + _: &settings::Flags, + clobbers: &Set<Writable<RealReg>>, + fixed_frame_storage_size: u32, + _outgoing_args_size: u32, + ) -> (u64, SmallVec<[Inst; 16]>) { + let mut insts = SmallVec::new(); + let (clobbered_int, clobbered_vec) = get_regs_saved_in_prologue(call_conv, clobbers); + + let (int_save_bytes, vec_save_bytes) = saved_reg_stack_size(&clobbered_int, &clobbered_vec); + let total_save_bytes = (vec_save_bytes + int_save_bytes) as i32; + insts.extend(Self::gen_sp_reg_adjust( + -(total_save_bytes + fixed_frame_storage_size as i32), + )); + + for (i, reg_pair) in clobbered_int.chunks(2).enumerate() { + let (r1, r2) = if reg_pair.len() == 2 { + // .to_reg().to_reg(): Writable<RealReg> --> RealReg --> Reg + (reg_pair[0].to_reg().to_reg(), reg_pair[1].to_reg().to_reg()) + } else { + (reg_pair[0].to_reg().to_reg(), zero_reg()) + }; + + debug_assert!(r1.get_class() == RegClass::I64); + debug_assert!(r2.get_class() == RegClass::I64); + + // stp r1, r2, [sp, #(i * #16)] + insts.push(Inst::StoreP64 { + rt: r1, + rt2: r2, + mem: PairAMode::SignedOffset( + stack_reg(), + SImm7Scaled::maybe_from_i64((i * 16) as i64, types::I64).unwrap(), + ), + flags: MemFlags::trusted(), + }); + } + + let vec_offset = int_save_bytes; + for (i, reg) in clobbered_vec.iter().enumerate() { + insts.push(Inst::FpuStore128 { + rd: reg.to_reg().to_reg(), + mem: AMode::Unscaled( + stack_reg(), + SImm9::maybe_from_i64((vec_offset + (i * 16)) as i64).unwrap(), + ), + flags: MemFlags::trusted(), + }); + } + + (total_save_bytes as u64, insts) + } + + fn gen_clobber_restore( + call_conv: isa::CallConv, + flags: &settings::Flags, + clobbers: &Set<Writable<RealReg>>, + _fixed_frame_storage_size: u32, + _outgoing_args_size: u32, + ) -> SmallVec<[Inst; 16]> { + let mut insts = SmallVec::new(); + let (clobbered_int, clobbered_vec) = get_regs_saved_in_prologue(call_conv, clobbers); + + let (int_save_bytes, vec_save_bytes) = saved_reg_stack_size(&clobbered_int, &clobbered_vec); + for (i, reg_pair) in clobbered_int.chunks(2).enumerate() { + let (r1, r2) = if reg_pair.len() == 2 { + ( + reg_pair[0].map(|r| r.to_reg()), + reg_pair[1].map(|r| r.to_reg()), + ) + } else { + (reg_pair[0].map(|r| r.to_reg()), writable_zero_reg()) + }; + + debug_assert!(r1.to_reg().get_class() == RegClass::I64); + debug_assert!(r2.to_reg().get_class() == RegClass::I64); + + // ldp r1, r2, [sp, #(i * 16)] + insts.push(Inst::LoadP64 { + rt: r1, + rt2: r2, + mem: PairAMode::SignedOffset( + stack_reg(), + SImm7Scaled::maybe_from_i64((i * 16) as i64, types::I64).unwrap(), + ), + flags: MemFlags::trusted(), + }); + } + + for (i, reg) in clobbered_vec.iter().enumerate() { + insts.push(Inst::FpuLoad128 { + rd: Writable::from_reg(reg.to_reg().to_reg()), + mem: AMode::Unscaled( + stack_reg(), + SImm9::maybe_from_i64(((i * 16) + int_save_bytes) as i64).unwrap(), + ), + flags: MemFlags::trusted(), + }); + } + + // For non-baldrdash calling conventions, the frame pointer + // will be moved into the stack pointer in the epilogue, so we + // can skip restoring the stack pointer value with this `add`. + if call_conv.extends_baldrdash() { + let total_save_bytes = (int_save_bytes + vec_save_bytes) as i32; + insts.extend(Self::gen_sp_reg_adjust(total_save_bytes)); + } + + // If this is Baldrdash-2020, restore the callee (i.e., our) TLS + // register. We may have allocated it for something else and clobbered + // it, but the ABI expects us to leave the TLS register unchanged. + if call_conv == isa::CallConv::Baldrdash2020 { + let off = BALDRDASH_CALLEE_TLS_OFFSET + Self::fp_to_arg_offset(call_conv, flags); + insts.push(Inst::gen_load( + writable_xreg(BALDRDASH_TLS_REG), + AMode::UnsignedOffset(fp_reg(), UImm12Scaled::maybe_from_i64(off, I64).unwrap()), + I64, + MemFlags::trusted(), + )); + } + + insts + } + + fn gen_call( + dest: &CallDest, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: ir::Opcode, + tmp: Writable<Reg>, + callee_conv: isa::CallConv, + caller_conv: isa::CallConv, + ) -> SmallVec<[(InstIsSafepoint, Inst); 2]> { + let mut insts = SmallVec::new(); + match &dest { + &CallDest::ExtName(ref name, RelocDistance::Near) => insts.push(( + InstIsSafepoint::Yes, + Inst::Call { + info: Box::new(CallInfo { + dest: name.clone(), + uses, + defs, + opcode, + caller_callconv: caller_conv, + callee_callconv: callee_conv, + }), + }, + )), + &CallDest::ExtName(ref name, RelocDistance::Far) => { + insts.push(( + InstIsSafepoint::No, + Inst::LoadExtName { + rd: tmp, + name: Box::new(name.clone()), + offset: 0, + }, + )); + insts.push(( + InstIsSafepoint::Yes, + Inst::CallInd { + info: Box::new(CallIndInfo { + rn: tmp.to_reg(), + uses, + defs, + opcode, + caller_callconv: caller_conv, + callee_callconv: callee_conv, + }), + }, + )); + } + &CallDest::Reg(reg) => insts.push(( + InstIsSafepoint::Yes, + Inst::CallInd { + info: Box::new(CallIndInfo { + rn: *reg, + uses, + defs, + opcode, + caller_callconv: caller_conv, + callee_callconv: callee_conv, + }), + }, + )), + } + + insts + } + + fn get_number_of_spillslots_for_value(rc: RegClass, ty: Type) -> u32 { + // We allocate in terms of 8-byte slots. + match (rc, ty) { + (RegClass::I64, _) => 1, + (RegClass::V128, F32) | (RegClass::V128, F64) => 1, + (RegClass::V128, _) => 2, + _ => panic!("Unexpected register class!"), + } + } + + /// Get the current virtual-SP offset from an instruction-emission state. + fn get_virtual_sp_offset_from_state(s: &EmitState) -> i64 { + s.virtual_sp_offset + } + + /// Get the nominal-SP-to-FP offset from an instruction-emission state. + fn get_nominal_sp_to_fp(s: &EmitState) -> i64 { + s.nominal_sp_to_fp + } + + fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> Vec<Writable<Reg>> { + let mut caller_saved = Vec::new(); + for i in 0..29 { + let x = writable_xreg(i); + if is_reg_clobbered_by_call(call_conv_of_callee, x.to_reg().to_real_reg()) { + caller_saved.push(x); + } + } + for i in 0..32 { + let v = writable_vreg(i); + if is_reg_clobbered_by_call(call_conv_of_callee, v.to_reg().to_real_reg()) { + caller_saved.push(v); + } + } + caller_saved + } +} + +/// Is this type supposed to be seen on this machine? E.g. references of the +/// wrong width are invalid. +fn legal_type_for_machine(ty: Type) -> bool { + match ty { + R32 => false, + _ => true, + } +} + +/// Is the given register saved in the prologue if clobbered, i.e., is it a +/// callee-save? +fn is_reg_saved_in_prologue(call_conv: isa::CallConv, r: RealReg) -> bool { + if call_conv.extends_baldrdash() { + match r.get_class() { + RegClass::I64 => { + let enc = r.get_hw_encoding(); + return BALDRDASH_JIT_CALLEE_SAVED_GPR[enc]; + } + RegClass::V128 => { + let enc = r.get_hw_encoding(); + return BALDRDASH_JIT_CALLEE_SAVED_FPU[enc]; + } + _ => unimplemented!("baldrdash callee saved on non-i64 reg classes"), + }; + } + + match r.get_class() { + RegClass::I64 => { + // x19 - x28 inclusive are callee-saves. + r.get_hw_encoding() >= 19 && r.get_hw_encoding() <= 28 + } + RegClass::V128 => { + // v8 - v15 inclusive are callee-saves. + r.get_hw_encoding() >= 8 && r.get_hw_encoding() <= 15 + } + _ => panic!("Unexpected RegClass"), + } +} + +/// Return the set of all integer and vector registers that must be saved in the +/// prologue and restored in the epilogue, given the set of all registers +/// written by the function's body. +fn get_regs_saved_in_prologue( + call_conv: isa::CallConv, + regs: &Set<Writable<RealReg>>, +) -> (Vec<Writable<RealReg>>, Vec<Writable<RealReg>>) { + let mut int_saves = vec![]; + let mut vec_saves = vec![]; + for ® in regs.iter() { + if is_reg_saved_in_prologue(call_conv, reg.to_reg()) { + match reg.to_reg().get_class() { + RegClass::I64 => int_saves.push(reg), + RegClass::V128 => vec_saves.push(reg), + _ => panic!("Unexpected RegClass"), + } + } + } + // Sort registers for deterministic code output. We can do an unstable sort because the + // registers will be unique (there are no dups). + int_saves.sort_unstable_by_key(|r| r.to_reg().get_index()); + vec_saves.sort_unstable_by_key(|r| r.to_reg().get_index()); + (int_saves, vec_saves) +} + +fn is_reg_clobbered_by_call(call_conv_of_callee: isa::CallConv, r: RealReg) -> bool { + if call_conv_of_callee.extends_baldrdash() { + match r.get_class() { + RegClass::I64 => { + let enc = r.get_hw_encoding(); + if !BALDRDASH_JIT_CALLEE_SAVED_GPR[enc] { + return true; + } + // Otherwise, fall through to preserve native's ABI caller-saved. + } + RegClass::V128 => { + let enc = r.get_hw_encoding(); + if !BALDRDASH_JIT_CALLEE_SAVED_FPU[enc] { + return true; + } + // Otherwise, fall through to preserve native's ABI caller-saved. + } + _ => unimplemented!("baldrdash callee saved on non-i64 reg classes"), + }; + } + + match r.get_class() { + RegClass::I64 => { + // x0 - x17 inclusive are caller-saves. + r.get_hw_encoding() <= 17 + } + RegClass::V128 => { + // v0 - v7 inclusive and v16 - v31 inclusive are caller-saves. The + // upper 64 bits of v8 - v15 inclusive are also caller-saves. + // However, because we cannot currently represent partial registers + // to regalloc.rs, we indicate here that every vector register is + // caller-save. Because this function is used at *callsites*, + // approximating in this direction (save more than necessary) is + // conservative and thus safe. + // + // Note that we set the 'not included in clobber set' flag in the + // regalloc.rs API when a call instruction's callee has the same ABI + // as the caller (the current function body); this is safe (anything + // clobbered by callee can be clobbered by caller as well) and + // avoids unnecessary saves of v8-v15 in the prologue even though we + // include them as defs here. + true + } + _ => panic!("Unexpected RegClass"), + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/args.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/args.rs new file mode 100644 index 0000000000..7bd181c86b --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/args.rs @@ -0,0 +1,728 @@ +//! AArch64 ISA definitions: instruction arguments. + +// Some variants are never constructed, but we still want them as options in the future. +#![allow(dead_code)] + +use crate::ir::types::{F32X2, F32X4, F64X2, I16X4, I16X8, I32X2, I32X4, I64X2, I8X16, I8X8}; +use crate::ir::Type; +use crate::isa::aarch64::inst::*; +use crate::machinst::{ty_bits, MachLabel}; + +use regalloc::{PrettyPrint, RealRegUniverse, Reg, Writable}; + +use core::convert::Into; +use std::string::String; + +//============================================================================= +// Instruction sub-components: shift and extend descriptors + +/// A shift operator for a register or immediate. +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum ShiftOp { + LSL = 0b00, + LSR = 0b01, + ASR = 0b10, + ROR = 0b11, +} + +impl ShiftOp { + /// Get the encoding of this shift op. + pub fn bits(self) -> u8 { + self as u8 + } +} + +/// A shift operator amount. +#[derive(Clone, Copy, Debug)] +pub struct ShiftOpShiftImm(u8); + +impl ShiftOpShiftImm { + /// Maximum shift for shifted-register operands. + pub const MAX_SHIFT: u64 = 63; + + /// Create a new shiftop shift amount, if possible. + pub fn maybe_from_shift(shift: u64) -> Option<ShiftOpShiftImm> { + if shift <= Self::MAX_SHIFT { + Some(ShiftOpShiftImm(shift as u8)) + } else { + None + } + } + + /// Return the shift amount. + pub fn value(self) -> u8 { + self.0 + } + + /// Mask down to a given number of bits. + pub fn mask(self, bits: u8) -> ShiftOpShiftImm { + ShiftOpShiftImm(self.0 & (bits - 1)) + } +} + +/// A shift operator with an amount, guaranteed to be within range. +#[derive(Clone, Debug)] +pub struct ShiftOpAndAmt { + op: ShiftOp, + shift: ShiftOpShiftImm, +} + +impl ShiftOpAndAmt { + pub fn new(op: ShiftOp, shift: ShiftOpShiftImm) -> ShiftOpAndAmt { + ShiftOpAndAmt { op, shift } + } + + /// Get the shift op. + pub fn op(&self) -> ShiftOp { + self.op + } + + /// Get the shift amount. + pub fn amt(&self) -> ShiftOpShiftImm { + self.shift + } +} + +/// An extend operator for a register. +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum ExtendOp { + UXTB = 0b000, + UXTH = 0b001, + UXTW = 0b010, + UXTX = 0b011, + SXTB = 0b100, + SXTH = 0b101, + SXTW = 0b110, + SXTX = 0b111, +} + +impl ExtendOp { + /// Encoding of this op. + pub fn bits(self) -> u8 { + self as u8 + } +} + +//============================================================================= +// Instruction sub-components (memory addresses): definitions + +/// A reference to some memory address. +#[derive(Clone, Debug)] +pub enum MemLabel { + /// An address in the code, a constant pool or jumptable, with relative + /// offset from this instruction. This form must be used at emission time; + /// see `memlabel_finalize()` for how other forms are lowered to this one. + PCRel(i32), +} + +/// An addressing mode specified for a load/store operation. +#[derive(Clone, Debug)] +pub enum AMode { + // + // Real ARM64 addressing modes: + // + /// "post-indexed" mode as per AArch64 docs: postincrement reg after address computation. + PostIndexed(Writable<Reg>, SImm9), + /// "pre-indexed" mode as per AArch64 docs: preincrement reg before address computation. + PreIndexed(Writable<Reg>, SImm9), + + // N.B.: RegReg, RegScaled, and RegScaledExtended all correspond to + // what the ISA calls the "register offset" addressing mode. We split out + // several options here for more ergonomic codegen. + /// Register plus register offset. + RegReg(Reg, Reg), + + /// Register plus register offset, scaled by type's size. + RegScaled(Reg, Reg, Type), + + /// Register plus register offset, scaled by type's size, with index sign- or zero-extended + /// first. + RegScaledExtended(Reg, Reg, Type, ExtendOp), + + /// Register plus register offset, with index sign- or zero-extended first. + RegExtended(Reg, Reg, ExtendOp), + + /// Unscaled signed 9-bit immediate offset from reg. + Unscaled(Reg, SImm9), + + /// Scaled (by size of a type) unsigned 12-bit immediate offset from reg. + UnsignedOffset(Reg, UImm12Scaled), + + // + // virtual addressing modes that are lowered at emission time: + // + /// Reference to a "label": e.g., a symbol. + Label(MemLabel), + + /// Arbitrary offset from a register. Converted to generation of large + /// offsets with multiple instructions as necessary during code emission. + RegOffset(Reg, i64, Type), + + /// Offset from the stack pointer. + SPOffset(i64, Type), + + /// Offset from the frame pointer. + FPOffset(i64, Type), + + /// Offset from the "nominal stack pointer", which is where the real SP is + /// just after stack and spill slots are allocated in the function prologue. + /// At emission time, this is converted to `SPOffset` with a fixup added to + /// the offset constant. The fixup is a running value that is tracked as + /// emission iterates through instructions in linear order, and can be + /// adjusted up and down with [Inst::VirtualSPOffsetAdj]. + /// + /// The standard ABI is in charge of handling this (by emitting the + /// adjustment meta-instructions). It maintains the invariant that "nominal + /// SP" is where the actual SP is after the function prologue and before + /// clobber pushes. See the diagram in the documentation for + /// [crate::isa::aarch64::abi](the ABI module) for more details. + NominalSPOffset(i64, Type), +} + +impl AMode { + /// Memory reference using an address in a register. + pub fn reg(reg: Reg) -> AMode { + // Use UnsignedOffset rather than Unscaled to use ldr rather than ldur. + // This also does not use PostIndexed / PreIndexed as they update the register. + AMode::UnsignedOffset(reg, UImm12Scaled::zero(I64)) + } + + /// Memory reference using the sum of two registers as an address. + pub fn reg_plus_reg(reg1: Reg, reg2: Reg) -> AMode { + AMode::RegReg(reg1, reg2) + } + + /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address. + pub fn reg_plus_reg_scaled(reg1: Reg, reg2: Reg, ty: Type) -> AMode { + AMode::RegScaled(reg1, reg2, ty) + } + + /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address, with `reg2` sign- or + /// zero-extended as per `op`. + pub fn reg_plus_reg_scaled_extended(reg1: Reg, reg2: Reg, ty: Type, op: ExtendOp) -> AMode { + AMode::RegScaledExtended(reg1, reg2, ty, op) + } + + /// Memory reference to a label: a global function or value, or data in the constant pool. + pub fn label(label: MemLabel) -> AMode { + AMode::Label(label) + } +} + +/// A memory argument to a load/store-pair. +#[derive(Clone, Debug)] +pub enum PairAMode { + SignedOffset(Reg, SImm7Scaled), + PreIndexed(Writable<Reg>, SImm7Scaled), + PostIndexed(Writable<Reg>, SImm7Scaled), +} + +//============================================================================= +// Instruction sub-components (conditions, branches and branch targets): +// definitions + +/// Condition for conditional branches. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(u8)] +pub enum Cond { + Eq = 0, + Ne = 1, + Hs = 2, + Lo = 3, + Mi = 4, + Pl = 5, + Vs = 6, + Vc = 7, + Hi = 8, + Ls = 9, + Ge = 10, + Lt = 11, + Gt = 12, + Le = 13, + Al = 14, + Nv = 15, +} + +impl Cond { + /// Return the inverted condition. + pub fn invert(self) -> Cond { + match self { + Cond::Eq => Cond::Ne, + Cond::Ne => Cond::Eq, + + Cond::Hs => Cond::Lo, + Cond::Lo => Cond::Hs, + + Cond::Mi => Cond::Pl, + Cond::Pl => Cond::Mi, + + Cond::Vs => Cond::Vc, + Cond::Vc => Cond::Vs, + + Cond::Hi => Cond::Ls, + Cond::Ls => Cond::Hi, + + Cond::Ge => Cond::Lt, + Cond::Lt => Cond::Ge, + + Cond::Gt => Cond::Le, + Cond::Le => Cond::Gt, + + Cond::Al => Cond::Nv, + Cond::Nv => Cond::Al, + } + } + + /// Return the machine encoding of this condition. + pub fn bits(self) -> u32 { + self as u32 + } +} + +/// The kind of conditional branch: the common-case-optimized "reg-is-zero" / +/// "reg-is-nonzero" variants, or the generic one that tests the machine +/// condition codes. +#[derive(Clone, Copy, Debug)] +pub enum CondBrKind { + /// Condition: given register is zero. + Zero(Reg), + /// Condition: given register is nonzero. + NotZero(Reg), + /// Condition: the given condition-code test is true. + Cond(Cond), +} + +impl CondBrKind { + /// Return the inverted branch condition. + pub fn invert(self) -> CondBrKind { + match self { + CondBrKind::Zero(reg) => CondBrKind::NotZero(reg), + CondBrKind::NotZero(reg) => CondBrKind::Zero(reg), + CondBrKind::Cond(c) => CondBrKind::Cond(c.invert()), + } + } +} + +/// A branch target. Either unresolved (basic-block index) or resolved (offset +/// from end of current instruction). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum BranchTarget { + /// An unresolved reference to a Label, as passed into + /// `lower_branch_group()`. + Label(MachLabel), + /// A fixed PC offset. + ResolvedOffset(i32), +} + +impl BranchTarget { + /// Return the target's label, if it is a label-based target. + pub fn as_label(self) -> Option<MachLabel> { + match self { + BranchTarget::Label(l) => Some(l), + _ => None, + } + } + + /// Return the target's offset, if specified, or zero if label-based. + pub fn as_offset19_or_zero(self) -> u32 { + let off = match self { + BranchTarget::ResolvedOffset(off) => off >> 2, + _ => 0, + }; + assert!(off <= 0x3ffff); + assert!(off >= -0x40000); + (off as u32) & 0x7ffff + } + + /// Return the target's offset, if specified, or zero if label-based. + pub fn as_offset26_or_zero(self) -> u32 { + let off = match self { + BranchTarget::ResolvedOffset(off) => off >> 2, + _ => 0, + }; + assert!(off <= 0x1ffffff); + assert!(off >= -0x2000000); + (off as u32) & 0x3ffffff + } +} + +impl PrettyPrint for ShiftOpAndAmt { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("{:?} {}", self.op(), self.amt().value()) + } +} + +impl PrettyPrint for ExtendOp { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("{:?}", self) + } +} + +impl PrettyPrint for MemLabel { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + match self { + &MemLabel::PCRel(off) => format!("pc+{}", off), + } + } +} + +fn shift_for_type(ty: Type) -> usize { + match ty.bytes() { + 1 => 0, + 2 => 1, + 4 => 2, + 8 => 3, + 16 => 4, + _ => panic!("unknown type: {}", ty), + } +} + +impl PrettyPrint for AMode { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + match self { + &AMode::Unscaled(reg, simm9) => { + if simm9.value != 0 { + format!("[{}, {}]", reg.show_rru(mb_rru), simm9.show_rru(mb_rru)) + } else { + format!("[{}]", reg.show_rru(mb_rru)) + } + } + &AMode::UnsignedOffset(reg, uimm12) => { + if uimm12.value != 0 { + format!("[{}, {}]", reg.show_rru(mb_rru), uimm12.show_rru(mb_rru)) + } else { + format!("[{}]", reg.show_rru(mb_rru)) + } + } + &AMode::RegReg(r1, r2) => { + format!("[{}, {}]", r1.show_rru(mb_rru), r2.show_rru(mb_rru),) + } + &AMode::RegScaled(r1, r2, ty) => { + let shift = shift_for_type(ty); + format!( + "[{}, {}, LSL #{}]", + r1.show_rru(mb_rru), + r2.show_rru(mb_rru), + shift, + ) + } + &AMode::RegScaledExtended(r1, r2, ty, op) => { + let shift = shift_for_type(ty); + let size = match op { + ExtendOp::SXTW | ExtendOp::UXTW => OperandSize::Size32, + _ => OperandSize::Size64, + }; + let op = op.show_rru(mb_rru); + format!( + "[{}, {}, {} #{}]", + r1.show_rru(mb_rru), + show_ireg_sized(r2, mb_rru, size), + op, + shift + ) + } + &AMode::RegExtended(r1, r2, op) => { + let size = match op { + ExtendOp::SXTW | ExtendOp::UXTW => OperandSize::Size32, + _ => OperandSize::Size64, + }; + let op = op.show_rru(mb_rru); + format!( + "[{}, {}, {}]", + r1.show_rru(mb_rru), + show_ireg_sized(r2, mb_rru, size), + op, + ) + } + &AMode::Label(ref label) => label.show_rru(mb_rru), + &AMode::PreIndexed(r, simm9) => format!( + "[{}, {}]!", + r.to_reg().show_rru(mb_rru), + simm9.show_rru(mb_rru) + ), + &AMode::PostIndexed(r, simm9) => format!( + "[{}], {}", + r.to_reg().show_rru(mb_rru), + simm9.show_rru(mb_rru) + ), + // Eliminated by `mem_finalize()`. + &AMode::SPOffset(..) + | &AMode::FPOffset(..) + | &AMode::NominalSPOffset(..) + | &AMode::RegOffset(..) => { + panic!("Unexpected pseudo mem-arg mode (stack-offset or generic reg-offset)!") + } + } + } +} + +impl PrettyPrint for PairAMode { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + match self { + &PairAMode::SignedOffset(reg, simm7) => { + if simm7.value != 0 { + format!("[{}, {}]", reg.show_rru(mb_rru), simm7.show_rru(mb_rru)) + } else { + format!("[{}]", reg.show_rru(mb_rru)) + } + } + &PairAMode::PreIndexed(reg, simm7) => format!( + "[{}, {}]!", + reg.to_reg().show_rru(mb_rru), + simm7.show_rru(mb_rru) + ), + &PairAMode::PostIndexed(reg, simm7) => format!( + "[{}], {}", + reg.to_reg().show_rru(mb_rru), + simm7.show_rru(mb_rru) + ), + } + } +} + +impl PrettyPrint for Cond { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + let mut s = format!("{:?}", self); + s.make_ascii_lowercase(); + s + } +} + +impl PrettyPrint for BranchTarget { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + match self { + &BranchTarget::Label(label) => format!("label{:?}", label.get()), + &BranchTarget::ResolvedOffset(off) => format!("{}", off), + } + } +} + +/// Type used to communicate the operand size of a machine instruction, as AArch64 has 32- and +/// 64-bit variants of many instructions (and integer registers). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum OperandSize { + Size32, + Size64, +} + +impl OperandSize { + /// 32-bit case? + pub fn is32(self) -> bool { + self == OperandSize::Size32 + } + /// 64-bit case? + pub fn is64(self) -> bool { + self == OperandSize::Size64 + } + /// Convert from an `is32` boolean flag to an `OperandSize`. + pub fn from_is32(is32: bool) -> OperandSize { + if is32 { + OperandSize::Size32 + } else { + OperandSize::Size64 + } + } + /// Convert from a needed width to the smallest size that fits. + pub fn from_bits<I: Into<usize>>(bits: I) -> OperandSize { + let bits: usize = bits.into(); + assert!(bits <= 64); + if bits <= 32 { + OperandSize::Size32 + } else { + OperandSize::Size64 + } + } + + /// Convert from an integer type into the smallest size that fits. + pub fn from_ty(ty: Type) -> OperandSize { + Self::from_bits(ty_bits(ty)) + } + + /// Convert to I32, I64, or I128. + pub fn to_ty(self) -> Type { + match self { + OperandSize::Size32 => I32, + OperandSize::Size64 => I64, + } + } + + pub fn sf_bit(&self) -> u32 { + match self { + OperandSize::Size32 => 0, + OperandSize::Size64 => 1, + } + } +} + +/// Type used to communicate the size of a scalar SIMD & FP operand. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ScalarSize { + Size8, + Size16, + Size32, + Size64, + Size128, +} + +impl ScalarSize { + /// Convert from a needed width to the smallest size that fits. + pub fn from_bits<I: Into<usize>>(bits: I) -> ScalarSize { + match bits.into().next_power_of_two() { + 8 => ScalarSize::Size8, + 16 => ScalarSize::Size16, + 32 => ScalarSize::Size32, + 64 => ScalarSize::Size64, + 128 => ScalarSize::Size128, + w => panic!("Unexpected type width: {}", w), + } + } + + /// Convert to an integer operand size. + pub fn operand_size(&self) -> OperandSize { + match self { + ScalarSize::Size32 => OperandSize::Size32, + ScalarSize::Size64 => OperandSize::Size64, + _ => panic!("Unexpected operand_size request for: {:?}", self), + } + } + + /// Convert from a type into the smallest size that fits. + pub fn from_ty(ty: Type) -> ScalarSize { + Self::from_bits(ty_bits(ty)) + } + + /// Return the encoding bits that are used by some scalar FP instructions + /// for a particular operand size. + pub fn ftype(&self) -> u32 { + match self { + ScalarSize::Size16 => 0b11, + ScalarSize::Size32 => 0b00, + ScalarSize::Size64 => 0b01, + _ => panic!("Unexpected scalar FP operand size: {:?}", self), + } + } +} + +/// Type used to communicate the size of a vector operand. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum VectorSize { + Size8x8, + Size8x16, + Size16x4, + Size16x8, + Size32x2, + Size32x4, + Size64x2, +} + +impl VectorSize { + /// Get the vector operand size with the given scalar size as lane size. + pub fn from_lane_size(size: ScalarSize, is_128bit: bool) -> VectorSize { + match (size, is_128bit) { + (ScalarSize::Size8, false) => VectorSize::Size8x8, + (ScalarSize::Size8, true) => VectorSize::Size8x16, + (ScalarSize::Size16, false) => VectorSize::Size16x4, + (ScalarSize::Size16, true) => VectorSize::Size16x8, + (ScalarSize::Size32, false) => VectorSize::Size32x2, + (ScalarSize::Size32, true) => VectorSize::Size32x4, + (ScalarSize::Size64, true) => VectorSize::Size64x2, + _ => panic!("Unexpected scalar FP operand size: {:?}", size), + } + } + + /// Convert from a type into a vector operand size. + pub fn from_ty(ty: Type) -> VectorSize { + match ty { + B8X16 => VectorSize::Size8x16, + B16X8 => VectorSize::Size16x8, + B32X4 => VectorSize::Size32x4, + B64X2 => VectorSize::Size64x2, + F32X2 => VectorSize::Size32x2, + F32X4 => VectorSize::Size32x4, + F64X2 => VectorSize::Size64x2, + I8X8 => VectorSize::Size8x8, + I8X16 => VectorSize::Size8x16, + I16X4 => VectorSize::Size16x4, + I16X8 => VectorSize::Size16x8, + I32X2 => VectorSize::Size32x2, + I32X4 => VectorSize::Size32x4, + I64X2 => VectorSize::Size64x2, + _ => unimplemented!("Unsupported type: {}", ty), + } + } + + /// Get the integer operand size that corresponds to a lane of a vector with a certain size. + pub fn operand_size(&self) -> OperandSize { + match self { + VectorSize::Size64x2 => OperandSize::Size64, + _ => OperandSize::Size32, + } + } + + /// Get the scalar operand size that corresponds to a lane of a vector with a certain size. + pub fn lane_size(&self) -> ScalarSize { + match self { + VectorSize::Size8x8 => ScalarSize::Size8, + VectorSize::Size8x16 => ScalarSize::Size8, + VectorSize::Size16x4 => ScalarSize::Size16, + VectorSize::Size16x8 => ScalarSize::Size16, + VectorSize::Size32x2 => ScalarSize::Size32, + VectorSize::Size32x4 => ScalarSize::Size32, + VectorSize::Size64x2 => ScalarSize::Size64, + } + } + + pub fn is_128bits(&self) -> bool { + match self { + VectorSize::Size8x8 => false, + VectorSize::Size8x16 => true, + VectorSize::Size16x4 => false, + VectorSize::Size16x8 => true, + VectorSize::Size32x2 => false, + VectorSize::Size32x4 => true, + VectorSize::Size64x2 => true, + } + } + + /// Produces a `VectorSize` with lanes twice as wide. Note that if the resulting + /// size would exceed 128 bits, then the number of lanes is also halved, so as to + /// ensure that the result size is at most 128 bits. + pub fn widen(&self) -> VectorSize { + match self { + VectorSize::Size8x8 => VectorSize::Size16x8, + VectorSize::Size8x16 => VectorSize::Size16x8, + VectorSize::Size16x4 => VectorSize::Size32x4, + VectorSize::Size16x8 => VectorSize::Size32x4, + VectorSize::Size32x2 => VectorSize::Size64x2, + VectorSize::Size32x4 => VectorSize::Size64x2, + VectorSize::Size64x2 => unreachable!(), + } + } + + /// Produces a `VectorSize` that has the same lane width, but half as many lanes. + pub fn halve(&self) -> VectorSize { + match self { + VectorSize::Size8x16 => VectorSize::Size8x8, + VectorSize::Size16x8 => VectorSize::Size16x4, + VectorSize::Size32x4 => VectorSize::Size32x2, + _ => *self, + } + } + + /// Return the encoding bits that are used by some SIMD instructions + /// for a particular operand size. + pub fn enc_size(&self) -> (u32, u32) { + let q = self.is_128bits() as u32; + let size = match self.lane_size() { + ScalarSize::Size8 => 0b00, + ScalarSize::Size16 => 0b01, + ScalarSize::Size32 => 0b10, + ScalarSize::Size64 => 0b11, + _ => unreachable!(), + }; + + (q, size) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit.rs new file mode 100644 index 0000000000..5d0270dade --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit.rs @@ -0,0 +1,2359 @@ +//! AArch64 ISA: binary code emission. + +use crate::binemit::{CodeOffset, Reloc, StackMap}; +use crate::ir::constant::ConstantData; +use crate::ir::types::*; +use crate::ir::{MemFlags, TrapCode}; +use crate::isa::aarch64::inst::*; +use crate::machinst::ty_bits; + +use regalloc::{Reg, RegClass, Writable}; + +use core::convert::TryFrom; +use log::debug; + +/// Memory label/reference finalization: convert a MemLabel to a PC-relative +/// offset, possibly emitting relocation(s) as necessary. +pub fn memlabel_finalize(_insn_off: CodeOffset, label: &MemLabel) -> i32 { + match label { + &MemLabel::PCRel(rel) => rel, + } +} + +/// Memory addressing mode finalization: convert "special" modes (e.g., +/// generic arbitrary stack offset) into real addressing modes, possibly by +/// emitting some helper instructions that come immediately before the use +/// of this amode. +pub fn mem_finalize( + insn_off: CodeOffset, + mem: &AMode, + state: &EmitState, +) -> (SmallVec<[Inst; 4]>, AMode) { + match mem { + &AMode::RegOffset(_, off, ty) + | &AMode::SPOffset(off, ty) + | &AMode::FPOffset(off, ty) + | &AMode::NominalSPOffset(off, ty) => { + let basereg = match mem { + &AMode::RegOffset(reg, _, _) => reg, + &AMode::SPOffset(..) | &AMode::NominalSPOffset(..) => stack_reg(), + &AMode::FPOffset(..) => fp_reg(), + _ => unreachable!(), + }; + let adj = match mem { + &AMode::NominalSPOffset(..) => { + debug!( + "mem_finalize: nominal SP offset {} + adj {} -> {}", + off, + state.virtual_sp_offset, + off + state.virtual_sp_offset + ); + state.virtual_sp_offset + } + _ => 0, + }; + let off = off + adj; + + if let Some(simm9) = SImm9::maybe_from_i64(off) { + let mem = AMode::Unscaled(basereg, simm9); + (smallvec![], mem) + } else if let Some(uimm12s) = UImm12Scaled::maybe_from_i64(off, ty) { + let mem = AMode::UnsignedOffset(basereg, uimm12s); + (smallvec![], mem) + } else { + let tmp = writable_spilltmp_reg(); + let mut const_insts = Inst::load_constant(tmp, off as u64); + // N.B.: we must use AluRRRExtend because AluRRR uses the "shifted register" form + // (AluRRRShift) instead, which interprets register 31 as the zero reg, not SP. SP + // is a valid base (for SPOffset) which we must handle here. + // Also, SP needs to be the first arg, not second. + let add_inst = Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd: tmp, + rn: basereg, + rm: tmp.to_reg(), + extendop: ExtendOp::UXTX, + }; + const_insts.push(add_inst); + (const_insts, AMode::reg(tmp.to_reg())) + } + } + + &AMode::Label(ref label) => { + let off = memlabel_finalize(insn_off, label); + (smallvec![], AMode::Label(MemLabel::PCRel(off))) + } + + _ => (smallvec![], mem.clone()), + } +} + +/// Helper: get a ConstantData from a u64. +pub fn u64_constant(bits: u64) -> ConstantData { + let data = bits.to_le_bytes(); + ConstantData::from(&data[..]) +} + +//============================================================================= +// Instructions and subcomponents: emission + +fn machreg_to_gpr(m: Reg) -> u32 { + assert_eq!(m.get_class(), RegClass::I64); + u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap() +} + +fn machreg_to_vec(m: Reg) -> u32 { + assert_eq!(m.get_class(), RegClass::V128); + u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap() +} + +fn machreg_to_gpr_or_vec(m: Reg) -> u32 { + u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap() +} + +fn enc_arith_rrr(bits_31_21: u32, bits_15_10: u32, rd: Writable<Reg>, rn: Reg, rm: Reg) -> u32 { + (bits_31_21 << 21) + | (bits_15_10 << 10) + | machreg_to_gpr(rd.to_reg()) + | (machreg_to_gpr(rn) << 5) + | (machreg_to_gpr(rm) << 16) +} + +fn enc_arith_rr_imm12( + bits_31_24: u32, + immshift: u32, + imm12: u32, + rn: Reg, + rd: Writable<Reg>, +) -> u32 { + (bits_31_24 << 24) + | (immshift << 22) + | (imm12 << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rd.to_reg()) +} + +fn enc_arith_rr_imml(bits_31_23: u32, imm_bits: u32, rn: Reg, rd: Writable<Reg>) -> u32 { + (bits_31_23 << 23) | (imm_bits << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()) +} + +fn enc_arith_rrrr(top11: u32, rm: Reg, bit15: u32, ra: Reg, rn: Reg, rd: Writable<Reg>) -> u32 { + (top11 << 21) + | (machreg_to_gpr(rm) << 16) + | (bit15 << 15) + | (machreg_to_gpr(ra) << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rd.to_reg()) +} + +fn enc_jump26(op_31_26: u32, off_26_0: u32) -> u32 { + assert!(off_26_0 < (1 << 26)); + (op_31_26 << 26) | off_26_0 +} + +fn enc_cmpbr(op_31_24: u32, off_18_0: u32, reg: Reg) -> u32 { + assert!(off_18_0 < (1 << 19)); + (op_31_24 << 24) | (off_18_0 << 5) | machreg_to_gpr(reg) +} + +fn enc_cbr(op_31_24: u32, off_18_0: u32, op_4: u32, cond: u32) -> u32 { + assert!(off_18_0 < (1 << 19)); + assert!(cond < (1 << 4)); + (op_31_24 << 24) | (off_18_0 << 5) | (op_4 << 4) | cond +} + +fn enc_conditional_br(taken: BranchTarget, kind: CondBrKind) -> u32 { + match kind { + CondBrKind::Zero(reg) => enc_cmpbr(0b1_011010_0, taken.as_offset19_or_zero(), reg), + CondBrKind::NotZero(reg) => enc_cmpbr(0b1_011010_1, taken.as_offset19_or_zero(), reg), + CondBrKind::Cond(c) => enc_cbr(0b01010100, taken.as_offset19_or_zero(), 0b0, c.bits()), + } +} + +const MOVE_WIDE_FIXED: u32 = 0x12800000; + +#[repr(u32)] +enum MoveWideOpcode { + MOVN = 0b00, + MOVZ = 0b10, + MOVK = 0b11, +} + +fn enc_move_wide( + op: MoveWideOpcode, + rd: Writable<Reg>, + imm: MoveWideConst, + size: OperandSize, +) -> u32 { + assert!(imm.shift <= 0b11); + MOVE_WIDE_FIXED + | size.sf_bit() << 31 + | (op as u32) << 29 + | u32::from(imm.shift) << 21 + | u32::from(imm.bits) << 5 + | machreg_to_gpr(rd.to_reg()) +} + +fn enc_ldst_pair(op_31_22: u32, simm7: SImm7Scaled, rn: Reg, rt: Reg, rt2: Reg) -> u32 { + (op_31_22 << 22) + | (simm7.bits() << 15) + | (machreg_to_gpr(rt2) << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rt) +} + +fn enc_ldst_simm9(op_31_22: u32, simm9: SImm9, op_11_10: u32, rn: Reg, rd: Reg) -> u32 { + (op_31_22 << 22) + | (simm9.bits() << 12) + | (op_11_10 << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr_or_vec(rd) +} + +fn enc_ldst_uimm12(op_31_22: u32, uimm12: UImm12Scaled, rn: Reg, rd: Reg) -> u32 { + (op_31_22 << 22) + | (0b1 << 24) + | (uimm12.bits() << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr_or_vec(rd) +} + +fn enc_ldst_reg( + op_31_22: u32, + rn: Reg, + rm: Reg, + s_bit: bool, + extendop: Option<ExtendOp>, + rd: Reg, +) -> u32 { + let s_bit = if s_bit { 1 } else { 0 }; + let extend_bits = match extendop { + Some(ExtendOp::UXTW) => 0b010, + Some(ExtendOp::SXTW) => 0b110, + Some(ExtendOp::SXTX) => 0b111, + None => 0b011, // LSL + _ => panic!("bad extend mode for ld/st AMode"), + }; + (op_31_22 << 22) + | (1 << 21) + | (machreg_to_gpr(rm) << 16) + | (extend_bits << 13) + | (s_bit << 12) + | (0b10 << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr_or_vec(rd) +} + +fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 { + (op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd) +} + +fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 { + debug_assert_eq!(q & 0b1, q); + debug_assert_eq!(size & 0b11, size); + 0b0_0_0011010_10_00000_110_0_00_00000_00000 + | q << 30 + | size << 10 + | machreg_to_gpr(rn) << 5 + | machreg_to_vec(rt.to_reg()) +} + +fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 { + (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()) +} + +fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable<Reg>) -> u32 { + (top11 << 21) + | (machreg_to_vec(rm) << 16) + | (bit15_10 << 10) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()) +} + +fn enc_bit_rr(size: u32, opcode2: u32, opcode1: u32, rn: Reg, rd: Writable<Reg>) -> u32 { + (0b01011010110 << 21) + | size << 31 + | opcode2 << 16 + | opcode1 << 10 + | machreg_to_gpr(rn) << 5 + | machreg_to_gpr(rd.to_reg()) +} + +fn enc_br(rn: Reg) -> u32 { + 0b1101011_0000_11111_000000_00000_00000 | (machreg_to_gpr(rn) << 5) +} + +fn enc_adr(off: i32, rd: Writable<Reg>) -> u32 { + let off = u32::try_from(off).unwrap(); + let immlo = off & 3; + let immhi = (off >> 2) & ((1 << 19) - 1); + (0b00010000 << 24) | (immlo << 29) | (immhi << 5) | machreg_to_gpr(rd.to_reg()) +} + +fn enc_csel(rd: Writable<Reg>, rn: Reg, rm: Reg, cond: Cond) -> u32 { + 0b100_11010100_00000_0000_00_00000_00000 + | (machreg_to_gpr(rm) << 16) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rd.to_reg()) + | (cond.bits() << 12) +} + +fn enc_fcsel(rd: Writable<Reg>, rn: Reg, rm: Reg, cond: Cond, size: ScalarSize) -> u32 { + 0b000_11110_00_1_00000_0000_11_00000_00000 + | (size.ftype() << 22) + | (machreg_to_vec(rm) << 16) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()) + | (cond.bits() << 12) +} + +fn enc_cset(rd: Writable<Reg>, cond: Cond) -> u32 { + 0b100_11010100_11111_0000_01_11111_00000 + | machreg_to_gpr(rd.to_reg()) + | (cond.invert().bits() << 12) +} + +fn enc_ccmp_imm(size: OperandSize, rn: Reg, imm: UImm5, nzcv: NZCV, cond: Cond) -> u32 { + 0b0_1_1_11010010_00000_0000_10_00000_0_0000 + | size.sf_bit() << 31 + | imm.bits() << 16 + | cond.bits() << 12 + | machreg_to_gpr(rn) << 5 + | nzcv.bits() +} + +fn enc_vecmov(is_16b: bool, rd: Writable<Reg>, rn: Reg) -> u32 { + 0b00001110_101_00000_00011_1_00000_00000 + | ((is_16b as u32) << 30) + | machreg_to_vec(rd.to_reg()) + | (machreg_to_vec(rn) << 16) + | (machreg_to_vec(rn) << 5) +} + +fn enc_fpurr(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 { + (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg()) +} + +fn enc_fpurrr(top22: u32, rd: Writable<Reg>, rn: Reg, rm: Reg) -> u32 { + (top22 << 10) + | (machreg_to_vec(rm) << 16) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()) +} + +fn enc_fpurrrr(top17: u32, rd: Writable<Reg>, rn: Reg, rm: Reg, ra: Reg) -> u32 { + (top17 << 15) + | (machreg_to_vec(rm) << 16) + | (machreg_to_vec(ra) << 10) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()) +} + +fn enc_fcmp(size: ScalarSize, rn: Reg, rm: Reg) -> u32 { + 0b000_11110_00_1_00000_00_1000_00000_00000 + | (size.ftype() << 22) + | (machreg_to_vec(rm) << 16) + | (machreg_to_vec(rn) << 5) +} + +fn enc_fputoint(top16: u32, rd: Writable<Reg>, rn: Reg) -> u32 { + (top16 << 16) | (machreg_to_vec(rn) << 5) | machreg_to_gpr(rd.to_reg()) +} + +fn enc_inttofpu(top16: u32, rd: Writable<Reg>, rn: Reg) -> u32 { + (top16 << 16) | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()) +} + +fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 { + (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg()) +} + +fn enc_vec_rr_misc(qu: u32, size: u32, bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 { + debug_assert_eq!(qu & 0b11, qu); + debug_assert_eq!(size & 0b11, size); + debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16); + let bits = 0b0_00_01110_00_10000_00000_10_00000_00000; + bits | qu << 29 + | size << 22 + | bits_12_16 << 12 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()) +} + +fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 { + debug_assert_eq!(q & 0b1, q); + debug_assert_eq!(u & 0b1, u); + debug_assert_eq!(size & 0b11, size); + debug_assert_eq!(opcode & 0b11111, opcode); + 0b0_0_0_01110_00_11000_0_0000_10_00000_00000 + | q << 30 + | u << 29 + | size << 22 + | opcode << 12 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()) +} + +fn enc_tbl(is_extension: bool, len: u32, rd: Writable<Reg>, rn: Reg, rm: Reg) -> u32 { + debug_assert_eq!(len & 0b11, len); + 0b0_1_001110_000_00000_0_00_0_00_00000_00000 + | (machreg_to_vec(rm) << 16) + | len << 13 + | (is_extension as u32) << 12 + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()) +} + +fn enc_dmb_ish() -> u32 { + 0xD5033BBF +} + +fn enc_ldxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 { + let sz = match ty { + I64 => 0b11, + I32 => 0b10, + I16 => 0b01, + I8 => 0b00, + _ => unreachable!(), + }; + 0b00001000_01011111_01111100_00000000 + | (sz << 30) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rt.to_reg()) +} + +fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 { + let sz = match ty { + I64 => 0b11, + I32 => 0b10, + I16 => 0b01, + I8 => 0b00, + _ => unreachable!(), + }; + 0b00001000_00000000_01111100_00000000 + | (sz << 30) + | (machreg_to_gpr(rs.to_reg()) << 16) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rt) +} + +fn enc_asimd_mod_imm(rd: Writable<Reg>, q_op: u32, cmode: u32, imm: u8) -> u32 { + let abc = (imm >> 5) as u32; + let defgh = (imm & 0b11111) as u32; + + debug_assert_eq!(cmode & 0b1111, cmode); + debug_assert_eq!(q_op & 0b11, q_op); + + 0b0_0_0_0111100000_000_0000_01_00000_00000 + | (q_op << 29) + | (abc << 16) + | (cmode << 12) + | (defgh << 5) + | machreg_to_vec(rd.to_reg()) +} + +/// State carried between emissions of a sequence of instructions. +#[derive(Default, Clone, Debug)] +pub struct EmitState { + /// Addend to convert nominal-SP offsets to real-SP offsets at the current + /// program point. + pub(crate) virtual_sp_offset: i64, + /// Offset of FP from nominal-SP. + pub(crate) nominal_sp_to_fp: i64, + /// Safepoint stack map for upcoming instruction, as provided to `pre_safepoint()`. + stack_map: Option<StackMap>, + /// Current source-code location corresponding to instruction to be emitted. + cur_srcloc: SourceLoc, +} + +impl MachInstEmitState<Inst> for EmitState { + fn new(abi: &dyn ABICallee<I = Inst>) -> Self { + EmitState { + virtual_sp_offset: 0, + nominal_sp_to_fp: abi.frame_size() as i64, + stack_map: None, + cur_srcloc: SourceLoc::default(), + } + } + + fn pre_safepoint(&mut self, stack_map: StackMap) { + self.stack_map = Some(stack_map); + } + + fn pre_sourceloc(&mut self, srcloc: SourceLoc) { + self.cur_srcloc = srcloc; + } +} + +impl EmitState { + fn take_stack_map(&mut self) -> Option<StackMap> { + self.stack_map.take() + } + + fn clear_post_insn(&mut self) { + self.stack_map = None; + } + + fn cur_srcloc(&self) -> SourceLoc { + self.cur_srcloc + } +} + +/// Constant state used during function compilation. +pub struct EmitInfo(settings::Flags); + +impl EmitInfo { + pub(crate) fn new(flags: settings::Flags) -> Self { + Self(flags) + } +} + +impl MachInstEmitInfo for EmitInfo { + fn flags(&self) -> &settings::Flags { + &self.0 + } +} + +impl MachInstEmit for Inst { + type State = EmitState; + type Info = EmitInfo; + type UnwindInfo = super::unwind::AArch64UnwindInfo; + + fn emit(&self, sink: &mut MachBuffer<Inst>, emit_info: &Self::Info, state: &mut EmitState) { + // N.B.: we *must* not exceed the "worst-case size" used to compute + // where to insert islands, except when islands are explicitly triggered + // (with an `EmitIsland`). We check this in debug builds. This is `mut` + // to allow disabling the check for `JTSequence`, which is always + // emitted following an `EmitIsland`. + let mut start_off = sink.cur_offset(); + + match self { + &Inst::AluRRR { alu_op, rd, rn, rm } => { + let top11 = match alu_op { + ALUOp::Add32 => 0b00001011_000, + ALUOp::Add64 => 0b10001011_000, + ALUOp::Sub32 => 0b01001011_000, + ALUOp::Sub64 => 0b11001011_000, + ALUOp::Orr32 => 0b00101010_000, + ALUOp::Orr64 => 0b10101010_000, + ALUOp::And32 => 0b00001010_000, + ALUOp::And64 => 0b10001010_000, + ALUOp::Eor32 => 0b01001010_000, + ALUOp::Eor64 => 0b11001010_000, + ALUOp::OrrNot32 => 0b00101010_001, + ALUOp::OrrNot64 => 0b10101010_001, + ALUOp::AndNot32 => 0b00001010_001, + ALUOp::AndNot64 => 0b10001010_001, + ALUOp::EorNot32 => 0b01001010_001, + ALUOp::EorNot64 => 0b11001010_001, + ALUOp::AddS32 => 0b00101011_000, + ALUOp::AddS64 => 0b10101011_000, + ALUOp::SubS32 => 0b01101011_000, + ALUOp::SubS64 => 0b11101011_000, + ALUOp::SDiv64 => 0b10011010_110, + ALUOp::UDiv64 => 0b10011010_110, + ALUOp::RotR32 | ALUOp::Lsr32 | ALUOp::Asr32 | ALUOp::Lsl32 => 0b00011010_110, + ALUOp::RotR64 | ALUOp::Lsr64 | ALUOp::Asr64 | ALUOp::Lsl64 => 0b10011010_110, + ALUOp::SMulH => 0b10011011_010, + ALUOp::UMulH => 0b10011011_110, + }; + let bit15_10 = match alu_op { + ALUOp::SDiv64 => 0b000011, + ALUOp::UDiv64 => 0b000010, + ALUOp::RotR32 | ALUOp::RotR64 => 0b001011, + ALUOp::Lsr32 | ALUOp::Lsr64 => 0b001001, + ALUOp::Asr32 | ALUOp::Asr64 => 0b001010, + ALUOp::Lsl32 | ALUOp::Lsl64 => 0b001000, + ALUOp::SMulH | ALUOp::UMulH => 0b011111, + _ => 0b000000, + }; + debug_assert_ne!(writable_stack_reg(), rd); + // The stack pointer is the zero register in this context, so this might be an + // indication that something is wrong. + debug_assert_ne!(stack_reg(), rn); + debug_assert_ne!(stack_reg(), rm); + sink.put4(enc_arith_rrr(top11, bit15_10, rd, rn, rm)); + } + &Inst::AluRRRR { + alu_op, + rd, + rm, + rn, + ra, + } => { + let (top11, bit15) = match alu_op { + ALUOp3::MAdd32 => (0b0_00_11011_000, 0), + ALUOp3::MSub32 => (0b0_00_11011_000, 1), + ALUOp3::MAdd64 => (0b1_00_11011_000, 0), + ALUOp3::MSub64 => (0b1_00_11011_000, 1), + }; + sink.put4(enc_arith_rrrr(top11, rm, bit15, ra, rn, rd)); + } + &Inst::AluRRImm12 { + alu_op, + rd, + rn, + ref imm12, + } => { + let top8 = match alu_op { + ALUOp::Add32 => 0b000_10001, + ALUOp::Add64 => 0b100_10001, + ALUOp::Sub32 => 0b010_10001, + ALUOp::Sub64 => 0b110_10001, + ALUOp::AddS32 => 0b001_10001, + ALUOp::AddS64 => 0b101_10001, + ALUOp::SubS32 => 0b011_10001, + ALUOp::SubS64 => 0b111_10001, + _ => unimplemented!("{:?}", alu_op), + }; + sink.put4(enc_arith_rr_imm12( + top8, + imm12.shift_bits(), + imm12.imm_bits(), + rn, + rd, + )); + } + &Inst::AluRRImmLogic { + alu_op, + rd, + rn, + ref imml, + } => { + let (top9, inv) = match alu_op { + ALUOp::Orr32 => (0b001_100100, false), + ALUOp::Orr64 => (0b101_100100, false), + ALUOp::And32 => (0b000_100100, false), + ALUOp::And64 => (0b100_100100, false), + ALUOp::Eor32 => (0b010_100100, false), + ALUOp::Eor64 => (0b110_100100, false), + ALUOp::OrrNot32 => (0b001_100100, true), + ALUOp::OrrNot64 => (0b101_100100, true), + ALUOp::AndNot32 => (0b000_100100, true), + ALUOp::AndNot64 => (0b100_100100, true), + ALUOp::EorNot32 => (0b010_100100, true), + ALUOp::EorNot64 => (0b110_100100, true), + _ => unimplemented!("{:?}", alu_op), + }; + let imml = if inv { imml.invert() } else { imml.clone() }; + sink.put4(enc_arith_rr_imml(top9, imml.enc_bits(), rn, rd)); + } + + &Inst::AluRRImmShift { + alu_op, + rd, + rn, + ref immshift, + } => { + let amt = immshift.value(); + let (top10, immr, imms) = match alu_op { + ALUOp::RotR32 => (0b0001001110, machreg_to_gpr(rn), u32::from(amt)), + ALUOp::RotR64 => (0b1001001111, machreg_to_gpr(rn), u32::from(amt)), + ALUOp::Lsr32 => (0b0101001100, u32::from(amt), 0b011111), + ALUOp::Lsr64 => (0b1101001101, u32::from(amt), 0b111111), + ALUOp::Asr32 => (0b0001001100, u32::from(amt), 0b011111), + ALUOp::Asr64 => (0b1001001101, u32::from(amt), 0b111111), + ALUOp::Lsl32 => ( + 0b0101001100, + u32::from((32 - amt) % 32), + u32::from(31 - amt), + ), + ALUOp::Lsl64 => ( + 0b1101001101, + u32::from((64 - amt) % 64), + u32::from(63 - amt), + ), + _ => unimplemented!("{:?}", alu_op), + }; + sink.put4( + (top10 << 22) + | (immr << 16) + | (imms << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rd.to_reg()), + ); + } + + &Inst::AluRRRShift { + alu_op, + rd, + rn, + rm, + ref shiftop, + } => { + let top11: u32 = match alu_op { + ALUOp::Add32 => 0b000_01011000, + ALUOp::Add64 => 0b100_01011000, + ALUOp::AddS32 => 0b001_01011000, + ALUOp::AddS64 => 0b101_01011000, + ALUOp::Sub32 => 0b010_01011000, + ALUOp::Sub64 => 0b110_01011000, + ALUOp::SubS32 => 0b011_01011000, + ALUOp::SubS64 => 0b111_01011000, + ALUOp::Orr32 => 0b001_01010000, + ALUOp::Orr64 => 0b101_01010000, + ALUOp::And32 => 0b000_01010000, + ALUOp::And64 => 0b100_01010000, + ALUOp::Eor32 => 0b010_01010000, + ALUOp::Eor64 => 0b110_01010000, + ALUOp::OrrNot32 => 0b001_01010001, + ALUOp::OrrNot64 => 0b101_01010001, + ALUOp::EorNot32 => 0b010_01010001, + ALUOp::EorNot64 => 0b110_01010001, + ALUOp::AndNot32 => 0b000_01010001, + ALUOp::AndNot64 => 0b100_01010001, + _ => unimplemented!("{:?}", alu_op), + }; + let top11 = top11 | (u32::from(shiftop.op().bits()) << 1); + let bits_15_10 = u32::from(shiftop.amt().value()); + sink.put4(enc_arith_rrr(top11, bits_15_10, rd, rn, rm)); + } + + &Inst::AluRRRExtend { + alu_op, + rd, + rn, + rm, + extendop, + } => { + let top11: u32 = match alu_op { + ALUOp::Add32 => 0b00001011001, + ALUOp::Add64 => 0b10001011001, + ALUOp::Sub32 => 0b01001011001, + ALUOp::Sub64 => 0b11001011001, + ALUOp::AddS32 => 0b00101011001, + ALUOp::AddS64 => 0b10101011001, + ALUOp::SubS32 => 0b01101011001, + ALUOp::SubS64 => 0b11101011001, + _ => unimplemented!("{:?}", alu_op), + }; + let bits_15_10 = u32::from(extendop.bits()) << 3; + sink.put4(enc_arith_rrr(top11, bits_15_10, rd, rn, rm)); + } + + &Inst::BitRR { op, rd, rn, .. } => { + let size = if op.operand_size().is32() { 0b0 } else { 0b1 }; + let (op1, op2) = match op { + BitOp::RBit32 | BitOp::RBit64 => (0b00000, 0b000000), + BitOp::Clz32 | BitOp::Clz64 => (0b00000, 0b000100), + BitOp::Cls32 | BitOp::Cls64 => (0b00000, 0b000101), + }; + sink.put4(enc_bit_rr(size, op1, op2, rn, rd)) + } + + &Inst::ULoad8 { rd, ref mem, flags } + | &Inst::SLoad8 { rd, ref mem, flags } + | &Inst::ULoad16 { rd, ref mem, flags } + | &Inst::SLoad16 { rd, ref mem, flags } + | &Inst::ULoad32 { rd, ref mem, flags } + | &Inst::SLoad32 { rd, ref mem, flags } + | &Inst::ULoad64 { + rd, ref mem, flags, .. + } + | &Inst::FpuLoad32 { rd, ref mem, flags } + | &Inst::FpuLoad64 { rd, ref mem, flags } + | &Inst::FpuLoad128 { rd, ref mem, flags } => { + let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state); + + for inst in mem_insts.into_iter() { + inst.emit(sink, emit_info, state); + } + + // ldst encoding helpers take Reg, not Writable<Reg>. + let rd = rd.to_reg(); + + // This is the base opcode (top 10 bits) for the "unscaled + // immediate" form (Unscaled). Other addressing modes will OR in + // other values for bits 24/25 (bits 1/2 of this constant). + let (op, bits) = match self { + &Inst::ULoad8 { .. } => (0b0011100001, 8), + &Inst::SLoad8 { .. } => (0b0011100010, 8), + &Inst::ULoad16 { .. } => (0b0111100001, 16), + &Inst::SLoad16 { .. } => (0b0111100010, 16), + &Inst::ULoad32 { .. } => (0b1011100001, 32), + &Inst::SLoad32 { .. } => (0b1011100010, 32), + &Inst::ULoad64 { .. } => (0b1111100001, 64), + &Inst::FpuLoad32 { .. } => (0b1011110001, 32), + &Inst::FpuLoad64 { .. } => (0b1111110001, 64), + &Inst::FpuLoad128 { .. } => (0b0011110011, 128), + _ => unreachable!(), + }; + + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() && !flags.notrap() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + match &mem { + &AMode::Unscaled(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd)); + } + &AMode::UnsignedOffset(reg, uimm12scaled) => { + if uimm12scaled.value() != 0 { + assert_eq!(bits, ty_bits(uimm12scaled.scale_ty())); + } + sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd)); + } + &AMode::RegReg(r1, r2) => { + sink.put4(enc_ldst_reg( + op, r1, r2, /* scaled = */ false, /* extendop = */ None, rd, + )); + } + &AMode::RegScaled(r1, r2, ty) | &AMode::RegScaledExtended(r1, r2, ty, _) => { + assert_eq!(bits, ty_bits(ty)); + let extendop = match &mem { + &AMode::RegScaled(..) => None, + &AMode::RegScaledExtended(_, _, _, op) => Some(op), + _ => unreachable!(), + }; + sink.put4(enc_ldst_reg( + op, r1, r2, /* scaled = */ true, extendop, rd, + )); + } + &AMode::RegExtended(r1, r2, extendop) => { + sink.put4(enc_ldst_reg( + op, + r1, + r2, + /* scaled = */ false, + Some(extendop), + rd, + )); + } + &AMode::Label(ref label) => { + let offset = match label { + // cast i32 to u32 (two's-complement) + &MemLabel::PCRel(off) => off as u32, + } / 4; + assert!(offset < (1 << 19)); + match self { + &Inst::ULoad32 { .. } => { + sink.put4(enc_ldst_imm19(0b00011000, offset, rd)); + } + &Inst::SLoad32 { .. } => { + sink.put4(enc_ldst_imm19(0b10011000, offset, rd)); + } + &Inst::FpuLoad32 { .. } => { + sink.put4(enc_ldst_imm19(0b00011100, offset, rd)); + } + &Inst::ULoad64 { .. } => { + sink.put4(enc_ldst_imm19(0b01011000, offset, rd)); + } + &Inst::FpuLoad64 { .. } => { + sink.put4(enc_ldst_imm19(0b01011100, offset, rd)); + } + &Inst::FpuLoad128 { .. } => { + sink.put4(enc_ldst_imm19(0b10011100, offset, rd)); + } + _ => panic!("Unspported size for LDR from constant pool!"), + } + } + &AMode::PreIndexed(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b11, reg.to_reg(), rd)); + } + &AMode::PostIndexed(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd)); + } + // Eliminated by `mem_finalize()` above. + &AMode::SPOffset(..) | &AMode::FPOffset(..) | &AMode::NominalSPOffset(..) => { + panic!("Should not see stack-offset here!") + } + &AMode::RegOffset(..) => panic!("SHould not see generic reg-offset here!"), + } + } + + &Inst::Store8 { rd, ref mem, flags } + | &Inst::Store16 { rd, ref mem, flags } + | &Inst::Store32 { rd, ref mem, flags } + | &Inst::Store64 { rd, ref mem, flags } + | &Inst::FpuStore32 { rd, ref mem, flags } + | &Inst::FpuStore64 { rd, ref mem, flags } + | &Inst::FpuStore128 { rd, ref mem, flags } => { + let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state); + + for inst in mem_insts.into_iter() { + inst.emit(sink, emit_info, state); + } + + let (op, bits) = match self { + &Inst::Store8 { .. } => (0b0011100000, 8), + &Inst::Store16 { .. } => (0b0111100000, 16), + &Inst::Store32 { .. } => (0b1011100000, 32), + &Inst::Store64 { .. } => (0b1111100000, 64), + &Inst::FpuStore32 { .. } => (0b1011110000, 32), + &Inst::FpuStore64 { .. } => (0b1111110000, 64), + &Inst::FpuStore128 { .. } => (0b0011110010, 128), + _ => unreachable!(), + }; + + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() && !flags.notrap() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + match &mem { + &AMode::Unscaled(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd)); + } + &AMode::UnsignedOffset(reg, uimm12scaled) => { + if uimm12scaled.value() != 0 { + assert_eq!(bits, ty_bits(uimm12scaled.scale_ty())); + } + sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd)); + } + &AMode::RegReg(r1, r2) => { + sink.put4(enc_ldst_reg( + op, r1, r2, /* scaled = */ false, /* extendop = */ None, rd, + )); + } + &AMode::RegScaled(r1, r2, _ty) | &AMode::RegScaledExtended(r1, r2, _ty, _) => { + let extendop = match &mem { + &AMode::RegScaled(..) => None, + &AMode::RegScaledExtended(_, _, _, op) => Some(op), + _ => unreachable!(), + }; + sink.put4(enc_ldst_reg( + op, r1, r2, /* scaled = */ true, extendop, rd, + )); + } + &AMode::RegExtended(r1, r2, extendop) => { + sink.put4(enc_ldst_reg( + op, + r1, + r2, + /* scaled = */ false, + Some(extendop), + rd, + )); + } + &AMode::Label(..) => { + panic!("Store to a MemLabel not implemented!"); + } + &AMode::PreIndexed(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b11, reg.to_reg(), rd)); + } + &AMode::PostIndexed(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd)); + } + // Eliminated by `mem_finalize()` above. + &AMode::SPOffset(..) | &AMode::FPOffset(..) | &AMode::NominalSPOffset(..) => { + panic!("Should not see stack-offset here!") + } + &AMode::RegOffset(..) => panic!("SHould not see generic reg-offset here!"), + } + } + + &Inst::StoreP64 { + rt, + rt2, + ref mem, + flags, + } => { + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() && !flags.notrap() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + match mem { + &PairAMode::SignedOffset(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100100, simm7, reg, rt, rt2)); + } + &PairAMode::PreIndexed(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100110, simm7, reg.to_reg(), rt, rt2)); + } + &PairAMode::PostIndexed(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100010, simm7, reg.to_reg(), rt, rt2)); + } + } + } + &Inst::LoadP64 { + rt, + rt2, + ref mem, + flags, + } => { + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() && !flags.notrap() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + let rt = rt.to_reg(); + let rt2 = rt2.to_reg(); + match mem { + &PairAMode::SignedOffset(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100101, simm7, reg, rt, rt2)); + } + &PairAMode::PreIndexed(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100111, simm7, reg.to_reg(), rt, rt2)); + } + &PairAMode::PostIndexed(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100011, simm7, reg.to_reg(), rt, rt2)); + } + } + } + &Inst::Mov64 { rd, rm } => { + assert!(rd.to_reg().get_class() == rm.get_class()); + assert!(rm.get_class() == RegClass::I64); + + // MOV to SP is interpreted as MOV to XZR instead. And our codegen + // should never MOV to XZR. + assert!(rd.to_reg() != stack_reg()); + + if rm == stack_reg() { + // We can't use ORR here, so use an `add rd, sp, #0` instead. + let imm12 = Imm12::maybe_from_u64(0).unwrap(); + sink.put4(enc_arith_rr_imm12( + 0b100_10001, + imm12.shift_bits(), + imm12.imm_bits(), + rm, + rd, + )); + } else { + // Encoded as ORR rd, rm, zero. + sink.put4(enc_arith_rrr(0b10101010_000, 0b000_000, rd, zero_reg(), rm)); + } + } + &Inst::Mov32 { rd, rm } => { + // MOV to SP is interpreted as MOV to XZR instead. And our codegen + // should never MOV to XZR. + assert!(machreg_to_gpr(rd.to_reg()) != 31); + // Encoded as ORR rd, rm, zero. + sink.put4(enc_arith_rrr(0b00101010_000, 0b000_000, rd, zero_reg(), rm)); + } + &Inst::MovZ { rd, imm, size } => { + sink.put4(enc_move_wide(MoveWideOpcode::MOVZ, rd, imm, size)) + } + &Inst::MovN { rd, imm, size } => { + sink.put4(enc_move_wide(MoveWideOpcode::MOVN, rd, imm, size)) + } + &Inst::MovK { rd, imm, size } => { + sink.put4(enc_move_wide(MoveWideOpcode::MOVK, rd, imm, size)) + } + &Inst::CSel { rd, rn, rm, cond } => { + sink.put4(enc_csel(rd, rn, rm, cond)); + } + &Inst::CSet { rd, cond } => { + sink.put4(enc_cset(rd, cond)); + } + &Inst::CCmpImm { + size, + rn, + imm, + nzcv, + cond, + } => { + sink.put4(enc_ccmp_imm(size, rn, imm, nzcv, cond)); + } + &Inst::AtomicRMW { ty, op } => { + /* Emit this: + dmb ish + again: + ldxr{,b,h} x/w27, [x25] + op x28, x27, x26 // op is add,sub,and,orr,eor + stxr{,b,h} w24, x/w28, [x25] + cbnz x24, again + dmb ish + + Operand conventions: + IN: x25 (addr), x26 (2nd arg for op) + OUT: x27 (old value), x24 (trashed), x28 (trashed) + + It is unfortunate that, per the ARM documentation, x28 cannot be used for + both the store-data and success-flag operands of stxr. This causes the + instruction's behaviour to be "CONSTRAINED UNPREDICTABLE", so we use x24 + instead for the success-flag. + + In the case where the operation is 'xchg', the second insn is instead + mov x28, x26 + so that we simply write in the destination, the "2nd arg for op". + */ + let xzr = zero_reg(); + let x24 = xreg(24); + let x25 = xreg(25); + let x26 = xreg(26); + let x27 = xreg(27); + let x28 = xreg(28); + let x24wr = writable_xreg(24); + let x27wr = writable_xreg(27); + let x28wr = writable_xreg(28); + let again_label = sink.get_label(); + + sink.put4(enc_dmb_ish()); // dmb ish + + // again: + sink.bind_label(again_label); + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() { + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25] + + if op == inst_common::AtomicRmwOp::Xchg { + // mov x28, x26 + sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x28wr, xzr, x26)) + } else { + // add/sub/and/orr/eor x28, x27, x26 + let bits_31_21 = match op { + inst_common::AtomicRmwOp::Add => 0b100_01011_00_0, + inst_common::AtomicRmwOp::Sub => 0b110_01011_00_0, + inst_common::AtomicRmwOp::And => 0b100_01010_00_0, + inst_common::AtomicRmwOp::Or => 0b101_01010_00_0, + inst_common::AtomicRmwOp::Xor => 0b110_01010_00_0, + inst_common::AtomicRmwOp::Xchg => unreachable!(), + }; + sink.put4(enc_arith_rrr(bits_31_21, 0b000000, x28wr, x27, x26)); + } + + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() { + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25] + + // cbnz w24, again + // Note, we're actually testing x24, and relying on the default zero-high-half + // rule in the assignment that `stxr` does. + let br_offset = sink.cur_offset(); + sink.put4(enc_conditional_br( + BranchTarget::Label(again_label), + CondBrKind::NotZero(x24), + )); + sink.use_label_at_offset(br_offset, again_label, LabelUse::Branch19); + + sink.put4(enc_dmb_ish()); // dmb ish + } + &Inst::AtomicCAS { ty } => { + /* Emit this: + dmb ish + again: + ldxr{,b,h} x/w27, [x25] + and x24, x26, MASK (= 2^size_bits - 1) + cmp x27, x24 + b.ne out + stxr{,b,h} w24, x/w28, [x25] + cbnz x24, again + out: + dmb ish + + Operand conventions: + IN: x25 (addr), x26 (expected value), x28 (replacement value) + OUT: x27 (old value), x24 (trashed) + */ + let xzr = zero_reg(); + let x24 = xreg(24); + let x25 = xreg(25); + let x26 = xreg(26); + let x27 = xreg(27); + let x28 = xreg(28); + let xzrwr = writable_zero_reg(); + let x24wr = writable_xreg(24); + let x27wr = writable_xreg(27); + let again_label = sink.get_label(); + let out_label = sink.get_label(); + + sink.put4(enc_dmb_ish()); // dmb ish + + // again: + sink.bind_label(again_label); + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() { + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25] + + if ty == I64 { + // mov x24, x26 + sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x24wr, xzr, x26)) + } else { + // and x24, x26, 0xFF/0xFFFF/0xFFFFFFFF + let (mask, s) = match ty { + I8 => (0xFF, 7), + I16 => (0xFFFF, 15), + I32 => (0xFFFFFFFF, 31), + _ => unreachable!(), + }; + sink.put4(enc_arith_rr_imml( + 0b100_100100, + ImmLogic::from_n_r_s(mask, true, 0, s, OperandSize::Size64).enc_bits(), + x26, + x24wr, + )) + } + + // cmp x27, x24 (== subs xzr, x27, x24) + sink.put4(enc_arith_rrr(0b111_01011_00_0, 0b000000, xzrwr, x27, x24)); + + // b.ne out + let br_out_offset = sink.cur_offset(); + sink.put4(enc_conditional_br( + BranchTarget::Label(out_label), + CondBrKind::Cond(Cond::Ne), + )); + sink.use_label_at_offset(br_out_offset, out_label, LabelUse::Branch19); + + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() { + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25] + + // cbnz w24, again. + // Note, we're actually testing x24, and relying on the default zero-high-half + // rule in the assignment that `stxr` does. + let br_again_offset = sink.cur_offset(); + sink.put4(enc_conditional_br( + BranchTarget::Label(again_label), + CondBrKind::NotZero(x24), + )); + sink.use_label_at_offset(br_again_offset, again_label, LabelUse::Branch19); + + // out: + sink.bind_label(out_label); + sink.put4(enc_dmb_ish()); // dmb ish + } + &Inst::AtomicLoad { ty, r_data, r_addr } => { + let op = match ty { + I8 => 0b0011100001, + I16 => 0b0111100001, + I32 => 0b1011100001, + I64 => 0b1111100001, + _ => unreachable!(), + }; + sink.put4(enc_dmb_ish()); // dmb ish + + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() { + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/); + sink.put4(enc_ldst_uimm12( + op, + uimm12scaled_zero, + r_addr, + r_data.to_reg(), + )); + } + &Inst::AtomicStore { ty, r_data, r_addr } => { + let op = match ty { + I8 => 0b0011100000, + I16 => 0b0111100000, + I32 => 0b1011100000, + I64 => 0b1111100000, + _ => unreachable!(), + }; + + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() { + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/); + sink.put4(enc_ldst_uimm12(op, uimm12scaled_zero, r_addr, r_data)); + sink.put4(enc_dmb_ish()); // dmb ish + } + &Inst::Fence {} => { + sink.put4(enc_dmb_ish()); // dmb ish + } + &Inst::FpuMove64 { rd, rn } => { + sink.put4(enc_vecmov(/* 16b = */ false, rd, rn)); + } + &Inst::FpuMove128 { rd, rn } => { + sink.put4(enc_vecmov(/* 16b = */ true, rd, rn)); + } + &Inst::FpuMoveFromVec { rd, rn, idx, size } => { + let (imm5, shift, mask) = match size.lane_size() { + ScalarSize::Size32 => (0b00100, 3, 0b011), + ScalarSize::Size64 => (0b01000, 4, 0b001), + _ => unimplemented!(), + }; + debug_assert_eq!(idx & mask, idx); + let imm5 = imm5 | ((idx as u32) << shift); + sink.put4( + 0b010_11110000_00000_000001_00000_00000 + | (imm5 << 16) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()), + ); + } + &Inst::FpuRR { fpu_op, rd, rn } => { + let top22 = match fpu_op { + FPUOp1::Abs32 => 0b000_11110_00_1_000001_10000, + FPUOp1::Abs64 => 0b000_11110_01_1_000001_10000, + FPUOp1::Neg32 => 0b000_11110_00_1_000010_10000, + FPUOp1::Neg64 => 0b000_11110_01_1_000010_10000, + FPUOp1::Sqrt32 => 0b000_11110_00_1_000011_10000, + FPUOp1::Sqrt64 => 0b000_11110_01_1_000011_10000, + FPUOp1::Cvt32To64 => 0b000_11110_00_1_000101_10000, + FPUOp1::Cvt64To32 => 0b000_11110_01_1_000100_10000, + }; + sink.put4(enc_fpurr(top22, rd, rn)); + } + &Inst::FpuRRR { fpu_op, rd, rn, rm } => { + let top22 = match fpu_op { + FPUOp2::Add32 => 0b000_11110_00_1_00000_001010, + FPUOp2::Add64 => 0b000_11110_01_1_00000_001010, + FPUOp2::Sub32 => 0b000_11110_00_1_00000_001110, + FPUOp2::Sub64 => 0b000_11110_01_1_00000_001110, + FPUOp2::Mul32 => 0b000_11110_00_1_00000_000010, + FPUOp2::Mul64 => 0b000_11110_01_1_00000_000010, + FPUOp2::Div32 => 0b000_11110_00_1_00000_000110, + FPUOp2::Div64 => 0b000_11110_01_1_00000_000110, + FPUOp2::Max32 => 0b000_11110_00_1_00000_010010, + FPUOp2::Max64 => 0b000_11110_01_1_00000_010010, + FPUOp2::Min32 => 0b000_11110_00_1_00000_010110, + FPUOp2::Min64 => 0b000_11110_01_1_00000_010110, + FPUOp2::Sqadd64 => 0b010_11110_11_1_00000_000011, + FPUOp2::Uqadd64 => 0b011_11110_11_1_00000_000011, + FPUOp2::Sqsub64 => 0b010_11110_11_1_00000_001011, + FPUOp2::Uqsub64 => 0b011_11110_11_1_00000_001011, + }; + sink.put4(enc_fpurrr(top22, rd, rn, rm)); + } + &Inst::FpuRRI { fpu_op, rd, rn } => match fpu_op { + FPUOpRI::UShr32(imm) => { + debug_assert_eq!(32, imm.lane_size_in_bits); + sink.put4( + 0b0_0_1_011110_0000000_00_0_0_0_1_00000_00000 + | imm.enc() << 16 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()), + ) + } + FPUOpRI::UShr64(imm) => { + debug_assert_eq!(64, imm.lane_size_in_bits); + sink.put4( + 0b01_1_111110_0000000_00_0_0_0_1_00000_00000 + | imm.enc() << 16 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()), + ) + } + FPUOpRI::Sli64(imm) => { + debug_assert_eq!(64, imm.lane_size_in_bits); + sink.put4( + 0b01_1_111110_0000000_010101_00000_00000 + | imm.enc() << 16 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()), + ) + } + FPUOpRI::Sli32(imm) => { + debug_assert_eq!(32, imm.lane_size_in_bits); + sink.put4( + 0b0_0_1_011110_0000000_010101_00000_00000 + | imm.enc() << 16 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()), + ) + } + }, + &Inst::FpuRRRR { + fpu_op, + rd, + rn, + rm, + ra, + } => { + let top17 = match fpu_op { + FPUOp3::MAdd32 => 0b000_11111_00_0_00000_0, + FPUOp3::MAdd64 => 0b000_11111_01_0_00000_0, + }; + sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra)); + } + &Inst::VecMisc { op, rd, rn, size } => { + let (q, enc_size) = size.enc_size(); + let (u, bits_12_16, size) = match op { + VecMisc2::Not => (0b1, 0b00101, 0b00), + VecMisc2::Neg => (0b1, 0b01011, enc_size), + VecMisc2::Abs => (0b0, 0b01011, enc_size), + VecMisc2::Fabs => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b01111, enc_size) + } + VecMisc2::Fneg => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b1, 0b01111, enc_size) + } + VecMisc2::Fsqrt => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b1, 0b11111, enc_size) + } + VecMisc2::Rev64 => { + debug_assert_ne!(VectorSize::Size64x2, size); + (0b0, 0b00000, enc_size) + } + VecMisc2::Shll => { + debug_assert_ne!(VectorSize::Size64x2, size); + debug_assert!(!size.is_128bits()); + (0b1, 0b10011, enc_size) + } + VecMisc2::Fcvtzs => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b11011, enc_size) + } + VecMisc2::Fcvtzu => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b1, 0b11011, enc_size) + } + VecMisc2::Scvtf => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b11101, enc_size & 0b1) + } + VecMisc2::Ucvtf => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b1, 0b11101, enc_size & 0b1) + } + VecMisc2::Frintn => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b11000, enc_size & 0b01) + } + VecMisc2::Frintz => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b11001, enc_size | 0b10) + } + VecMisc2::Frintm => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b11001, enc_size & 0b01) + } + VecMisc2::Frintp => { + debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2); + (0b0, 0b11000, enc_size | 0b10) + } + }; + sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn)); + } + &Inst::VecLanes { op, rd, rn, size } => { + let (q, size) = match size { + VectorSize::Size8x16 => (0b1, 0b00), + VectorSize::Size16x8 => (0b1, 0b01), + VectorSize::Size32x4 => (0b1, 0b10), + _ => unreachable!(), + }; + let (u, opcode) = match op { + VecLanesOp::Uminv => (0b1, 0b11010), + VecLanesOp::Addv => (0b0, 0b11011), + }; + sink.put4(enc_vec_lanes(q, u, size, opcode, rd, rn)); + } + &Inst::VecShiftImm { + op, + rd, + rn, + size, + imm, + } => { + let (is_shr, template) = match op { + VecShiftImmOp::Ushr => (true, 0b_011_011110_0000_000_000001_00000_00000_u32), + VecShiftImmOp::Sshr => (true, 0b_010_011110_0000_000_000001_00000_00000_u32), + VecShiftImmOp::Shl => (false, 0b_010_011110_0000_000_010101_00000_00000_u32), + }; + let imm = imm as u32; + // Deal with the somewhat strange encoding scheme for, and limits on, + // the shift amount. + let immh_immb = match (size, is_shr) { + (VectorSize::Size64x2, true) if imm >= 1 && imm <= 64 => { + 0b_1000_000_u32 | (64 - imm) + } + (VectorSize::Size32x4, true) if imm >= 1 && imm <= 32 => { + 0b_0100_000_u32 | (32 - imm) + } + (VectorSize::Size16x8, true) if imm >= 1 && imm <= 16 => { + 0b_0010_000_u32 | (16 - imm) + } + (VectorSize::Size8x16, true) if imm >= 1 && imm <= 8 => { + 0b_0001_000_u32 | (8 - imm) + } + (VectorSize::Size64x2, false) if imm <= 63 => 0b_1000_000_u32 | imm, + (VectorSize::Size32x4, false) if imm <= 31 => 0b_0100_000_u32 | imm, + (VectorSize::Size16x8, false) if imm <= 15 => 0b_0010_000_u32 | imm, + (VectorSize::Size8x16, false) if imm <= 7 => 0b_0001_000_u32 | imm, + _ => panic!( + "aarch64: Inst::VecShiftImm: emit: invalid op/size/imm {:?}, {:?}, {:?}", + op, size, imm + ), + }; + let rn_enc = machreg_to_vec(rn); + let rd_enc = machreg_to_vec(rd.to_reg()); + sink.put4(template | (immh_immb << 16) | (rn_enc << 5) | rd_enc); + } + &Inst::VecExtract { rd, rn, rm, imm4 } => { + if imm4 < 16 { + let template = 0b_01_101110_000_00000_0_0000_0_00000_00000_u32; + let rm_enc = machreg_to_vec(rm); + let rn_enc = machreg_to_vec(rn); + let rd_enc = machreg_to_vec(rd.to_reg()); + sink.put4( + template | (rm_enc << 16) | ((imm4 as u32) << 11) | (rn_enc << 5) | rd_enc, + ); + } else { + panic!( + "aarch64: Inst::VecExtract: emit: invalid extract index {}", + imm4 + ); + } + } + &Inst::VecTbl { + rd, + rn, + rm, + is_extension, + } => { + sink.put4(enc_tbl(is_extension, 0b00, rd, rn, rm)); + } + &Inst::VecTbl2 { + rd, + rn, + rn2, + rm, + is_extension, + } => { + assert_eq!(machreg_to_vec(rn2), (machreg_to_vec(rn) + 1) % 32); + sink.put4(enc_tbl(is_extension, 0b01, rd, rn, rm)); + } + &Inst::FpuCmp32 { rn, rm } => { + sink.put4(enc_fcmp(ScalarSize::Size32, rn, rm)); + } + &Inst::FpuCmp64 { rn, rm } => { + sink.put4(enc_fcmp(ScalarSize::Size64, rn, rm)); + } + &Inst::FpuToInt { op, rd, rn } => { + let top16 = match op { + // FCVTZS (32/32-bit) + FpuToIntOp::F32ToI32 => 0b000_11110_00_1_11_000, + // FCVTZU (32/32-bit) + FpuToIntOp::F32ToU32 => 0b000_11110_00_1_11_001, + // FCVTZS (32/64-bit) + FpuToIntOp::F32ToI64 => 0b100_11110_00_1_11_000, + // FCVTZU (32/64-bit) + FpuToIntOp::F32ToU64 => 0b100_11110_00_1_11_001, + // FCVTZS (64/32-bit) + FpuToIntOp::F64ToI32 => 0b000_11110_01_1_11_000, + // FCVTZU (64/32-bit) + FpuToIntOp::F64ToU32 => 0b000_11110_01_1_11_001, + // FCVTZS (64/64-bit) + FpuToIntOp::F64ToI64 => 0b100_11110_01_1_11_000, + // FCVTZU (64/64-bit) + FpuToIntOp::F64ToU64 => 0b100_11110_01_1_11_001, + }; + sink.put4(enc_fputoint(top16, rd, rn)); + } + &Inst::IntToFpu { op, rd, rn } => { + let top16 = match op { + // SCVTF (32/32-bit) + IntToFpuOp::I32ToF32 => 0b000_11110_00_1_00_010, + // UCVTF (32/32-bit) + IntToFpuOp::U32ToF32 => 0b000_11110_00_1_00_011, + // SCVTF (64/32-bit) + IntToFpuOp::I64ToF32 => 0b100_11110_00_1_00_010, + // UCVTF (64/32-bit) + IntToFpuOp::U64ToF32 => 0b100_11110_00_1_00_011, + // SCVTF (32/64-bit) + IntToFpuOp::I32ToF64 => 0b000_11110_01_1_00_010, + // UCVTF (32/64-bit) + IntToFpuOp::U32ToF64 => 0b000_11110_01_1_00_011, + // SCVTF (64/64-bit) + IntToFpuOp::I64ToF64 => 0b100_11110_01_1_00_010, + // UCVTF (64/64-bit) + IntToFpuOp::U64ToF64 => 0b100_11110_01_1_00_011, + }; + sink.put4(enc_inttofpu(top16, rd, rn)); + } + &Inst::LoadFpuConst64 { rd, const_data } => { + let inst = Inst::FpuLoad64 { + rd, + mem: AMode::Label(MemLabel::PCRel(8)), + flags: MemFlags::trusted(), + }; + inst.emit(sink, emit_info, state); + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(12), + }; + inst.emit(sink, emit_info, state); + sink.put8(const_data); + } + &Inst::LoadFpuConst128 { rd, const_data } => { + let inst = Inst::FpuLoad128 { + rd, + mem: AMode::Label(MemLabel::PCRel(8)), + flags: MemFlags::trusted(), + }; + inst.emit(sink, emit_info, state); + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(20), + }; + inst.emit(sink, emit_info, state); + + for i in const_data.to_le_bytes().iter() { + sink.put1(*i); + } + } + &Inst::FpuCSel32 { rd, rn, rm, cond } => { + sink.put4(enc_fcsel(rd, rn, rm, cond, ScalarSize::Size32)); + } + &Inst::FpuCSel64 { rd, rn, rm, cond } => { + sink.put4(enc_fcsel(rd, rn, rm, cond, ScalarSize::Size64)); + } + &Inst::FpuRound { op, rd, rn } => { + let top22 = match op { + FpuRoundMode::Minus32 => 0b000_11110_00_1_001_010_10000, + FpuRoundMode::Minus64 => 0b000_11110_01_1_001_010_10000, + FpuRoundMode::Plus32 => 0b000_11110_00_1_001_001_10000, + FpuRoundMode::Plus64 => 0b000_11110_01_1_001_001_10000, + FpuRoundMode::Zero32 => 0b000_11110_00_1_001_011_10000, + FpuRoundMode::Zero64 => 0b000_11110_01_1_001_011_10000, + FpuRoundMode::Nearest32 => 0b000_11110_00_1_001_000_10000, + FpuRoundMode::Nearest64 => 0b000_11110_01_1_001_000_10000, + }; + sink.put4(enc_fround(top22, rd, rn)); + } + &Inst::MovToFpu { rd, rn, size } => { + let template = match size { + ScalarSize::Size32 => 0b000_11110_00_1_00_111_000000_00000_00000, + ScalarSize::Size64 => 0b100_11110_01_1_00_111_000000_00000_00000, + _ => unreachable!(), + }; + sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg())); + } + &Inst::MovToVec { rd, rn, idx, size } => { + let (imm5, shift) = match size.lane_size() { + ScalarSize::Size8 => (0b00001, 1), + ScalarSize::Size16 => (0b00010, 2), + ScalarSize::Size32 => (0b00100, 3), + ScalarSize::Size64 => (0b01000, 4), + _ => unreachable!(), + }; + debug_assert_eq!(idx & (0b11111 >> shift), idx); + let imm5 = imm5 | ((idx as u32) << shift); + sink.put4( + 0b010_01110000_00000_0_0011_1_00000_00000 + | (imm5 << 16) + | (machreg_to_gpr(rn) << 5) + | machreg_to_vec(rd.to_reg()), + ); + } + &Inst::MovFromVec { rd, rn, idx, size } => { + let (q, imm5, shift, mask) = match size { + VectorSize::Size8x16 => (0b0, 0b00001, 1, 0b1111), + VectorSize::Size16x8 => (0b0, 0b00010, 2, 0b0111), + VectorSize::Size32x4 => (0b0, 0b00100, 3, 0b0011), + VectorSize::Size64x2 => (0b1, 0b01000, 4, 0b0001), + _ => unreachable!(), + }; + debug_assert_eq!(idx & mask, idx); + let imm5 = imm5 | ((idx as u32) << shift); + sink.put4( + 0b000_01110000_00000_0_0111_1_00000_00000 + | (q << 30) + | (imm5 << 16) + | (machreg_to_vec(rn) << 5) + | machreg_to_gpr(rd.to_reg()), + ); + } + &Inst::MovFromVecSigned { + rd, + rn, + idx, + size, + scalar_size, + } => { + let (imm5, shift, half) = match size { + VectorSize::Size8x8 => (0b00001, 1, true), + VectorSize::Size8x16 => (0b00001, 1, false), + VectorSize::Size16x4 => (0b00010, 2, true), + VectorSize::Size16x8 => (0b00010, 2, false), + VectorSize::Size32x2 => { + debug_assert_ne!(scalar_size, OperandSize::Size32); + (0b00100, 3, true) + } + VectorSize::Size32x4 => { + debug_assert_ne!(scalar_size, OperandSize::Size32); + (0b00100, 3, false) + } + _ => panic!("Unexpected vector operand size"), + }; + debug_assert_eq!(idx & (0b11111 >> (half as u32 + shift)), idx); + let imm5 = imm5 | ((idx as u32) << shift); + sink.put4( + 0b000_01110000_00000_0_0101_1_00000_00000 + | (scalar_size.is64() as u32) << 30 + | (imm5 << 16) + | (machreg_to_vec(rn) << 5) + | machreg_to_gpr(rd.to_reg()), + ); + } + &Inst::VecDup { rd, rn, size } => { + let imm5 = match size { + VectorSize::Size8x16 => 0b00001, + VectorSize::Size16x8 => 0b00010, + VectorSize::Size32x4 => 0b00100, + VectorSize::Size64x2 => 0b01000, + _ => unimplemented!(), + }; + sink.put4( + 0b010_01110000_00000_000011_00000_00000 + | (imm5 << 16) + | (machreg_to_gpr(rn) << 5) + | machreg_to_vec(rd.to_reg()), + ); + } + &Inst::VecDupFromFpu { rd, rn, size } => { + let imm5 = match size { + VectorSize::Size32x4 => 0b00100, + VectorSize::Size64x2 => 0b01000, + _ => unimplemented!(), + }; + sink.put4( + 0b010_01110000_00000_000001_00000_00000 + | (imm5 << 16) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()), + ); + } + &Inst::VecDupImm { + rd, + imm, + invert, + size, + } => { + let (imm, shift, shift_ones) = imm.value(); + let (op, cmode) = match size.lane_size() { + ScalarSize::Size8 => { + assert!(!invert); + assert_eq!(shift, 0); + + (0, 0b1110) + } + ScalarSize::Size16 => { + let s = shift & 8; + + assert!(!shift_ones); + assert_eq!(s, shift); + + (invert as u32, 0b1000 | (s >> 2)) + } + ScalarSize::Size32 => { + if shift_ones { + assert!(shift == 8 || shift == 16); + + (invert as u32, 0b1100 | (shift >> 4)) + } else { + let s = shift & 24; + + assert_eq!(s, shift); + + (invert as u32, 0b0000 | (s >> 2)) + } + } + ScalarSize::Size64 => { + assert!(!invert); + assert_eq!(shift, 0); + + (1, 0b1110) + } + _ => unreachable!(), + }; + let q_op = op | ((size.is_128bits() as u32) << 1); + + sink.put4(enc_asimd_mod_imm(rd, q_op, cmode, imm)); + } + &Inst::VecExtend { + t, + rd, + rn, + high_half, + } => { + let (u, immh) = match t { + VecExtendOp::Sxtl8 => (0b0, 0b001), + VecExtendOp::Sxtl16 => (0b0, 0b010), + VecExtendOp::Sxtl32 => (0b0, 0b100), + VecExtendOp::Uxtl8 => (0b1, 0b001), + VecExtendOp::Uxtl16 => (0b1, 0b010), + VecExtendOp::Uxtl32 => (0b1, 0b100), + }; + sink.put4( + 0b000_011110_0000_000_101001_00000_00000 + | ((high_half as u32) << 30) + | (u << 29) + | (immh << 19) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()), + ); + } + &Inst::VecMiscNarrow { + op, + rd, + rn, + size, + high_half, + } => { + let size = match size.lane_size() { + ScalarSize::Size8 => 0b00, + ScalarSize::Size16 => 0b01, + ScalarSize::Size32 => 0b10, + _ => panic!("Unexpected vector operand lane size!"), + }; + let (u, bits_12_16) = match op { + VecMiscNarrowOp::Xtn => (0b0, 0b10010), + VecMiscNarrowOp::Sqxtn => (0b0, 0b10100), + VecMiscNarrowOp::Sqxtun => (0b1, 0b10010), + }; + sink.put4(enc_vec_rr_misc( + ((high_half as u32) << 1) | u, + size, + bits_12_16, + rd, + rn, + )); + } + &Inst::VecMovElement { + rd, + rn, + dest_idx, + src_idx, + size, + } => { + let (imm5, shift) = match size.lane_size() { + ScalarSize::Size8 => (0b00001, 1), + ScalarSize::Size16 => (0b00010, 2), + ScalarSize::Size32 => (0b00100, 3), + ScalarSize::Size64 => (0b01000, 4), + _ => unreachable!(), + }; + let mask = 0b11111 >> shift; + debug_assert_eq!(dest_idx & mask, dest_idx); + debug_assert_eq!(src_idx & mask, src_idx); + let imm4 = (src_idx as u32) << (shift - 1); + let imm5 = imm5 | ((dest_idx as u32) << shift); + sink.put4( + 0b011_01110000_00000_0_0000_1_00000_00000 + | (imm5 << 16) + | (imm4 << 11) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()), + ); + } + &Inst::VecRRR { + rd, + rn, + rm, + alu_op, + size, + } => { + let (q, enc_size) = size.enc_size(); + let is_float = match alu_op { + VecALUOp::Fcmeq + | VecALUOp::Fcmgt + | VecALUOp::Fcmge + | VecALUOp::Fadd + | VecALUOp::Fsub + | VecALUOp::Fdiv + | VecALUOp::Fmax + | VecALUOp::Fmin + | VecALUOp::Fmul => true, + _ => false, + }; + let enc_float_size = match (is_float, size) { + (true, VectorSize::Size32x2) => 0b0, + (true, VectorSize::Size32x4) => 0b0, + (true, VectorSize::Size64x2) => 0b1, + (true, _) => unimplemented!(), + _ => 0, + }; + + let (top11, bit15_10) = match alu_op { + VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011), + VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011), + VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011), + VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011), + VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011), + VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111), + VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101), + VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101), + VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111), + VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001), + VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001), + VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001), + // The following logical instructions operate on bytes, so are not encoded differently + // for the different vector types. + VecALUOp::And => (0b000_01110_00_1, 0b000111), + VecALUOp::Bic => (0b000_01110_01_1, 0b000111), + VecALUOp::Orr => (0b000_01110_10_1, 0b000111), + VecALUOp::Eor => (0b001_01110_00_1, 0b000111), + VecALUOp::Bsl => (0b001_01110_01_1, 0b000111), + VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001), + VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001), + VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001), + VecALUOp::Mul => { + debug_assert_ne!(size, VectorSize::Size64x2); + (0b000_01110_00_1 | enc_size << 1, 0b100111) + } + VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001), + VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001), + VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011), + VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011), + VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001), + VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001), + VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101), + VecALUOp::Fadd => (0b000_01110_00_1, 0b110101), + VecALUOp::Fsub => (0b000_01110_10_1, 0b110101), + VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111), + VecALUOp::Fmax => (0b000_01110_00_1, 0b111101), + VecALUOp::Fmin => (0b000_01110_10_1, 0b111101), + VecALUOp::Fmul => (0b001_01110_00_1, 0b110111), + VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111), + VecALUOp::Umlal => { + debug_assert!(!size.is_128bits()); + (0b001_01110_00_1 | enc_size << 1, 0b100000) + } + VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110), + VecALUOp::Smull => (0b000_01110_00_1 | enc_size << 1, 0b110000), + VecALUOp::Smull2 => (0b010_01110_00_1 | enc_size << 1, 0b110000), + }; + let top11 = match alu_op { + VecALUOp::Smull | VecALUOp::Smull2 => top11, + _ if is_float => top11 | (q << 9) | enc_float_size << 1, + _ => top11 | (q << 9), + }; + sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd)); + } + &Inst::VecLoadReplicate { rd, rn, size } => { + let (q, size) = size.enc_size(); + + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + sink.put4(enc_ldst_vec(q, size, rn, rd)); + } + &Inst::VecCSel { rd, rn, rm, cond } => { + /* Emit this: + b.cond else + mov rd, rm + b out + else: + mov rd, rn + out: + + Note, we could do better in the cases where rd == rn or rd == rm. + */ + let else_label = sink.get_label(); + let out_label = sink.get_label(); + + // b.cond else + let br_else_offset = sink.cur_offset(); + sink.put4(enc_conditional_br( + BranchTarget::Label(else_label), + CondBrKind::Cond(cond), + )); + sink.use_label_at_offset(br_else_offset, else_label, LabelUse::Branch19); + + // mov rd, rm + sink.put4(enc_vecmov(/* 16b = */ true, rd, rm)); + + // b out + let b_out_offset = sink.cur_offset(); + sink.use_label_at_offset(b_out_offset, out_label, LabelUse::Branch26); + sink.add_uncond_branch(b_out_offset, b_out_offset + 4, out_label); + sink.put4(enc_jump26(0b000101, 0 /* will be fixed up later */)); + + // else: + sink.bind_label(else_label); + + // mov rd, rn + sink.put4(enc_vecmov(/* 16b = */ true, rd, rn)); + + // out: + sink.bind_label(out_label); + } + &Inst::MovToNZCV { rn } => { + sink.put4(0xd51b4200 | machreg_to_gpr(rn)); + } + &Inst::MovFromNZCV { rd } => { + sink.put4(0xd53b4200 | machreg_to_gpr(rd.to_reg())); + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + } if from_bits >= 8 => { + let top22 = match (signed, from_bits, to_bits) { + (false, 8, 32) => 0b010_100110_0_000000_000111, // UXTB (32) + (false, 16, 32) => 0b010_100110_0_000000_001111, // UXTH (32) + (true, 8, 32) => 0b000_100110_0_000000_000111, // SXTB (32) + (true, 16, 32) => 0b000_100110_0_000000_001111, // SXTH (32) + // The 64-bit unsigned variants are the same as the 32-bit ones, + // because writes to Wn zero out the top 32 bits of Xn + (false, 8, 64) => 0b010_100110_0_000000_000111, // UXTB (64) + (false, 16, 64) => 0b010_100110_0_000000_001111, // UXTH (64) + (true, 8, 64) => 0b100_100110_1_000000_000111, // SXTB (64) + (true, 16, 64) => 0b100_100110_1_000000_001111, // SXTH (64) + // 32-to-64: the unsigned case is a 'mov' (special-cased below). + (false, 32, 64) => 0, // MOV + (true, 32, 64) => 0b100_100110_1_000000_011111, // SXTW (64) + _ => panic!( + "Unsupported extend combination: signed = {}, from_bits = {}, to_bits = {}", + signed, from_bits, to_bits + ), + }; + if top22 != 0 { + sink.put4(enc_extend(top22, rd, rn)); + } else { + Inst::mov32(rd, rn).emit(sink, emit_info, state); + } + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + } if from_bits == 1 && signed => { + assert!(to_bits <= 64); + // Reduce sign-extend-from-1-bit to: + // - and rd, rn, #1 + // - sub rd, zr, rd + + // We don't have ImmLogic yet, so we just hardcode this. FIXME. + sink.put4(0x92400000 | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())); + let sub_inst = Inst::AluRRR { + alu_op: ALUOp::Sub64, + rd, + rn: zero_reg(), + rm: rd.to_reg(), + }; + sub_inst.emit(sink, emit_info, state); + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + } if from_bits == 1 && !signed => { + assert!(to_bits <= 64); + // Reduce zero-extend-from-1-bit to: + // - and rd, rn, #1 + + // We don't have ImmLogic yet, so we just hardcode this. FIXME. + sink.put4(0x92400000 | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())); + } + &Inst::Extend { .. } => { + panic!("Unsupported extend variant"); + } + &Inst::Jump { ref dest } => { + let off = sink.cur_offset(); + // Indicate that the jump uses a label, if so, so that a fixup can occur later. + if let Some(l) = dest.as_label() { + sink.use_label_at_offset(off, l, LabelUse::Branch26); + sink.add_uncond_branch(off, off + 4, l); + } + // Emit the jump itself. + sink.put4(enc_jump26(0b000101, dest.as_offset26_or_zero())); + } + &Inst::Ret => { + sink.put4(0xd65f03c0); + } + &Inst::EpiloguePlaceholder => { + // Noop; this is just a placeholder for epilogues. + } + &Inst::Call { ref info } => { + if let Some(s) = state.take_stack_map() { + sink.add_stack_map(StackMapExtent::UpcomingBytes(4), s); + } + let loc = state.cur_srcloc(); + sink.add_reloc(loc, Reloc::Arm64Call, &info.dest, 0); + sink.put4(enc_jump26(0b100101, 0)); + if info.opcode.is_call() { + sink.add_call_site(loc, info.opcode); + } + } + &Inst::CallInd { ref info } => { + if let Some(s) = state.take_stack_map() { + sink.add_stack_map(StackMapExtent::UpcomingBytes(4), s); + } + sink.put4(0b1101011_0001_11111_000000_00000_00000 | (machreg_to_gpr(info.rn) << 5)); + let loc = state.cur_srcloc(); + if info.opcode.is_call() { + sink.add_call_site(loc, info.opcode); + } + } + &Inst::CondBr { + taken, + not_taken, + kind, + } => { + // Conditional part first. + let cond_off = sink.cur_offset(); + if let Some(l) = taken.as_label() { + sink.use_label_at_offset(cond_off, l, LabelUse::Branch19); + let inverted = enc_conditional_br(taken, kind.invert()).to_le_bytes(); + sink.add_cond_branch(cond_off, cond_off + 4, l, &inverted[..]); + } + sink.put4(enc_conditional_br(taken, kind)); + + // Unconditional part next. + let uncond_off = sink.cur_offset(); + if let Some(l) = not_taken.as_label() { + sink.use_label_at_offset(uncond_off, l, LabelUse::Branch26); + sink.add_uncond_branch(uncond_off, uncond_off + 4, l); + } + sink.put4(enc_jump26(0b000101, not_taken.as_offset26_or_zero())); + } + &Inst::TrapIf { kind, trap_code } => { + // condbr KIND, LABEL + let off = sink.cur_offset(); + let label = sink.get_label(); + sink.put4(enc_conditional_br( + BranchTarget::Label(label), + kind.invert(), + )); + sink.use_label_at_offset(off, label, LabelUse::Branch19); + // udf + let trap = Inst::Udf { trap_code }; + trap.emit(sink, emit_info, state); + // LABEL: + sink.bind_label(label); + } + &Inst::IndirectBr { rn, .. } => { + sink.put4(enc_br(rn)); + } + &Inst::Nop0 => {} + &Inst::Nop4 => { + sink.put4(0xd503201f); + } + &Inst::Brk => { + sink.put4(0xd4200000); + } + &Inst::Udf { trap_code } => { + let srcloc = state.cur_srcloc(); + sink.add_trap(srcloc, trap_code); + if let Some(s) = state.take_stack_map() { + sink.add_stack_map(StackMapExtent::UpcomingBytes(4), s); + } + sink.put4(0xd4a00000); + } + &Inst::Adr { rd, off } => { + assert!(off > -(1 << 20)); + assert!(off < (1 << 20)); + sink.put4(enc_adr(off, rd)); + } + &Inst::Word4 { data } => { + sink.put4(data); + } + &Inst::Word8 { data } => { + sink.put8(data); + } + &Inst::JTSequence { + ridx, + rtmp1, + rtmp2, + ref info, + .. + } => { + // This sequence is *one* instruction in the vcode, and is expanded only here at + // emission time, because we cannot allow the regalloc to insert spills/reloads in + // the middle; we depend on hardcoded PC-rel addressing below. + + // Branch to default when condition code from prior comparison indicates. + let br = enc_conditional_br(info.default_target, CondBrKind::Cond(Cond::Hs)); + // No need to inform the sink's branch folding logic about this branch, because it + // will not be merged with any other branch, flipped, or elided (it is not preceded + // or succeeded by any other branch). Just emit it with the label use. + let default_br_offset = sink.cur_offset(); + if let BranchTarget::Label(l) = info.default_target { + sink.use_label_at_offset(default_br_offset, l, LabelUse::Branch19); + } + sink.put4(br); + + // Save index in a tmp (the live range of ridx only goes to start of this + // sequence; rtmp1 or rtmp2 may overwrite it). + let inst = Inst::gen_move(rtmp2, ridx, I64); + inst.emit(sink, emit_info, state); + // Load address of jump table + let inst = Inst::Adr { rd: rtmp1, off: 16 }; + inst.emit(sink, emit_info, state); + // Load value out of jump table + let inst = Inst::SLoad32 { + rd: rtmp2, + mem: AMode::reg_plus_reg_scaled_extended( + rtmp1.to_reg(), + rtmp2.to_reg(), + I32, + ExtendOp::UXTW, + ), + flags: MemFlags::trusted(), + }; + inst.emit(sink, emit_info, state); + // Add base of jump table to jump-table-sourced block offset + let inst = Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: rtmp1, + rn: rtmp1.to_reg(), + rm: rtmp2.to_reg(), + }; + inst.emit(sink, emit_info, state); + // Branch to computed address. (`targets` here is only used for successor queries + // and is not needed for emission.) + let inst = Inst::IndirectBr { + rn: rtmp1.to_reg(), + targets: vec![], + }; + inst.emit(sink, emit_info, state); + // Emit jump table (table of 32-bit offsets). + let jt_off = sink.cur_offset(); + for &target in info.targets.iter() { + let word_off = sink.cur_offset(); + // off_into_table is an addend here embedded in the label to be later patched + // at the end of codegen. The offset is initially relative to this jump table + // entry; with the extra addend, it'll be relative to the jump table's start, + // after patching. + let off_into_table = word_off - jt_off; + sink.use_label_at_offset( + word_off, + target.as_label().unwrap(), + LabelUse::PCRel32, + ); + sink.put4(off_into_table); + } + + // Lowering produces an EmitIsland before using a JTSequence, so we can safely + // disable the worst-case-size check in this case. + start_off = sink.cur_offset(); + } + &Inst::LoadExtName { + rd, + ref name, + offset, + } => { + let inst = Inst::ULoad64 { + rd, + mem: AMode::Label(MemLabel::PCRel(8)), + flags: MemFlags::trusted(), + }; + inst.emit(sink, emit_info, state); + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(12), + }; + inst.emit(sink, emit_info, state); + let srcloc = state.cur_srcloc(); + sink.add_reloc(srcloc, Reloc::Abs8, name, offset); + if emit_info.flags().emit_all_ones_funcaddrs() { + sink.put8(u64::max_value()); + } else { + sink.put8(0); + } + } + &Inst::LoadAddr { rd, ref mem } => { + let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state); + for inst in mem_insts.into_iter() { + inst.emit(sink, emit_info, state); + } + + let (reg, index_reg, offset) = match mem { + AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0), + AMode::Unscaled(r, simm9) => (r, None, simm9.value()), + AMode::UnsignedOffset(r, uimm12scaled) => { + (r, None, uimm12scaled.value() as i32) + } + _ => panic!("Unsupported case for LoadAddr: {:?}", mem), + }; + let abs_offset = if offset < 0 { + -offset as u64 + } else { + offset as u64 + }; + let alu_op = if offset < 0 { + ALUOp::Sub64 + } else { + ALUOp::Add64 + }; + + if let Some((idx, extendop)) = index_reg { + let add = Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd, + rn: reg, + rm: idx, + extendop, + }; + + add.emit(sink, emit_info, state); + } else if offset == 0 { + if reg != rd.to_reg() { + let mov = Inst::mov(rd, reg); + + mov.emit(sink, emit_info, state); + } + } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) { + let add = Inst::AluRRImm12 { + alu_op, + rd, + rn: reg, + imm12, + }; + add.emit(sink, emit_info, state); + } else { + // Use `tmp2` here: `reg` may be `spilltmp` if the `AMode` on this instruction + // was initially an `SPOffset`. Assert that `tmp2` is truly free to use. Note + // that no other instructions will be inserted here (we're emitting directly), + // and a live range of `tmp2` should not span this instruction, so this use + // should otherwise be correct. + debug_assert!(rd.to_reg() != tmp2_reg()); + debug_assert!(reg != tmp2_reg()); + let tmp = writable_tmp2_reg(); + for insn in Inst::load_constant(tmp, abs_offset).into_iter() { + insn.emit(sink, emit_info, state); + } + let add = Inst::AluRRR { + alu_op, + rd, + rn: reg, + rm: tmp.to_reg(), + }; + add.emit(sink, emit_info, state); + } + } + &Inst::VirtualSPOffsetAdj { offset } => { + debug!( + "virtual sp offset adjusted by {} -> {}", + offset, + state.virtual_sp_offset + offset, + ); + state.virtual_sp_offset += offset; + } + &Inst::EmitIsland { needed_space } => { + if sink.island_needed(needed_space + 4) { + let jump_around_label = sink.get_label(); + let jmp = Inst::Jump { + dest: BranchTarget::Label(jump_around_label), + }; + jmp.emit(sink, emit_info, state); + sink.emit_island(); + sink.bind_label(jump_around_label); + } + } + } + + let end_off = sink.cur_offset(); + debug_assert!((end_off - start_off) <= Inst::worst_case_size()); + + state.clear_post_insn(); + } + + fn pretty_print(&self, mb_rru: Option<&RealRegUniverse>, state: &mut EmitState) -> String { + self.print_with_state(mb_rru, state) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit_tests.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit_tests.rs new file mode 100644 index 0000000000..eb31963b5d --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/emit_tests.rs @@ -0,0 +1,5143 @@ +use crate::ir::types::*; +use crate::isa::aarch64::inst::*; +use crate::isa::test_utils; +use crate::isa::CallConv; +use crate::settings; + +use alloc::boxed::Box; +use alloc::vec::Vec; + +#[test] +fn test_aarch64_binemit() { + let mut insns = Vec::<(Inst, &str, &str)>::new(); + + // N.B.: the architecture is little-endian, so when transcribing the 32-bit + // hex instructions from e.g. objdump disassembly, one must swap the bytes + // seen below. (E.g., a `ret` is normally written as the u32 `D65F03C0`, + // but we write it here as C0035FD6.) + + // Useful helper script to produce the encodings from the text: + // + // #!/bin/sh + // tmp=`mktemp /tmp/XXXXXXXX.o` + // aarch64-linux-gnu-as /dev/stdin -o $tmp + // aarch64-linux-gnu-objdump -d $tmp + // rm -f $tmp + // + // Then: + // + // $ echo "mov x1, x2" | aarch64inst.sh + insns.push((Inst::Ret, "C0035FD6", "ret")); + insns.push((Inst::Nop0, "", "nop-zero-len")); + insns.push((Inst::Nop4, "1F2003D5", "nop")); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Add32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100030B", + "add w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400068B", + "add x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Sub32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100034B", + "sub w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Sub64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40006CB", + "sub x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Orr32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100032A", + "orr w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Orr64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40006AA", + "orr x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::And32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100030A", + "and w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::And64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400068A", + "and x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::SubS32, + rd: writable_zero_reg(), + rn: xreg(2), + rm: xreg(3), + }, + "5F00036B", + // TODO: Display as cmp + "subs wzr, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::SubS32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100036B", + "subs w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::SubS64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40006EB", + "subs x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::AddS32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100032B", + "adds w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::AddS64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40006AB", + "adds x4, x5, x6", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::AddS64, + rd: writable_zero_reg(), + rn: xreg(5), + imm12: Imm12::maybe_from_u64(1).unwrap(), + }, + "BF0400B1", + // TODO: Display as cmn. + "adds xzr, x5, #1", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::SDiv64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40CC69A", + "sdiv x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::UDiv64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A408C69A", + "udiv x4, x5, x6", + )); + + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Eor32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400064A", + "eor w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Eor64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40006CA", + "eor x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::AndNot32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400260A", + "bic w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::AndNot64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400268A", + "bic x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::OrrNot32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400262A", + "orn w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::OrrNot64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40026AA", + "orn x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::EorNot32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400264A", + "eon w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::EorNot64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40026CA", + "eon x4, x5, x6", + )); + + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::RotR32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A42CC61A", + "ror w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::RotR64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A42CC69A", + "ror x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsr32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A424C61A", + "lsr w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsr64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A424C69A", + "lsr x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Asr32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A428C61A", + "asr w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Asr64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A428C69A", + "asr x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsl32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A420C61A", + "lsl w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsl64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A420C69A", + "lsl x4, x5, x6", + )); + + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Add32, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D0411", + "add w7, w8, #291", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Add32, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: true, + }, + }, + "078D4411", + "add w7, w8, #1191936", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D0491", + "add x7, x8, #291", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Sub32, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D0451", + "sub w7, w8, #291", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Sub64, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D04D1", + "sub x7, x8, #291", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::SubS32, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D0471", + "subs w7, w8, #291", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::SubS64, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D04F1", + "subs x7, x8, #291", + )); + + insns.push(( + Inst::AluRRRExtend { + alu_op: ALUOp::Add32, + rd: writable_xreg(7), + rn: xreg(8), + rm: xreg(9), + extendop: ExtendOp::SXTB, + }, + "0781290B", + "add w7, w8, w9, SXTB", + )); + + insns.push(( + Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd: writable_xreg(15), + rn: xreg(16), + rm: xreg(17), + extendop: ExtendOp::UXTB, + }, + "0F02318B", + "add x15, x16, x17, UXTB", + )); + + insns.push(( + Inst::AluRRRExtend { + alu_op: ALUOp::Sub32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + extendop: ExtendOp::SXTH, + }, + "41A0234B", + "sub w1, w2, w3, SXTH", + )); + + insns.push(( + Inst::AluRRRExtend { + alu_op: ALUOp::Sub64, + rd: writable_xreg(20), + rn: xreg(21), + rm: xreg(22), + extendop: ExtendOp::UXTW, + }, + "B44236CB", + "sub x20, x21, x22, UXTW", + )); + + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Add32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(20).unwrap(), + ), + }, + "6A510C0B", + "add w10, w11, w12, LSL 20", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Add64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::ASR, + ShiftOpShiftImm::maybe_from_shift(42).unwrap(), + ), + }, + "6AA98C8B", + "add x10, x11, x12, ASR 42", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Sub32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C4B", + "sub w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Sub64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0CCB", + "sub x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Orr32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C2A", + "orr w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Orr64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0CAA", + "orr x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::And32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C0A", + "and w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::And64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C8A", + "and x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Eor32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C4A", + "eor w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Eor64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0CCA", + "eor x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::OrrNot32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2C2A", + "orn w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::OrrNot64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2CAA", + "orn x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::AndNot32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2C0A", + "bic w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::AndNot64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2C8A", + "bic x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::EorNot32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2C4A", + "eon w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::EorNot64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2CCA", + "eon x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::AddS32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C2B", + "adds w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::AddS64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0CAB", + "adds x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::SubS32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C6B", + "subs w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::SubS64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0CEB", + "subs x10, x11, x12, LSL 23", + )); + + insns.push(( + Inst::AluRRRExtend { + alu_op: ALUOp::SubS64, + rd: writable_zero_reg(), + rn: stack_reg(), + rm: xreg(12), + extendop: ExtendOp::UXTX, + }, + "FF632CEB", + "subs xzr, sp, x12, UXTX", + )); + + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp3::MAdd32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + ra: xreg(4), + }, + "4110031B", + "madd w1, w2, w3, w4", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp3::MAdd64, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + ra: xreg(4), + }, + "4110039B", + "madd x1, x2, x3, x4", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp3::MSub32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + ra: xreg(4), + }, + "4190031B", + "msub w1, w2, w3, w4", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp3::MSub64, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + ra: xreg(4), + }, + "4190039B", + "msub x1, x2, x3, x4", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::SMulH, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "417C439B", + "smulh x1, x2, x3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::UMulH, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "417CC39B", + "umulh x1, x2, x3", + )); + + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::RotR32, + rd: writable_xreg(20), + rn: xreg(21), + immshift: ImmShift::maybe_from_u64(19).unwrap(), + }, + "B44E9513", + "ror w20, w21, #19", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::RotR64, + rd: writable_xreg(20), + rn: xreg(21), + immshift: ImmShift::maybe_from_u64(42).unwrap(), + }, + "B4AAD593", + "ror x20, x21, #42", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Lsr32, + rd: writable_xreg(10), + rn: xreg(11), + immshift: ImmShift::maybe_from_u64(13).unwrap(), + }, + "6A7D0D53", + "lsr w10, w11, #13", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Lsr64, + rd: writable_xreg(10), + rn: xreg(11), + immshift: ImmShift::maybe_from_u64(57).unwrap(), + }, + "6AFD79D3", + "lsr x10, x11, #57", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Asr32, + rd: writable_xreg(4), + rn: xreg(5), + immshift: ImmShift::maybe_from_u64(7).unwrap(), + }, + "A47C0713", + "asr w4, w5, #7", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Asr64, + rd: writable_xreg(4), + rn: xreg(5), + immshift: ImmShift::maybe_from_u64(35).unwrap(), + }, + "A4FC6393", + "asr x4, x5, #35", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Lsl32, + rd: writable_xreg(8), + rn: xreg(9), + immshift: ImmShift::maybe_from_u64(24).unwrap(), + }, + "281D0853", + "lsl w8, w9, #24", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Lsl64, + rd: writable_xreg(8), + rn: xreg(9), + immshift: ImmShift::maybe_from_u64(63).unwrap(), + }, + "280141D3", + "lsl x8, x9, #63", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Lsl32, + rd: writable_xreg(10), + rn: xreg(11), + immshift: ImmShift::maybe_from_u64(0).unwrap(), + }, + "6A7D0053", + "lsl w10, w11, #0", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Lsl64, + rd: writable_xreg(10), + rn: xreg(11), + immshift: ImmShift::maybe_from_u64(0).unwrap(), + }, + "6AFD40D3", + "lsl x10, x11, #0", + )); + + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::And32, + rd: writable_xreg(21), + rn: xreg(27), + imml: ImmLogic::maybe_from_u64(0x80003fff, I32).unwrap(), + }, + "753B0112", + "and w21, w27, #2147500031", + )); + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd: writable_xreg(7), + rn: xreg(6), + imml: ImmLogic::maybe_from_u64(0x3fff80003fff800, I64).unwrap(), + }, + "C7381592", + "and x7, x6, #288221580125796352", + )); + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::Orr32, + rd: writable_xreg(1), + rn: xreg(5), + imml: ImmLogic::maybe_from_u64(0x100000, I32).unwrap(), + }, + "A1000C32", + "orr w1, w5, #1048576", + )); + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::Orr64, + rd: writable_xreg(4), + rn: xreg(5), + imml: ImmLogic::maybe_from_u64(0x8181818181818181, I64).unwrap(), + }, + "A4C401B2", + "orr x4, x5, #9331882296111890817", + )); + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::Eor32, + rd: writable_xreg(1), + rn: xreg(5), + imml: ImmLogic::maybe_from_u64(0x00007fff, I32).unwrap(), + }, + "A1380052", + "eor w1, w5, #32767", + )); + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::Eor64, + rd: writable_xreg(10), + rn: xreg(8), + imml: ImmLogic::maybe_from_u64(0x8181818181818181, I64).unwrap(), + }, + "0AC501D2", + "eor x10, x8, #9331882296111890817", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::RBit32, + rd: writable_xreg(1), + rn: xreg(10), + }, + "4101C05A", + "rbit w1, w10", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::RBit64, + rd: writable_xreg(1), + rn: xreg(10), + }, + "4101C0DA", + "rbit x1, x10", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::Clz32, + rd: writable_xreg(15), + rn: xreg(3), + }, + "6F10C05A", + "clz w15, w3", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::Clz64, + rd: writable_xreg(15), + rn: xreg(3), + }, + "6F10C0DA", + "clz x15, x3", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::Cls32, + rd: writable_xreg(21), + rn: xreg(16), + }, + "1516C05A", + "cls w21, w16", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::Cls64, + rd: writable_xreg(21), + rn: xreg(16), + }, + "1516C0DA", + "cls x21, x16", + )); + + insns.push(( + Inst::ULoad8 { + rd: writable_xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::zero()), + flags: MemFlags::trusted(), + }, + "41004038", + "ldurb w1, [x2]", + )); + insns.push(( + Inst::ULoad8 { + rd: writable_xreg(1), + mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::zero(I8)), + flags: MemFlags::trusted(), + }, + "41004039", + "ldrb w1, [x2]", + )); + insns.push(( + Inst::ULoad8 { + rd: writable_xreg(1), + mem: AMode::RegReg(xreg(2), xreg(5)), + flags: MemFlags::trusted(), + }, + "41686538", + "ldrb w1, [x2, x5]", + )); + insns.push(( + Inst::SLoad8 { + rd: writable_xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::zero()), + flags: MemFlags::trusted(), + }, + "41008038", + "ldursb x1, [x2]", + )); + insns.push(( + Inst::SLoad8 { + rd: writable_xreg(1), + mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(63, I8).unwrap()), + flags: MemFlags::trusted(), + }, + "41FC8039", + "ldrsb x1, [x2, #63]", + )); + insns.push(( + Inst::SLoad8 { + rd: writable_xreg(1), + mem: AMode::RegReg(xreg(2), xreg(5)), + flags: MemFlags::trusted(), + }, + "4168A538", + "ldrsb x1, [x2, x5]", + )); + insns.push(( + Inst::ULoad16 { + rd: writable_xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::maybe_from_i64(5).unwrap()), + flags: MemFlags::trusted(), + }, + "41504078", + "ldurh w1, [x2, #5]", + )); + insns.push(( + Inst::ULoad16 { + rd: writable_xreg(1), + mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(8, I16).unwrap()), + flags: MemFlags::trusted(), + }, + "41104079", + "ldrh w1, [x2, #8]", + )); + insns.push(( + Inst::ULoad16 { + rd: writable_xreg(1), + mem: AMode::RegScaled(xreg(2), xreg(3), I16), + flags: MemFlags::trusted(), + }, + "41786378", + "ldrh w1, [x2, x3, LSL #1]", + )); + insns.push(( + Inst::SLoad16 { + rd: writable_xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::zero()), + flags: MemFlags::trusted(), + }, + "41008078", + "ldursh x1, [x2]", + )); + insns.push(( + Inst::SLoad16 { + rd: writable_xreg(28), + mem: AMode::UnsignedOffset(xreg(20), UImm12Scaled::maybe_from_i64(24, I16).unwrap()), + flags: MemFlags::trusted(), + }, + "9C328079", + "ldrsh x28, [x20, #24]", + )); + insns.push(( + Inst::SLoad16 { + rd: writable_xreg(28), + mem: AMode::RegScaled(xreg(20), xreg(20), I16), + flags: MemFlags::trusted(), + }, + "9C7AB478", + "ldrsh x28, [x20, x20, LSL #1]", + )); + insns.push(( + Inst::ULoad32 { + rd: writable_xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::zero()), + flags: MemFlags::trusted(), + }, + "410040B8", + "ldur w1, [x2]", + )); + insns.push(( + Inst::ULoad32 { + rd: writable_xreg(12), + mem: AMode::UnsignedOffset(xreg(0), UImm12Scaled::maybe_from_i64(204, I32).unwrap()), + flags: MemFlags::trusted(), + }, + "0CCC40B9", + "ldr w12, [x0, #204]", + )); + insns.push(( + Inst::ULoad32 { + rd: writable_xreg(1), + mem: AMode::RegScaled(xreg(2), xreg(12), I32), + flags: MemFlags::trusted(), + }, + "41786CB8", + "ldr w1, [x2, x12, LSL #2]", + )); + insns.push(( + Inst::SLoad32 { + rd: writable_xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::zero()), + flags: MemFlags::trusted(), + }, + "410080B8", + "ldursw x1, [x2]", + )); + insns.push(( + Inst::SLoad32 { + rd: writable_xreg(12), + mem: AMode::UnsignedOffset(xreg(1), UImm12Scaled::maybe_from_i64(16380, I32).unwrap()), + flags: MemFlags::trusted(), + }, + "2CFCBFB9", + "ldrsw x12, [x1, #16380]", + )); + insns.push(( + Inst::SLoad32 { + rd: writable_xreg(1), + mem: AMode::RegScaled(xreg(5), xreg(1), I32), + flags: MemFlags::trusted(), + }, + "A178A1B8", + "ldrsw x1, [x5, x1, LSL #2]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::zero()), + flags: MemFlags::trusted(), + }, + "410040F8", + "ldur x1, [x2]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::maybe_from_i64(-256).unwrap()), + flags: MemFlags::trusted(), + }, + "410050F8", + "ldur x1, [x2, #-256]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::maybe_from_i64(255).unwrap()), + flags: MemFlags::trusted(), + }, + "41F04FF8", + "ldur x1, [x2, #255]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(32760, I64).unwrap()), + flags: MemFlags::trusted(), + }, + "41FC7FF9", + "ldr x1, [x2, #32760]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::RegReg(xreg(2), xreg(3)), + flags: MemFlags::trusted(), + }, + "416863F8", + "ldr x1, [x2, x3]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::RegScaled(xreg(2), xreg(3), I64), + flags: MemFlags::trusted(), + }, + "417863F8", + "ldr x1, [x2, x3, LSL #3]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::RegScaledExtended(xreg(2), xreg(3), I64, ExtendOp::SXTW), + flags: MemFlags::trusted(), + }, + "41D863F8", + "ldr x1, [x2, w3, SXTW #3]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::RegExtended(xreg(2), xreg(3), ExtendOp::SXTW), + flags: MemFlags::trusted(), + }, + "41C863F8", + "ldr x1, [x2, w3, SXTW]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::Label(MemLabel::PCRel(64)), + flags: MemFlags::trusted(), + }, + "01020058", + "ldr x1, pc+64", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::PreIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()), + flags: MemFlags::trusted(), + }, + "410C41F8", + "ldr x1, [x2, #16]!", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::PostIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()), + flags: MemFlags::trusted(), + }, + "410441F8", + "ldr x1, [x2], #16", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::FPOffset(32768, I8), + flags: MemFlags::trusted(), + }, + "100090D2B063308B010240F9", + "movz x16, #32768 ; add x16, fp, x16, UXTX ; ldr x1, [x16]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::FPOffset(-32768, I8), + flags: MemFlags::trusted(), + }, + "F0FF8F92B063308B010240F9", + "movn x16, #32767 ; add x16, fp, x16, UXTX ; ldr x1, [x16]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::FPOffset(1048576, I8), // 2^20 + flags: MemFlags::trusted(), + }, + "1002A0D2B063308B010240F9", + "movz x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::FPOffset(1048576 + 1, I8), // 2^20 + 1 + flags: MemFlags::trusted(), + }, + "300080521002A072B063308B010240F9", + "movz w16, #1 ; movk w16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]", + )); + + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::RegOffset(xreg(7), 8, I64), + flags: MemFlags::trusted(), + }, + "E18040F8", + "ldur x1, [x7, #8]", + )); + + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::RegOffset(xreg(7), 1024, I64), + flags: MemFlags::trusted(), + }, + "E10042F9", + "ldr x1, [x7, #1024]", + )); + + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: AMode::RegOffset(xreg(7), 1048576, I64), + flags: MemFlags::trusted(), + }, + "1002A0D2F060308B010240F9", + "movz x16, #16, LSL #16 ; add x16, x7, x16, UXTX ; ldr x1, [x16]", + )); + + insns.push(( + Inst::Store8 { + rd: xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::zero()), + flags: MemFlags::trusted(), + }, + "41000038", + "sturb w1, [x2]", + )); + insns.push(( + Inst::Store8 { + rd: xreg(1), + mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(4095, I8).unwrap()), + flags: MemFlags::trusted(), + }, + "41FC3F39", + "strb w1, [x2, #4095]", + )); + insns.push(( + Inst::Store16 { + rd: xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::zero()), + flags: MemFlags::trusted(), + }, + "41000078", + "sturh w1, [x2]", + )); + insns.push(( + Inst::Store16 { + rd: xreg(1), + mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(8190, I16).unwrap()), + flags: MemFlags::trusted(), + }, + "41FC3F79", + "strh w1, [x2, #8190]", + )); + insns.push(( + Inst::Store32 { + rd: xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::zero()), + flags: MemFlags::trusted(), + }, + "410000B8", + "stur w1, [x2]", + )); + insns.push(( + Inst::Store32 { + rd: xreg(1), + mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(16380, I32).unwrap()), + flags: MemFlags::trusted(), + }, + "41FC3FB9", + "str w1, [x2, #16380]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: AMode::Unscaled(xreg(2), SImm9::zero()), + flags: MemFlags::trusted(), + }, + "410000F8", + "stur x1, [x2]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: AMode::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(32760, I64).unwrap()), + flags: MemFlags::trusted(), + }, + "41FC3FF9", + "str x1, [x2, #32760]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: AMode::RegReg(xreg(2), xreg(3)), + flags: MemFlags::trusted(), + }, + "416823F8", + "str x1, [x2, x3]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: AMode::RegScaled(xreg(2), xreg(3), I64), + flags: MemFlags::trusted(), + }, + "417823F8", + "str x1, [x2, x3, LSL #3]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: AMode::RegScaledExtended(xreg(2), xreg(3), I64, ExtendOp::UXTW), + flags: MemFlags::trusted(), + }, + "415823F8", + "str x1, [x2, w3, UXTW #3]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: AMode::RegExtended(xreg(2), xreg(3), ExtendOp::UXTW), + flags: MemFlags::trusted(), + }, + "414823F8", + "str x1, [x2, w3, UXTW]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: AMode::PreIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()), + flags: MemFlags::trusted(), + }, + "410C01F8", + "str x1, [x2, #16]!", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: AMode::PostIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()), + flags: MemFlags::trusted(), + }, + "410401F8", + "str x1, [x2], #16", + )); + + insns.push(( + Inst::StoreP64 { + rt: xreg(8), + rt2: xreg(9), + mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::zero(I64)), + flags: MemFlags::trusted(), + }, + "482500A9", + "stp x8, x9, [x10]", + )); + insns.push(( + Inst::StoreP64 { + rt: xreg(8), + rt2: xreg(9), + mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::maybe_from_i64(504, I64).unwrap()), + flags: MemFlags::trusted(), + }, + "48A51FA9", + "stp x8, x9, [x10, #504]", + )); + insns.push(( + Inst::StoreP64 { + rt: xreg(8), + rt2: xreg(9), + mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::maybe_from_i64(-64, I64).unwrap()), + flags: MemFlags::trusted(), + }, + "48253CA9", + "stp x8, x9, [x10, #-64]", + )); + insns.push(( + Inst::StoreP64 { + rt: xreg(21), + rt2: xreg(28), + mem: PairAMode::SignedOffset(xreg(1), SImm7Scaled::maybe_from_i64(-512, I64).unwrap()), + flags: MemFlags::trusted(), + }, + "357020A9", + "stp x21, x28, [x1, #-512]", + )); + insns.push(( + Inst::StoreP64 { + rt: xreg(8), + rt2: xreg(9), + mem: PairAMode::PreIndexed( + writable_xreg(10), + SImm7Scaled::maybe_from_i64(-64, I64).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "4825BCA9", + "stp x8, x9, [x10, #-64]!", + )); + insns.push(( + Inst::StoreP64 { + rt: xreg(15), + rt2: xreg(16), + mem: PairAMode::PostIndexed( + writable_xreg(20), + SImm7Scaled::maybe_from_i64(504, I64).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "8FC29FA8", + "stp x15, x16, [x20], #504", + )); + + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(9), + mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::zero(I64)), + flags: MemFlags::trusted(), + }, + "482540A9", + "ldp x8, x9, [x10]", + )); + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(9), + mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::maybe_from_i64(504, I64).unwrap()), + flags: MemFlags::trusted(), + }, + "48A55FA9", + "ldp x8, x9, [x10, #504]", + )); + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(9), + mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::maybe_from_i64(-64, I64).unwrap()), + flags: MemFlags::trusted(), + }, + "48257CA9", + "ldp x8, x9, [x10, #-64]", + )); + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(9), + mem: PairAMode::SignedOffset(xreg(10), SImm7Scaled::maybe_from_i64(-512, I64).unwrap()), + flags: MemFlags::trusted(), + }, + "482560A9", + "ldp x8, x9, [x10, #-512]", + )); + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(9), + mem: PairAMode::PreIndexed( + writable_xreg(10), + SImm7Scaled::maybe_from_i64(-64, I64).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "4825FCA9", + "ldp x8, x9, [x10, #-64]!", + )); + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(25), + mem: PairAMode::PostIndexed( + writable_xreg(12), + SImm7Scaled::maybe_from_i64(504, I64).unwrap(), + ), + flags: MemFlags::trusted(), + }, + "88E5DFA8", + "ldp x8, x25, [x12], #504", + )); + + insns.push(( + Inst::Mov64 { + rd: writable_xreg(8), + rm: xreg(9), + }, + "E80309AA", + "mov x8, x9", + )); + insns.push(( + Inst::Mov32 { + rd: writable_xreg(8), + rm: xreg(9), + }, + "E803092A", + "mov w8, w9", + )); + + insns.push(( + Inst::MovZ { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(), + size: OperandSize::Size64, + }, + "E8FF9FD2", + "movz x8, #65535", + )); + insns.push(( + Inst::MovZ { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(), + size: OperandSize::Size64, + }, + "E8FFBFD2", + "movz x8, #65535, LSL #16", + )); + insns.push(( + Inst::MovZ { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(), + size: OperandSize::Size64, + }, + "E8FFDFD2", + "movz x8, #65535, LSL #32", + )); + insns.push(( + Inst::MovZ { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(), + size: OperandSize::Size64, + }, + "E8FFFFD2", + "movz x8, #65535, LSL #48", + )); + insns.push(( + Inst::MovZ { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(), + size: OperandSize::Size32, + }, + "E8FFBF52", + "movz w8, #65535, LSL #16", + )); + + insns.push(( + Inst::MovN { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(), + size: OperandSize::Size64, + }, + "E8FF9F92", + "movn x8, #65535", + )); + insns.push(( + Inst::MovN { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(), + size: OperandSize::Size64, + }, + "E8FFBF92", + "movn x8, #65535, LSL #16", + )); + insns.push(( + Inst::MovN { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(), + size: OperandSize::Size64, + }, + "E8FFDF92", + "movn x8, #65535, LSL #32", + )); + insns.push(( + Inst::MovN { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(), + size: OperandSize::Size64, + }, + "E8FFFF92", + "movn x8, #65535, LSL #48", + )); + insns.push(( + Inst::MovN { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(), + size: OperandSize::Size32, + }, + "E8FF9F12", + "movn w8, #65535", + )); + + insns.push(( + Inst::MovK { + rd: writable_xreg(12), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_0000).unwrap(), + size: OperandSize::Size64, + }, + "0C0080F2", + "movk x12, #0", + )); + insns.push(( + Inst::MovK { + rd: writable_xreg(19), + imm: MoveWideConst::maybe_with_shift(0x0000, 16).unwrap(), + size: OperandSize::Size64, + }, + "1300A0F2", + "movk x19, #0, LSL #16", + )); + insns.push(( + Inst::MovK { + rd: writable_xreg(3), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(), + size: OperandSize::Size64, + }, + "E3FF9FF2", + "movk x3, #65535", + )); + insns.push(( + Inst::MovK { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(), + size: OperandSize::Size64, + }, + "E8FFBFF2", + "movk x8, #65535, LSL #16", + )); + insns.push(( + Inst::MovK { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(), + size: OperandSize::Size64, + }, + "E8FFDFF2", + "movk x8, #65535, LSL #32", + )); + insns.push(( + Inst::MovK { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(), + size: OperandSize::Size64, + }, + "E8FFFFF2", + "movk x8, #65535, LSL #48", + )); + + insns.push(( + Inst::CSel { + rd: writable_xreg(10), + rn: xreg(12), + rm: xreg(14), + cond: Cond::Hs, + }, + "8A218E9A", + "csel x10, x12, x14, hs", + )); + insns.push(( + Inst::CSet { + rd: writable_xreg(15), + cond: Cond::Ge, + }, + "EFB79F9A", + "cset x15, ge", + )); + insns.push(( + Inst::CCmpImm { + size: OperandSize::Size64, + rn: xreg(22), + imm: UImm5::maybe_from_u8(5).unwrap(), + nzcv: NZCV::new(false, false, true, true), + cond: Cond::Eq, + }, + "C30A45FA", + "ccmp x22, #5, #nzCV, eq", + )); + insns.push(( + Inst::CCmpImm { + size: OperandSize::Size32, + rn: xreg(3), + imm: UImm5::maybe_from_u8(30).unwrap(), + nzcv: NZCV::new(true, true, true, true), + cond: Cond::Gt, + }, + "6FC85E7A", + "ccmp w3, #30, #NZCV, gt", + )); + insns.push(( + Inst::MovToFpu { + rd: writable_vreg(31), + rn: xreg(0), + size: ScalarSize::Size64, + }, + "1F00679E", + "fmov d31, x0", + )); + insns.push(( + Inst::MovToFpu { + rd: writable_vreg(1), + rn: xreg(28), + size: ScalarSize::Size32, + }, + "8103271E", + "fmov s1, w28", + )); + insns.push(( + Inst::MovToVec { + rd: writable_vreg(0), + rn: xreg(0), + idx: 7, + size: VectorSize::Size8x8, + }, + "001C0F4E", + "mov v0.b[7], w0", + )); + insns.push(( + Inst::MovToVec { + rd: writable_vreg(20), + rn: xreg(21), + idx: 0, + size: VectorSize::Size64x2, + }, + "B41E084E", + "mov v20.d[0], x21", + )); + insns.push(( + Inst::MovFromVec { + rd: writable_xreg(3), + rn: vreg(27), + idx: 14, + size: VectorSize::Size8x16, + }, + "633F1D0E", + "umov w3, v27.b[14]", + )); + insns.push(( + Inst::MovFromVec { + rd: writable_xreg(24), + rn: vreg(5), + idx: 3, + size: VectorSize::Size16x8, + }, + "B83C0E0E", + "umov w24, v5.h[3]", + )); + insns.push(( + Inst::MovFromVec { + rd: writable_xreg(12), + rn: vreg(17), + idx: 1, + size: VectorSize::Size32x4, + }, + "2C3E0C0E", + "mov w12, v17.s[1]", + )); + insns.push(( + Inst::MovFromVec { + rd: writable_xreg(21), + rn: vreg(20), + idx: 0, + size: VectorSize::Size64x2, + }, + "953E084E", + "mov x21, v20.d[0]", + )); + insns.push(( + Inst::MovFromVecSigned { + rd: writable_xreg(0), + rn: vreg(0), + idx: 15, + size: VectorSize::Size8x16, + scalar_size: OperandSize::Size32, + }, + "002C1F0E", + "smov w0, v0.b[15]", + )); + insns.push(( + Inst::MovFromVecSigned { + rd: writable_xreg(12), + rn: vreg(13), + idx: 7, + size: VectorSize::Size8x8, + scalar_size: OperandSize::Size64, + }, + "AC2D0F4E", + "smov x12, v13.b[7]", + )); + insns.push(( + Inst::MovFromVecSigned { + rd: writable_xreg(23), + rn: vreg(31), + idx: 7, + size: VectorSize::Size16x8, + scalar_size: OperandSize::Size32, + }, + "F72F1E0E", + "smov w23, v31.h[7]", + )); + insns.push(( + Inst::MovFromVecSigned { + rd: writable_xreg(24), + rn: vreg(5), + idx: 1, + size: VectorSize::Size32x2, + scalar_size: OperandSize::Size64, + }, + "B82C0C4E", + "smov x24, v5.s[1]", + )); + insns.push(( + Inst::MovToNZCV { rn: xreg(13) }, + "0D421BD5", + "msr nzcv, x13", + )); + insns.push(( + Inst::MovFromNZCV { + rd: writable_xreg(27), + }, + "1B423BD5", + "mrs x27, nzcv", + )); + insns.push(( + Inst::VecDup { + rd: writable_vreg(25), + rn: xreg(7), + size: VectorSize::Size8x16, + }, + "F90C014E", + "dup v25.16b, w7", + )); + insns.push(( + Inst::VecDup { + rd: writable_vreg(2), + rn: xreg(23), + size: VectorSize::Size16x8, + }, + "E20E024E", + "dup v2.8h, w23", + )); + insns.push(( + Inst::VecDup { + rd: writable_vreg(0), + rn: xreg(28), + size: VectorSize::Size32x4, + }, + "800F044E", + "dup v0.4s, w28", + )); + insns.push(( + Inst::VecDup { + rd: writable_vreg(31), + rn: xreg(5), + size: VectorSize::Size64x2, + }, + "BF0C084E", + "dup v31.2d, x5", + )); + insns.push(( + Inst::VecDupFromFpu { + rd: writable_vreg(14), + rn: vreg(19), + size: VectorSize::Size32x4, + }, + "6E06044E", + "dup v14.4s, v19.s[0]", + )); + insns.push(( + Inst::VecDupFromFpu { + rd: writable_vreg(18), + rn: vreg(10), + size: VectorSize::Size64x2, + }, + "5205084E", + "dup v18.2d, v10.d[0]", + )); + insns.push(( + Inst::VecDupImm { + rd: writable_vreg(31), + imm: ASIMDMovModImm::maybe_from_u64(255, ScalarSize::Size8).unwrap(), + invert: false, + size: VectorSize::Size8x16, + }, + "FFE7074F", + "movi v31.16b, #255", + )); + insns.push(( + Inst::VecDupImm { + rd: writable_vreg(0), + imm: ASIMDMovModImm::zero(), + invert: true, + size: VectorSize::Size16x4, + }, + "0084002F", + "mvni v0.4h, #0", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Sxtl8, + rd: writable_vreg(4), + rn: vreg(27), + high_half: false, + }, + "64A7080F", + "sxtl v4.8h, v27.8b", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Sxtl16, + rd: writable_vreg(17), + rn: vreg(19), + high_half: true, + }, + "71A6104F", + "sxtl2 v17.4s, v19.8h", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Sxtl32, + rd: writable_vreg(30), + rn: vreg(6), + high_half: false, + }, + "DEA4200F", + "sxtl v30.2d, v6.2s", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Uxtl8, + rd: writable_vreg(3), + rn: vreg(29), + high_half: true, + }, + "A3A7086F", + "uxtl2 v3.8h, v29.16b", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Uxtl16, + rd: writable_vreg(15), + rn: vreg(12), + high_half: false, + }, + "8FA5102F", + "uxtl v15.4s, v12.4h", + )); + insns.push(( + Inst::VecExtend { + t: VecExtendOp::Uxtl32, + rd: writable_vreg(28), + rn: vreg(2), + high_half: true, + }, + "5CA4206F", + "uxtl2 v28.2d, v2.4s", + )); + + insns.push(( + Inst::VecMovElement { + rd: writable_vreg(0), + rn: vreg(31), + dest_idx: 7, + src_idx: 7, + size: VectorSize::Size16x8, + }, + "E0771E6E", + "mov v0.h[7], v31.h[7]", + )); + + insns.push(( + Inst::VecMovElement { + rd: writable_vreg(31), + rn: vreg(16), + dest_idx: 1, + src_idx: 0, + size: VectorSize::Size32x2, + }, + "1F060C6E", + "mov v31.s[1], v16.s[0]", + )); + + insns.push(( + Inst::VecMiscNarrow { + op: VecMiscNarrowOp::Xtn, + rd: writable_vreg(22), + rn: vreg(8), + size: VectorSize::Size32x2, + high_half: false, + }, + "1629A10E", + "xtn v22.2s, v8.2d", + )); + + insns.push(( + Inst::VecMiscNarrow { + op: VecMiscNarrowOp::Sqxtn, + rd: writable_vreg(31), + rn: vreg(0), + size: VectorSize::Size16x8, + high_half: true, + }, + "1F48614E", + "sqxtn2 v31.8h, v0.4s", + )); + + insns.push(( + Inst::VecMiscNarrow { + op: VecMiscNarrowOp::Sqxtun, + rd: writable_vreg(16), + rn: vreg(23), + size: VectorSize::Size8x16, + high_half: false, + }, + "F02A212E", + "sqxtun v16.8b, v23.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqadd, + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(8), + size: VectorSize::Size8x16, + }, + "410C284E", + "sqadd v1.16b, v2.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqadd, + rd: writable_vreg(1), + rn: vreg(12), + rm: vreg(28), + size: VectorSize::Size16x8, + }, + "810D7C4E", + "sqadd v1.8h, v12.8h, v28.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqadd, + rd: writable_vreg(12), + rn: vreg(2), + rm: vreg(6), + size: VectorSize::Size32x4, + }, + "4C0CA64E", + "sqadd v12.4s, v2.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqadd, + rd: writable_vreg(20), + rn: vreg(7), + rm: vreg(13), + size: VectorSize::Size64x2, + }, + "F40CED4E", + "sqadd v20.2d, v7.2d, v13.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqsub, + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(8), + size: VectorSize::Size8x16, + }, + "412C284E", + "sqsub v1.16b, v2.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqsub, + rd: writable_vreg(1), + rn: vreg(12), + rm: vreg(28), + size: VectorSize::Size16x8, + }, + "812D7C4E", + "sqsub v1.8h, v12.8h, v28.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqsub, + rd: writable_vreg(12), + rn: vreg(2), + rm: vreg(6), + size: VectorSize::Size32x4, + }, + "4C2CA64E", + "sqsub v12.4s, v2.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqsub, + rd: writable_vreg(20), + rn: vreg(7), + rm: vreg(13), + size: VectorSize::Size64x2, + }, + "F42CED4E", + "sqsub v20.2d, v7.2d, v13.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqadd, + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(8), + size: VectorSize::Size8x16, + }, + "410C286E", + "uqadd v1.16b, v2.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqadd, + rd: writable_vreg(1), + rn: vreg(12), + rm: vreg(28), + size: VectorSize::Size16x8, + }, + "810D7C6E", + "uqadd v1.8h, v12.8h, v28.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqadd, + rd: writable_vreg(12), + rn: vreg(2), + rm: vreg(6), + size: VectorSize::Size32x4, + }, + "4C0CA66E", + "uqadd v12.4s, v2.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqadd, + rd: writable_vreg(20), + rn: vreg(7), + rm: vreg(13), + size: VectorSize::Size64x2, + }, + "F40CED6E", + "uqadd v20.2d, v7.2d, v13.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqsub, + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(8), + size: VectorSize::Size8x16, + }, + "412C286E", + "uqsub v1.16b, v2.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqsub, + rd: writable_vreg(1), + rn: vreg(12), + rm: vreg(28), + size: VectorSize::Size16x8, + }, + "812D7C6E", + "uqsub v1.8h, v12.8h, v28.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqsub, + rd: writable_vreg(12), + rn: vreg(2), + rm: vreg(6), + size: VectorSize::Size32x4, + }, + "4C2CA66E", + "uqsub v12.4s, v2.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqsub, + rd: writable_vreg(20), + rn: vreg(7), + rm: vreg(13), + size: VectorSize::Size64x2, + }, + "F42CED6E", + "uqsub v20.2d, v7.2d, v13.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmeq, + rd: writable_vreg(3), + rn: vreg(23), + rm: vreg(24), + size: VectorSize::Size8x16, + }, + "E38E386E", + "cmeq v3.16b, v23.16b, v24.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmgt, + rd: writable_vreg(3), + rn: vreg(23), + rm: vreg(24), + size: VectorSize::Size8x16, + }, + "E336384E", + "cmgt v3.16b, v23.16b, v24.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmge, + rd: writable_vreg(23), + rn: vreg(9), + rm: vreg(12), + size: VectorSize::Size8x16, + }, + "373D2C4E", + "cmge v23.16b, v9.16b, v12.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmhi, + rd: writable_vreg(5), + rn: vreg(1), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "2534216E", + "cmhi v5.16b, v1.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmhs, + rd: writable_vreg(8), + rn: vreg(2), + rm: vreg(15), + size: VectorSize::Size8x16, + }, + "483C2F6E", + "cmhs v8.16b, v2.16b, v15.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmeq, + rd: writable_vreg(3), + rn: vreg(23), + rm: vreg(24), + size: VectorSize::Size16x8, + }, + "E38E786E", + "cmeq v3.8h, v23.8h, v24.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmgt, + rd: writable_vreg(3), + rn: vreg(23), + rm: vreg(24), + size: VectorSize::Size16x8, + }, + "E336784E", + "cmgt v3.8h, v23.8h, v24.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmge, + rd: writable_vreg(23), + rn: vreg(9), + rm: vreg(12), + size: VectorSize::Size16x8, + }, + "373D6C4E", + "cmge v23.8h, v9.8h, v12.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmhi, + rd: writable_vreg(5), + rn: vreg(1), + rm: vreg(1), + size: VectorSize::Size16x8, + }, + "2534616E", + "cmhi v5.8h, v1.8h, v1.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmhs, + rd: writable_vreg(8), + rn: vreg(2), + rm: vreg(15), + size: VectorSize::Size16x8, + }, + "483C6F6E", + "cmhs v8.8h, v2.8h, v15.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmeq, + rd: writable_vreg(3), + rn: vreg(23), + rm: vreg(24), + size: VectorSize::Size32x4, + }, + "E38EB86E", + "cmeq v3.4s, v23.4s, v24.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmgt, + rd: writable_vreg(3), + rn: vreg(23), + rm: vreg(24), + size: VectorSize::Size32x4, + }, + "E336B84E", + "cmgt v3.4s, v23.4s, v24.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmge, + rd: writable_vreg(23), + rn: vreg(9), + rm: vreg(12), + size: VectorSize::Size32x4, + }, + "373DAC4E", + "cmge v23.4s, v9.4s, v12.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmhi, + rd: writable_vreg(5), + rn: vreg(1), + rm: vreg(1), + size: VectorSize::Size32x4, + }, + "2534A16E", + "cmhi v5.4s, v1.4s, v1.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmhs, + rd: writable_vreg(8), + rn: vreg(2), + rm: vreg(15), + size: VectorSize::Size32x4, + }, + "483CAF6E", + "cmhs v8.4s, v2.4s, v15.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fcmeq, + rd: writable_vreg(28), + rn: vreg(12), + rm: vreg(4), + size: VectorSize::Size32x2, + }, + "9CE5240E", + "fcmeq v28.2s, v12.2s, v4.2s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fcmgt, + rd: writable_vreg(3), + rn: vreg(16), + rm: vreg(31), + size: VectorSize::Size64x2, + }, + "03E6FF6E", + "fcmgt v3.2d, v16.2d, v31.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fcmge, + rd: writable_vreg(18), + rn: vreg(23), + rm: vreg(0), + size: VectorSize::Size64x2, + }, + "F2E6606E", + "fcmge v18.2d, v23.2d, v0.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::And, + rd: writable_vreg(20), + rn: vreg(19), + rm: vreg(18), + size: VectorSize::Size32x4, + }, + "741E324E", + "and v20.16b, v19.16b, v18.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Bic, + rd: writable_vreg(8), + rn: vreg(11), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "681D614E", + "bic v8.16b, v11.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Orr, + rd: writable_vreg(15), + rn: vreg(2), + rm: vreg(12), + size: VectorSize::Size16x8, + }, + "4F1CAC4E", + "orr v15.16b, v2.16b, v12.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Eor, + rd: writable_vreg(18), + rn: vreg(3), + rm: vreg(22), + size: VectorSize::Size8x16, + }, + "721C366E", + "eor v18.16b, v3.16b, v22.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Bsl, + rd: writable_vreg(8), + rn: vreg(9), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "281D616E", + "bsl v8.16b, v9.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umaxp, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "88A5216E", + "umaxp v8.16b, v12.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umaxp, + rd: writable_vreg(1), + rn: vreg(6), + rm: vreg(1), + size: VectorSize::Size16x8, + }, + "C1A4616E", + "umaxp v1.8h, v6.8h, v1.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umaxp, + rd: writable_vreg(1), + rn: vreg(20), + rm: vreg(16), + size: VectorSize::Size32x4, + }, + "81A6B06E", + "umaxp v1.4s, v20.4s, v16.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Add, + rd: writable_vreg(5), + rn: vreg(1), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "2584214E", + "add v5.16b, v1.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Add, + rd: writable_vreg(7), + rn: vreg(13), + rm: vreg(2), + size: VectorSize::Size16x8, + }, + "A785624E", + "add v7.8h, v13.8h, v2.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Add, + rd: writable_vreg(18), + rn: vreg(9), + rm: vreg(6), + size: VectorSize::Size32x4, + }, + "3285A64E", + "add v18.4s, v9.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Add, + rd: writable_vreg(1), + rn: vreg(3), + rm: vreg(2), + size: VectorSize::Size64x2, + }, + "6184E24E", + "add v1.2d, v3.2d, v2.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sub, + rd: writable_vreg(5), + rn: vreg(1), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "2584216E", + "sub v5.16b, v1.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sub, + rd: writable_vreg(7), + rn: vreg(13), + rm: vreg(2), + size: VectorSize::Size16x8, + }, + "A785626E", + "sub v7.8h, v13.8h, v2.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sub, + rd: writable_vreg(18), + rn: vreg(9), + rm: vreg(6), + size: VectorSize::Size32x4, + }, + "3285A66E", + "sub v18.4s, v9.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sub, + rd: writable_vreg(18), + rn: vreg(0), + rm: vreg(8), + size: VectorSize::Size64x2, + }, + "1284E86E", + "sub v18.2d, v0.2d, v8.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd: writable_vreg(25), + rn: vreg(9), + rm: vreg(8), + size: VectorSize::Size8x16, + }, + "399D284E", + "mul v25.16b, v9.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd: writable_vreg(30), + rn: vreg(30), + rm: vreg(12), + size: VectorSize::Size16x8, + }, + "DE9F6C4E", + "mul v30.8h, v30.8h, v12.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd: writable_vreg(18), + rn: vreg(18), + rm: vreg(18), + size: VectorSize::Size32x4, + }, + "529EB24E", + "mul v18.4s, v18.4s, v18.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Ushl, + rd: writable_vreg(18), + rn: vreg(18), + rm: vreg(18), + size: VectorSize::Size8x16, + }, + "5246326E", + "ushl v18.16b, v18.16b, v18.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Ushl, + rd: writable_vreg(18), + rn: vreg(18), + rm: vreg(18), + size: VectorSize::Size16x8, + }, + "5246726E", + "ushl v18.8h, v18.8h, v18.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Ushl, + rd: writable_vreg(18), + rn: vreg(1), + rm: vreg(21), + size: VectorSize::Size32x4, + }, + "3244B56E", + "ushl v18.4s, v1.4s, v21.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Ushl, + rd: writable_vreg(5), + rn: vreg(7), + rm: vreg(19), + size: VectorSize::Size64x2, + }, + "E544F36E", + "ushl v5.2d, v7.2d, v19.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sshl, + rd: writable_vreg(18), + rn: vreg(18), + rm: vreg(18), + size: VectorSize::Size8x16, + }, + "5246324E", + "sshl v18.16b, v18.16b, v18.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sshl, + rd: writable_vreg(30), + rn: vreg(1), + rm: vreg(29), + size: VectorSize::Size16x8, + }, + "3E447D4E", + "sshl v30.8h, v1.8h, v29.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sshl, + rd: writable_vreg(8), + rn: vreg(22), + rm: vreg(21), + size: VectorSize::Size32x4, + }, + "C846B54E", + "sshl v8.4s, v22.4s, v21.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sshl, + rd: writable_vreg(8), + rn: vreg(22), + rm: vreg(2), + size: VectorSize::Size64x2, + }, + "C846E24E", + "sshl v8.2d, v22.2d, v2.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umin, + rd: writable_vreg(1), + rn: vreg(12), + rm: vreg(3), + size: VectorSize::Size8x16, + }, + "816D236E", + "umin v1.16b, v12.16b, v3.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umin, + rd: writable_vreg(30), + rn: vreg(20), + rm: vreg(10), + size: VectorSize::Size16x8, + }, + "9E6E6A6E", + "umin v30.8h, v20.8h, v10.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umin, + rd: writable_vreg(8), + rn: vreg(22), + rm: vreg(21), + size: VectorSize::Size32x4, + }, + "C86EB56E", + "umin v8.4s, v22.4s, v21.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smin, + rd: writable_vreg(1), + rn: vreg(12), + rm: vreg(3), + size: VectorSize::Size8x16, + }, + "816D234E", + "smin v1.16b, v12.16b, v3.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smin, + rd: writable_vreg(30), + rn: vreg(20), + rm: vreg(10), + size: VectorSize::Size16x8, + }, + "9E6E6A4E", + "smin v30.8h, v20.8h, v10.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smin, + rd: writable_vreg(8), + rn: vreg(22), + rm: vreg(21), + size: VectorSize::Size32x4, + }, + "C86EB54E", + "smin v8.4s, v22.4s, v21.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umax, + rd: writable_vreg(6), + rn: vreg(9), + rm: vreg(8), + size: VectorSize::Size8x8, + }, + "2665282E", + "umax v6.8b, v9.8b, v8.8b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umax, + rd: writable_vreg(11), + rn: vreg(13), + rm: vreg(2), + size: VectorSize::Size16x8, + }, + "AB65626E", + "umax v11.8h, v13.8h, v2.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umax, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(14), + size: VectorSize::Size32x4, + }, + "8865AE6E", + "umax v8.4s, v12.4s, v14.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smax, + rd: writable_vreg(6), + rn: vreg(9), + rm: vreg(8), + size: VectorSize::Size8x16, + }, + "2665284E", + "smax v6.16b, v9.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smax, + rd: writable_vreg(11), + rn: vreg(13), + rm: vreg(2), + size: VectorSize::Size16x8, + }, + "AB65624E", + "smax v11.8h, v13.8h, v2.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smax, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(14), + size: VectorSize::Size32x4, + }, + "8865AE4E", + "smax v8.4s, v12.4s, v14.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Urhadd, + rd: writable_vreg(8), + rn: vreg(1), + rm: vreg(3), + size: VectorSize::Size8x16, + }, + "2814236E", + "urhadd v8.16b, v1.16b, v3.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Urhadd, + rd: writable_vreg(2), + rn: vreg(13), + rm: vreg(6), + size: VectorSize::Size16x8, + }, + "A215666E", + "urhadd v2.8h, v13.8h, v6.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Urhadd, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(14), + size: VectorSize::Size32x4, + }, + "8815AE6E", + "urhadd v8.4s, v12.4s, v14.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fadd, + rd: writable_vreg(31), + rn: vreg(0), + rm: vreg(16), + size: VectorSize::Size32x4, + }, + "1FD4304E", + "fadd v31.4s, v0.4s, v16.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fsub, + rd: writable_vreg(8), + rn: vreg(7), + rm: vreg(15), + size: VectorSize::Size64x2, + }, + "E8D4EF4E", + "fsub v8.2d, v7.2d, v15.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fdiv, + rd: writable_vreg(1), + rn: vreg(3), + rm: vreg(4), + size: VectorSize::Size32x4, + }, + "61FC246E", + "fdiv v1.4s, v3.4s, v4.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fmax, + rd: writable_vreg(31), + rn: vreg(16), + rm: vreg(0), + size: VectorSize::Size64x2, + }, + "1FF6604E", + "fmax v31.2d, v16.2d, v0.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fmin, + rd: writable_vreg(5), + rn: vreg(19), + rm: vreg(26), + size: VectorSize::Size32x4, + }, + "65F6BA4E", + "fmin v5.4s, v19.4s, v26.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Fmul, + rd: writable_vreg(2), + rn: vreg(0), + rm: vreg(5), + size: VectorSize::Size64x2, + }, + "02DC656E", + "fmul v2.2d, v0.2d, v5.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Addp, + rd: writable_vreg(16), + rn: vreg(12), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "90BD214E", + "addp v16.16b, v12.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Addp, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(14), + size: VectorSize::Size32x4, + }, + "88BDAE4E", + "addp v8.4s, v12.4s, v14.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umlal, + rd: writable_vreg(9), + rn: vreg(20), + rm: vreg(17), + size: VectorSize::Size32x2, + }, + "8982B12E", + "umlal v9.2d, v20.2s, v17.2s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Zip1, + rd: writable_vreg(16), + rn: vreg(12), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "9039014E", + "zip1 v16.16b, v12.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Zip1, + rd: writable_vreg(2), + rn: vreg(13), + rm: vreg(6), + size: VectorSize::Size16x8, + }, + "A239464E", + "zip1 v2.8h, v13.8h, v6.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Zip1, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(14), + size: VectorSize::Size32x4, + }, + "88398E4E", + "zip1 v8.4s, v12.4s, v14.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Zip1, + rd: writable_vreg(9), + rn: vreg(20), + rm: vreg(17), + size: VectorSize::Size64x2, + }, + "893AD14E", + "zip1 v9.2d, v20.2d, v17.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smull, + rd: writable_vreg(16), + rn: vreg(12), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "90C1210E", + "smull v16.8h, v12.8b, v1.8b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smull, + rd: writable_vreg(2), + rn: vreg(13), + rm: vreg(6), + size: VectorSize::Size16x8, + }, + "A2C1660E", + "smull v2.4s, v13.4h, v6.4h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smull, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(14), + size: VectorSize::Size32x4, + }, + "88C1AE0E", + "smull v8.2d, v12.2s, v14.2s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smull2, + rd: writable_vreg(16), + rn: vreg(12), + rm: vreg(1), + size: VectorSize::Size8x16, + }, + "90C1214E", + "smull2 v16.8h, v12.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smull2, + rd: writable_vreg(2), + rn: vreg(13), + rm: vreg(6), + size: VectorSize::Size16x8, + }, + "A2C1664E", + "smull2 v2.4s, v13.8h, v6.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Smull2, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(14), + size: VectorSize::Size32x4, + }, + "88C1AE4E", + "smull2 v8.2d, v12.4s, v14.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Not, + rd: writable_vreg(20), + rn: vreg(17), + size: VectorSize::Size8x8, + }, + "345A202E", + "mvn v20.8b, v17.8b", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Not, + rd: writable_vreg(2), + rn: vreg(1), + size: VectorSize::Size32x4, + }, + "2258206E", + "mvn v2.16b, v1.16b", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Neg, + rd: writable_vreg(3), + rn: vreg(7), + size: VectorSize::Size8x8, + }, + "E3B8202E", + "neg v3.8b, v7.8b", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Neg, + rd: writable_vreg(8), + rn: vreg(12), + size: VectorSize::Size8x16, + }, + "88B9206E", + "neg v8.16b, v12.16b", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Neg, + rd: writable_vreg(0), + rn: vreg(31), + size: VectorSize::Size16x8, + }, + "E0BB606E", + "neg v0.8h, v31.8h", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Neg, + rd: writable_vreg(2), + rn: vreg(3), + size: VectorSize::Size32x4, + }, + "62B8A06E", + "neg v2.4s, v3.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Neg, + rd: writable_vreg(10), + rn: vreg(8), + size: VectorSize::Size64x2, + }, + "0AB9E06E", + "neg v10.2d, v8.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Abs, + rd: writable_vreg(3), + rn: vreg(1), + size: VectorSize::Size8x8, + }, + "23B8200E", + "abs v3.8b, v1.8b", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Abs, + rd: writable_vreg(1), + rn: vreg(1), + size: VectorSize::Size8x16, + }, + "21B8204E", + "abs v1.16b, v1.16b", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Abs, + rd: writable_vreg(29), + rn: vreg(28), + size: VectorSize::Size16x8, + }, + "9DBB604E", + "abs v29.8h, v28.8h", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Abs, + rd: writable_vreg(7), + rn: vreg(8), + size: VectorSize::Size32x4, + }, + "07B9A04E", + "abs v7.4s, v8.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Abs, + rd: writable_vreg(1), + rn: vreg(10), + size: VectorSize::Size64x2, + }, + "41B9E04E", + "abs v1.2d, v10.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Fabs, + rd: writable_vreg(15), + rn: vreg(16), + size: VectorSize::Size32x4, + }, + "0FFAA04E", + "fabs v15.4s, v16.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Fneg, + rd: writable_vreg(31), + rn: vreg(0), + size: VectorSize::Size32x4, + }, + "1FF8A06E", + "fneg v31.4s, v0.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Fsqrt, + rd: writable_vreg(7), + rn: vreg(18), + size: VectorSize::Size64x2, + }, + "47FAE16E", + "fsqrt v7.2d, v18.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Rev64, + rd: writable_vreg(1), + rn: vreg(10), + size: VectorSize::Size32x4, + }, + "4109A04E", + "rev64 v1.4s, v10.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Shll, + rd: writable_vreg(12), + rn: vreg(5), + size: VectorSize::Size8x8, + }, + "AC38212E", + "shll v12.8h, v5.8b, #8", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Shll, + rd: writable_vreg(9), + rn: vreg(1), + size: VectorSize::Size16x4, + }, + "2938612E", + "shll v9.4s, v1.4h, #16", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Shll, + rd: writable_vreg(1), + rn: vreg(10), + size: VectorSize::Size32x2, + }, + "4139A12E", + "shll v1.2d, v10.2s, #32", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Fcvtzs, + rd: writable_vreg(4), + rn: vreg(22), + size: VectorSize::Size32x4, + }, + "C4BAA14E", + "fcvtzs v4.4s, v22.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Fcvtzu, + rd: writable_vreg(29), + rn: vreg(15), + size: VectorSize::Size64x2, + }, + "FDB9E16E", + "fcvtzu v29.2d, v15.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Scvtf, + rd: writable_vreg(20), + rn: vreg(8), + size: VectorSize::Size32x4, + }, + "14D9214E", + "scvtf v20.4s, v8.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Ucvtf, + rd: writable_vreg(10), + rn: vreg(19), + size: VectorSize::Size64x2, + }, + "6ADA616E", + "ucvtf v10.2d, v19.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintn, + rd: writable_vreg(11), + rn: vreg(18), + size: VectorSize::Size32x4, + }, + "4B8A214E", + "frintn v11.4s, v18.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintn, + rd: writable_vreg(12), + rn: vreg(17), + size: VectorSize::Size64x2, + }, + "2C8A614E", + "frintn v12.2d, v17.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintz, + rd: writable_vreg(11), + rn: vreg(18), + size: VectorSize::Size32x4, + }, + "4B9AA14E", + "frintz v11.4s, v18.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintz, + rd: writable_vreg(12), + rn: vreg(17), + size: VectorSize::Size64x2, + }, + "2C9AE14E", + "frintz v12.2d, v17.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintm, + rd: writable_vreg(11), + rn: vreg(18), + size: VectorSize::Size32x4, + }, + "4B9A214E", + "frintm v11.4s, v18.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintm, + rd: writable_vreg(12), + rn: vreg(17), + size: VectorSize::Size64x2, + }, + "2C9A614E", + "frintm v12.2d, v17.2d", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintp, + rd: writable_vreg(11), + rn: vreg(18), + size: VectorSize::Size32x4, + }, + "4B8AA14E", + "frintp v11.4s, v18.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Frintp, + rd: writable_vreg(12), + rn: vreg(17), + size: VectorSize::Size64x2, + }, + "2C8AE14E", + "frintp v12.2d, v17.2d", + )); + + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Uminv, + rd: writable_vreg(2), + rn: vreg(1), + size: VectorSize::Size8x16, + }, + "22A8316E", + "uminv b2, v1.16b", + )); + + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Uminv, + rd: writable_vreg(3), + rn: vreg(11), + size: VectorSize::Size16x8, + }, + "63A9716E", + "uminv h3, v11.8h", + )); + + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Uminv, + rd: writable_vreg(18), + rn: vreg(4), + size: VectorSize::Size32x4, + }, + "92A8B16E", + "uminv s18, v4.4s", + )); + + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Addv, + rd: writable_vreg(2), + rn: vreg(29), + size: VectorSize::Size8x16, + }, + "A2BB314E", + "addv b2, v29.16b", + )); + + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Addv, + rd: writable_vreg(3), + rn: vreg(21), + size: VectorSize::Size16x8, + }, + "A3BA714E", + "addv h3, v21.8h", + )); + + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Addv, + rd: writable_vreg(18), + rn: vreg(5), + size: VectorSize::Size32x4, + }, + "B2B8B14E", + "addv s18, v5.4s", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Shl, + rd: writable_vreg(27), + rn: vreg(5), + imm: 7, + size: VectorSize::Size8x16, + }, + "BB540F4F", + "shl v27.16b, v5.16b, #7", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Shl, + rd: writable_vreg(1), + rn: vreg(30), + imm: 0, + size: VectorSize::Size8x16, + }, + "C157084F", + "shl v1.16b, v30.16b, #0", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Sshr, + rd: writable_vreg(26), + rn: vreg(6), + imm: 16, + size: VectorSize::Size16x8, + }, + "DA04104F", + "sshr v26.8h, v6.8h, #16", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Sshr, + rd: writable_vreg(3), + rn: vreg(19), + imm: 1, + size: VectorSize::Size16x8, + }, + "63061F4F", + "sshr v3.8h, v19.8h, #1", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(25), + rn: vreg(6), + imm: 32, + size: VectorSize::Size32x4, + }, + "D904206F", + "ushr v25.4s, v6.4s, #32", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Ushr, + rd: writable_vreg(5), + rn: vreg(21), + imm: 1, + size: VectorSize::Size32x4, + }, + "A5063F6F", + "ushr v5.4s, v21.4s, #1", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Shl, + rd: writable_vreg(22), + rn: vreg(13), + imm: 63, + size: VectorSize::Size64x2, + }, + "B6557F4F", + "shl v22.2d, v13.2d, #63", + )); + + insns.push(( + Inst::VecShiftImm { + op: VecShiftImmOp::Shl, + rd: writable_vreg(23), + rn: vreg(9), + imm: 0, + size: VectorSize::Size64x2, + }, + "3755404F", + "shl v23.2d, v9.2d, #0", + )); + + insns.push(( + Inst::VecExtract { + rd: writable_vreg(1), + rn: vreg(30), + rm: vreg(17), + imm4: 0, + }, + "C103116E", + "ext v1.16b, v30.16b, v17.16b, #0", + )); + + insns.push(( + Inst::VecExtract { + rd: writable_vreg(1), + rn: vreg(30), + rm: vreg(17), + imm4: 8, + }, + "C143116E", + "ext v1.16b, v30.16b, v17.16b, #8", + )); + + insns.push(( + Inst::VecExtract { + rd: writable_vreg(1), + rn: vreg(30), + rm: vreg(17), + imm4: 15, + }, + "C17B116E", + "ext v1.16b, v30.16b, v17.16b, #15", + )); + + insns.push(( + Inst::VecTbl { + rd: writable_vreg(0), + rn: vreg(31), + rm: vreg(16), + is_extension: false, + }, + "E003104E", + "tbl v0.16b, { v31.16b }, v16.16b", + )); + + insns.push(( + Inst::VecTbl { + rd: writable_vreg(4), + rn: vreg(12), + rm: vreg(23), + is_extension: true, + }, + "8411174E", + "tbx v4.16b, { v12.16b }, v23.16b", + )); + + insns.push(( + Inst::VecTbl2 { + rd: writable_vreg(16), + rn: vreg(31), + rn2: vreg(0), + rm: vreg(26), + is_extension: false, + }, + "F0231A4E", + "tbl v16.16b, { v31.16b, v0.16b }, v26.16b", + )); + + insns.push(( + Inst::VecTbl2 { + rd: writable_vreg(3), + rn: vreg(11), + rn2: vreg(12), + rm: vreg(19), + is_extension: true, + }, + "6331134E", + "tbx v3.16b, { v11.16b, v12.16b }, v19.16b", + )); + + insns.push(( + Inst::VecLoadReplicate { + rd: writable_vreg(31), + rn: xreg(0), + + size: VectorSize::Size64x2, + }, + "1FCC404D", + "ld1r { v31.2d }, [x0]", + )); + + insns.push(( + Inst::VecLoadReplicate { + rd: writable_vreg(0), + rn: xreg(25), + + size: VectorSize::Size8x8, + }, + "20C3400D", + "ld1r { v0.8b }, [x25]", + )); + + insns.push(( + Inst::VecCSel { + rd: writable_vreg(5), + rn: vreg(10), + rm: vreg(19), + cond: Cond::Gt, + }, + "6C000054651EB34E02000014451DAA4E", + "vcsel v5.16b, v10.16b, v19.16b, gt (if-then-else diamond)", + )); + + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: false, + from_bits: 8, + to_bits: 32, + }, + "411C0053", + "uxtb w1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: true, + from_bits: 8, + to_bits: 32, + }, + "411C0013", + "sxtb w1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: false, + from_bits: 16, + to_bits: 32, + }, + "413C0053", + "uxth w1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: true, + from_bits: 16, + to_bits: 32, + }, + "413C0013", + "sxth w1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: false, + from_bits: 8, + to_bits: 64, + }, + "411C0053", + "uxtb x1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: true, + from_bits: 8, + to_bits: 64, + }, + "411C4093", + "sxtb x1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: false, + from_bits: 16, + to_bits: 64, + }, + "413C0053", + "uxth x1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: true, + from_bits: 16, + to_bits: 64, + }, + "413C4093", + "sxth x1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: false, + from_bits: 32, + to_bits: 64, + }, + "E103022A", + "mov w1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: true, + from_bits: 32, + to_bits: 64, + }, + "417C4093", + "sxtw x1, w2", + )); + + insns.push(( + Inst::Jump { + dest: BranchTarget::ResolvedOffset(64), + }, + "10000014", + "b 64", + )); + + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::NotZero(xreg(8)), + }, + "480000B40000A0D4", + "cbz x8, 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Zero(xreg(8)), + }, + "480000B50000A0D4", + "cbnz x8, 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Ne), + }, + "400000540000A0D4", + "b.eq 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Eq), + }, + "410000540000A0D4", + "b.ne 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Lo), + }, + "420000540000A0D4", + "b.hs 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Hs), + }, + "430000540000A0D4", + "b.lo 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Pl), + }, + "440000540000A0D4", + "b.mi 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Mi), + }, + "450000540000A0D4", + "b.pl 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Vc), + }, + "460000540000A0D4", + "b.vs 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Vs), + }, + "470000540000A0D4", + "b.vc 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Ls), + }, + "480000540000A0D4", + "b.hi 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Hi), + }, + "490000540000A0D4", + "b.ls 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Lt), + }, + "4A0000540000A0D4", + "b.ge 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Ge), + }, + "4B0000540000A0D4", + "b.lt 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Le), + }, + "4C0000540000A0D4", + "b.gt 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Gt), + }, + "4D0000540000A0D4", + "b.le 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Nv), + }, + "4E0000540000A0D4", + "b.al 8 ; udf", + )); + insns.push(( + Inst::TrapIf { + trap_code: TrapCode::Interrupt, + kind: CondBrKind::Cond(Cond::Al), + }, + "4F0000540000A0D4", + "b.nv 8 ; udf", + )); + + insns.push(( + Inst::CondBr { + taken: BranchTarget::ResolvedOffset(64), + not_taken: BranchTarget::ResolvedOffset(128), + kind: CondBrKind::Cond(Cond::Le), + }, + "0D02005420000014", + "b.le 64 ; b 128", + )); + + insns.push(( + Inst::Call { + info: Box::new(CallInfo { + dest: ExternalName::testcase("test0"), + uses: Vec::new(), + defs: Vec::new(), + opcode: Opcode::Call, + caller_callconv: CallConv::SystemV, + callee_callconv: CallConv::SystemV, + }), + }, + "00000094", + "bl 0", + )); + + insns.push(( + Inst::CallInd { + info: Box::new(CallIndInfo { + rn: xreg(10), + uses: Vec::new(), + defs: Vec::new(), + opcode: Opcode::CallIndirect, + caller_callconv: CallConv::SystemV, + callee_callconv: CallConv::SystemV, + }), + }, + "40013FD6", + "blr x10", + )); + + insns.push(( + Inst::IndirectBr { + rn: xreg(3), + targets: vec![], + }, + "60001FD6", + "br x3", + )); + + insns.push((Inst::Brk, "000020D4", "brk #0")); + + insns.push(( + Inst::Adr { + rd: writable_xreg(15), + off: (1 << 20) - 4, + }, + "EFFF7F10", + "adr x15, pc+1048572", + )); + + insns.push(( + Inst::FpuMove64 { + rd: writable_vreg(8), + rn: vreg(4), + }, + "881CA40E", + "mov v8.8b, v4.8b", + )); + + insns.push(( + Inst::FpuMove128 { + rd: writable_vreg(17), + rn: vreg(26), + }, + "511FBA4E", + "mov v17.16b, v26.16b", + )); + + insns.push(( + Inst::FpuMoveFromVec { + rd: writable_vreg(1), + rn: vreg(30), + idx: 2, + size: VectorSize::Size32x4, + }, + "C107145E", + "mov s1, v30.s[2]", + )); + + insns.push(( + Inst::FpuMoveFromVec { + rd: writable_vreg(23), + rn: vreg(11), + idx: 0, + size: VectorSize::Size64x2, + }, + "7705085E", + "mov d23, v11.d[0]", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Abs32, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CFC3201E", + "fabs s15, s30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Abs64, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CFC3601E", + "fabs d15, d30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Neg32, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CF43211E", + "fneg s15, s30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Neg64, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CF43611E", + "fneg d15, d30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Sqrt32, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CFC3211E", + "fsqrt s15, s30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Sqrt64, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CFC3611E", + "fsqrt d15, d30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Cvt32To64, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CFC3221E", + "fcvt d15, s30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Cvt64To32, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CF43621E", + "fcvt s15, d30", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Add32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF2B3F1E", + "fadd s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Add64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF2B7F1E", + "fadd d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Sub32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF3B3F1E", + "fsub s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Sub64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF3B7F1E", + "fsub d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Mul32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF0B3F1E", + "fmul s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Mul64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF0B7F1E", + "fmul d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Div32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF1B3F1E", + "fdiv s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Div64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF1B7F1E", + "fdiv d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Max32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF4B3F1E", + "fmax s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Max64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF4B7F1E", + "fmax d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Min32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF5B3F1E", + "fmin s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Min64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF5B7F1E", + "fmin d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Uqadd64, + rd: writable_vreg(21), + rn: vreg(22), + rm: vreg(23), + }, + "D50EF77E", + "uqadd d21, d22, d23", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Sqadd64, + rd: writable_vreg(21), + rn: vreg(22), + rm: vreg(23), + }, + "D50EF75E", + "sqadd d21, d22, d23", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Uqsub64, + rd: writable_vreg(21), + rn: vreg(22), + rm: vreg(23), + }, + "D52EF77E", + "uqsub d21, d22, d23", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Sqsub64, + rd: writable_vreg(21), + rn: vreg(22), + rm: vreg(23), + }, + "D52EF75E", + "sqsub d21, d22, d23", + )); + + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MAdd32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + ra: vreg(1), + }, + "CF071F1F", + "fmadd s15, s30, s31, s1", + )); + + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MAdd64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + ra: vreg(1), + }, + "CF075F1F", + "fmadd d15, d30, d31, d1", + )); + + insns.push(( + Inst::FpuRRI { + fpu_op: FPUOpRI::UShr32(FPURightShiftImm::maybe_from_u8(32, 32).unwrap()), + rd: writable_vreg(2), + rn: vreg(5), + }, + "A204202F", + "ushr v2.2s, v5.2s, #32", + )); + + insns.push(( + Inst::FpuRRI { + fpu_op: FPUOpRI::UShr64(FPURightShiftImm::maybe_from_u8(63, 64).unwrap()), + rd: writable_vreg(2), + rn: vreg(5), + }, + "A204417F", + "ushr d2, d5, #63", + )); + + insns.push(( + Inst::FpuRRI { + fpu_op: FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(31, 32).unwrap()), + rd: writable_vreg(4), + rn: vreg(10), + }, + "44553F2F", + "sli v4.2s, v10.2s, #31", + )); + + insns.push(( + Inst::FpuRRI { + fpu_op: FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(63, 64).unwrap()), + rd: writable_vreg(4), + rn: vreg(10), + }, + "44557F7F", + "sli d4, d10, #63", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F32ToU32, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100391E", + "fcvtzu w1, s4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F32ToU64, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100399E", + "fcvtzu x1, s4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F32ToI32, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100381E", + "fcvtzs w1, s4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F32ToI64, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100389E", + "fcvtzs x1, s4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F64ToU32, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100791E", + "fcvtzu w1, d4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F64ToU64, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100799E", + "fcvtzu x1, d4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F64ToI32, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100781E", + "fcvtzs w1, d4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F64ToI64, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100789E", + "fcvtzs x1, d4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::U32ToF32, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100231E", + "ucvtf s1, w4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::I32ToF32, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100221E", + "scvtf s1, w4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::U32ToF64, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100631E", + "ucvtf d1, w4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::I32ToF64, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100621E", + "scvtf d1, w4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::U64ToF32, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100239E", + "ucvtf s1, x4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::I64ToF32, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100229E", + "scvtf s1, x4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::U64ToF64, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100639E", + "ucvtf d1, x4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::I64ToF64, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100629E", + "scvtf d1, x4", + )); + + insns.push(( + Inst::FpuCmp32 { + rn: vreg(23), + rm: vreg(24), + }, + "E022381E", + "fcmp s23, s24", + )); + + insns.push(( + Inst::FpuCmp64 { + rn: vreg(23), + rm: vreg(24), + }, + "E022781E", + "fcmp d23, d24", + )); + + insns.push(( + Inst::FpuLoad32 { + rd: writable_vreg(16), + mem: AMode::RegScaled(xreg(8), xreg(9), F32), + flags: MemFlags::trusted(), + }, + "107969BC", + "ldr s16, [x8, x9, LSL #2]", + )); + + insns.push(( + Inst::FpuLoad64 { + rd: writable_vreg(16), + mem: AMode::RegScaled(xreg(8), xreg(9), F64), + flags: MemFlags::trusted(), + }, + "107969FC", + "ldr d16, [x8, x9, LSL #3]", + )); + + insns.push(( + Inst::FpuLoad128 { + rd: writable_vreg(16), + mem: AMode::RegScaled(xreg(8), xreg(9), I128), + flags: MemFlags::trusted(), + }, + "1079E93C", + "ldr q16, [x8, x9, LSL #4]", + )); + + insns.push(( + Inst::FpuLoad32 { + rd: writable_vreg(16), + mem: AMode::Label(MemLabel::PCRel(8)), + flags: MemFlags::trusted(), + }, + "5000001C", + "ldr s16, pc+8", + )); + + insns.push(( + Inst::FpuLoad64 { + rd: writable_vreg(16), + mem: AMode::Label(MemLabel::PCRel(8)), + flags: MemFlags::trusted(), + }, + "5000005C", + "ldr d16, pc+8", + )); + + insns.push(( + Inst::FpuLoad128 { + rd: writable_vreg(16), + mem: AMode::Label(MemLabel::PCRel(8)), + flags: MemFlags::trusted(), + }, + "5000009C", + "ldr q16, pc+8", + )); + + insns.push(( + Inst::FpuStore32 { + rd: vreg(16), + mem: AMode::RegScaled(xreg(8), xreg(9), F32), + flags: MemFlags::trusted(), + }, + "107929BC", + "str s16, [x8, x9, LSL #2]", + )); + + insns.push(( + Inst::FpuStore64 { + rd: vreg(16), + mem: AMode::RegScaled(xreg(8), xreg(9), F64), + flags: MemFlags::trusted(), + }, + "107929FC", + "str d16, [x8, x9, LSL #3]", + )); + + insns.push(( + Inst::FpuStore128 { + rd: vreg(16), + mem: AMode::RegScaled(xreg(8), xreg(9), I128), + flags: MemFlags::trusted(), + }, + "1079A93C", + "str q16, [x8, x9, LSL #4]", + )); + + insns.push(( + Inst::LoadFpuConst64 { + rd: writable_vreg(16), + const_data: 1.0_f64.to_bits(), + }, + "5000005C03000014000000000000F03F", + "ldr d16, pc+8 ; b 12 ; data.f64 1", + )); + + insns.push(( + Inst::LoadFpuConst128 { + rd: writable_vreg(5), + const_data: 0x0f0e0d0c0b0a09080706050403020100, + }, + "4500009C05000014000102030405060708090A0B0C0D0E0F", + "ldr q5, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100", + )); + + insns.push(( + Inst::FpuCSel32 { + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(3), + cond: Cond::Hi, + }, + "418C231E", + "fcsel s1, s2, s3, hi", + )); + + insns.push(( + Inst::FpuCSel64 { + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(3), + cond: Cond::Eq, + }, + "410C631E", + "fcsel d1, d2, d3, eq", + )); + + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Minus32, + }, + "1743251E", + "frintm s23, s24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Minus64, + }, + "1743651E", + "frintm d23, d24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Plus32, + }, + "17C3241E", + "frintp s23, s24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Plus64, + }, + "17C3641E", + "frintp d23, d24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Zero32, + }, + "17C3251E", + "frintz s23, s24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Zero64, + }, + "17C3651E", + "frintz d23, d24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Nearest32, + }, + "1743241E", + "frintn s23, s24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Nearest64, + }, + "1743641E", + "frintn d23, d24", + )); + + insns.push(( + Inst::AtomicRMW { + ty: I16, + op: inst_common::AtomicRmwOp::Xor, + }, + "BF3B03D53B7F5F487C031ACA3C7F1848B8FFFFB5BF3B03D5", + "atomically { 16_bits_at_[x25]) Xor= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }", + )); + + insns.push(( + Inst::AtomicRMW { + ty: I32, + op: inst_common::AtomicRmwOp::Xchg, + }, + "BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5", + "atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }", + )); + + insns.push(( + Inst::AtomicCAS { + ty: I8, + }, + "BF3B03D53B7F5F08581F40927F0318EB610000543C7F180878FFFFB5BF3B03D5", + "atomically { compare-and-swap(8_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }" + )); + + insns.push(( + Inst::AtomicCAS { + ty: I64, + }, + "BF3B03D53B7F5FC8F8031AAA7F0318EB610000543C7F18C878FFFFB5BF3B03D5", + "atomically { compare-and-swap(64_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }" + )); + + insns.push(( + Inst::AtomicLoad { + ty: I8, + r_data: writable_xreg(7), + r_addr: xreg(28), + }, + "BF3B03D587034039", + "atomically { x7 = zero_extend_8_bits_at[x28] }", + )); + + insns.push(( + Inst::AtomicLoad { + ty: I64, + r_data: writable_xreg(28), + r_addr: xreg(7), + }, + "BF3B03D5FC0040F9", + "atomically { x28 = zero_extend_64_bits_at[x7] }", + )); + + insns.push(( + Inst::AtomicStore { + ty: I16, + r_data: xreg(17), + r_addr: xreg(8), + }, + "11010079BF3B03D5", + "atomically { 16_bits_at[x8] = x17 }", + )); + + insns.push(( + Inst::AtomicStore { + ty: I32, + r_data: xreg(18), + r_addr: xreg(7), + }, + "F20000B9BF3B03D5", + "atomically { 32_bits_at[x7] = x18 }", + )); + + insns.push((Inst::Fence {}, "BF3B03D5", "dmb ish")); + + let flags = settings::Flags::new(settings::builder()); + let rru = create_reg_universe(&flags); + let emit_info = EmitInfo::new(flags); + for (insn, expected_encoding, expected_printing) in insns { + println!( + "AArch64: {:?}, {}, {}", + insn, expected_encoding, expected_printing + ); + + // Check the printed text is as expected. + let actual_printing = insn.show_rru(Some(&rru)); + assert_eq!(expected_printing, actual_printing); + + let mut sink = test_utils::TestCodeSink::new(); + let mut buffer = MachBuffer::new(); + insn.emit(&mut buffer, &emit_info, &mut Default::default()); + let buffer = buffer.finish(); + buffer.emit(&mut sink); + let actual_encoding = &sink.stringify(); + assert_eq!(expected_encoding, actual_encoding); + } +} + +#[test] +fn test_cond_invert() { + for cond in vec![ + Cond::Eq, + Cond::Ne, + Cond::Hs, + Cond::Lo, + Cond::Mi, + Cond::Pl, + Cond::Vs, + Cond::Vc, + Cond::Hi, + Cond::Ls, + Cond::Ge, + Cond::Lt, + Cond::Gt, + Cond::Le, + Cond::Al, + Cond::Nv, + ] + .into_iter() + { + assert_eq!(cond.invert().invert(), cond); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/imms.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/imms.rs new file mode 100644 index 0000000000..b6da0402bc --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/imms.rs @@ -0,0 +1,1025 @@ +//! AArch64 ISA definitions: immediate constants. + +// Some variants are never constructed, but we still want them as options in the future. +#[allow(dead_code)] +use crate::ir::types::*; +use crate::ir::Type; +use crate::isa::aarch64::inst::{OperandSize, ScalarSize}; + +use regalloc::{PrettyPrint, RealRegUniverse}; + +use core::convert::TryFrom; +use std::string::String; + +/// An immediate that represents the NZCV flags. +#[derive(Clone, Copy, Debug)] +pub struct NZCV { + /// The negative condition flag. + n: bool, + /// The zero condition flag. + z: bool, + /// The carry condition flag. + c: bool, + /// The overflow condition flag. + v: bool, +} + +impl NZCV { + pub fn new(n: bool, z: bool, c: bool, v: bool) -> NZCV { + NZCV { n, z, c, v } + } + + /// Bits for encoding. + pub fn bits(&self) -> u32 { + (u32::from(self.n) << 3) + | (u32::from(self.z) << 2) + | (u32::from(self.c) << 1) + | u32::from(self.v) + } +} + +/// An unsigned 5-bit immediate. +#[derive(Clone, Copy, Debug)] +pub struct UImm5 { + /// The value. + value: u8, +} + +impl UImm5 { + pub fn maybe_from_u8(value: u8) -> Option<UImm5> { + if value < 32 { + Some(UImm5 { value }) + } else { + None + } + } + + /// Bits for encoding. + pub fn bits(&self) -> u32 { + u32::from(self.value) + } +} + +/// A signed, scaled 7-bit offset. +#[derive(Clone, Copy, Debug)] +pub struct SImm7Scaled { + /// The value. + pub value: i16, + /// multiplied by the size of this type + pub scale_ty: Type, +} + +impl SImm7Scaled { + /// Create a SImm7Scaled from a raw offset and the known scale type, if + /// possible. + pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option<SImm7Scaled> { + assert!(scale_ty == I64 || scale_ty == I32); + let scale = scale_ty.bytes(); + assert!(scale.is_power_of_two()); + let scale = i64::from(scale); + let upper_limit = 63 * scale; + let lower_limit = -(64 * scale); + if value >= lower_limit && value <= upper_limit && (value & (scale - 1)) == 0 { + Some(SImm7Scaled { + value: i16::try_from(value).unwrap(), + scale_ty, + }) + } else { + None + } + } + + /// Create a zero immediate of this format. + pub fn zero(scale_ty: Type) -> SImm7Scaled { + SImm7Scaled { value: 0, scale_ty } + } + + /// Bits for encoding. + pub fn bits(&self) -> u32 { + let ty_bytes: i16 = self.scale_ty.bytes() as i16; + let scaled: i16 = self.value / ty_bytes; + assert!(scaled <= 63 && scaled >= -64); + let scaled: i8 = scaled as i8; + let encoded: u32 = scaled as u32; + encoded & 0x7f + } +} + +#[derive(Clone, Copy, Debug)] +pub struct FPULeftShiftImm { + pub amount: u8, + pub lane_size_in_bits: u8, +} + +impl FPULeftShiftImm { + pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> { + debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64); + if amount < lane_size_in_bits { + Some(Self { + amount, + lane_size_in_bits, + }) + } else { + None + } + } + + pub fn enc(&self) -> u32 { + debug_assert!(self.lane_size_in_bits.is_power_of_two()); + debug_assert!(self.lane_size_in_bits > self.amount); + // The encoding of the immediate follows the table below, + // where xs encode the shift amount. + // + // | lane_size_in_bits | encoding | + // +------------------------------+ + // | 8 | 0001xxx | + // | 16 | 001xxxx | + // | 32 | 01xxxxx | + // | 64 | 1xxxxxx | + // + // The highest one bit is represented by `lane_size_in_bits`. Since + // `lane_size_in_bits` is a power of 2 and `amount` is less + // than `lane_size_in_bits`, they can be ORed + // together to produced the encoded value. + u32::from(self.lane_size_in_bits | self.amount) + } +} + +#[derive(Clone, Copy, Debug)] +pub struct FPURightShiftImm { + pub amount: u8, + pub lane_size_in_bits: u8, +} + +impl FPURightShiftImm { + pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> { + debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64); + if amount > 0 && amount <= lane_size_in_bits { + Some(Self { + amount, + lane_size_in_bits, + }) + } else { + None + } + } + + pub fn enc(&self) -> u32 { + debug_assert_ne!(0, self.amount); + // The encoding of the immediate follows the table below, + // where xs encodes the negated shift amount. + // + // | lane_size_in_bits | encoding | + // +------------------------------+ + // | 8 | 0001xxx | + // | 16 | 001xxxx | + // | 32 | 01xxxxx | + // | 64 | 1xxxxxx | + // + // The shift amount is negated such that a shift ammount + // of 1 (in 64-bit) is encoded as 0b111111 and a shift + // amount of 64 is encoded as 0b000000, + // in the bottom 6 bits. + u32::from((self.lane_size_in_bits * 2) - self.amount) + } +} + +/// a 9-bit signed offset. +#[derive(Clone, Copy, Debug)] +pub struct SImm9 { + /// The value. + pub value: i16, +} + +impl SImm9 { + /// Create a signed 9-bit offset from a full-range value, if possible. + pub fn maybe_from_i64(value: i64) -> Option<SImm9> { + if value >= -256 && value <= 255 { + Some(SImm9 { + value: value as i16, + }) + } else { + None + } + } + + /// Create a zero immediate of this format. + pub fn zero() -> SImm9 { + SImm9 { value: 0 } + } + + /// Bits for encoding. + pub fn bits(&self) -> u32 { + (self.value as u32) & 0x1ff + } + + /// Signed value of immediate. + pub fn value(&self) -> i32 { + self.value as i32 + } +} + +/// An unsigned, scaled 12-bit offset. +#[derive(Clone, Copy, Debug)] +pub struct UImm12Scaled { + /// The value. + pub value: u16, + /// multiplied by the size of this type + pub scale_ty: Type, +} + +impl UImm12Scaled { + /// Create a UImm12Scaled from a raw offset and the known scale type, if + /// possible. + pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option<UImm12Scaled> { + // Ensure the type is at least one byte. + let scale_ty = if scale_ty == B1 { B8 } else { scale_ty }; + + let scale = scale_ty.bytes(); + assert!(scale.is_power_of_two()); + let scale = scale as i64; + let limit = 4095 * scale; + if value >= 0 && value <= limit && (value & (scale - 1)) == 0 { + Some(UImm12Scaled { + value: value as u16, + scale_ty, + }) + } else { + None + } + } + + /// Create a zero immediate of this format. + pub fn zero(scale_ty: Type) -> UImm12Scaled { + UImm12Scaled { value: 0, scale_ty } + } + + /// Encoded bits. + pub fn bits(&self) -> u32 { + (self.value as u32 / self.scale_ty.bytes()) & 0xfff + } + + /// Value after scaling. + pub fn value(&self) -> u32 { + self.value as u32 + } + + /// The value type which is the scaling base. + pub fn scale_ty(&self) -> Type { + self.scale_ty + } +} + +/// A shifted immediate value in 'imm12' format: supports 12 bits, shifted +/// left by 0 or 12 places. +#[derive(Clone, Debug)] +pub struct Imm12 { + /// The immediate bits. + pub bits: u16, + /// Whether the immediate bits are shifted left by 12 or not. + pub shift12: bool, +} + +impl Imm12 { + /// Compute a Imm12 from raw bits, if possible. + pub fn maybe_from_u64(val: u64) -> Option<Imm12> { + if val == 0 { + Some(Imm12 { + bits: 0, + shift12: false, + }) + } else if val < 0xfff { + Some(Imm12 { + bits: val as u16, + shift12: false, + }) + } else if val < 0xfff_000 && (val & 0xfff == 0) { + Some(Imm12 { + bits: (val >> 12) as u16, + shift12: true, + }) + } else { + None + } + } + + /// Create a zero immediate of this format. + pub fn zero() -> Self { + Imm12 { + bits: 0, + shift12: false, + } + } + + /// Bits for 2-bit "shift" field in e.g. AddI. + pub fn shift_bits(&self) -> u32 { + if self.shift12 { + 0b01 + } else { + 0b00 + } + } + + /// Bits for 12-bit "imm" field in e.g. AddI. + pub fn imm_bits(&self) -> u32 { + self.bits as u32 + } +} + +/// An immediate for logical instructions. +#[derive(Clone, Debug, PartialEq)] +pub struct ImmLogic { + /// The actual value. + value: u64, + /// `N` flag. + pub n: bool, + /// `S` field: element size and element bits. + pub r: u8, + /// `R` field: rotate amount. + pub s: u8, + /// Was this constructed for a 32-bit or 64-bit instruction? + pub size: OperandSize, +} + +impl ImmLogic { + /// Compute an ImmLogic from raw bits, if possible. + pub fn maybe_from_u64(value: u64, ty: Type) -> Option<ImmLogic> { + // Note: This function is a port of VIXL's Assembler::IsImmLogical. + + if ty != I64 && ty != I32 { + return None; + } + let operand_size = OperandSize::from_ty(ty); + + let original_value = value; + + let value = if ty == I32 { + // To handle 32-bit logical immediates, the very easiest thing is to repeat + // the input value twice to make a 64-bit word. The correct encoding of that + // as a logical immediate will also be the correct encoding of the 32-bit + // value. + + // Avoid making the assumption that the most-significant 32 bits are zero by + // shifting the value left and duplicating it. + let value = value << 32; + value | value >> 32 + } else { + value + }; + + // Logical immediates are encoded using parameters n, imm_s and imm_r using + // the following table: + // + // N imms immr size S R + // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) + // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) + // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) + // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) + // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) + // 0 11110s xxxxxr 2 UInt(s) UInt(r) + // (s bits must not be all set) + // + // A pattern is constructed of size bits, where the least significant S+1 bits + // are set. The pattern is rotated right by R, and repeated across a 32 or + // 64-bit value, depending on destination register width. + // + // Put another way: the basic format of a logical immediate is a single + // contiguous stretch of 1 bits, repeated across the whole word at intervals + // given by a power of 2. To identify them quickly, we first locate the + // lowest stretch of 1 bits, then the next 1 bit above that; that combination + // is different for every logical immediate, so it gives us all the + // information we need to identify the only logical immediate that our input + // could be, and then we simply check if that's the value we actually have. + // + // (The rotation parameter does give the possibility of the stretch of 1 bits + // going 'round the end' of the word. To deal with that, we observe that in + // any situation where that happens the bitwise NOT of the value is also a + // valid logical immediate. So we simply invert the input whenever its low bit + // is set, and then we know that the rotated case can't arise.) + let (value, inverted) = if value & 1 == 1 { + (!value, true) + } else { + (value, false) + }; + + if value == 0 { + return None; + } + + // The basic analysis idea: imagine our input word looks like this. + // + // 0011111000111110001111100011111000111110001111100011111000111110 + // c b a + // |<--d-->| + // + // We find the lowest set bit (as an actual power-of-2 value, not its index) + // and call it a. Then we add a to our original number, which wipes out the + // bottommost stretch of set bits and replaces it with a 1 carried into the + // next zero bit. Then we look for the new lowest set bit, which is in + // position b, and subtract it, so now our number is just like the original + // but with the lowest stretch of set bits completely gone. Now we find the + // lowest set bit again, which is position c in the diagram above. Then we'll + // measure the distance d between bit positions a and c (using CLZ), and that + // tells us that the only valid logical immediate that could possibly be equal + // to this number is the one in which a stretch of bits running from a to just + // below b is replicated every d bits. + fn lowest_set_bit(value: u64) -> u64 { + let bit = value.trailing_zeros(); + 1u64.checked_shl(bit).unwrap_or(0) + } + let a = lowest_set_bit(value); + assert_ne!(0, a); + let value_plus_a = value.wrapping_add(a); + let b = lowest_set_bit(value_plus_a); + let value_plus_a_minus_b = value_plus_a - b; + let c = lowest_set_bit(value_plus_a_minus_b); + + let (d, clz_a, out_n, mask) = if c != 0 { + // The general case, in which there is more than one stretch of set bits. + // Compute the repeat distance d, and set up a bitmask covering the basic + // unit of repetition (i.e. a word with the bottom d bits set). Also, in all + // of these cases the N bit of the output will be zero. + let clz_a = a.leading_zeros(); + let clz_c = c.leading_zeros(); + let d = clz_a - clz_c; + let mask = (1 << d) - 1; + (d, clz_a, 0, mask) + } else { + (64, a.leading_zeros(), 1, u64::max_value()) + }; + + // If the repeat period d is not a power of two, it can't be encoded. + if !d.is_power_of_two() { + return None; + } + + if ((b.wrapping_sub(a)) & !mask) != 0 { + // If the bit stretch (b - a) does not fit within the mask derived from the + // repeat period, then fail. + return None; + } + + // The only possible option is b - a repeated every d bits. Now we're going to + // actually construct the valid logical immediate derived from that + // specification, and see if it equals our original input. + // + // To repeat a value every d bits, we multiply it by a number of the form + // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can + // be derived using a table lookup on CLZ(d). + const MULTIPLIERS: [u64; 6] = [ + 0x0000000000000001, + 0x0000000100000001, + 0x0001000100010001, + 0x0101010101010101, + 0x1111111111111111, + 0x5555555555555555, + ]; + let multiplier = MULTIPLIERS[(u64::from(d).leading_zeros() - 57) as usize]; + let candidate = b.wrapping_sub(a) * multiplier; + + if value != candidate { + // The candidate pattern doesn't match our input value, so fail. + return None; + } + + // We have a match! This is a valid logical immediate, so now we have to + // construct the bits and pieces of the instruction encoding that generates + // it. + + // Count the set bits in our basic stretch. The special case of clz(0) == -1 + // makes the answer come out right for stretches that reach the very top of + // the word (e.g. numbers like 0xffffc00000000000). + let clz_b = if b == 0 { + u32::max_value() // -1 + } else { + b.leading_zeros() + }; + let s = clz_a.wrapping_sub(clz_b); + + // Decide how many bits to rotate right by, to put the low bit of that basic + // stretch in position a. + let (s, r) = if inverted { + // If we inverted the input right at the start of this function, here's + // where we compensate: the number of set bits becomes the number of clear + // bits, and the rotation count is based on position b rather than position + // a (since b is the location of the 'lowest' 1 bit after inversion). + // Need wrapping for when clz_b is max_value() (for when b == 0). + (d - s, clz_b.wrapping_add(1) & (d - 1)) + } else { + (s, (clz_a + 1) & (d - 1)) + }; + + // Now we're done, except for having to encode the S output in such a way that + // it gives both the number of set bits and the length of the repeated + // segment. The s field is encoded like this: + // + // imms size S + // ssssss 64 UInt(ssssss) + // 0sssss 32 UInt(sssss) + // 10ssss 16 UInt(ssss) + // 110sss 8 UInt(sss) + // 1110ss 4 UInt(ss) + // 11110s 2 UInt(s) + // + // So we 'or' (2 * -d) with our computed s to form imms. + let s = ((d * 2).wrapping_neg() | (s - 1)) & 0x3f; + debug_assert!(u8::try_from(r).is_ok()); + debug_assert!(u8::try_from(s).is_ok()); + Some(ImmLogic { + value: original_value, + n: out_n != 0, + r: r as u8, + s: s as u8, + size: operand_size, + }) + } + + /// Returns bits ready for encoding: (N:1, R:6, S:6) + pub fn enc_bits(&self) -> u32 { + ((self.n as u32) << 12) | ((self.r as u32) << 6) | (self.s as u32) + } + + /// Returns the value that this immediate represents. + pub fn value(&self) -> u64 { + self.value + } + + /// Return an immediate for the bitwise-inverted value. + pub fn invert(&self) -> ImmLogic { + // For every ImmLogical immediate, the inverse can also be encoded. + Self::maybe_from_u64(!self.value, self.size.to_ty()).unwrap() + } + + /// This provides a safe(ish) way to avoid the costs of `maybe_from_u64` when we want to + /// encode a constant that we know at compiler-build time. It constructs an `ImmLogic` from + /// the fields `n`, `r`, `s` and `size`, but in a debug build, checks that `value_to_check` + /// corresponds to those four fields. The intention is that, in a non-debug build, this + /// reduces to something small enough that it will be a candidate for inlining. + pub fn from_n_r_s(value_to_check: u64, n: bool, r: u8, s: u8, size: OperandSize) -> Self { + // Construct it from the components we got given. + let imml = Self { + value: value_to_check, + n, + r, + s, + size, + }; + + // In debug mode, check that `n`/`r`/`s` are correct, given `value` and `size`. + debug_assert!(match ImmLogic::maybe_from_u64( + value_to_check, + if size == OperandSize::Size64 { + I64 + } else { + I32 + } + ) { + None => false, // fail: `value` is unrepresentable + Some(imml_check) => imml_check == imml, + }); + + imml + } +} + +/// An immediate for shift instructions. +#[derive(Clone, Debug)] +pub struct ImmShift { + /// 6-bit shift amount. + pub imm: u8, +} + +impl ImmShift { + /// Create an ImmShift from raw bits, if possible. + pub fn maybe_from_u64(val: u64) -> Option<ImmShift> { + if val < 64 { + Some(ImmShift { imm: val as u8 }) + } else { + None + } + } + + /// Get the immediate value. + pub fn value(&self) -> u8 { + self.imm + } +} + +/// A 16-bit immediate for a MOVZ instruction, with a {0,16,32,48}-bit shift. +#[derive(Clone, Copy, Debug)] +pub struct MoveWideConst { + /// The value. + pub bits: u16, + /// Result is `bits` shifted 16*shift bits to the left. + pub shift: u8, +} + +impl MoveWideConst { + /// Construct a MoveWideConst from an arbitrary 64-bit constant if possible. + pub fn maybe_from_u64(value: u64) -> Option<MoveWideConst> { + let mask0 = 0x0000_0000_0000_ffffu64; + let mask1 = 0x0000_0000_ffff_0000u64; + let mask2 = 0x0000_ffff_0000_0000u64; + let mask3 = 0xffff_0000_0000_0000u64; + + if value == (value & mask0) { + return Some(MoveWideConst { + bits: (value & mask0) as u16, + shift: 0, + }); + } + if value == (value & mask1) { + return Some(MoveWideConst { + bits: ((value >> 16) & mask0) as u16, + shift: 1, + }); + } + if value == (value & mask2) { + return Some(MoveWideConst { + bits: ((value >> 32) & mask0) as u16, + shift: 2, + }); + } + if value == (value & mask3) { + return Some(MoveWideConst { + bits: ((value >> 48) & mask0) as u16, + shift: 3, + }); + } + None + } + + pub fn maybe_with_shift(imm: u16, shift: u8) -> Option<MoveWideConst> { + let shift_enc = shift / 16; + if shift_enc > 3 { + None + } else { + Some(MoveWideConst { + bits: imm, + shift: shift_enc, + }) + } + } + + /// Returns the value that this constant represents. + pub fn value(&self) -> u64 { + (self.bits as u64) << (16 * self.shift) + } +} + +/// Advanced SIMD modified immediate as used by MOVI/MVNI. +#[derive(Clone, Copy, Debug)] +pub struct ASIMDMovModImm { + imm: u8, + shift: u8, + shift_ones: bool, +} + +impl ASIMDMovModImm { + pub fn maybe_from_u64(value: u64, size: ScalarSize) -> Option<ASIMDMovModImm> { + match size { + ScalarSize::Size8 => Some(ASIMDMovModImm { + imm: value as u8, + shift: 0, + shift_ones: false, + }), + _ => None, + } + } + + /// Create a zero immediate of this format. + pub fn zero() -> Self { + ASIMDMovModImm { + imm: 0, + shift: 0, + shift_ones: false, + } + } + + pub fn value(&self) -> (u8, u32, bool) { + (self.imm, self.shift as u32, self.shift_ones) + } +} + +impl PrettyPrint for NZCV { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + let fmt = |c: char, v| if v { c.to_ascii_uppercase() } else { c }; + format!( + "#{}{}{}{}", + fmt('n', self.n), + fmt('z', self.z), + fmt('c', self.c), + fmt('v', self.v) + ) + } +} + +impl PrettyPrint for UImm5 { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value) + } +} + +impl PrettyPrint for Imm12 { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + let shift = if self.shift12 { 12 } else { 0 }; + let value = u32::from(self.bits) << shift; + format!("#{}", value) + } +} + +impl PrettyPrint for SImm7Scaled { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value) + } +} + +impl PrettyPrint for FPULeftShiftImm { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.amount) + } +} + +impl PrettyPrint for FPURightShiftImm { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.amount) + } +} + +impl PrettyPrint for SImm9 { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value) + } +} + +impl PrettyPrint for UImm12Scaled { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value) + } +} + +impl PrettyPrint for ImmLogic { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value()) + } +} + +impl PrettyPrint for ImmShift { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.imm) + } +} + +impl PrettyPrint for MoveWideConst { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + if self.shift == 0 { + format!("#{}", self.bits) + } else { + format!("#{}, LSL #{}", self.bits, self.shift * 16) + } + } +} + +impl PrettyPrint for ASIMDMovModImm { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + if self.shift == 0 { + format!("#{}", self.imm) + } else { + let shift_type = if self.shift_ones { "MSL" } else { "LSL" }; + format!("#{}, {} #{}", self.imm, shift_type, self.shift) + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn imm_logical_test() { + assert_eq!(None, ImmLogic::maybe_from_u64(0, I64)); + assert_eq!(None, ImmLogic::maybe_from_u64(u64::max_value(), I64)); + + assert_eq!( + Some(ImmLogic { + value: 1, + n: true, + r: 0, + s: 0, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(1, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 2, + n: true, + r: 63, + s: 0, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(2, I64) + ); + + assert_eq!(None, ImmLogic::maybe_from_u64(5, I64)); + + assert_eq!(None, ImmLogic::maybe_from_u64(11, I64)); + + assert_eq!( + Some(ImmLogic { + value: 248, + n: true, + r: 61, + s: 4, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(248, I64) + ); + + assert_eq!(None, ImmLogic::maybe_from_u64(249, I64)); + + assert_eq!( + Some(ImmLogic { + value: 1920, + n: true, + r: 57, + s: 3, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(1920, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x7ffe, + n: true, + r: 63, + s: 13, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(0x7ffe, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x30000, + n: true, + r: 48, + s: 1, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(0x30000, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x100000, + n: true, + r: 44, + s: 0, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(0x100000, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: u64::max_value() - 1, + n: true, + r: 63, + s: 62, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(u64::max_value() - 1, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0xaaaaaaaaaaaaaaaa, + n: false, + r: 1, + s: 60, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(0xaaaaaaaaaaaaaaaa, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x8181818181818181, + n: false, + r: 1, + s: 49, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(0x8181818181818181, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0xffc3ffc3ffc3ffc3, + n: false, + r: 10, + s: 43, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(0xffc3ffc3ffc3ffc3, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x100000001, + n: false, + r: 0, + s: 0, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(0x100000001, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x1111111111111111, + n: false, + r: 0, + s: 56, + size: OperandSize::Size64, + }), + ImmLogic::maybe_from_u64(0x1111111111111111, I64) + ); + + for n in 0..2 { + let types = if n == 0 { vec![I64, I32] } else { vec![I64] }; + for s in 0..64 { + for r in 0..64 { + let imm = get_logical_imm(n, s, r); + for &ty in &types { + match ImmLogic::maybe_from_u64(imm, ty) { + Some(ImmLogic { value, .. }) => { + assert_eq!(imm, value); + ImmLogic::maybe_from_u64(!value, ty).unwrap(); + } + None => assert_eq!(0, imm), + }; + } + } + } + } + } + + // Repeat a value that has `width` bits, across a 64-bit value. + fn repeat(value: u64, width: u64) -> u64 { + let mut result = value & ((1 << width) - 1); + let mut i = width; + while i < 64 { + result |= result << i; + i *= 2; + } + result + } + + // Get the logical immediate, from the encoding N/R/S bits. + fn get_logical_imm(n: u32, s: u32, r: u32) -> u64 { + // An integer is constructed from the n, imm_s and imm_r bits according to + // the following table: + // + // N imms immr size S R + // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) + // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) + // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) + // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) + // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) + // 0 11110s xxxxxr 2 UInt(s) UInt(r) + // (s bits must not be all set) + // + // A pattern is constructed of size bits, where the least significant S+1 + // bits are set. The pattern is rotated right by R, and repeated across a + // 64-bit value. + + if n == 1 { + if s == 0x3f { + return 0; + } + let bits = (1u64 << (s + 1)) - 1; + bits.rotate_right(r) + } else { + if (s >> 1) == 0x1f { + return 0; + } + let mut width = 0x20; + while width >= 0x2 { + if (s & width) == 0 { + let mask = width - 1; + if (s & mask) == mask { + return 0; + } + let bits = (1u64 << ((s & mask) + 1)) - 1; + return repeat(bits.rotate_right(r & mask), width.into()); + } + width >>= 1; + } + unreachable!(); + } + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/mod.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/mod.rs new file mode 100644 index 0000000000..278302018e --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/mod.rs @@ -0,0 +1,4057 @@ +//! This module defines aarch64-specific machine instruction types. + +// Some variants are not constructed, but we still want them as options in the future. +#![allow(dead_code)] + +use crate::binemit::CodeOffset; +use crate::ir::types::{ + B1, B16, B16X8, B32, B32X4, B64, B64X2, B8, B8X16, F32, F32X4, F64, F64X2, FFLAGS, I16, I16X8, + I32, I32X4, I64, I64X2, I8, I8X16, IFLAGS, R32, R64, +}; +use crate::ir::{ExternalName, MemFlags, Opcode, SourceLoc, TrapCode, Type}; +use crate::isa::CallConv; +use crate::machinst::*; +use crate::{settings, CodegenError, CodegenResult}; + +use regalloc::{PrettyPrint, RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable}; +use regalloc::{RegUsageCollector, RegUsageMapper}; + +use alloc::boxed::Box; +use alloc::vec::Vec; +use core::convert::TryFrom; +use smallvec::{smallvec, SmallVec}; +use std::string::{String, ToString}; + +pub mod regs; +pub use self::regs::*; +pub mod imms; +pub use self::imms::*; +pub mod args; +pub use self::args::*; +pub mod emit; +pub use self::emit::*; +pub mod unwind; + +#[cfg(test)] +mod emit_tests; + +//============================================================================= +// Instructions (top level): definition + +/// An ALU operation. This can be paired with several instruction formats +/// below (see `Inst`) in any combination. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum ALUOp { + Add32, + Add64, + Sub32, + Sub64, + Orr32, + Orr64, + OrrNot32, + OrrNot64, + And32, + And64, + AndNot32, + AndNot64, + /// XOR (AArch64 calls this "EOR") + Eor32, + /// XOR (AArch64 calls this "EOR") + Eor64, + /// XNOR (AArch64 calls this "EOR-NOT") + EorNot32, + /// XNOR (AArch64 calls this "EOR-NOT") + EorNot64, + /// Add, setting flags + AddS32, + /// Add, setting flags + AddS64, + /// Sub, setting flags + SubS32, + /// Sub, setting flags + SubS64, + /// Signed multiply, high-word result + SMulH, + /// Unsigned multiply, high-word result + UMulH, + SDiv64, + UDiv64, + RotR32, + RotR64, + Lsr32, + Lsr64, + Asr32, + Asr64, + Lsl32, + Lsl64, +} + +/// An ALU operation with three arguments. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum ALUOp3 { + /// Multiply-add + MAdd32, + /// Multiply-add + MAdd64, + /// Multiply-sub + MSub32, + /// Multiply-sub + MSub64, +} + +/// A floating-point unit (FPU) operation with one arg. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum FPUOp1 { + Abs32, + Abs64, + Neg32, + Neg64, + Sqrt32, + Sqrt64, + Cvt32To64, + Cvt64To32, +} + +/// A floating-point unit (FPU) operation with two args. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum FPUOp2 { + Add32, + Add64, + Sub32, + Sub64, + Mul32, + Mul64, + Div32, + Div64, + Max32, + Max64, + Min32, + Min64, + /// Signed saturating add + Sqadd64, + /// Unsigned saturating add + Uqadd64, + /// Signed saturating subtract + Sqsub64, + /// Unsigned saturating subtract + Uqsub64, +} + +/// A floating-point unit (FPU) operation with two args, a register and an immediate. +#[derive(Copy, Clone, Debug)] +pub enum FPUOpRI { + /// Unsigned right shift. Rd = Rn << #imm + UShr32(FPURightShiftImm), + /// Unsigned right shift. Rd = Rn << #imm + UShr64(FPURightShiftImm), + /// Shift left and insert. Rd |= Rn << #imm + Sli32(FPULeftShiftImm), + /// Shift left and insert. Rd |= Rn << #imm + Sli64(FPULeftShiftImm), +} + +/// A floating-point unit (FPU) operation with three args. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum FPUOp3 { + MAdd32, + MAdd64, +} + +/// A conversion from an FP to an integer value. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum FpuToIntOp { + F32ToU32, + F32ToI32, + F32ToU64, + F32ToI64, + F64ToU32, + F64ToI32, + F64ToU64, + F64ToI64, +} + +/// A conversion from an integer to an FP value. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum IntToFpuOp { + U32ToF32, + I32ToF32, + U32ToF64, + I32ToF64, + U64ToF32, + I64ToF32, + U64ToF64, + I64ToF64, +} + +/// Modes for FP rounding ops: round down (floor) or up (ceil), or toward zero (trunc), or to +/// nearest, and for 32- or 64-bit FP values. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum FpuRoundMode { + Minus32, + Minus64, + Plus32, + Plus64, + Zero32, + Zero64, + Nearest32, + Nearest64, +} + +/// Type of vector element extensions. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecExtendOp { + /// Signed extension of 8-bit elements + Sxtl8, + /// Signed extension of 16-bit elements + Sxtl16, + /// Signed extension of 32-bit elements + Sxtl32, + /// Unsigned extension of 8-bit elements + Uxtl8, + /// Unsigned extension of 16-bit elements + Uxtl16, + /// Unsigned extension of 32-bit elements + Uxtl32, +} + +/// A vector ALU operation. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecALUOp { + /// Signed saturating add + Sqadd, + /// Unsigned saturating add + Uqadd, + /// Signed saturating subtract + Sqsub, + /// Unsigned saturating subtract + Uqsub, + /// Compare bitwise equal + Cmeq, + /// Compare signed greater than or equal + Cmge, + /// Compare signed greater than + Cmgt, + /// Compare unsigned higher + Cmhs, + /// Compare unsigned higher or same + Cmhi, + /// Floating-point compare equal + Fcmeq, + /// Floating-point compare greater than + Fcmgt, + /// Floating-point compare greater than or equal + Fcmge, + /// Bitwise and + And, + /// Bitwise bit clear + Bic, + /// Bitwise inclusive or + Orr, + /// Bitwise exclusive or + Eor, + /// Bitwise select + Bsl, + /// Unsigned maximum pairwise + Umaxp, + /// Add + Add, + /// Subtract + Sub, + /// Multiply + Mul, + /// Signed shift left + Sshl, + /// Unsigned shift left + Ushl, + /// Unsigned minimum + Umin, + /// Signed minimum + Smin, + /// Unsigned maximum + Umax, + /// Signed maximum + Smax, + /// Unsigned rounding halving add + Urhadd, + /// Floating-point add + Fadd, + /// Floating-point subtract + Fsub, + /// Floating-point divide + Fdiv, + /// Floating-point maximum + Fmax, + /// Floating-point minimum + Fmin, + /// Floating-point multiply + Fmul, + /// Add pairwise + Addp, + /// Unsigned multiply add long + Umlal, + /// Zip vectors (primary) [meaning, high halves] + Zip1, + /// Signed multiply long (low halves) + Smull, + /// Signed multiply long (high halves) + Smull2, +} + +/// A Vector miscellaneous operation with two registers. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecMisc2 { + /// Bitwise NOT + Not, + /// Negate + Neg, + /// Absolute value + Abs, + /// Floating-point absolute value + Fabs, + /// Floating-point negate + Fneg, + /// Floating-point square root + Fsqrt, + /// Reverse elements in 64-bit doublewords + Rev64, + /// Shift left long (by element size) + Shll, + /// Floating-point convert to signed integer, rounding toward zero + Fcvtzs, + /// Floating-point convert to unsigned integer, rounding toward zero + Fcvtzu, + /// Signed integer convert to floating-point + Scvtf, + /// Unsigned integer convert to floating-point + Ucvtf, + /// Floating point round to integral, rounding towards nearest + Frintn, + /// Floating point round to integral, rounding towards zero + Frintz, + /// Floating point round to integral, rounding towards minus infinity + Frintm, + /// Floating point round to integral, rounding towards plus infinity + Frintp, +} + +/// A Vector narrowing operation with two registers. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecMiscNarrowOp { + /// Extract Narrow + Xtn, + /// Signed saturating extract narrow + Sqxtn, + /// Signed saturating extract unsigned narrow + Sqxtun, +} + +/// An operation across the lanes of vectors. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecLanesOp { + /// Integer addition across a vector + Addv, + /// Unsigned minimum across a vector + Uminv, +} + +/// A shift-by-immediate operation on each lane of a vector. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecShiftImmOp { + // Unsigned shift left + Shl, + // Unsigned shift right + Ushr, + // Signed shift right + Sshr, +} + +/// An operation on the bits of a register. This can be paired with several instruction formats +/// below (see `Inst`) in any combination. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum BitOp { + /// Bit reverse + RBit32, + /// Bit reverse + RBit64, + Clz32, + Clz64, + Cls32, + Cls64, +} + +impl BitOp { + /// What is the opcode's native width? + pub fn operand_size(&self) -> OperandSize { + match self { + BitOp::RBit32 | BitOp::Clz32 | BitOp::Cls32 => OperandSize::Size32, + _ => OperandSize::Size64, + } + } + + /// Get the assembly mnemonic for this opcode. + pub fn op_str(&self) -> &'static str { + match self { + BitOp::RBit32 | BitOp::RBit64 => "rbit", + BitOp::Clz32 | BitOp::Clz64 => "clz", + BitOp::Cls32 | BitOp::Cls64 => "cls", + } + } +} + +impl From<(Opcode, Type)> for BitOp { + /// Get the BitOp from the IR opcode. + fn from(op_ty: (Opcode, Type)) -> BitOp { + match op_ty { + (Opcode::Bitrev, I32) => BitOp::RBit32, + (Opcode::Bitrev, I64) => BitOp::RBit64, + (Opcode::Clz, I32) => BitOp::Clz32, + (Opcode::Clz, I64) => BitOp::Clz64, + (Opcode::Cls, I32) => BitOp::Cls32, + (Opcode::Cls, I64) => BitOp::Cls64, + _ => unreachable!("Called with non-bit op!: {:?}", op_ty), + } + } +} + +/// Additional information for (direct) Call instructions, left out of line to lower the size of +/// the Inst enum. +#[derive(Clone, Debug)] +pub struct CallInfo { + pub dest: ExternalName, + pub uses: Vec<Reg>, + pub defs: Vec<Writable<Reg>>, + pub opcode: Opcode, + pub caller_callconv: CallConv, + pub callee_callconv: CallConv, +} + +/// Additional information for CallInd instructions, left out of line to lower the size of the Inst +/// enum. +#[derive(Clone, Debug)] +pub struct CallIndInfo { + pub rn: Reg, + pub uses: Vec<Reg>, + pub defs: Vec<Writable<Reg>>, + pub opcode: Opcode, + pub caller_callconv: CallConv, + pub callee_callconv: CallConv, +} + +/// Additional information for JTSequence instructions, left out of line to lower the size of the Inst +/// enum. +#[derive(Clone, Debug)] +pub struct JTSequenceInfo { + pub targets: Vec<BranchTarget>, + pub default_target: BranchTarget, + pub targets_for_term: Vec<MachLabel>, // needed for MachTerminator. +} + +/// Instruction formats. +#[derive(Clone, Debug)] +pub enum Inst { + /// A no-op of zero size. + Nop0, + + /// A no-op that is one instruction large. + Nop4, + + /// An ALU operation with two register sources and a register destination. + AluRRR { + alu_op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + }, + /// An ALU operation with three register sources and a register destination. + AluRRRR { + alu_op: ALUOp3, + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + ra: Reg, + }, + /// An ALU operation with a register source and an immediate-12 source, and a register + /// destination. + AluRRImm12 { + alu_op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + imm12: Imm12, + }, + /// An ALU operation with a register source and an immediate-logic source, and a register destination. + AluRRImmLogic { + alu_op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + imml: ImmLogic, + }, + /// An ALU operation with a register source and an immediate-shiftamt source, and a register destination. + AluRRImmShift { + alu_op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + immshift: ImmShift, + }, + /// An ALU operation with two register sources, one of which can be shifted, and a register + /// destination. + AluRRRShift { + alu_op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + shiftop: ShiftOpAndAmt, + }, + /// An ALU operation with two register sources, one of which can be {zero,sign}-extended and + /// shifted, and a register destination. + AluRRRExtend { + alu_op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + extendop: ExtendOp, + }, + + /// A bit op instruction with a single register source. + BitRR { + op: BitOp, + rd: Writable<Reg>, + rn: Reg, + }, + + /// An unsigned (zero-extending) 8-bit load. + ULoad8 { + rd: Writable<Reg>, + mem: AMode, + flags: MemFlags, + }, + /// A signed (sign-extending) 8-bit load. + SLoad8 { + rd: Writable<Reg>, + mem: AMode, + flags: MemFlags, + }, + /// An unsigned (zero-extending) 16-bit load. + ULoad16 { + rd: Writable<Reg>, + mem: AMode, + flags: MemFlags, + }, + /// A signed (sign-extending) 16-bit load. + SLoad16 { + rd: Writable<Reg>, + mem: AMode, + flags: MemFlags, + }, + /// An unsigned (zero-extending) 32-bit load. + ULoad32 { + rd: Writable<Reg>, + mem: AMode, + flags: MemFlags, + }, + /// A signed (sign-extending) 32-bit load. + SLoad32 { + rd: Writable<Reg>, + mem: AMode, + flags: MemFlags, + }, + /// A 64-bit load. + ULoad64 { + rd: Writable<Reg>, + mem: AMode, + flags: MemFlags, + }, + + /// An 8-bit store. + Store8 { + rd: Reg, + mem: AMode, + flags: MemFlags, + }, + /// A 16-bit store. + Store16 { + rd: Reg, + mem: AMode, + flags: MemFlags, + }, + /// A 32-bit store. + Store32 { + rd: Reg, + mem: AMode, + flags: MemFlags, + }, + /// A 64-bit store. + Store64 { + rd: Reg, + mem: AMode, + flags: MemFlags, + }, + + /// A store of a pair of registers. + StoreP64 { + rt: Reg, + rt2: Reg, + mem: PairAMode, + flags: MemFlags, + }, + /// A load of a pair of registers. + LoadP64 { + rt: Writable<Reg>, + rt2: Writable<Reg>, + mem: PairAMode, + flags: MemFlags, + }, + + /// A MOV instruction. These are encoded as ORR's (AluRRR form) but we + /// keep them separate at the `Inst` level for better pretty-printing + /// and faster `is_move()` logic. + Mov64 { + rd: Writable<Reg>, + rm: Reg, + }, + + /// A 32-bit MOV. Zeroes the top 32 bits of the destination. This is + /// effectively an alias for an unsigned 32-to-64-bit extension. + Mov32 { + rd: Writable<Reg>, + rm: Reg, + }, + + /// A MOVZ with a 16-bit immediate. + MovZ { + rd: Writable<Reg>, + imm: MoveWideConst, + size: OperandSize, + }, + + /// A MOVN with a 16-bit immediate. + MovN { + rd: Writable<Reg>, + imm: MoveWideConst, + size: OperandSize, + }, + + /// A MOVK with a 16-bit immediate. + MovK { + rd: Writable<Reg>, + imm: MoveWideConst, + size: OperandSize, + }, + + /// A sign- or zero-extend operation. + Extend { + rd: Writable<Reg>, + rn: Reg, + signed: bool, + from_bits: u8, + to_bits: u8, + }, + + /// A conditional-select operation. + CSel { + rd: Writable<Reg>, + cond: Cond, + rn: Reg, + rm: Reg, + }, + + /// A conditional-set operation. + CSet { + rd: Writable<Reg>, + cond: Cond, + }, + + /// A conditional comparison with an immediate. + CCmpImm { + size: OperandSize, + rn: Reg, + imm: UImm5, + nzcv: NZCV, + cond: Cond, + }, + + /// A synthetic insn, which is a load-linked store-conditional loop, that has the overall + /// effect of atomically modifying a memory location in a particular way. Because we have + /// no way to explain to the regalloc about earlyclobber registers, this instruction has + /// completely fixed operand registers, and we rely on the RA's coalescing to remove copies + /// in the surrounding code to the extent it can. The sequence is both preceded and + /// followed by a fence which is at least as comprehensive as that of the `Fence` + /// instruction below. This instruction is sequentially consistent. The operand + /// conventions are: + /// + /// x25 (rd) address + /// x26 (rd) second operand for `op` + /// x27 (wr) old value + /// x24 (wr) scratch reg; value afterwards has no meaning + /// x28 (wr) scratch reg; value afterwards has no meaning + AtomicRMW { + ty: Type, // I8, I16, I32 or I64 + op: inst_common::AtomicRmwOp, + }, + + /// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked + /// store-conditional loop. (Although we could possibly implement it more directly using + /// CAS insns that are available in some revisions of AArch64 above 8.0). The sequence is + /// both preceded and followed by a fence which is at least as comprehensive as that of the + /// `Fence` instruction below. This instruction is sequentially consistent. Note that the + /// operand conventions, although very similar to AtomicRMW, are different: + /// + /// x25 (rd) address + /// x26 (rd) expected value + /// x28 (rd) replacement value + /// x27 (wr) old value + /// x24 (wr) scratch reg; value afterwards has no meaning + AtomicCAS { + ty: Type, // I8, I16, I32 or I64 + }, + + /// Read `ty` bits from address `r_addr`, zero extend the loaded value to 64 bits and put it + /// in `r_data`. The load instruction is preceded by a fence at least as comprehensive as + /// that of the `Fence` instruction below. This instruction is sequentially consistent. + AtomicLoad { + ty: Type, // I8, I16, I32 or I64 + r_data: Writable<Reg>, + r_addr: Reg, + }, + + /// Write the lowest `ty` bits of `r_data` to address `r_addr`, with a memory fence + /// instruction following the store. The fence is at least as comprehensive as that of the + /// `Fence` instruction below. This instruction is sequentially consistent. + AtomicStore { + ty: Type, // I8, I16, I32 or I64 + r_data: Reg, + r_addr: Reg, + }, + + /// A memory fence. This must provide ordering to ensure that, at a minimum, neither loads + /// nor stores may move forwards or backwards across the fence. Currently emitted as "dmb + /// ish". This instruction is sequentially consistent. + Fence, + + /// FPU move. Note that this is distinct from a vector-register + /// move; moving just 64 bits seems to be significantly faster. + FpuMove64 { + rd: Writable<Reg>, + rn: Reg, + }, + + /// Vector register move. + FpuMove128 { + rd: Writable<Reg>, + rn: Reg, + }, + + /// Move to scalar from a vector element. + FpuMoveFromVec { + rd: Writable<Reg>, + rn: Reg, + idx: u8, + size: VectorSize, + }, + + /// 1-op FPU instruction. + FpuRR { + fpu_op: FPUOp1, + rd: Writable<Reg>, + rn: Reg, + }, + + /// 2-op FPU instruction. + FpuRRR { + fpu_op: FPUOp2, + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + }, + + FpuRRI { + fpu_op: FPUOpRI, + rd: Writable<Reg>, + rn: Reg, + }, + + /// 3-op FPU instruction. + FpuRRRR { + fpu_op: FPUOp3, + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + ra: Reg, + }, + + /// FPU comparison, single-precision (32 bit). + FpuCmp32 { + rn: Reg, + rm: Reg, + }, + + /// FPU comparison, double-precision (64 bit). + FpuCmp64 { + rn: Reg, + rm: Reg, + }, + + /// Floating-point load, single-precision (32 bit). + FpuLoad32 { + rd: Writable<Reg>, + mem: AMode, + flags: MemFlags, + }, + /// Floating-point store, single-precision (32 bit). + FpuStore32 { + rd: Reg, + mem: AMode, + flags: MemFlags, + }, + /// Floating-point load, double-precision (64 bit). + FpuLoad64 { + rd: Writable<Reg>, + mem: AMode, + flags: MemFlags, + }, + /// Floating-point store, double-precision (64 bit). + FpuStore64 { + rd: Reg, + mem: AMode, + flags: MemFlags, + }, + /// Floating-point/vector load, 128 bit. + FpuLoad128 { + rd: Writable<Reg>, + mem: AMode, + flags: MemFlags, + }, + /// Floating-point/vector store, 128 bit. + FpuStore128 { + rd: Reg, + mem: AMode, + flags: MemFlags, + }, + + LoadFpuConst64 { + rd: Writable<Reg>, + const_data: u64, + }, + + LoadFpuConst128 { + rd: Writable<Reg>, + const_data: u128, + }, + + /// Conversion: FP -> integer. + FpuToInt { + op: FpuToIntOp, + rd: Writable<Reg>, + rn: Reg, + }, + + /// Conversion: integer -> FP. + IntToFpu { + op: IntToFpuOp, + rd: Writable<Reg>, + rn: Reg, + }, + + /// FP conditional select, 32 bit. + FpuCSel32 { + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + cond: Cond, + }, + /// FP conditional select, 64 bit. + FpuCSel64 { + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + cond: Cond, + }, + + /// Round to integer. + FpuRound { + op: FpuRoundMode, + rd: Writable<Reg>, + rn: Reg, + }, + + /// Move from a GPR to a vector register. The scalar value is parked in the lowest lane + /// of the destination, and all other lanes are zeroed out. Currently only 32- and 64-bit + /// transactions are supported. + MovToFpu { + rd: Writable<Reg>, + rn: Reg, + size: ScalarSize, + }, + + /// Move to a vector element from a GPR. + MovToVec { + rd: Writable<Reg>, + rn: Reg, + idx: u8, + size: VectorSize, + }, + + /// Unsigned move from a vector element to a GPR. + MovFromVec { + rd: Writable<Reg>, + rn: Reg, + idx: u8, + size: VectorSize, + }, + + /// Signed move from a vector element to a GPR. + MovFromVecSigned { + rd: Writable<Reg>, + rn: Reg, + idx: u8, + size: VectorSize, + scalar_size: OperandSize, + }, + + /// Duplicate general-purpose register to vector. + VecDup { + rd: Writable<Reg>, + rn: Reg, + size: VectorSize, + }, + + /// Duplicate scalar to vector. + VecDupFromFpu { + rd: Writable<Reg>, + rn: Reg, + size: VectorSize, + }, + + /// Duplicate immediate to vector. + VecDupImm { + rd: Writable<Reg>, + imm: ASIMDMovModImm, + invert: bool, + size: VectorSize, + }, + + /// Vector extend. + VecExtend { + t: VecExtendOp, + rd: Writable<Reg>, + rn: Reg, + high_half: bool, + }, + + /// Move vector element to another vector element. + VecMovElement { + rd: Writable<Reg>, + rn: Reg, + dest_idx: u8, + src_idx: u8, + size: VectorSize, + }, + + /// Vector narrowing operation. + VecMiscNarrow { + op: VecMiscNarrowOp, + rd: Writable<Reg>, + rn: Reg, + size: VectorSize, + high_half: bool, + }, + + /// A vector ALU op. + VecRRR { + alu_op: VecALUOp, + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + size: VectorSize, + }, + + /// Vector two register miscellaneous instruction. + VecMisc { + op: VecMisc2, + rd: Writable<Reg>, + rn: Reg, + size: VectorSize, + }, + + /// Vector instruction across lanes. + VecLanes { + op: VecLanesOp, + rd: Writable<Reg>, + rn: Reg, + size: VectorSize, + }, + + /// Vector shift by immediate: Shift Left (immediate), Unsigned Shift Right (immediate), + /// Signed Shift Right (immediate). These are somewhat unusual in that, for right shifts, + /// the allowed range of `imm` values is 1 to lane-size-in-bits, inclusive. A zero + /// right-shift cannot be encoded. Left shifts are "normal", though, having valid `imm` + /// values from 0 to lane-size-in-bits - 1 inclusive. + VecShiftImm { + op: VecShiftImmOp, + rd: Writable<Reg>, + rn: Reg, + size: VectorSize, + imm: u8, + }, + + /// Vector extract - create a new vector, being the concatenation of the lowest `imm4` bytes + /// of `rm` followed by the uppermost `16 - imm4` bytes of `rn`. + VecExtract { + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + imm4: u8, + }, + + /// Table vector lookup - single register table. The table consists of 8-bit elements and is + /// stored in `rn`, while `rm` contains 8-bit element indices. `is_extension` specifies whether + /// to emit a TBX or a TBL instruction, i.e. whether to leave the elements in the destination + /// vector that correspond to out-of-range indices (greater than 15) unmodified or to set them + /// to 0. + VecTbl { + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + is_extension: bool, + }, + + /// Table vector lookup - two register table. The table consists of 8-bit elements and is + /// stored in `rn` and `rn2`, while `rm` contains 8-bit element indices. `is_extension` + /// specifies whether to emit a TBX or a TBL instruction, i.e. whether to leave the elements in + /// the destination vector that correspond to out-of-range indices (greater than 31) unmodified + /// or to set them to 0. The table registers `rn` and `rn2` must have consecutive numbers + /// modulo 32, that is v31 and v0 (in that order) are consecutive registers. + VecTbl2 { + rd: Writable<Reg>, + rn: Reg, + rn2: Reg, + rm: Reg, + is_extension: bool, + }, + + /// Load an element and replicate to all lanes of a vector. + VecLoadReplicate { + rd: Writable<Reg>, + rn: Reg, + size: VectorSize, + }, + + /// Vector conditional select, 128 bit. A synthetic instruction, which generates a 4-insn + /// control-flow diamond. + VecCSel { + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + cond: Cond, + }, + + /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn). + MovToNZCV { + rn: Reg, + }, + + /// Move from the NZCV flags (actually a `MRS Xn, NZCV` insn). + MovFromNZCV { + rd: Writable<Reg>, + }, + + /// A machine call instruction. N.B.: this allows only a +/- 128MB offset (it uses a relocation + /// of type `Reloc::Arm64Call`); if the destination distance is not `RelocDistance::Near`, the + /// code should use a `LoadExtName` / `CallInd` sequence instead, allowing an arbitrary 64-bit + /// target. + Call { + info: Box<CallInfo>, + }, + /// A machine indirect-call instruction. + CallInd { + info: Box<CallIndInfo>, + }, + + // ---- branches (exactly one must appear at end of BB) ---- + /// A machine return instruction. + Ret, + + /// A placeholder instruction, generating no code, meaning that a function epilogue must be + /// inserted there. + EpiloguePlaceholder, + + /// An unconditional branch. + Jump { + dest: BranchTarget, + }, + + /// A conditional branch. Contains two targets; at emission time, both are emitted, but + /// the MachBuffer knows to truncate the trailing branch if fallthrough. We optimize the + /// choice of taken/not_taken (inverting the branch polarity as needed) based on the + /// fallthrough at the time of lowering. + CondBr { + taken: BranchTarget, + not_taken: BranchTarget, + kind: CondBrKind, + }, + + /// A conditional trap: execute a `udf` if the condition is true. This is + /// one VCode instruction because it uses embedded control flow; it is + /// logically a single-in, single-out region, but needs to appear as one + /// unit to the register allocator. + /// + /// The `CondBrKind` gives the conditional-branch condition that will + /// *execute* the embedded `Inst`. (In the emitted code, we use the inverse + /// of this condition in a branch that skips the trap instruction.) + TrapIf { + kind: CondBrKind, + trap_code: TrapCode, + }, + + /// An indirect branch through a register, augmented with set of all + /// possible successors. + IndirectBr { + rn: Reg, + targets: Vec<MachLabel>, + }, + + /// A "break" instruction, used for e.g. traps and debug breakpoints. + Brk, + + /// An instruction guaranteed to always be undefined and to trigger an illegal instruction at + /// runtime. + Udf { + trap_code: TrapCode, + }, + + /// Compute the address (using a PC-relative offset) of a memory location, using the `ADR` + /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is + /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may + /// need full `MemLabel` support. + Adr { + rd: Writable<Reg>, + /// Offset in range -2^20 .. 2^20. + off: i32, + }, + + /// Raw 32-bit word, used for inline constants and jump-table entries. + Word4 { + data: u32, + }, + + /// Raw 64-bit word, used for inline constants. + Word8 { + data: u64, + }, + + /// Jump-table sequence, as one compound instruction (see note in lower_inst.rs for rationale). + JTSequence { + info: Box<JTSequenceInfo>, + ridx: Reg, + rtmp1: Writable<Reg>, + rtmp2: Writable<Reg>, + }, + + /// Load an inline symbol reference. + LoadExtName { + rd: Writable<Reg>, + name: Box<ExternalName>, + offset: i64, + }, + + /// Load address referenced by `mem` into `rd`. + LoadAddr { + rd: Writable<Reg>, + mem: AMode, + }, + + /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This + /// controls how AMode::NominalSPOffset args are lowered. + VirtualSPOffsetAdj { + offset: i64, + }, + + /// Meta-insn, no-op in generated code: emit constant/branch veneer island + /// at this point (with a guard jump around it) if less than the needed + /// space is available before the next branch deadline. See the `MachBuffer` + /// implementation in `machinst/buffer.rs` for the overall algorithm. In + /// brief, we retain a set of "pending/unresolved label references" from + /// branches as we scan forward through instructions to emit machine code; + /// if we notice we're about to go out of range on an unresolved reference, + /// we stop, emit a bunch of "veneers" (branches in a form that has a longer + /// range, e.g. a 26-bit-offset unconditional jump), and point the original + /// label references to those. This is an "island" because it comes in the + /// middle of the code. + /// + /// This meta-instruction is a necessary part of the logic that determines + /// where to place islands. Ordinarily, we want to place them between basic + /// blocks, so we compute the worst-case size of each block, and emit the + /// island before starting a block if we would exceed a deadline before the + /// end of the block. However, some sequences (such as an inline jumptable) + /// are variable-length and not accounted for by this logic; so these + /// lowered sequences include an `EmitIsland` to trigger island generation + /// where necessary. + EmitIsland { + /// The needed space before the next deadline. + needed_space: CodeOffset, + }, +} + +fn count_zero_half_words(mut value: u64, num_half_words: u8) -> usize { + let mut count = 0; + for _ in 0..num_half_words { + if value & 0xffff == 0 { + count += 1; + } + value >>= 16; + } + + count +} + +#[test] +fn inst_size_test() { + // This test will help with unintentionally growing the size + // of the Inst enum. + assert_eq!(32, std::mem::size_of::<Inst>()); +} + +impl Inst { + /// Create a move instruction. + pub fn mov(to_reg: Writable<Reg>, from_reg: Reg) -> Inst { + assert!(to_reg.to_reg().get_class() == from_reg.get_class()); + if from_reg.get_class() == RegClass::I64 { + Inst::Mov64 { + rd: to_reg, + rm: from_reg, + } + } else if from_reg.get_class() == RegClass::V128 { + Inst::FpuMove128 { + rd: to_reg, + rn: from_reg, + } + } else { + Inst::FpuMove64 { + rd: to_reg, + rn: from_reg, + } + } + } + + /// Create a 32-bit move instruction. + pub fn mov32(to_reg: Writable<Reg>, from_reg: Reg) -> Inst { + Inst::Mov32 { + rd: to_reg, + rm: from_reg, + } + } + + /// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN, + /// logical immediate, or constant pool). + pub fn load_constant(rd: Writable<Reg>, value: u64) -> SmallVec<[Inst; 4]> { + if let Some(imm) = MoveWideConst::maybe_from_u64(value) { + // 16-bit immediate (shifted by 0, 16, 32 or 48 bits) in MOVZ + smallvec![Inst::MovZ { + rd, + imm, + size: OperandSize::Size64 + }] + } else if let Some(imm) = MoveWideConst::maybe_from_u64(!value) { + // 16-bit immediate (shifted by 0, 16, 32 or 48 bits) in MOVN + smallvec![Inst::MovN { + rd, + imm, + size: OperandSize::Size64 + }] + } else if let Some(imml) = ImmLogic::maybe_from_u64(value, I64) { + // Weird logical-instruction immediate in ORI using zero register + smallvec![Inst::AluRRImmLogic { + alu_op: ALUOp::Orr64, + rd, + rn: zero_reg(), + imml, + }] + } else { + let mut insts = smallvec![]; + + // If the top 32 bits are zero, use 32-bit `mov` operations. + let (num_half_words, size, negated) = if value >> 32 == 0 { + (2, OperandSize::Size32, (!value << 32) >> 32) + } else { + (4, OperandSize::Size64, !value) + }; + // If the number of 0xffff half words is greater than the number of 0x0000 half words + // it is more efficient to use `movn` for the first instruction. + let first_is_inverted = count_zero_half_words(negated, num_half_words) + > count_zero_half_words(value, num_half_words); + // Either 0xffff or 0x0000 half words can be skipped, depending on the first + // instruction used. + let ignored_halfword = if first_is_inverted { 0xffff } else { 0 }; + let mut first_mov_emitted = false; + + for i in 0..num_half_words { + let imm16 = (value >> (16 * i)) & 0xffff; + if imm16 != ignored_halfword { + if !first_mov_emitted { + first_mov_emitted = true; + if first_is_inverted { + let imm = + MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16) + .unwrap(); + insts.push(Inst::MovN { rd, imm, size }); + } else { + let imm = + MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap(); + insts.push(Inst::MovZ { rd, imm, size }); + } + } else { + let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap(); + insts.push(Inst::MovK { rd, imm, size }); + } + } + } + + assert!(first_mov_emitted); + + insts + } + } + + /// Create instructions that load a 32-bit floating-point constant. + pub fn load_fp_constant32<F: FnMut(RegClass, Type) -> Writable<Reg>>( + rd: Writable<Reg>, + value: u32, + mut alloc_tmp: F, + ) -> SmallVec<[Inst; 4]> { + if value == 0 { + smallvec![Inst::VecDupImm { + rd, + imm: ASIMDMovModImm::zero(), + invert: false, + size: VectorSize::Size8x8 + }] + } else { + // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent + // bits. + let tmp = alloc_tmp(RegClass::I64, I32); + let mut insts = Inst::load_constant(tmp, value as u64); + + insts.push(Inst::MovToFpu { + rd, + rn: tmp.to_reg(), + size: ScalarSize::Size64, + }); + + insts + } + } + + /// Create instructions that load a 64-bit floating-point constant. + pub fn load_fp_constant64<F: FnMut(RegClass, Type) -> Writable<Reg>>( + rd: Writable<Reg>, + const_data: u64, + mut alloc_tmp: F, + ) -> SmallVec<[Inst; 4]> { + if let Ok(const_data) = u32::try_from(const_data) { + Inst::load_fp_constant32(rd, const_data, alloc_tmp) + // TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent + // bits. Also, treat it as half of a 128-bit vector and consider replicated + // patterns. Scalar MOVI might also be an option. + } else if const_data & (u32::MAX as u64) == 0 { + let tmp = alloc_tmp(RegClass::I64, I64); + let mut insts = Inst::load_constant(tmp, const_data); + + insts.push(Inst::MovToFpu { + rd, + rn: tmp.to_reg(), + size: ScalarSize::Size64, + }); + + insts + } else { + smallvec![Inst::LoadFpuConst64 { rd, const_data }] + } + } + + /// Create instructions that load a 128-bit vector constant. + pub fn load_fp_constant128<F: FnMut(RegClass, Type) -> Writable<Reg>>( + rd: Writable<Reg>, + const_data: u128, + alloc_tmp: F, + ) -> SmallVec<[Inst; 5]> { + if let Ok(const_data) = u64::try_from(const_data) { + SmallVec::from(&Inst::load_fp_constant64(rd, const_data, alloc_tmp)[..]) + } else if let Some((pattern, size)) = + Inst::get_replicated_vector_pattern(const_data, ScalarSize::Size64) + { + Inst::load_replicated_vector_pattern( + rd, + pattern, + VectorSize::from_lane_size(size, true), + alloc_tmp, + ) + } else { + smallvec![Inst::LoadFpuConst128 { rd, const_data }] + } + } + + /// Determine whether a 128-bit constant represents a vector consisting of elements with + /// the same value. + pub fn get_replicated_vector_pattern( + value: u128, + size: ScalarSize, + ) -> Option<(u64, ScalarSize)> { + let (mask, shift, next_size) = match size { + ScalarSize::Size8 => (u8::MAX as u128, 8, ScalarSize::Size128), + ScalarSize::Size16 => (u16::MAX as u128, 16, ScalarSize::Size8), + ScalarSize::Size32 => (u32::MAX as u128, 32, ScalarSize::Size16), + ScalarSize::Size64 => (u64::MAX as u128, 64, ScalarSize::Size32), + _ => return None, + }; + let mut r = None; + let v = value & mask; + + if (value >> shift) & mask == v { + r = Inst::get_replicated_vector_pattern(v, next_size); + + if r.is_none() { + r = Some((v as u64, size)); + } + } + + r + } + + /// Create instructions that load a 128-bit vector constant consisting of elements with + /// the same value. + pub fn load_replicated_vector_pattern<F: FnMut(RegClass, Type) -> Writable<Reg>>( + rd: Writable<Reg>, + pattern: u64, + size: VectorSize, + mut alloc_tmp: F, + ) -> SmallVec<[Inst; 5]> { + let lane_size = size.lane_size(); + + if let Some(imm) = ASIMDMovModImm::maybe_from_u64(pattern, lane_size) { + smallvec![Inst::VecDupImm { + rd, + imm, + invert: false, + size + }] + } else if let Some(imm) = ASIMDMovModImm::maybe_from_u64(!pattern, lane_size) { + debug_assert_ne!(lane_size, ScalarSize::Size8); + debug_assert_ne!(lane_size, ScalarSize::Size64); + + smallvec![Inst::VecDupImm { + rd, + imm, + invert: true, + size + }] + } else { + let tmp = alloc_tmp(RegClass::I64, I64); + let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern)[..]); + + insts.push(Inst::VecDup { + rd, + rn: tmp.to_reg(), + size, + }); + + insts + } + } + + /// Generic constructor for a load (zero-extending where appropriate). + pub fn gen_load(into_reg: Writable<Reg>, mem: AMode, ty: Type, flags: MemFlags) -> Inst { + match ty { + B1 | B8 | I8 => Inst::ULoad8 { + rd: into_reg, + mem, + flags, + }, + B16 | I16 => Inst::ULoad16 { + rd: into_reg, + mem, + flags, + }, + B32 | I32 | R32 => Inst::ULoad32 { + rd: into_reg, + mem, + flags, + }, + B64 | I64 | R64 => Inst::ULoad64 { + rd: into_reg, + mem, + flags, + }, + F32 => Inst::FpuLoad32 { + rd: into_reg, + mem, + flags, + }, + F64 => Inst::FpuLoad64 { + rd: into_reg, + mem, + flags, + }, + _ => { + if ty.is_vector() { + let bits = ty_bits(ty); + let rd = into_reg; + + if bits == 128 { + Inst::FpuLoad128 { rd, mem, flags } + } else { + assert_eq!(bits, 64); + Inst::FpuLoad64 { rd, mem, flags } + } + } else { + unimplemented!("gen_load({})", ty); + } + } + } + } + + /// Generic constructor for a store. + pub fn gen_store(mem: AMode, from_reg: Reg, ty: Type, flags: MemFlags) -> Inst { + match ty { + B1 | B8 | I8 => Inst::Store8 { + rd: from_reg, + mem, + flags, + }, + B16 | I16 => Inst::Store16 { + rd: from_reg, + mem, + flags, + }, + B32 | I32 | R32 => Inst::Store32 { + rd: from_reg, + mem, + flags, + }, + B64 | I64 | R64 => Inst::Store64 { + rd: from_reg, + mem, + flags, + }, + F32 => Inst::FpuStore32 { + rd: from_reg, + mem, + flags, + }, + F64 => Inst::FpuStore64 { + rd: from_reg, + mem, + flags, + }, + _ => { + if ty.is_vector() { + let bits = ty_bits(ty); + let rd = from_reg; + + if bits == 128 { + Inst::FpuStore128 { rd, mem, flags } + } else { + assert_eq!(bits, 64); + Inst::FpuStore64 { rd, mem, flags } + } + } else { + unimplemented!("gen_store({})", ty); + } + } + } + } +} + +//============================================================================= +// Instructions: get_regs + +fn memarg_regs(memarg: &AMode, collector: &mut RegUsageCollector) { + match memarg { + &AMode::Unscaled(reg, ..) | &AMode::UnsignedOffset(reg, ..) => { + collector.add_use(reg); + } + &AMode::RegReg(r1, r2, ..) + | &AMode::RegScaled(r1, r2, ..) + | &AMode::RegScaledExtended(r1, r2, ..) + | &AMode::RegExtended(r1, r2, ..) => { + collector.add_use(r1); + collector.add_use(r2); + } + &AMode::Label(..) => {} + &AMode::PreIndexed(reg, ..) | &AMode::PostIndexed(reg, ..) => { + collector.add_mod(reg); + } + &AMode::FPOffset(..) => { + collector.add_use(fp_reg()); + } + &AMode::SPOffset(..) | &AMode::NominalSPOffset(..) => { + collector.add_use(stack_reg()); + } + &AMode::RegOffset(r, ..) => { + collector.add_use(r); + } + } +} + +fn pairmemarg_regs(pairmemarg: &PairAMode, collector: &mut RegUsageCollector) { + match pairmemarg { + &PairAMode::SignedOffset(reg, ..) => { + collector.add_use(reg); + } + &PairAMode::PreIndexed(reg, ..) | &PairAMode::PostIndexed(reg, ..) => { + collector.add_mod(reg); + } + } +} + +fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { + match inst { + &Inst::AluRRR { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::AluRRRR { rd, rn, rm, ra, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + collector.add_use(ra); + } + &Inst::AluRRImm12 { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::AluRRImmLogic { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::AluRRImmShift { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::AluRRRShift { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::AluRRRExtend { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::BitRR { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::ULoad8 { rd, ref mem, .. } + | &Inst::SLoad8 { rd, ref mem, .. } + | &Inst::ULoad16 { rd, ref mem, .. } + | &Inst::SLoad16 { rd, ref mem, .. } + | &Inst::ULoad32 { rd, ref mem, .. } + | &Inst::SLoad32 { rd, ref mem, .. } + | &Inst::ULoad64 { rd, ref mem, .. } => { + collector.add_def(rd); + memarg_regs(mem, collector); + } + &Inst::Store8 { rd, ref mem, .. } + | &Inst::Store16 { rd, ref mem, .. } + | &Inst::Store32 { rd, ref mem, .. } + | &Inst::Store64 { rd, ref mem, .. } => { + collector.add_use(rd); + memarg_regs(mem, collector); + } + &Inst::StoreP64 { + rt, rt2, ref mem, .. + } => { + collector.add_use(rt); + collector.add_use(rt2); + pairmemarg_regs(mem, collector); + } + &Inst::LoadP64 { + rt, rt2, ref mem, .. + } => { + collector.add_def(rt); + collector.add_def(rt2); + pairmemarg_regs(mem, collector); + } + &Inst::Mov64 { rd, rm } => { + collector.add_def(rd); + collector.add_use(rm); + } + &Inst::Mov32 { rd, rm } => { + collector.add_def(rd); + collector.add_use(rm); + } + &Inst::MovZ { rd, .. } | &Inst::MovN { rd, .. } => { + collector.add_def(rd); + } + &Inst::MovK { rd, .. } => { + collector.add_mod(rd); + } + &Inst::CSel { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::CSet { rd, .. } => { + collector.add_def(rd); + } + &Inst::CCmpImm { rn, .. } => { + collector.add_use(rn); + } + &Inst::AtomicRMW { .. } => { + collector.add_use(xreg(25)); + collector.add_use(xreg(26)); + collector.add_def(writable_xreg(24)); + collector.add_def(writable_xreg(27)); + collector.add_def(writable_xreg(28)); + } + &Inst::AtomicCAS { .. } => { + collector.add_use(xreg(25)); + collector.add_use(xreg(26)); + collector.add_use(xreg(28)); + collector.add_def(writable_xreg(24)); + collector.add_def(writable_xreg(27)); + } + &Inst::AtomicLoad { r_data, r_addr, .. } => { + collector.add_use(r_addr); + collector.add_def(r_data); + } + &Inst::AtomicStore { r_data, r_addr, .. } => { + collector.add_use(r_addr); + collector.add_use(r_data); + } + &Inst::Fence {} => {} + &Inst::FpuMove64 { rd, rn } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::FpuMove128 { rd, rn } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::FpuMoveFromVec { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::FpuRR { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::FpuRRR { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::FpuRRI { fpu_op, rd, rn, .. } => { + match fpu_op { + FPUOpRI::UShr32(..) | FPUOpRI::UShr64(..) => collector.add_def(rd), + FPUOpRI::Sli32(..) | FPUOpRI::Sli64(..) => collector.add_mod(rd), + } + collector.add_use(rn); + } + &Inst::FpuRRRR { rd, rn, rm, ra, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + collector.add_use(ra); + } + &Inst::VecMisc { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + + &Inst::VecLanes { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::VecShiftImm { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::VecExtract { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::VecTbl { + rd, + rn, + rm, + is_extension, + } => { + collector.add_use(rn); + collector.add_use(rm); + + if is_extension { + collector.add_mod(rd); + } else { + collector.add_def(rd); + } + } + &Inst::VecTbl2 { + rd, + rn, + rn2, + rm, + is_extension, + } => { + collector.add_use(rn); + collector.add_use(rn2); + collector.add_use(rm); + + if is_extension { + collector.add_mod(rd); + } else { + collector.add_def(rd); + } + } + &Inst::VecLoadReplicate { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::VecCSel { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => { + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::FpuLoad32 { rd, ref mem, .. } => { + collector.add_def(rd); + memarg_regs(mem, collector); + } + &Inst::FpuLoad64 { rd, ref mem, .. } => { + collector.add_def(rd); + memarg_regs(mem, collector); + } + &Inst::FpuLoad128 { rd, ref mem, .. } => { + collector.add_def(rd); + memarg_regs(mem, collector); + } + &Inst::FpuStore32 { rd, ref mem, .. } => { + collector.add_use(rd); + memarg_regs(mem, collector); + } + &Inst::FpuStore64 { rd, ref mem, .. } => { + collector.add_use(rd); + memarg_regs(mem, collector); + } + &Inst::FpuStore128 { rd, ref mem, .. } => { + collector.add_use(rd); + memarg_regs(mem, collector); + } + &Inst::LoadFpuConst64 { rd, .. } | &Inst::LoadFpuConst128 { rd, .. } => { + collector.add_def(rd); + } + &Inst::FpuToInt { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::IntToFpu { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::FpuCSel32 { rd, rn, rm, .. } | &Inst::FpuCSel64 { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::FpuRound { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::MovToFpu { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::MovToVec { rd, rn, .. } => { + collector.add_mod(rd); + collector.add_use(rn); + } + &Inst::MovFromVec { rd, rn, .. } | &Inst::MovFromVecSigned { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::VecDup { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::VecDupFromFpu { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::VecDupImm { rd, .. } => { + collector.add_def(rd); + } + &Inst::VecExtend { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::VecMovElement { rd, rn, .. } => { + collector.add_mod(rd); + collector.add_use(rn); + } + &Inst::VecMiscNarrow { + rd, rn, high_half, .. + } => { + collector.add_use(rn); + + if high_half { + collector.add_mod(rd); + } else { + collector.add_def(rd); + } + } + &Inst::VecRRR { + alu_op, rd, rn, rm, .. + } => { + if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal { + collector.add_mod(rd); + } else { + collector.add_def(rd); + } + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::MovToNZCV { rn } => { + collector.add_use(rn); + } + &Inst::MovFromNZCV { rd } => { + collector.add_def(rd); + } + &Inst::Extend { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::Jump { .. } | &Inst::Ret | &Inst::EpiloguePlaceholder => {} + &Inst::Call { ref info, .. } => { + collector.add_uses(&*info.uses); + collector.add_defs(&*info.defs); + } + &Inst::CallInd { ref info, .. } => { + collector.add_uses(&*info.uses); + collector.add_defs(&*info.defs); + collector.add_use(info.rn); + } + &Inst::CondBr { ref kind, .. } => match kind { + CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => { + collector.add_use(*rt); + } + CondBrKind::Cond(_) => {} + }, + &Inst::IndirectBr { rn, .. } => { + collector.add_use(rn); + } + &Inst::Nop0 | Inst::Nop4 => {} + &Inst::Brk => {} + &Inst::Udf { .. } => {} + &Inst::TrapIf { ref kind, .. } => match kind { + CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => { + collector.add_use(*rt); + } + CondBrKind::Cond(_) => {} + }, + &Inst::Adr { rd, .. } => { + collector.add_def(rd); + } + &Inst::Word4 { .. } | &Inst::Word8 { .. } => {} + &Inst::JTSequence { + ridx, rtmp1, rtmp2, .. + } => { + collector.add_use(ridx); + collector.add_def(rtmp1); + collector.add_def(rtmp2); + } + &Inst::LoadExtName { rd, .. } => { + collector.add_def(rd); + } + &Inst::LoadAddr { rd, ref mem } => { + collector.add_def(rd); + memarg_regs(mem, collector); + } + &Inst::VirtualSPOffsetAdj { .. } => {} + &Inst::EmitIsland { .. } => {} + } +} + +//============================================================================= +// Instructions: map_regs + +fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) { + fn map_use<RUM: RegUsageMapper>(m: &RUM, r: &mut Reg) { + if r.is_virtual() { + let new = m.get_use(r.to_virtual_reg()).unwrap().to_reg(); + *r = new; + } + } + + fn map_def<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) { + if r.to_reg().is_virtual() { + let new = m.get_def(r.to_reg().to_virtual_reg()).unwrap().to_reg(); + *r = Writable::from_reg(new); + } + } + + fn map_mod<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) { + if r.to_reg().is_virtual() { + let new = m.get_mod(r.to_reg().to_virtual_reg()).unwrap().to_reg(); + *r = Writable::from_reg(new); + } + } + + fn map_mem<RUM: RegUsageMapper>(m: &RUM, mem: &mut AMode) { + // N.B.: we take only the pre-map here, but this is OK because the + // only addressing modes that update registers (pre/post-increment on + // AArch64) both read and write registers, so they are "mods" rather + // than "defs", so must be the same in both the pre- and post-map. + match mem { + &mut AMode::Unscaled(ref mut reg, ..) => map_use(m, reg), + &mut AMode::UnsignedOffset(ref mut reg, ..) => map_use(m, reg), + &mut AMode::RegReg(ref mut r1, ref mut r2) + | &mut AMode::RegScaled(ref mut r1, ref mut r2, ..) + | &mut AMode::RegScaledExtended(ref mut r1, ref mut r2, ..) + | &mut AMode::RegExtended(ref mut r1, ref mut r2, ..) => { + map_use(m, r1); + map_use(m, r2); + } + &mut AMode::Label(..) => {} + &mut AMode::PreIndexed(ref mut r, ..) => map_mod(m, r), + &mut AMode::PostIndexed(ref mut r, ..) => map_mod(m, r), + &mut AMode::FPOffset(..) + | &mut AMode::SPOffset(..) + | &mut AMode::NominalSPOffset(..) => {} + &mut AMode::RegOffset(ref mut r, ..) => map_use(m, r), + }; + } + + fn map_pairmem<RUM: RegUsageMapper>(m: &RUM, mem: &mut PairAMode) { + match mem { + &mut PairAMode::SignedOffset(ref mut reg, ..) => map_use(m, reg), + &mut PairAMode::PreIndexed(ref mut reg, ..) => map_def(m, reg), + &mut PairAMode::PostIndexed(ref mut reg, ..) => map_def(m, reg), + } + } + + fn map_br<RUM: RegUsageMapper>(m: &RUM, br: &mut CondBrKind) { + match br { + &mut CondBrKind::Zero(ref mut reg) => map_use(m, reg), + &mut CondBrKind::NotZero(ref mut reg) => map_use(m, reg), + &mut CondBrKind::Cond(..) => {} + }; + } + + match inst { + &mut Inst::AluRRR { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::AluRRRR { + ref mut rd, + ref mut rn, + ref mut rm, + ref mut ra, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + map_use(mapper, ra); + } + &mut Inst::AluRRImm12 { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::AluRRImmLogic { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::AluRRImmShift { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::AluRRRShift { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::AluRRRExtend { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::BitRR { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::ULoad8 { + ref mut rd, + ref mut mem, + .. + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::SLoad8 { + ref mut rd, + ref mut mem, + .. + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::ULoad16 { + ref mut rd, + ref mut mem, + .. + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::SLoad16 { + ref mut rd, + ref mut mem, + .. + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::ULoad32 { + ref mut rd, + ref mut mem, + .. + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::SLoad32 { + ref mut rd, + ref mut mem, + .. + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + + &mut Inst::ULoad64 { + ref mut rd, + ref mut mem, + .. + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::Store8 { + ref mut rd, + ref mut mem, + .. + } => { + map_use(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::Store16 { + ref mut rd, + ref mut mem, + .. + } => { + map_use(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::Store32 { + ref mut rd, + ref mut mem, + .. + } => { + map_use(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::Store64 { + ref mut rd, + ref mut mem, + .. + } => { + map_use(mapper, rd); + map_mem(mapper, mem); + } + + &mut Inst::StoreP64 { + ref mut rt, + ref mut rt2, + ref mut mem, + .. + } => { + map_use(mapper, rt); + map_use(mapper, rt2); + map_pairmem(mapper, mem); + } + &mut Inst::LoadP64 { + ref mut rt, + ref mut rt2, + ref mut mem, + .. + } => { + map_def(mapper, rt); + map_def(mapper, rt2); + map_pairmem(mapper, mem); + } + &mut Inst::Mov64 { + ref mut rd, + ref mut rm, + } => { + map_def(mapper, rd); + map_use(mapper, rm); + } + &mut Inst::Mov32 { + ref mut rd, + ref mut rm, + } => { + map_def(mapper, rd); + map_use(mapper, rm); + } + &mut Inst::MovZ { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::MovN { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::MovK { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::CSel { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::CSet { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::CCmpImm { ref mut rn, .. } => { + map_use(mapper, rn); + } + &mut Inst::AtomicRMW { .. } => { + // There are no vregs to map in this insn. + } + &mut Inst::AtomicCAS { .. } => { + // There are no vregs to map in this insn. + } + &mut Inst::AtomicLoad { + ref mut r_data, + ref mut r_addr, + .. + } => { + map_def(mapper, r_data); + map_use(mapper, r_addr); + } + &mut Inst::AtomicStore { + ref mut r_data, + ref mut r_addr, + .. + } => { + map_use(mapper, r_data); + map_use(mapper, r_addr); + } + &mut Inst::Fence {} => {} + &mut Inst::FpuMove64 { + ref mut rd, + ref mut rn, + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::FpuMove128 { + ref mut rd, + ref mut rn, + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::FpuMoveFromVec { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::FpuRR { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::FpuRRR { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::FpuRRI { + fpu_op, + ref mut rd, + ref mut rn, + .. + } => { + match fpu_op { + FPUOpRI::UShr32(..) | FPUOpRI::UShr64(..) => map_def(mapper, rd), + FPUOpRI::Sli32(..) | FPUOpRI::Sli64(..) => map_mod(mapper, rd), + } + map_use(mapper, rn); + } + &mut Inst::FpuRRRR { + ref mut rd, + ref mut rn, + ref mut rm, + ref mut ra, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + map_use(mapper, ra); + } + &mut Inst::VecMisc { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::VecLanes { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::VecShiftImm { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::VecExtract { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::VecTbl { + ref mut rd, + ref mut rn, + ref mut rm, + is_extension, + } => { + map_use(mapper, rn); + map_use(mapper, rm); + + if is_extension { + map_mod(mapper, rd); + } else { + map_def(mapper, rd); + } + } + &mut Inst::VecTbl2 { + ref mut rd, + ref mut rn, + ref mut rn2, + ref mut rm, + is_extension, + } => { + map_use(mapper, rn); + map_use(mapper, rn2); + map_use(mapper, rm); + + if is_extension { + map_mod(mapper, rd); + } else { + map_def(mapper, rd); + } + } + &mut Inst::VecLoadReplicate { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::VecCSel { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::FpuCmp32 { + ref mut rn, + ref mut rm, + } => { + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::FpuCmp64 { + ref mut rn, + ref mut rm, + } => { + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::FpuLoad32 { + ref mut rd, + ref mut mem, + .. + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::FpuLoad64 { + ref mut rd, + ref mut mem, + .. + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::FpuLoad128 { + ref mut rd, + ref mut mem, + .. + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::FpuStore32 { + ref mut rd, + ref mut mem, + .. + } => { + map_use(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::FpuStore64 { + ref mut rd, + ref mut mem, + .. + } => { + map_use(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::FpuStore128 { + ref mut rd, + ref mut mem, + .. + } => { + map_use(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::LoadFpuConst64 { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::LoadFpuConst128 { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::FpuToInt { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::IntToFpu { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::FpuCSel32 { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::FpuCSel64 { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::FpuRound { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::MovToFpu { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::MovToVec { + ref mut rd, + ref mut rn, + .. + } => { + map_mod(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::MovFromVec { + ref mut rd, + ref mut rn, + .. + } + | &mut Inst::MovFromVecSigned { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::VecDup { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::VecDupFromFpu { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::VecDupImm { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::VecExtend { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::VecMovElement { + ref mut rd, + ref mut rn, + .. + } => { + map_mod(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::VecMiscNarrow { + ref mut rd, + ref mut rn, + high_half, + .. + } => { + map_use(mapper, rn); + + if high_half { + map_mod(mapper, rd); + } else { + map_def(mapper, rd); + } + } + &mut Inst::VecRRR { + alu_op, + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal { + map_mod(mapper, rd); + } else { + map_def(mapper, rd); + } + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::MovToNZCV { ref mut rn } => { + map_use(mapper, rn); + } + &mut Inst::MovFromNZCV { ref mut rd } => { + map_def(mapper, rd); + } + &mut Inst::Extend { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::Jump { .. } => {} + &mut Inst::Call { ref mut info } => { + for r in info.uses.iter_mut() { + map_use(mapper, r); + } + for r in info.defs.iter_mut() { + map_def(mapper, r); + } + } + &mut Inst::Ret | &mut Inst::EpiloguePlaceholder => {} + &mut Inst::CallInd { ref mut info, .. } => { + for r in info.uses.iter_mut() { + map_use(mapper, r); + } + for r in info.defs.iter_mut() { + map_def(mapper, r); + } + map_use(mapper, &mut info.rn); + } + &mut Inst::CondBr { ref mut kind, .. } => { + map_br(mapper, kind); + } + &mut Inst::IndirectBr { ref mut rn, .. } => { + map_use(mapper, rn); + } + &mut Inst::Nop0 | &mut Inst::Nop4 | &mut Inst::Brk | &mut Inst::Udf { .. } => {} + &mut Inst::TrapIf { ref mut kind, .. } => { + map_br(mapper, kind); + } + &mut Inst::Adr { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::Word4 { .. } | &mut Inst::Word8 { .. } => {} + &mut Inst::JTSequence { + ref mut ridx, + ref mut rtmp1, + ref mut rtmp2, + .. + } => { + map_use(mapper, ridx); + map_def(mapper, rtmp1); + map_def(mapper, rtmp2); + } + &mut Inst::LoadExtName { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::LoadAddr { + ref mut rd, + ref mut mem, + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::VirtualSPOffsetAdj { .. } => {} + &mut Inst::EmitIsland { .. } => {} + } +} + +//============================================================================= +// Instructions: misc functions and external interface + +impl MachInst for Inst { + type LabelUse = LabelUse; + + fn get_regs(&self, collector: &mut RegUsageCollector) { + aarch64_get_regs(self, collector) + } + + fn map_regs<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + aarch64_map_regs(self, mapper); + } + + fn is_move(&self) -> Option<(Writable<Reg>, Reg)> { + match self { + &Inst::Mov64 { rd, rm } => Some((rd, rm)), + &Inst::FpuMove64 { rd, rn } => Some((rd, rn)), + &Inst::FpuMove128 { rd, rn } => Some((rd, rn)), + _ => None, + } + } + + fn is_epilogue_placeholder(&self) -> bool { + if let Inst::EpiloguePlaceholder = self { + true + } else { + false + } + } + + fn is_included_in_clobbers(&self) -> bool { + // We exclude call instructions from the clobber-set when they are calls + // from caller to callee with the same ABI. Such calls cannot possibly + // force any new registers to be saved in the prologue, because anything + // that the callee clobbers, the caller is also allowed to clobber. This + // both saves work and enables us to more precisely follow the + // half-caller-save, half-callee-save SysV ABI for some vector + // registers. + // + // See the note in [crate::isa::aarch64::abi::is_caller_save_reg] for + // more information on this ABI-implementation hack. + match self { + &Inst::Call { ref info } => info.caller_callconv != info.callee_callconv, + &Inst::CallInd { ref info } => info.caller_callconv != info.callee_callconv, + _ => true, + } + } + + fn is_term<'a>(&'a self) -> MachTerminator<'a> { + match self { + &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret, + &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_label().unwrap()), + &Inst::CondBr { + taken, not_taken, .. + } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()), + &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]), + &Inst::JTSequence { ref info, .. } => { + MachTerminator::Indirect(&info.targets_for_term[..]) + } + _ => MachTerminator::None, + } + } + + fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst { + assert!(ty.bits() <= 128); + Inst::mov(to_reg, from_reg) + } + + fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>( + to_reg: Writable<Reg>, + value: u64, + ty: Type, + alloc_tmp: F, + ) -> SmallVec<[Inst; 4]> { + if ty == F64 { + Inst::load_fp_constant64(to_reg, value, alloc_tmp) + } else if ty == F32 { + Inst::load_fp_constant32(to_reg, value as u32, alloc_tmp) + } else { + // Must be an integer type. + debug_assert!( + ty == B1 + || ty == I8 + || ty == B8 + || ty == I16 + || ty == B16 + || ty == I32 + || ty == B32 + || ty == I64 + || ty == B64 + || ty == R32 + || ty == R64 + ); + Inst::load_constant(to_reg, value) + } + } + + fn gen_zero_len_nop() -> Inst { + Inst::Nop0 + } + + fn gen_nop(preferred_size: usize) -> Inst { + // We can't give a NOP (or any insn) < 4 bytes. + assert!(preferred_size >= 4); + Inst::Nop4 + } + + fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> { + None + } + + fn rc_for_type(ty: Type) -> CodegenResult<RegClass> { + match ty { + I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 | R32 | R64 => Ok(RegClass::I64), + F32 | F64 => Ok(RegClass::V128), + IFLAGS | FFLAGS => Ok(RegClass::I64), + B8X16 | I8X16 | B16X8 | I16X8 | B32X4 | I32X4 | B64X2 | I64X2 | F32X4 | F64X2 => { + Ok(RegClass::V128) + } + _ => Err(CodegenError::Unsupported(format!( + "Unexpected SSA-value type: {}", + ty + ))), + } + } + + fn gen_jump(target: MachLabel) -> Inst { + Inst::Jump { + dest: BranchTarget::Label(target), + } + } + + fn reg_universe(flags: &settings::Flags) -> RealRegUniverse { + create_reg_universe(flags) + } + + fn worst_case_size() -> CodeOffset { + // The maximum size, in bytes, of any `Inst`'s emitted code. We have at least one case of + // an 8-instruction sequence (saturating int-to-float conversions) with three embedded + // 64-bit f64 constants. + // + // Note that inline jump-tables handle island/pool insertion separately, so we do not need + // to account for them here (otherwise the worst case would be 2^31 * 4, clearly not + // feasible for other reasons). + 44 + } + + fn ref_type_regclass(_: &settings::Flags) -> RegClass { + RegClass::I64 + } +} + +//============================================================================= +// Pretty-printing of instructions. + +fn mem_finalize_for_show( + mem: &AMode, + mb_rru: Option<&RealRegUniverse>, + state: &EmitState, +) -> (String, AMode) { + let (mem_insts, mem) = mem_finalize(0, mem, state); + let mut mem_str = mem_insts + .into_iter() + .map(|inst| inst.show_rru(mb_rru)) + .collect::<Vec<_>>() + .join(" ; "); + if !mem_str.is_empty() { + mem_str += " ; "; + } + + (mem_str, mem) +} + +impl PrettyPrint for Inst { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + self.pretty_print(mb_rru, &mut EmitState::default()) + } +} + +impl Inst { + fn print_with_state(&self, mb_rru: Option<&RealRegUniverse>, state: &mut EmitState) -> String { + fn op_name_size(alu_op: ALUOp) -> (&'static str, OperandSize) { + match alu_op { + ALUOp::Add32 => ("add", OperandSize::Size32), + ALUOp::Add64 => ("add", OperandSize::Size64), + ALUOp::Sub32 => ("sub", OperandSize::Size32), + ALUOp::Sub64 => ("sub", OperandSize::Size64), + ALUOp::Orr32 => ("orr", OperandSize::Size32), + ALUOp::Orr64 => ("orr", OperandSize::Size64), + ALUOp::And32 => ("and", OperandSize::Size32), + ALUOp::And64 => ("and", OperandSize::Size64), + ALUOp::Eor32 => ("eor", OperandSize::Size32), + ALUOp::Eor64 => ("eor", OperandSize::Size64), + ALUOp::AddS32 => ("adds", OperandSize::Size32), + ALUOp::AddS64 => ("adds", OperandSize::Size64), + ALUOp::SubS32 => ("subs", OperandSize::Size32), + ALUOp::SubS64 => ("subs", OperandSize::Size64), + ALUOp::SMulH => ("smulh", OperandSize::Size64), + ALUOp::UMulH => ("umulh", OperandSize::Size64), + ALUOp::SDiv64 => ("sdiv", OperandSize::Size64), + ALUOp::UDiv64 => ("udiv", OperandSize::Size64), + ALUOp::AndNot32 => ("bic", OperandSize::Size32), + ALUOp::AndNot64 => ("bic", OperandSize::Size64), + ALUOp::OrrNot32 => ("orn", OperandSize::Size32), + ALUOp::OrrNot64 => ("orn", OperandSize::Size64), + ALUOp::EorNot32 => ("eon", OperandSize::Size32), + ALUOp::EorNot64 => ("eon", OperandSize::Size64), + ALUOp::RotR32 => ("ror", OperandSize::Size32), + ALUOp::RotR64 => ("ror", OperandSize::Size64), + ALUOp::Lsr32 => ("lsr", OperandSize::Size32), + ALUOp::Lsr64 => ("lsr", OperandSize::Size64), + ALUOp::Asr32 => ("asr", OperandSize::Size32), + ALUOp::Asr64 => ("asr", OperandSize::Size64), + ALUOp::Lsl32 => ("lsl", OperandSize::Size32), + ALUOp::Lsl64 => ("lsl", OperandSize::Size64), + } + } + + match self { + &Inst::Nop0 => "nop-zero-len".to_string(), + &Inst::Nop4 => "nop".to_string(), + &Inst::AluRRR { alu_op, rd, rn, rm } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let rm = show_ireg_sized(rm, mb_rru, size); + format!("{} {}, {}, {}", op, rd, rn, rm) + } + &Inst::AluRRRR { + alu_op, + rd, + rn, + rm, + ra, + } => { + let (op, size) = match alu_op { + ALUOp3::MAdd32 => ("madd", OperandSize::Size32), + ALUOp3::MAdd64 => ("madd", OperandSize::Size64), + ALUOp3::MSub32 => ("msub", OperandSize::Size32), + ALUOp3::MSub64 => ("msub", OperandSize::Size64), + }; + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let rm = show_ireg_sized(rm, mb_rru, size); + let ra = show_ireg_sized(ra, mb_rru, size); + + format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra) + } + &Inst::AluRRImm12 { + alu_op, + rd, + rn, + ref imm12, + } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + + if imm12.bits == 0 && alu_op == ALUOp::Add64 { + // special-case MOV (used for moving into SP). + format!("mov {}, {}", rd, rn) + } else { + let imm12 = imm12.show_rru(mb_rru); + format!("{} {}, {}, {}", op, rd, rn, imm12) + } + } + &Inst::AluRRImmLogic { + alu_op, + rd, + rn, + ref imml, + } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let imml = imml.show_rru(mb_rru); + format!("{} {}, {}, {}", op, rd, rn, imml) + } + &Inst::AluRRImmShift { + alu_op, + rd, + rn, + ref immshift, + } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let immshift = immshift.show_rru(mb_rru); + format!("{} {}, {}, {}", op, rd, rn, immshift) + } + &Inst::AluRRRShift { + alu_op, + rd, + rn, + rm, + ref shiftop, + } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let rm = show_ireg_sized(rm, mb_rru, size); + let shiftop = shiftop.show_rru(mb_rru); + format!("{} {}, {}, {}, {}", op, rd, rn, rm, shiftop) + } + &Inst::AluRRRExtend { + alu_op, + rd, + rn, + rm, + ref extendop, + } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let rm = show_ireg_sized(rm, mb_rru, size); + let extendop = extendop.show_rru(mb_rru); + format!("{} {}, {}, {}, {}", op, rd, rn, rm, extendop) + } + &Inst::BitRR { op, rd, rn } => { + let size = op.operand_size(); + let op = op.op_str(); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + format!("{} {}, {}", op, rd, rn) + } + &Inst::ULoad8 { + rd, + ref mem, + .. + } + | &Inst::SLoad8 { + rd, + ref mem, + .. + } + | &Inst::ULoad16 { + rd, + ref mem, + .. + } + | &Inst::SLoad16 { + rd, + ref mem, + .. + } + | &Inst::ULoad32 { + rd, + ref mem, + .. + } + | &Inst::SLoad32 { + rd, + ref mem, + .. + } + | &Inst::ULoad64 { + rd, + ref mem, + .. + } => { + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state); + + let is_unscaled = match &mem { + &AMode::Unscaled(..) => true, + _ => false, + }; + let (op, size) = match (self, is_unscaled) { + (&Inst::ULoad8 { .. }, false) => ("ldrb", OperandSize::Size32), + (&Inst::ULoad8 { .. }, true) => ("ldurb", OperandSize::Size32), + (&Inst::SLoad8 { .. }, false) => ("ldrsb", OperandSize::Size64), + (&Inst::SLoad8 { .. }, true) => ("ldursb", OperandSize::Size64), + (&Inst::ULoad16 { .. }, false) => ("ldrh", OperandSize::Size32), + (&Inst::ULoad16 { .. }, true) => ("ldurh", OperandSize::Size32), + (&Inst::SLoad16 { .. }, false) => ("ldrsh", OperandSize::Size64), + (&Inst::SLoad16 { .. }, true) => ("ldursh", OperandSize::Size64), + (&Inst::ULoad32 { .. }, false) => ("ldr", OperandSize::Size32), + (&Inst::ULoad32 { .. }, true) => ("ldur", OperandSize::Size32), + (&Inst::SLoad32 { .. }, false) => ("ldrsw", OperandSize::Size64), + (&Inst::SLoad32 { .. }, true) => ("ldursw", OperandSize::Size64), + (&Inst::ULoad64 { .. }, false) => ("ldr", OperandSize::Size64), + (&Inst::ULoad64 { .. }, true) => ("ldur", OperandSize::Size64), + _ => unreachable!(), + }; + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let mem = mem.show_rru(mb_rru); + format!("{}{} {}, {}", mem_str, op, rd, mem) + } + &Inst::Store8 { + rd, + ref mem, + .. + } + | &Inst::Store16 { + rd, + ref mem, + .. + } + | &Inst::Store32 { + rd, + ref mem, + .. + } + | &Inst::Store64 { + rd, + ref mem, + .. + } => { + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state); + + let is_unscaled = match &mem { + &AMode::Unscaled(..) => true, + _ => false, + }; + let (op, size) = match (self, is_unscaled) { + (&Inst::Store8 { .. }, false) => ("strb", OperandSize::Size32), + (&Inst::Store8 { .. }, true) => ("sturb", OperandSize::Size32), + (&Inst::Store16 { .. }, false) => ("strh", OperandSize::Size32), + (&Inst::Store16 { .. }, true) => ("sturh", OperandSize::Size32), + (&Inst::Store32 { .. }, false) => ("str", OperandSize::Size32), + (&Inst::Store32 { .. }, true) => ("stur", OperandSize::Size32), + (&Inst::Store64 { .. }, false) => ("str", OperandSize::Size64), + (&Inst::Store64 { .. }, true) => ("stur", OperandSize::Size64), + _ => unreachable!(), + }; + let rd = show_ireg_sized(rd, mb_rru, size); + let mem = mem.show_rru(mb_rru); + format!("{}{} {}, {}", mem_str, op, rd, mem) + } + &Inst::StoreP64 { rt, rt2, ref mem, .. } => { + let rt = rt.show_rru(mb_rru); + let rt2 = rt2.show_rru(mb_rru); + let mem = mem.show_rru(mb_rru); + format!("stp {}, {}, {}", rt, rt2, mem) + } + &Inst::LoadP64 { rt, rt2, ref mem, .. } => { + let rt = rt.to_reg().show_rru(mb_rru); + let rt2 = rt2.to_reg().show_rru(mb_rru); + let mem = mem.show_rru(mb_rru); + format!("ldp {}, {}, {}", rt, rt2, mem) + } + &Inst::Mov64 { rd, rm } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + format!("mov {}, {}", rd, rm) + } + &Inst::Mov32 { rd, rm } => { + let rd = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::Size32); + let rm = show_ireg_sized(rm, mb_rru, OperandSize::Size32); + format!("mov {}, {}", rd, rm) + } + &Inst::MovZ { rd, ref imm, size } => { + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let imm = imm.show_rru(mb_rru); + format!("movz {}, {}", rd, imm) + } + &Inst::MovN { rd, ref imm, size } => { + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let imm = imm.show_rru(mb_rru); + format!("movn {}, {}", rd, imm) + } + &Inst::MovK { rd, ref imm, size } => { + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let imm = imm.show_rru(mb_rru); + format!("movk {}, {}", rd, imm) + } + &Inst::CSel { rd, rn, rm, cond } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + let cond = cond.show_rru(mb_rru); + format!("csel {}, {}, {}, {}", rd, rn, rm, cond) + } + &Inst::CSet { rd, cond } => { + let rd = rd.to_reg().show_rru(mb_rru); + let cond = cond.show_rru(mb_rru); + format!("cset {}, {}", rd, cond) + } + &Inst::CCmpImm { + size, + rn, + imm, + nzcv, + cond, + } => { + let rn = show_ireg_sized(rn, mb_rru, size); + let imm = imm.show_rru(mb_rru); + let nzcv = nzcv.show_rru(mb_rru); + let cond = cond.show_rru(mb_rru); + format!("ccmp {}, {}, {}, {}", rn, imm, nzcv, cond) + } + &Inst::AtomicRMW { ty, op, .. } => { + format!( + "atomically {{ {}_bits_at_[x25]) {:?}= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }}", + ty.bits(), op) + } + &Inst::AtomicCAS { ty, .. } => { + format!( + "atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}", + ty.bits()) + } + &Inst::AtomicLoad { ty, r_data, r_addr, .. } => { + format!( + "atomically {{ {} = zero_extend_{}_bits_at[{}] }}", + r_data.show_rru(mb_rru), ty.bits(), r_addr.show_rru(mb_rru)) + } + &Inst::AtomicStore { ty, r_data, r_addr, .. } => { + format!( + "atomically {{ {}_bits_at[{}] = {} }}", ty.bits(), r_addr.show_rru(mb_rru), r_data.show_rru(mb_rru)) + } + &Inst::Fence {} => { + format!("dmb ish") + } + &Inst::FpuMove64 { rd, rn } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + format!("mov {}.8b, {}.8b", rd, rn) + } + &Inst::FpuMove128 { rd, rn } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + format!("mov {}.16b, {}.16b", rd, rn) + } + &Inst::FpuMoveFromVec { rd, rn, idx, size } => { + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size.lane_size()); + let rn = show_vreg_element(rn, mb_rru, idx, size); + format!("mov {}, {}", rd, rn) + } + &Inst::FpuRR { fpu_op, rd, rn } => { + let (op, sizesrc, sizedest) = match fpu_op { + FPUOp1::Abs32 => ("fabs", ScalarSize::Size32, ScalarSize::Size32), + FPUOp1::Abs64 => ("fabs", ScalarSize::Size64, ScalarSize::Size64), + FPUOp1::Neg32 => ("fneg", ScalarSize::Size32, ScalarSize::Size32), + FPUOp1::Neg64 => ("fneg", ScalarSize::Size64, ScalarSize::Size64), + FPUOp1::Sqrt32 => ("fsqrt", ScalarSize::Size32, ScalarSize::Size32), + FPUOp1::Sqrt64 => ("fsqrt", ScalarSize::Size64, ScalarSize::Size64), + FPUOp1::Cvt32To64 => ("fcvt", ScalarSize::Size32, ScalarSize::Size64), + FPUOp1::Cvt64To32 => ("fcvt", ScalarSize::Size64, ScalarSize::Size32), + }; + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, sizedest); + let rn = show_vreg_scalar(rn, mb_rru, sizesrc); + format!("{} {}, {}", op, rd, rn) + } + &Inst::FpuRRR { fpu_op, rd, rn, rm } => { + let (op, size) = match fpu_op { + FPUOp2::Add32 => ("fadd", ScalarSize::Size32), + FPUOp2::Add64 => ("fadd", ScalarSize::Size64), + FPUOp2::Sub32 => ("fsub", ScalarSize::Size32), + FPUOp2::Sub64 => ("fsub", ScalarSize::Size64), + FPUOp2::Mul32 => ("fmul", ScalarSize::Size32), + FPUOp2::Mul64 => ("fmul", ScalarSize::Size64), + FPUOp2::Div32 => ("fdiv", ScalarSize::Size32), + FPUOp2::Div64 => ("fdiv", ScalarSize::Size64), + FPUOp2::Max32 => ("fmax", ScalarSize::Size32), + FPUOp2::Max64 => ("fmax", ScalarSize::Size64), + FPUOp2::Min32 => ("fmin", ScalarSize::Size32), + FPUOp2::Min64 => ("fmin", ScalarSize::Size64), + FPUOp2::Sqadd64 => ("sqadd", ScalarSize::Size64), + FPUOp2::Uqadd64 => ("uqadd", ScalarSize::Size64), + FPUOp2::Sqsub64 => ("sqsub", ScalarSize::Size64), + FPUOp2::Uqsub64 => ("uqsub", ScalarSize::Size64), + }; + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size); + let rn = show_vreg_scalar(rn, mb_rru, size); + let rm = show_vreg_scalar(rm, mb_rru, size); + format!("{} {}, {}, {}", op, rd, rn, rm) + } + &Inst::FpuRRI { fpu_op, rd, rn } => { + let (op, imm, vector) = match fpu_op { + FPUOpRI::UShr32(imm) => ("ushr", imm.show_rru(mb_rru), true), + FPUOpRI::UShr64(imm) => ("ushr", imm.show_rru(mb_rru), false), + FPUOpRI::Sli32(imm) => ("sli", imm.show_rru(mb_rru), true), + FPUOpRI::Sli64(imm) => ("sli", imm.show_rru(mb_rru), false), + }; + + let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector { + |reg, mb_rru| show_vreg_vector(reg, mb_rru, VectorSize::Size32x2) + } else { + |reg, mb_rru| show_vreg_scalar(reg, mb_rru, ScalarSize::Size64) + }; + let rd = show_vreg_fn(rd.to_reg(), mb_rru); + let rn = show_vreg_fn(rn, mb_rru); + format!("{} {}, {}, {}", op, rd, rn, imm) + } + &Inst::FpuRRRR { + fpu_op, + rd, + rn, + rm, + ra, + } => { + let (op, size) = match fpu_op { + FPUOp3::MAdd32 => ("fmadd", ScalarSize::Size32), + FPUOp3::MAdd64 => ("fmadd", ScalarSize::Size64), + }; + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size); + let rn = show_vreg_scalar(rn, mb_rru, size); + let rm = show_vreg_scalar(rm, mb_rru, size); + let ra = show_vreg_scalar(ra, mb_rru, size); + format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra) + } + &Inst::FpuCmp32 { rn, rm } => { + let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size32); + let rm = show_vreg_scalar(rm, mb_rru, ScalarSize::Size32); + format!("fcmp {}, {}", rn, rm) + } + &Inst::FpuCmp64 { rn, rm } => { + let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size64); + let rm = show_vreg_scalar(rm, mb_rru, ScalarSize::Size64); + format!("fcmp {}, {}", rn, rm) + } + &Inst::FpuLoad32 { rd, ref mem, .. } => { + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size32); + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state); + let mem = mem.show_rru(mb_rru); + format!("{}ldr {}, {}", mem_str, rd, mem) + } + &Inst::FpuLoad64 { rd, ref mem, .. } => { + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64); + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state); + let mem = mem.show_rru(mb_rru); + format!("{}ldr {}, {}", mem_str, rd, mem) + } + &Inst::FpuLoad128 { rd, ref mem, .. } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rd = "q".to_string() + &rd[1..]; + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state); + let mem = mem.show_rru(mb_rru); + format!("{}ldr {}, {}", mem_str, rd, mem) + } + &Inst::FpuStore32 { rd, ref mem, .. } => { + let rd = show_vreg_scalar(rd, mb_rru, ScalarSize::Size32); + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state); + let mem = mem.show_rru(mb_rru); + format!("{}str {}, {}", mem_str, rd, mem) + } + &Inst::FpuStore64 { rd, ref mem, .. } => { + let rd = show_vreg_scalar(rd, mb_rru, ScalarSize::Size64); + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state); + let mem = mem.show_rru(mb_rru); + format!("{}str {}, {}", mem_str, rd, mem) + } + &Inst::FpuStore128 { rd, ref mem, .. } => { + let rd = rd.show_rru(mb_rru); + let rd = "q".to_string() + &rd[1..]; + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state); + let mem = mem.show_rru(mb_rru); + format!("{}str {}, {}", mem_str, rd, mem) + } + &Inst::LoadFpuConst64 { rd, const_data } => { + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64); + format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, f64::from_bits(const_data)) + } + &Inst::LoadFpuConst128 { rd, const_data } => { + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size128); + format!("ldr {}, pc+8 ; b 20 ; data.f128 0x{:032x}", rd, const_data) + } + &Inst::FpuToInt { op, rd, rn } => { + let (op, sizesrc, sizedest) = match op { + FpuToIntOp::F32ToI32 => ("fcvtzs", ScalarSize::Size32, OperandSize::Size32), + FpuToIntOp::F32ToU32 => ("fcvtzu", ScalarSize::Size32, OperandSize::Size32), + FpuToIntOp::F32ToI64 => ("fcvtzs", ScalarSize::Size32, OperandSize::Size64), + FpuToIntOp::F32ToU64 => ("fcvtzu", ScalarSize::Size32, OperandSize::Size64), + FpuToIntOp::F64ToI32 => ("fcvtzs", ScalarSize::Size64, OperandSize::Size32), + FpuToIntOp::F64ToU32 => ("fcvtzu", ScalarSize::Size64, OperandSize::Size32), + FpuToIntOp::F64ToI64 => ("fcvtzs", ScalarSize::Size64, OperandSize::Size64), + FpuToIntOp::F64ToU64 => ("fcvtzu", ScalarSize::Size64, OperandSize::Size64), + }; + let rd = show_ireg_sized(rd.to_reg(), mb_rru, sizedest); + let rn = show_vreg_scalar(rn, mb_rru, sizesrc); + format!("{} {}, {}", op, rd, rn) + } + &Inst::IntToFpu { op, rd, rn } => { + let (op, sizesrc, sizedest) = match op { + IntToFpuOp::I32ToF32 => ("scvtf", OperandSize::Size32, ScalarSize::Size32), + IntToFpuOp::U32ToF32 => ("ucvtf", OperandSize::Size32, ScalarSize::Size32), + IntToFpuOp::I64ToF32 => ("scvtf", OperandSize::Size64, ScalarSize::Size32), + IntToFpuOp::U64ToF32 => ("ucvtf", OperandSize::Size64, ScalarSize::Size32), + IntToFpuOp::I32ToF64 => ("scvtf", OperandSize::Size32, ScalarSize::Size64), + IntToFpuOp::U32ToF64 => ("ucvtf", OperandSize::Size32, ScalarSize::Size64), + IntToFpuOp::I64ToF64 => ("scvtf", OperandSize::Size64, ScalarSize::Size64), + IntToFpuOp::U64ToF64 => ("ucvtf", OperandSize::Size64, ScalarSize::Size64), + }; + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, sizedest); + let rn = show_ireg_sized(rn, mb_rru, sizesrc); + format!("{} {}, {}", op, rd, rn) + } + &Inst::FpuCSel32 { rd, rn, rm, cond } => { + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size32); + let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size32); + let rm = show_vreg_scalar(rm, mb_rru, ScalarSize::Size32); + let cond = cond.show_rru(mb_rru); + format!("fcsel {}, {}, {}, {}", rd, rn, rm, cond) + } + &Inst::FpuCSel64 { rd, rn, rm, cond } => { + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64); + let rn = show_vreg_scalar(rn, mb_rru, ScalarSize::Size64); + let rm = show_vreg_scalar(rm, mb_rru, ScalarSize::Size64); + let cond = cond.show_rru(mb_rru); + format!("fcsel {}, {}, {}, {}", rd, rn, rm, cond) + } + &Inst::FpuRound { op, rd, rn } => { + let (inst, size) = match op { + FpuRoundMode::Minus32 => ("frintm", ScalarSize::Size32), + FpuRoundMode::Minus64 => ("frintm", ScalarSize::Size64), + FpuRoundMode::Plus32 => ("frintp", ScalarSize::Size32), + FpuRoundMode::Plus64 => ("frintp", ScalarSize::Size64), + FpuRoundMode::Zero32 => ("frintz", ScalarSize::Size32), + FpuRoundMode::Zero64 => ("frintz", ScalarSize::Size64), + FpuRoundMode::Nearest32 => ("frintn", ScalarSize::Size32), + FpuRoundMode::Nearest64 => ("frintn", ScalarSize::Size64), + }; + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size); + let rn = show_vreg_scalar(rn, mb_rru, size); + format!("{} {}, {}", inst, rd, rn) + } + &Inst::MovToFpu { rd, rn, size } => { + let operand_size = size.operand_size(); + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, operand_size); + format!("fmov {}, {}", rd, rn) + } + &Inst::MovToVec { rd, rn, idx, size } => { + let rd = show_vreg_element(rd.to_reg(), mb_rru, idx, size); + let rn = show_ireg_sized(rn, mb_rru, size.operand_size()); + format!("mov {}, {}", rd, rn) + } + &Inst::MovFromVec { rd, rn, idx, size } => { + let op = match size { + VectorSize::Size8x16 => "umov", + VectorSize::Size16x8 => "umov", + VectorSize::Size32x4 => "mov", + VectorSize::Size64x2 => "mov", + _ => unimplemented!(), + }; + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size.operand_size()); + let rn = show_vreg_element(rn, mb_rru, idx, size); + format!("{} {}, {}", op, rd, rn) + } + &Inst::MovFromVecSigned { + rd, + rn, + idx, + size, + scalar_size, + } => { + let rd = show_ireg_sized(rd.to_reg(), mb_rru, scalar_size); + let rn = show_vreg_element(rn, mb_rru, idx, size); + format!("smov {}, {}", rd, rn) + } + &Inst::VecDup { rd, rn, size } => { + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size.operand_size()); + format!("dup {}, {}", rd, rn) + } + &Inst::VecDupFromFpu { rd, rn, size } => { + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rn = show_vreg_element(rn, mb_rru, 0, size); + format!("dup {}, {}", rd, rn) + } + &Inst::VecDupImm { rd, imm, invert, size } => { + let imm = imm.show_rru(mb_rru); + let op = if invert { + "mvni" + } else { + "movi" + }; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + + format!("{} {}, {}", op, rd, imm) + } + &Inst::VecExtend { t, rd, rn, high_half } => { + let (op, dest, src) = match (t, high_half) { + (VecExtendOp::Sxtl8, false) => ("sxtl", VectorSize::Size16x8, VectorSize::Size8x8), + (VecExtendOp::Sxtl8, true) => ("sxtl2", VectorSize::Size16x8, VectorSize::Size8x16), + (VecExtendOp::Sxtl16, false) => ("sxtl", VectorSize::Size32x4, VectorSize::Size16x4), + (VecExtendOp::Sxtl16, true) => ("sxtl2", VectorSize::Size32x4, VectorSize::Size16x8), + (VecExtendOp::Sxtl32, false) => ("sxtl", VectorSize::Size64x2, VectorSize::Size32x2), + (VecExtendOp::Sxtl32, true) => ("sxtl2", VectorSize::Size64x2, VectorSize::Size32x4), + (VecExtendOp::Uxtl8, false) => ("uxtl", VectorSize::Size16x8, VectorSize::Size8x8), + (VecExtendOp::Uxtl8, true) => ("uxtl2", VectorSize::Size16x8, VectorSize::Size8x16), + (VecExtendOp::Uxtl16, false) => ("uxtl", VectorSize::Size32x4, VectorSize::Size16x4), + (VecExtendOp::Uxtl16, true) => ("uxtl2", VectorSize::Size32x4, VectorSize::Size16x8), + (VecExtendOp::Uxtl32, false) => ("uxtl", VectorSize::Size64x2, VectorSize::Size32x2), + (VecExtendOp::Uxtl32, true) => ("uxtl2", VectorSize::Size64x2, VectorSize::Size32x4), + }; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest); + let rn = show_vreg_vector(rn, mb_rru, src); + format!("{} {}, {}", op, rd, rn) + } + &Inst::VecMovElement { + rd, + rn, + dest_idx, + src_idx, + size, + } => { + let rd = show_vreg_element(rd.to_reg(), mb_rru, dest_idx, size); + let rn = show_vreg_element(rn, mb_rru, src_idx, size); + format!("mov {}, {}", rd, rn) + } + &Inst::VecMiscNarrow { op, rd, rn, size, high_half } => { + let dest_size = if high_half { + assert!(size.is_128bits()); + size + } else { + size.halve() + }; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest_size); + let rn = show_vreg_vector(rn, mb_rru, size.widen()); + let op = match (op, high_half) { + (VecMiscNarrowOp::Xtn, false) => "xtn", + (VecMiscNarrowOp::Xtn, true) => "xtn2", + (VecMiscNarrowOp::Sqxtn, false) => "sqxtn", + (VecMiscNarrowOp::Sqxtn, true) => "sqxtn2", + (VecMiscNarrowOp::Sqxtun, false) => "sqxtun", + (VecMiscNarrowOp::Sqxtun, true) => "sqxtun2", + }; + format!("{} {}, {}", op, rd, rn) + } + &Inst::VecRRR { + rd, + rn, + rm, + alu_op, + size, + } => { + let (op, size) = match alu_op { + VecALUOp::Sqadd => ("sqadd", size), + VecALUOp::Uqadd => ("uqadd", size), + VecALUOp::Sqsub => ("sqsub", size), + VecALUOp::Uqsub => ("uqsub", size), + VecALUOp::Cmeq => ("cmeq", size), + VecALUOp::Cmge => ("cmge", size), + VecALUOp::Cmgt => ("cmgt", size), + VecALUOp::Cmhs => ("cmhs", size), + VecALUOp::Cmhi => ("cmhi", size), + VecALUOp::Fcmeq => ("fcmeq", size), + VecALUOp::Fcmgt => ("fcmgt", size), + VecALUOp::Fcmge => ("fcmge", size), + VecALUOp::And => ("and", VectorSize::Size8x16), + VecALUOp::Bic => ("bic", VectorSize::Size8x16), + VecALUOp::Orr => ("orr", VectorSize::Size8x16), + VecALUOp::Eor => ("eor", VectorSize::Size8x16), + VecALUOp::Bsl => ("bsl", VectorSize::Size8x16), + VecALUOp::Umaxp => ("umaxp", size), + VecALUOp::Add => ("add", size), + VecALUOp::Sub => ("sub", size), + VecALUOp::Mul => ("mul", size), + VecALUOp::Sshl => ("sshl", size), + VecALUOp::Ushl => ("ushl", size), + VecALUOp::Umin => ("umin", size), + VecALUOp::Smin => ("smin", size), + VecALUOp::Umax => ("umax", size), + VecALUOp::Smax => ("smax", size), + VecALUOp::Urhadd => ("urhadd", size), + VecALUOp::Fadd => ("fadd", size), + VecALUOp::Fsub => ("fsub", size), + VecALUOp::Fdiv => ("fdiv", size), + VecALUOp::Fmax => ("fmax", size), + VecALUOp::Fmin => ("fmin", size), + VecALUOp::Fmul => ("fmul", size), + VecALUOp::Addp => ("addp", size), + VecALUOp::Umlal => ("umlal", size), + VecALUOp::Zip1 => ("zip1", size), + VecALUOp::Smull => ("smull", size), + VecALUOp::Smull2 => ("smull2", size), + }; + let rd_size = match alu_op { + VecALUOp::Umlal | VecALUOp::Smull | VecALUOp::Smull2 => size.widen(), + _ => size + }; + let rn_size = match alu_op { + VecALUOp::Smull => size.halve(), + _ => size + }; + let rm_size = rn_size; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size); + let rn = show_vreg_vector(rn, mb_rru, rn_size); + let rm = show_vreg_vector(rm, mb_rru, rm_size); + format!("{} {}, {}, {}", op, rd, rn, rm) + } + &Inst::VecMisc { op, rd, rn, size } => { + let is_shll = op == VecMisc2::Shll; + let suffix = match (is_shll, size) { + (true, VectorSize::Size8x8) => ", #8", + (true, VectorSize::Size16x4) => ", #16", + (true, VectorSize::Size32x2) => ", #32", + _ => "", + }; + + let (op, size) = match op { + VecMisc2::Not => ( + "mvn", + if size.is_128bits() { + VectorSize::Size8x16 + } else { + VectorSize::Size8x8 + }, + ), + VecMisc2::Neg => ("neg", size), + VecMisc2::Abs => ("abs", size), + VecMisc2::Fabs => ("fabs", size), + VecMisc2::Fneg => ("fneg", size), + VecMisc2::Fsqrt => ("fsqrt", size), + VecMisc2::Rev64 => ("rev64", size), + VecMisc2::Shll => ("shll", size), + VecMisc2::Fcvtzs => ("fcvtzs", size), + VecMisc2::Fcvtzu => ("fcvtzu", size), + VecMisc2::Scvtf => ("scvtf", size), + VecMisc2::Ucvtf => ("ucvtf", size), + VecMisc2::Frintn => ("frintn", size), + VecMisc2::Frintz => ("frintz", size), + VecMisc2::Frintm => ("frintm", size), + VecMisc2::Frintp => ("frintp", size), + }; + + let rd_size = if is_shll { size.widen() } else { size }; + + let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size); + let rn = show_vreg_vector(rn, mb_rru, size); + format!("{} {}, {}{}", op, rd, rn, suffix) + } + &Inst::VecLanes { op, rd, rn, size } => { + let op = match op { + VecLanesOp::Uminv => "uminv", + VecLanesOp::Addv => "addv", + }; + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size.lane_size()); + let rn = show_vreg_vector(rn, mb_rru, size); + format!("{} {}, {}", op, rd, rn) + } + &Inst::VecShiftImm { op, rd, rn, size, imm } => { + let op = match op { + VecShiftImmOp::Shl => "shl", + VecShiftImmOp::Ushr => "ushr", + VecShiftImmOp::Sshr => "sshr", + }; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rn = show_vreg_vector(rn, mb_rru, size); + format!("{} {}, {}, #{}", op, rd, rn, imm) + } + &Inst::VecExtract { rd, rn, rm, imm4 } => { + let rd = show_vreg_vector(rd.to_reg(), mb_rru, VectorSize::Size8x16); + let rn = show_vreg_vector(rn, mb_rru, VectorSize::Size8x16); + let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16); + format!("ext {}, {}, {}, #{}", rd, rn, rm, imm4) + } + &Inst::VecTbl { + rd, + rn, + rm, + is_extension, + } => { + let op = if is_extension { "tbx" } else { "tbl" }; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, VectorSize::Size8x16); + let rn = show_vreg_vector(rn, mb_rru, VectorSize::Size8x16); + let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16); + format!("{} {}, {{ {} }}, {}", op, rd, rn, rm) + } + &Inst::VecTbl2 { + rd, + rn, + rn2, + rm, + is_extension, + } => { + let op = if is_extension { "tbx" } else { "tbl" }; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, VectorSize::Size8x16); + let rn = show_vreg_vector(rn, mb_rru, VectorSize::Size8x16); + let rn2 = show_vreg_vector(rn2, mb_rru, VectorSize::Size8x16); + let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16); + format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm) + } + &Inst::VecLoadReplicate { rd, rn, size, .. } => { + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rn = rn.show_rru(mb_rru); + + format!("ld1r {{ {} }}, [{}]", rd, rn) + } + &Inst::VecCSel { rd, rn, rm, cond } => { + let rd = show_vreg_vector(rd.to_reg(), mb_rru, VectorSize::Size8x16); + let rn = show_vreg_vector(rn, mb_rru, VectorSize::Size8x16); + let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16); + let cond = cond.show_rru(mb_rru); + format!("vcsel {}, {}, {}, {} (if-then-else diamond)", rd, rn, rm, cond) + } + &Inst::MovToNZCV { rn } => { + let rn = rn.show_rru(mb_rru); + format!("msr nzcv, {}", rn) + } + &Inst::MovFromNZCV { rd } => { + let rd = rd.to_reg().show_rru(mb_rru); + format!("mrs {}, nzcv", rd) + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + } if from_bits >= 8 => { + // Is the destination a 32-bit register? Corresponds to whether + // extend-to width is <= 32 bits, *unless* we have an unsigned + // 32-to-64-bit extension, which is implemented with a "mov" to a + // 32-bit (W-reg) dest, because this zeroes the top 32 bits. + let dest_size = if !signed && from_bits == 32 && to_bits == 64 { + OperandSize::Size32 + } else { + OperandSize::from_bits(to_bits) + }; + let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size); + let rn = show_ireg_sized(rn, mb_rru, OperandSize::from_bits(from_bits)); + let op = match (signed, from_bits, to_bits) { + (false, 8, 32) => "uxtb", + (true, 8, 32) => "sxtb", + (false, 16, 32) => "uxth", + (true, 16, 32) => "sxth", + (false, 8, 64) => "uxtb", + (true, 8, 64) => "sxtb", + (false, 16, 64) => "uxth", + (true, 16, 64) => "sxth", + (false, 32, 64) => "mov", // special case (see above). + (true, 32, 64) => "sxtw", + _ => panic!("Unsupported Extend case: {:?}", self), + }; + format!("{} {}, {}", op, rd, rn) + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + } if from_bits == 1 && signed => { + let dest_size = OperandSize::from_bits(to_bits); + let zr = if dest_size.is32() { "wzr" } else { "xzr" }; + let rd32 = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::Size32); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size); + let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size32); + format!("and {}, {}, #1 ; sub {}, {}, {}", rd32, rn, rd, zr, rd) + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + .. + } if from_bits == 1 && !signed => { + let rd = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::Size32); + let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size32); + format!("and {}, {}, #1", rd, rn) + } + &Inst::Extend { .. } => { + panic!("Unsupported Extend case"); + } + &Inst::Call { .. } => format!("bl 0"), + &Inst::CallInd { ref info, .. } => { + let rn = info.rn.show_rru(mb_rru); + format!("blr {}", rn) + } + &Inst::Ret => "ret".to_string(), + &Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(), + &Inst::Jump { ref dest } => { + let dest = dest.show_rru(mb_rru); + format!("b {}", dest) + } + &Inst::CondBr { + ref taken, + ref not_taken, + ref kind, + } => { + let taken = taken.show_rru(mb_rru); + let not_taken = not_taken.show_rru(mb_rru); + match kind { + &CondBrKind::Zero(reg) => { + let reg = reg.show_rru(mb_rru); + format!("cbz {}, {} ; b {}", reg, taken, not_taken) + } + &CondBrKind::NotZero(reg) => { + let reg = reg.show_rru(mb_rru); + format!("cbnz {}, {} ; b {}", reg, taken, not_taken) + } + &CondBrKind::Cond(c) => { + let c = c.show_rru(mb_rru); + format!("b.{} {} ; b {}", c, taken, not_taken) + } + } + } + &Inst::IndirectBr { rn, .. } => { + let rn = rn.show_rru(mb_rru); + format!("br {}", rn) + } + &Inst::Brk => "brk #0".to_string(), + &Inst::Udf { .. } => "udf".to_string(), + &Inst::TrapIf { ref kind, .. } => match kind { + &CondBrKind::Zero(reg) => { + let reg = reg.show_rru(mb_rru); + format!("cbnz {}, 8 ; udf", reg) + } + &CondBrKind::NotZero(reg) => { + let reg = reg.show_rru(mb_rru); + format!("cbz {}, 8 ; udf", reg) + } + &CondBrKind::Cond(c) => { + let c = c.invert().show_rru(mb_rru); + format!("b.{} 8 ; udf", c) + } + }, + &Inst::Adr { rd, off } => { + let rd = rd.show_rru(mb_rru); + format!("adr {}, pc+{}", rd, off) + } + &Inst::Word4 { data } => format!("data.i32 {}", data), + &Inst::Word8 { data } => format!("data.i64 {}", data), + &Inst::JTSequence { + ref info, + ridx, + rtmp1, + rtmp2, + .. + } => { + let ridx = ridx.show_rru(mb_rru); + let rtmp1 = rtmp1.show_rru(mb_rru); + let rtmp2 = rtmp2.show_rru(mb_rru); + let default_target = info.default_target.show_rru(mb_rru); + format!( + concat!( + "b.hs {} ; ", + "adr {}, pc+16 ; ", + "ldrsw {}, [{}, {}, LSL 2] ; ", + "add {}, {}, {} ; ", + "br {} ; ", + "jt_entries {:?}" + ), + default_target, + rtmp1, + rtmp2, + rtmp1, + ridx, + rtmp1, + rtmp1, + rtmp2, + rtmp1, + info.targets + ) + } + &Inst::LoadExtName { + rd, + ref name, + offset, + } => { + let rd = rd.show_rru(mb_rru); + format!("ldr {}, 8 ; b 12 ; data {:?} + {}", rd, name, offset) + } + &Inst::LoadAddr { rd, ref mem } => { + // TODO: we really should find a better way to avoid duplication of + // this logic between `emit()` and `show_rru()` -- a separate 1-to-N + // expansion stage (i.e., legalization, but without the slow edit-in-place + // of the existing legalization framework). + let (mem_insts, mem) = mem_finalize(0, mem, state); + let mut ret = String::new(); + for inst in mem_insts.into_iter() { + ret.push_str(&inst.show_rru(mb_rru)); + } + let (reg, offset) = match mem { + AMode::Unscaled(r, simm9) => (r, simm9.value()), + AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32), + _ => panic!("Unsupported case for LoadAddr: {:?}", mem), + }; + let abs_offset = if offset < 0 { + -offset as u64 + } else { + offset as u64 + }; + let alu_op = if offset < 0 { + ALUOp::Sub64 + } else { + ALUOp::Add64 + }; + + if offset == 0 { + let mov = Inst::mov(rd, reg); + ret.push_str(&mov.show_rru(mb_rru)); + } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) { + let add = Inst::AluRRImm12 { + alu_op, + rd, + rn: reg, + imm12, + }; + ret.push_str(&add.show_rru(mb_rru)); + } else { + let tmp = writable_spilltmp_reg(); + for inst in Inst::load_constant(tmp, abs_offset).into_iter() { + ret.push_str(&inst.show_rru(mb_rru)); + } + let add = Inst::AluRRR { + alu_op, + rd, + rn: reg, + rm: tmp.to_reg(), + }; + ret.push_str(&add.show_rru(mb_rru)); + } + ret + } + &Inst::VirtualSPOffsetAdj { offset } => { + state.virtual_sp_offset += offset; + format!("virtual_sp_offset_adjust {}", offset) + } + &Inst::EmitIsland { needed_space } => format!("emit_island {}", needed_space), + } + } +} + +//============================================================================= +// Label fixups and jump veneers. + +/// Different forms of label references for different instruction formats. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LabelUse { + /// 19-bit branch offset (conditional branches). PC-rel, offset is imm << 2. Immediate is 19 + /// signed bits, in bits 23:5. Used by cbz, cbnz, b.cond. + Branch19, + /// 26-bit branch offset (unconditional branches). PC-rel, offset is imm << 2. Immediate is 26 + /// signed bits, in bits 25:0. Used by b, bl. + Branch26, + /// 19-bit offset for LDR (load literal). PC-rel, offset is imm << 2. Immediate is 19 signed bits, + /// in bits 23:5. + Ldr19, + /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is + /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29. + Adr21, + /// 32-bit PC relative constant offset (from address of constant itself), + /// signed. Used in jump tables. + PCRel32, +} + +impl MachInstLabelUse for LabelUse { + /// Alignment for veneer code. Every AArch64 instruction must be 4-byte-aligned. + const ALIGN: CodeOffset = 4; + + /// Maximum PC-relative range (positive), inclusive. + fn max_pos_range(self) -> CodeOffset { + match self { + // 19-bit immediate, left-shifted by 2, for 21 bits of total range. Signed, so +2^20 + // from zero. Likewise for two other shifted cases below. + LabelUse::Branch19 => (1 << 20) - 1, + LabelUse::Branch26 => (1 << 27) - 1, + LabelUse::Ldr19 => (1 << 20) - 1, + // Adr does not shift its immediate, so the 21-bit immediate gives 21 bits of total + // range. + LabelUse::Adr21 => (1 << 20) - 1, + LabelUse::PCRel32 => 0x7fffffff, + } + } + + /// Maximum PC-relative range (negative). + fn max_neg_range(self) -> CodeOffset { + // All forms are twos-complement signed offsets, so negative limit is one more than + // positive limit. + self.max_pos_range() + 1 + } + + /// Size of window into code needed to do the patch. + fn patch_size(self) -> CodeOffset { + // Patch is on one instruction only for all of these label reference types. + 4 + } + + /// Perform the patch. + fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) { + let pc_rel = (label_offset as i64) - (use_offset as i64); + debug_assert!(pc_rel <= self.max_pos_range() as i64); + debug_assert!(pc_rel >= -(self.max_neg_range() as i64)); + let pc_rel = pc_rel as u32; + let insn_word = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]); + let mask = match self { + LabelUse::Branch19 => 0x00ffffe0, // bits 23..5 inclusive + LabelUse::Branch26 => 0x03ffffff, // bits 25..0 inclusive + LabelUse::Ldr19 => 0x00ffffe0, // bits 23..5 inclusive + LabelUse::Adr21 => 0x60ffffe0, // bits 30..29, 25..5 inclusive + LabelUse::PCRel32 => 0xffffffff, + }; + let pc_rel_shifted = match self { + LabelUse::Adr21 | LabelUse::PCRel32 => pc_rel, + _ => { + debug_assert!(pc_rel & 3 == 0); + pc_rel >> 2 + } + }; + let pc_rel_inserted = match self { + LabelUse::Branch19 | LabelUse::Ldr19 => (pc_rel_shifted & 0x7ffff) << 5, + LabelUse::Branch26 => pc_rel_shifted & 0x3ffffff, + LabelUse::Adr21 => (pc_rel_shifted & 0x7ffff) << 5 | (pc_rel_shifted & 0x180000) << 10, + LabelUse::PCRel32 => pc_rel_shifted, + }; + let is_add = match self { + LabelUse::PCRel32 => true, + _ => false, + }; + let insn_word = if is_add { + insn_word.wrapping_add(pc_rel_inserted) + } else { + (insn_word & !mask) | pc_rel_inserted + }; + buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word)); + } + + /// Is a veneer supported for this label reference type? + fn supports_veneer(self) -> bool { + match self { + LabelUse::Branch19 => true, // veneer is a Branch26 + _ => false, + } + } + + /// How large is the veneer, if supported? + fn veneer_size(self) -> CodeOffset { + 4 + } + + /// Generate a veneer into the buffer, given that this veneer is at `veneer_offset`, and return + /// an offset and label-use for the veneer's use of the original label. + fn generate_veneer( + self, + buffer: &mut [u8], + veneer_offset: CodeOffset, + ) -> (CodeOffset, LabelUse) { + match self { + LabelUse::Branch19 => { + // veneer is a Branch26 (unconditional branch). Just encode directly here -- don't + // bother with constructing an Inst. + let insn_word = 0b000101 << 26; + buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word)); + (veneer_offset, LabelUse::Branch26) + } + _ => panic!("Unsupported label-reference type for veneer generation!"), + } + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/regs.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/regs.rs new file mode 100644 index 0000000000..0b4babe04a --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/regs.rs @@ -0,0 +1,351 @@ +//! AArch64 ISA definitions: registers. + +use crate::isa::aarch64::inst::OperandSize; +use crate::isa::aarch64::inst::ScalarSize; +use crate::isa::aarch64::inst::VectorSize; +use crate::settings; + +use regalloc::{ + PrettyPrint, RealRegUniverse, Reg, RegClass, RegClassInfo, Writable, NUM_REG_CLASSES, +}; + +use std::string::{String, ToString}; + +//============================================================================= +// Registers, the Universe thereof, and printing + +/// The pinned register on this architecture. +/// It must be the same as Spidermonkey's HeapReg, as found in this file. +/// https://searchfox.org/mozilla-central/source/js/src/jit/arm64/Assembler-arm64.h#103 +pub const PINNED_REG: u8 = 21; + +#[rustfmt::skip] +const XREG_INDICES: [u8; 31] = [ + // X0 - X7 + 32, 33, 34, 35, 36, 37, 38, 39, + // X8 - X15 + 40, 41, 42, 43, 44, 45, 46, 47, + // X16, X17 + 58, 59, + // X18 + 60, + // X19, X20 + 48, 49, + // X21, put aside because it's the pinned register. + 57, + // X22 - X28 + 50, 51, 52, 53, 54, 55, 56, + // X29 (FP) + 61, + // X30 (LR) + 62, +]; + +const ZERO_REG_INDEX: u8 = 63; + +const SP_REG_INDEX: u8 = 64; + +/// Get a reference to an X-register (integer register). +pub fn xreg(num: u8) -> Reg { + assert!(num < 31); + Reg::new_real( + RegClass::I64, + /* enc = */ num, + /* index = */ XREG_INDICES[num as usize], + ) +} + +/// Get a writable reference to an X-register. +pub fn writable_xreg(num: u8) -> Writable<Reg> { + Writable::from_reg(xreg(num)) +} + +/// Get a reference to a V-register (vector/FP register). +pub fn vreg(num: u8) -> Reg { + assert!(num < 32); + Reg::new_real(RegClass::V128, /* enc = */ num, /* index = */ num) +} + +/// Get a writable reference to a V-register. +pub fn writable_vreg(num: u8) -> Writable<Reg> { + Writable::from_reg(vreg(num)) +} + +/// Get a reference to the zero-register. +pub fn zero_reg() -> Reg { + // This should be the same as what xreg(31) returns, except that + // we use the special index into the register index space. + Reg::new_real( + RegClass::I64, + /* enc = */ 31, + /* index = */ ZERO_REG_INDEX, + ) +} + +/// Get a writable reference to the zero-register (this discards a result). +pub fn writable_zero_reg() -> Writable<Reg> { + Writable::from_reg(zero_reg()) +} + +/// Get a reference to the stack-pointer register. +pub fn stack_reg() -> Reg { + // XSP (stack) and XZR (zero) are logically different registers which have + // the same hardware encoding, and whose meaning, in real aarch64 + // instructions, is context-dependent. For convenience of + // universe-construction and for correct printing, we make them be two + // different real registers. + Reg::new_real( + RegClass::I64, + /* enc = */ 31, + /* index = */ SP_REG_INDEX, + ) +} + +/// Get a writable reference to the stack-pointer register. +pub fn writable_stack_reg() -> Writable<Reg> { + Writable::from_reg(stack_reg()) +} + +/// Get a reference to the link register (x30). +pub fn link_reg() -> Reg { + xreg(30) +} + +/// Get a writable reference to the link register. +pub fn writable_link_reg() -> Writable<Reg> { + Writable::from_reg(link_reg()) +} + +/// Get a reference to the frame pointer (x29). +pub fn fp_reg() -> Reg { + xreg(29) +} + +/// Get a writable reference to the frame pointer. +pub fn writable_fp_reg() -> Writable<Reg> { + Writable::from_reg(fp_reg()) +} + +/// Get a reference to the first temporary, sometimes "spill temporary", register. This register is +/// used to compute the address of a spill slot when a direct offset addressing mode from FP is not +/// sufficient (+/- 2^11 words). We exclude this register from regalloc and reserve it for this +/// purpose for simplicity; otherwise we need a multi-stage analysis where we first determine how +/// many spill slots we have, then perhaps remove the reg from the pool and recompute regalloc. +/// +/// We use x16 for this (aka IP0 in the AArch64 ABI) because it's a scratch register but is +/// slightly special (used for linker veneers). We're free to use it as long as we don't expect it +/// to live through call instructions. +pub fn spilltmp_reg() -> Reg { + xreg(16) +} + +/// Get a writable reference to the spilltmp reg. +pub fn writable_spilltmp_reg() -> Writable<Reg> { + Writable::from_reg(spilltmp_reg()) +} + +/// Get a reference to the second temp register. We need this in some edge cases +/// where we need both the spilltmp and another temporary. +/// +/// We use x17 (aka IP1), the other "interprocedural"/linker-veneer scratch reg that is +/// free to use otherwise. +pub fn tmp2_reg() -> Reg { + xreg(17) +} + +/// Get a writable reference to the tmp2 reg. +pub fn writable_tmp2_reg() -> Writable<Reg> { + Writable::from_reg(tmp2_reg()) +} + +/// Create the register universe for AArch64. +pub fn create_reg_universe(flags: &settings::Flags) -> RealRegUniverse { + let mut regs = vec![]; + let mut allocable_by_class = [None; NUM_REG_CLASSES]; + + // Numbering Scheme: we put V-regs first, then X-regs. The X-regs exclude several registers: + // x18 (globally reserved for platform-specific purposes), x29 (frame pointer), x30 (link + // register), x31 (stack pointer or zero register, depending on context). + + let v_reg_base = 0u8; // in contiguous real-register index space + let v_reg_count = 32; + for i in 0u8..v_reg_count { + let reg = Reg::new_real( + RegClass::V128, + /* enc = */ i, + /* index = */ v_reg_base + i, + ) + .to_real_reg(); + let name = format!("v{}", i); + regs.push((reg, name)); + } + let v_reg_last = v_reg_base + v_reg_count - 1; + + // Add the X registers. N.B.: the order here must match the order implied + // by XREG_INDICES, ZERO_REG_INDEX, and SP_REG_INDEX above. + + let x_reg_base = 32u8; // in contiguous real-register index space + let mut x_reg_count = 0; + + let uses_pinned_reg = flags.enable_pinned_reg(); + + for i in 0u8..32u8 { + // See above for excluded registers. + if i == 16 || i == 17 || i == 18 || i == 29 || i == 30 || i == 31 || i == PINNED_REG { + continue; + } + let reg = Reg::new_real( + RegClass::I64, + /* enc = */ i, + /* index = */ x_reg_base + x_reg_count, + ) + .to_real_reg(); + let name = format!("x{}", i); + regs.push((reg, name)); + x_reg_count += 1; + } + let x_reg_last = x_reg_base + x_reg_count - 1; + + allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo { + first: x_reg_base as usize, + last: x_reg_last as usize, + suggested_scratch: Some(XREG_INDICES[19] as usize), + }); + allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo { + first: v_reg_base as usize, + last: v_reg_last as usize, + suggested_scratch: Some(/* V31: */ 31), + }); + + // Other regs, not available to the allocator. + let allocable = if uses_pinned_reg { + // The pinned register is not allocatable in this case, so record the length before adding + // it. + let len = regs.len(); + regs.push((xreg(PINNED_REG).to_real_reg(), "x21/pinned_reg".to_string())); + len + } else { + regs.push((xreg(PINNED_REG).to_real_reg(), "x21".to_string())); + regs.len() + }; + + regs.push((xreg(16).to_real_reg(), "x16".to_string())); + regs.push((xreg(17).to_real_reg(), "x17".to_string())); + regs.push((xreg(18).to_real_reg(), "x18".to_string())); + regs.push((fp_reg().to_real_reg(), "fp".to_string())); + regs.push((link_reg().to_real_reg(), "lr".to_string())); + regs.push((zero_reg().to_real_reg(), "xzr".to_string())); + regs.push((stack_reg().to_real_reg(), "sp".to_string())); + + // FIXME JRS 2020Feb06: unfortunately this pushes the number of real regs + // to 65, which is potentially inconvenient from a compiler performance + // standpoint. We could possibly drop back to 64 by "losing" a vector + // register in future. + + // Assert sanity: the indices in the register structs must match their + // actual indices in the array. + for (i, reg) in regs.iter().enumerate() { + assert_eq!(i, reg.0.get_index()); + } + + RealRegUniverse { + regs, + allocable, + allocable_by_class, + } +} + +/// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show +/// its name at the 32-bit size. +pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: OperandSize) -> String { + let mut s = reg.show_rru(mb_rru); + if reg.get_class() != RegClass::I64 || !size.is32() { + // We can't do any better. + return s; + } + + if reg.is_real() { + // Change (eg) "x42" into "w42" as appropriate + if reg.get_class() == RegClass::I64 && size.is32() && s.starts_with("x") { + s = "w".to_string() + &s[1..]; + } + } else { + // Add a "w" suffix to RegClass::I64 vregs used in a 32-bit role + if reg.get_class() == RegClass::I64 && size.is32() { + s.push('w'); + } + } + s +} + +/// Show a vector register used in a scalar context. +pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: ScalarSize) -> String { + let mut s = reg.show_rru(mb_rru); + if reg.get_class() != RegClass::V128 { + // We can't do any better. + return s; + } + + if reg.is_real() { + // Change (eg) "v0" into "d0". + if s.starts_with("v") { + let replacement = match size { + ScalarSize::Size8 => "b", + ScalarSize::Size16 => "h", + ScalarSize::Size32 => "s", + ScalarSize::Size64 => "d", + ScalarSize::Size128 => "q", + }; + s.replace_range(0..1, replacement); + } + } else { + // Add a "d" suffix to RegClass::V128 vregs. + if reg.get_class() == RegClass::V128 { + s.push('d'); + } + } + s +} + +/// Show a vector register. +pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: VectorSize) -> String { + assert_eq!(RegClass::V128, reg.get_class()); + let mut s = reg.show_rru(mb_rru); + + let suffix = match size { + VectorSize::Size8x8 => ".8b", + VectorSize::Size8x16 => ".16b", + VectorSize::Size16x4 => ".4h", + VectorSize::Size16x8 => ".8h", + VectorSize::Size32x2 => ".2s", + VectorSize::Size32x4 => ".4s", + VectorSize::Size64x2 => ".2d", + }; + + s.push_str(suffix); + s +} + +/// Show an indexed vector element. +pub fn show_vreg_element( + reg: Reg, + mb_rru: Option<&RealRegUniverse>, + idx: u8, + size: VectorSize, +) -> String { + assert_eq!(RegClass::V128, reg.get_class()); + let mut s = reg.show_rru(mb_rru); + + let suffix = match size { + VectorSize::Size8x8 => "b", + VectorSize::Size8x16 => "b", + VectorSize::Size16x4 => "h", + VectorSize::Size16x8 => "h", + VectorSize::Size32x2 => "s", + VectorSize::Size32x4 => "s", + VectorSize::Size64x2 => "d", + }; + + s.push_str(&format!(".{}[{}]", suffix, idx)); + s +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind.rs new file mode 100644 index 0000000000..698e094795 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind.rs @@ -0,0 +1,201 @@ +use super::*; +use crate::isa::aarch64::inst::{args::PairAMode, imms::Imm12, regs, ALUOp, Inst}; +use crate::isa::unwind::input::{UnwindCode, UnwindInfo}; +use crate::machinst::UnwindInfoContext; +use crate::result::CodegenResult; +use alloc::vec::Vec; +use regalloc::Reg; + +#[cfg(feature = "unwind")] +pub(crate) mod systemv; + +pub struct AArch64UnwindInfo; + +impl UnwindInfoGenerator<Inst> for AArch64UnwindInfo { + fn create_unwind_info( + context: UnwindInfoContext<Inst>, + ) -> CodegenResult<Option<UnwindInfo<Reg>>> { + let word_size = 8u8; + let pair_size = word_size * 2; + let mut codes = Vec::new(); + + for i in context.prologue.clone() { + let i = i as usize; + let inst = &context.insts[i]; + let offset = context.insts_layout[i]; + + match inst { + Inst::StoreP64 { + rt, + rt2, + mem: PairAMode::PreIndexed(rn, imm7), + .. + } if *rt == regs::fp_reg() + && *rt2 == regs::link_reg() + && *rn == regs::writable_stack_reg() + && imm7.value == -(pair_size as i16) => + { + // stp fp (x29), lr (x30), [sp, #-16]! + codes.push(( + offset, + UnwindCode::StackAlloc { + size: pair_size as u32, + }, + )); + codes.push(( + offset, + UnwindCode::SaveRegister { + reg: *rt, + stack_offset: 0, + }, + )); + codes.push(( + offset, + UnwindCode::SaveRegister { + reg: *rt2, + stack_offset: word_size as u32, + }, + )); + } + Inst::StoreP64 { + rt, + rt2, + mem: PairAMode::PreIndexed(rn, imm7), + .. + } if rn.to_reg() == regs::stack_reg() && imm7.value % (pair_size as i16) == 0 => { + // stp r1, r2, [sp, #(i * #16)] + let stack_offset = imm7.value as u32; + codes.push(( + offset, + UnwindCode::SaveRegister { + reg: *rt, + stack_offset, + }, + )); + if *rt2 != regs::zero_reg() { + codes.push(( + offset, + UnwindCode::SaveRegister { + reg: *rt2, + stack_offset: stack_offset + word_size as u32, + }, + )); + } + } + Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd, + rn, + imm12: + Imm12 { + bits: 0, + shift12: false, + }, + } if *rd == regs::writable_fp_reg() && *rn == regs::stack_reg() => { + // mov fp (x29), sp. + codes.push((offset, UnwindCode::SetFramePointer { reg: rd.to_reg() })); + } + Inst::VirtualSPOffsetAdj { offset: adj } if offset > 0 => { + codes.push((offset, UnwindCode::StackAlloc { size: *adj as u32 })); + } + _ => {} + } + } + + // TODO epilogues + + let prologue_size = if context.prologue.is_empty() { + 0 + } else { + context.insts_layout[context.prologue.end as usize - 1] + }; + + Ok(Some(UnwindInfo { + prologue_size, + prologue_unwind_codes: codes, + epilogues_unwind_codes: vec![], + function_size: context.len, + word_size, + initial_sp_offset: 0, + })) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::{ExternalName, Function, InstBuilder, Signature, StackSlotData, StackSlotKind}; + use crate::isa::{lookup, CallConv}; + use crate::settings::{builder, Flags}; + use crate::Context; + use std::str::FromStr; + use target_lexicon::triple; + + #[test] + fn test_simple_func() { + let isa = lookup(triple!("aarch64")) + .expect("expect aarch64 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::SystemV, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let result = context.mach_compile_result.unwrap(); + let unwind_info = result.unwind_info.unwrap(); + + assert_eq!( + unwind_info, + UnwindInfo { + prologue_size: 12, + prologue_unwind_codes: vec![ + (4, UnwindCode::StackAlloc { size: 16 }), + ( + 4, + UnwindCode::SaveRegister { + reg: regs::fp_reg(), + stack_offset: 0 + } + ), + ( + 4, + UnwindCode::SaveRegister { + reg: regs::link_reg(), + stack_offset: 8 + } + ), + ( + 8, + UnwindCode::SetFramePointer { + reg: regs::fp_reg() + } + ) + ], + epilogues_unwind_codes: vec![], + function_size: 24, + word_size: 8, + initial_sp_offset: 0, + } + ); + } + + fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function { + let mut func = + Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv)); + + let block0 = func.dfg.make_block(); + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().return_(&[]); + + if let Some(stack_slot) = stack_slot { + func.stack_slots.push(stack_slot); + } + + func + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind/systemv.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind/systemv.rs new file mode 100644 index 0000000000..b988314b1b --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/inst/unwind/systemv.rs @@ -0,0 +1,158 @@ +//! Unwind information for System V ABI (Aarch64). + +use crate::isa::aarch64::inst::regs; +use crate::isa::unwind::input; +use crate::isa::unwind::systemv::{RegisterMappingError, UnwindInfo}; +use crate::result::CodegenResult; +use gimli::{write::CommonInformationEntry, Encoding, Format, Register}; +use regalloc::{Reg, RegClass}; + +/// Creates a new aarch64 common information entry (CIE). +pub fn create_cie() -> CommonInformationEntry { + use gimli::write::CallFrameInstruction; + + let mut entry = CommonInformationEntry::new( + Encoding { + address_size: 8, + format: Format::Dwarf32, + version: 1, + }, + 4, // Code alignment factor + -8, // Data alignment factor + Register(regs::link_reg().get_hw_encoding().into()), + ); + + // Every frame will start with the call frame address (CFA) at SP + let sp = Register(regs::stack_reg().get_hw_encoding().into()); + entry.add_instruction(CallFrameInstruction::Cfa(sp, 0)); + + entry +} + +/// Map Cranelift registers to their corresponding Gimli registers. +pub fn map_reg(reg: Reg) -> Result<Register, RegisterMappingError> { + match reg.get_class() { + RegClass::I64 => Ok(Register(reg.get_hw_encoding().into())), + _ => Err(RegisterMappingError::UnsupportedRegisterBank("class?")), + } +} + +pub(crate) fn create_unwind_info( + unwind: input::UnwindInfo<Reg>, +) -> CodegenResult<Option<UnwindInfo>> { + struct RegisterMapper; + impl crate::isa::unwind::systemv::RegisterMapper<Reg> for RegisterMapper { + fn map(&self, reg: Reg) -> Result<u16, RegisterMappingError> { + Ok(map_reg(reg)?.0) + } + fn sp(&self) -> u16 { + regs::stack_reg().get_hw_encoding().into() + } + } + let map = RegisterMapper; + Ok(Some(UnwindInfo::build(unwind, &map)?)) +} + +#[cfg(test)] +mod tests { + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::{ + types, AbiParam, ExternalName, Function, InstBuilder, Signature, StackSlotData, + StackSlotKind, + }; + use crate::isa::{lookup, CallConv}; + use crate::settings::{builder, Flags}; + use crate::Context; + use gimli::write::Address; + use std::str::FromStr; + use target_lexicon::triple; + + #[test] + fn test_simple_func() { + let isa = lookup(triple!("aarch64")) + .expect("expect aarch64 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::SystemV, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let fde = match context + .create_unwind_info(isa.as_ref()) + .expect("can create unwind info") + { + Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => { + info.to_fde(Address::Constant(1234)) + } + _ => panic!("expected unwind information"), + }; + + assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(1234), length: 24, lsda: None, instructions: [(4, CfaOffset(16)), (4, Offset(Register(29), -16)), (4, Offset(Register(30), -8)), (8, CfaRegister(Register(29)))] }"); + } + + fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function { + let mut func = + Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv)); + + let block0 = func.dfg.make_block(); + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().return_(&[]); + + if let Some(stack_slot) = stack_slot { + func.stack_slots.push(stack_slot); + } + + func + } + + #[test] + fn test_multi_return_func() { + let isa = lookup(triple!("aarch64")) + .expect("expect aarch64 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV)); + + context.compile(&*isa).expect("expected compilation"); + + let fde = match context + .create_unwind_info(isa.as_ref()) + .expect("can create unwind info") + { + Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => { + info.to_fde(Address::Constant(4321)) + } + _ => panic!("expected unwind information"), + }; + + assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 40, lsda: None, instructions: [(4, CfaOffset(16)), (4, Offset(Register(29), -16)), (4, Offset(Register(30), -8)), (8, CfaRegister(Register(29)))] }"); + } + + fn create_multi_return_function(call_conv: CallConv) -> Function { + let mut sig = Signature::new(call_conv); + sig.params.push(AbiParam::new(types::I32)); + let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig); + + let block0 = func.dfg.make_block(); + let v0 = func.dfg.append_block_param(block0, types::I32); + let block1 = func.dfg.make_block(); + let block2 = func.dfg.make_block(); + + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().brnz(v0, block2, &[]); + pos.ins().jump(block1, &[]); + + pos.insert_block(block1); + pos.ins().return_(&[]); + + pos.insert_block(block2); + pos.ins().return_(&[]); + + func + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/lower.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/lower.rs new file mode 100644 index 0000000000..17555c1bd2 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/lower.rs @@ -0,0 +1,1196 @@ +//! Lowering rules for AArch64. +//! +//! TODO: opportunities for better code generation: +//! +//! - Smarter use of addressing modes. Recognize a+SCALE*b patterns. Recognize +//! pre/post-index opportunities. +//! +//! - Floating-point immediates (FIMM instruction). + +use crate::ir::condcodes::{FloatCC, IntCC}; +use crate::ir::types::*; +use crate::ir::Inst as IRInst; +use crate::ir::{Opcode, Type}; +use crate::machinst::lower::*; +use crate::machinst::*; +use crate::CodegenResult; + +use crate::isa::aarch64::inst::*; +use crate::isa::aarch64::AArch64Backend; + +use super::lower_inst; + +use crate::data_value::DataValue; +use log::{debug, trace}; +use regalloc::{Reg, RegClass, Writable}; +use smallvec::SmallVec; + +//============================================================================ +// Result enum types. +// +// Lowering of a given value results in one of these enums, depending on the +// modes in which we can accept the value. + +/// A lowering result: register, register-shift. An SSA value can always be +/// lowered into one of these options; the register form is the fallback. +#[derive(Clone, Debug)] +enum ResultRS { + Reg(Reg), + RegShift(Reg, ShiftOpAndAmt), +} + +/// A lowering result: register, register-shift, register-extend. An SSA value can always be +/// lowered into one of these options; the register form is the fallback. +#[derive(Clone, Debug)] +enum ResultRSE { + Reg(Reg), + RegShift(Reg, ShiftOpAndAmt), + RegExtend(Reg, ExtendOp), +} + +impl ResultRSE { + fn from_rs(rs: ResultRS) -> ResultRSE { + match rs { + ResultRS::Reg(r) => ResultRSE::Reg(r), + ResultRS::RegShift(r, s) => ResultRSE::RegShift(r, s), + } + } +} + +/// A lowering result: register, register-shift, register-extend, or 12-bit immediate form. +/// An SSA value can always be lowered into one of these options; the register form is the +/// fallback. +#[derive(Clone, Debug)] +pub(crate) enum ResultRSEImm12 { + Reg(Reg), + RegShift(Reg, ShiftOpAndAmt), + RegExtend(Reg, ExtendOp), + Imm12(Imm12), +} + +impl ResultRSEImm12 { + fn from_rse(rse: ResultRSE) -> ResultRSEImm12 { + match rse { + ResultRSE::Reg(r) => ResultRSEImm12::Reg(r), + ResultRSE::RegShift(r, s) => ResultRSEImm12::RegShift(r, s), + ResultRSE::RegExtend(r, e) => ResultRSEImm12::RegExtend(r, e), + } + } +} + +/// A lowering result: register, register-shift, or logical immediate form. +/// An SSA value can always be lowered into one of these options; the register form is the +/// fallback. +#[derive(Clone, Debug)] +pub(crate) enum ResultRSImmLogic { + Reg(Reg), + RegShift(Reg, ShiftOpAndAmt), + ImmLogic(ImmLogic), +} + +impl ResultRSImmLogic { + fn from_rs(rse: ResultRS) -> ResultRSImmLogic { + match rse { + ResultRS::Reg(r) => ResultRSImmLogic::Reg(r), + ResultRS::RegShift(r, s) => ResultRSImmLogic::RegShift(r, s), + } + } +} + +/// A lowering result: register or immediate shift amount (arg to a shift op). +/// An SSA value can always be lowered into one of these options; the register form is the +/// fallback. +#[derive(Clone, Debug)] +pub(crate) enum ResultRegImmShift { + Reg(Reg), + ImmShift(ImmShift), +} + +//============================================================================ +// Lowering: convert instruction inputs to forms that we can use. + +/// Lower an instruction input to a 64-bit constant, if possible. +pub(crate) fn input_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> Option<u64> { + let input = ctx.get_input(input.insn, input.input); + input.constant +} + +/// Lower an instruction input to a constant register-shift amount, if possible. +pub(crate) fn input_to_shiftimm<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, +) -> Option<ShiftOpShiftImm> { + input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift) +} + +pub(crate) fn const_param_to_u128<C: LowerCtx<I = Inst>>( + ctx: &mut C, + inst: IRInst, +) -> Option<u128> { + match ctx.get_immediate(inst) { + Some(DataValue::V128(bytes)) => Some(u128::from_le_bytes(bytes)), + _ => None, + } +} + +/// How to handle narrow values loaded into registers; see note on `narrow_mode` +/// parameter to `put_input_in_*` below. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum NarrowValueMode { + None, + /// Zero-extend to 32 bits if original is < 32 bits. + ZeroExtend32, + /// Sign-extend to 32 bits if original is < 32 bits. + SignExtend32, + /// Zero-extend to 64 bits if original is < 64 bits. + ZeroExtend64, + /// Sign-extend to 64 bits if original is < 64 bits. + SignExtend64, +} + +impl NarrowValueMode { + fn is_32bit(&self) -> bool { + match self { + NarrowValueMode::None => false, + NarrowValueMode::ZeroExtend32 | NarrowValueMode::SignExtend32 => true, + NarrowValueMode::ZeroExtend64 | NarrowValueMode::SignExtend64 => false, + } + } +} + +/// Lower an instruction input to a reg. +/// +/// The given register will be extended appropriately, according to +/// `narrow_mode` and the input's type. If extended, the value is +/// always extended to 64 bits, for simplicity. +pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> Reg { + debug!("put_input_in_reg: input {:?}", input); + let ty = ctx.input_ty(input.insn, input.input); + let from_bits = ty_bits(ty) as u8; + let inputs = ctx.get_input(input.insn, input.input); + let in_reg = if let Some(c) = inputs.constant { + // Generate constants fresh at each use to minimize long-range register pressure. + let masked = if from_bits < 64 { + c & ((1u64 << from_bits) - 1) + } else { + c + }; + let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty); + for inst in Inst::gen_constant(to_reg, masked, ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) + .into_iter() + { + ctx.emit(inst); + } + to_reg.to_reg() + } else { + ctx.use_input_reg(inputs); + inputs.reg + }; + + match (narrow_mode, from_bits) { + (NarrowValueMode::None, _) => in_reg, + (NarrowValueMode::ZeroExtend32, n) if n < 32 => { + let tmp = ctx.alloc_tmp(RegClass::I64, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rn: in_reg, + signed: false, + from_bits, + to_bits: 32, + }); + tmp.to_reg() + } + (NarrowValueMode::SignExtend32, n) if n < 32 => { + let tmp = ctx.alloc_tmp(RegClass::I64, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rn: in_reg, + signed: true, + from_bits, + to_bits: 32, + }); + tmp.to_reg() + } + (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg, + + (NarrowValueMode::ZeroExtend64, n) if n < 64 => { + if inputs.constant.is_some() { + // Constants are zero-extended to full 64-bit width on load already. + in_reg + } else { + let tmp = ctx.alloc_tmp(RegClass::I64, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rn: in_reg, + signed: false, + from_bits, + to_bits: 64, + }); + tmp.to_reg() + } + } + (NarrowValueMode::SignExtend64, n) if n < 64 => { + let tmp = ctx.alloc_tmp(RegClass::I64, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rn: in_reg, + signed: true, + from_bits, + to_bits: 64, + }); + tmp.to_reg() + } + (_, 64) => in_reg, + (_, 128) => in_reg, + + _ => panic!( + "Unsupported input width: input ty {} bits {} mode {:?}", + ty, from_bits, narrow_mode + ), + } +} + +/// Lower an instruction input to a reg or reg/shift, or reg/extend operand. +/// +/// The `narrow_mode` flag indicates whether the consumer of this value needs +/// the high bits clear. For many operations, such as an add/sub/mul or any +/// bitwise logical operation, the low-bit results depend only on the low-bit +/// inputs, so e.g. we can do an 8 bit add on 32 bit registers where the 8-bit +/// value is stored in the low 8 bits of the register and the high 24 bits are +/// undefined. If the op truly needs the high N bits clear (such as for a +/// divide or a right-shift or a compare-to-zero), `narrow_mode` should be +/// set to `ZeroExtend` or `SignExtend` as appropriate, and the resulting +/// register will be provided the extended value. +fn put_input_in_rs<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> ResultRS { + let inputs = ctx.get_input(input.insn, input.input); + if let Some((insn, 0)) = inputs.inst { + let op = ctx.data(insn).opcode(); + + if op == Opcode::Ishl { + let shiftee = InsnInput { insn, input: 0 }; + let shift_amt = InsnInput { insn, input: 1 }; + + // Can we get the shift amount as an immediate? + if let Some(shiftimm) = input_to_shiftimm(ctx, shift_amt) { + let shiftee_bits = ty_bits(ctx.input_ty(insn, 0)); + if shiftee_bits <= std::u8::MAX as usize { + let shiftimm = shiftimm.mask(shiftee_bits as u8); + let reg = put_input_in_reg(ctx, shiftee, narrow_mode); + return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm)); + } + } + } + } + + ResultRS::Reg(put_input_in_reg(ctx, input, narrow_mode)) +} + +/// Lower an instruction input to a reg or reg/shift, or reg/extend operand. +/// This does not actually codegen the source instruction; it just uses the +/// vreg into which the source instruction will generate its value. +/// +/// See note on `put_input_in_rs` for a description of `narrow_mode`. +fn put_input_in_rse<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> ResultRSE { + let inputs = ctx.get_input(input.insn, input.input); + if let Some((insn, 0)) = inputs.inst { + let op = ctx.data(insn).opcode(); + let out_ty = ctx.output_ty(insn, 0); + let out_bits = ty_bits(out_ty); + + // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator? + if op == Opcode::Uextend || op == Opcode::Sextend { + let sign_extend = op == Opcode::Sextend; + let inner_ty = ctx.input_ty(insn, 0); + let inner_bits = ty_bits(inner_ty); + assert!(inner_bits < out_bits); + if match (sign_extend, narrow_mode) { + // A single zero-extend or sign-extend is equal to itself. + (_, NarrowValueMode::None) => true, + // Two zero-extends or sign-extends in a row is equal to a single zero-extend or sign-extend. + (false, NarrowValueMode::ZeroExtend32) | (false, NarrowValueMode::ZeroExtend64) => { + true + } + (true, NarrowValueMode::SignExtend32) | (true, NarrowValueMode::SignExtend64) => { + true + } + // A zero-extend and a sign-extend in a row is not equal to a single zero-extend or sign-extend + (false, NarrowValueMode::SignExtend32) | (false, NarrowValueMode::SignExtend64) => { + false + } + (true, NarrowValueMode::ZeroExtend32) | (true, NarrowValueMode::ZeroExtend64) => { + false + } + } { + let extendop = match (sign_extend, inner_bits) { + (true, 8) => ExtendOp::SXTB, + (false, 8) => ExtendOp::UXTB, + (true, 16) => ExtendOp::SXTH, + (false, 16) => ExtendOp::UXTH, + (true, 32) => ExtendOp::SXTW, + (false, 32) => ExtendOp::UXTW, + _ => unreachable!(), + }; + let reg = + put_input_in_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None); + return ResultRSE::RegExtend(reg, extendop); + } + } + + // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend, + // then get the result into a register and return an Extend-mode operand on + // that register. + if narrow_mode != NarrowValueMode::None + && ((narrow_mode.is_32bit() && out_bits < 32) + || (!narrow_mode.is_32bit() && out_bits < 64)) + { + let reg = put_input_in_reg(ctx, input, NarrowValueMode::None); + let extendop = match (narrow_mode, out_bits) { + (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => { + ExtendOp::SXTB + } + (NarrowValueMode::ZeroExtend32, 1) | (NarrowValueMode::ZeroExtend64, 1) => { + ExtendOp::UXTB + } + (NarrowValueMode::SignExtend32, 8) | (NarrowValueMode::SignExtend64, 8) => { + ExtendOp::SXTB + } + (NarrowValueMode::ZeroExtend32, 8) | (NarrowValueMode::ZeroExtend64, 8) => { + ExtendOp::UXTB + } + (NarrowValueMode::SignExtend32, 16) | (NarrowValueMode::SignExtend64, 16) => { + ExtendOp::SXTH + } + (NarrowValueMode::ZeroExtend32, 16) | (NarrowValueMode::ZeroExtend64, 16) => { + ExtendOp::UXTH + } + (NarrowValueMode::SignExtend64, 32) => ExtendOp::SXTW, + (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW, + _ => unreachable!(), + }; + return ResultRSE::RegExtend(reg, extendop); + } + } + + ResultRSE::from_rs(put_input_in_rs(ctx, input, narrow_mode)) +} + +pub(crate) fn put_input_in_rse_imm12<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> ResultRSEImm12 { + if let Some(imm_value) = input_to_const(ctx, input) { + if let Some(i) = Imm12::maybe_from_u64(imm_value) { + return ResultRSEImm12::Imm12(i); + } + } + + ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode)) +} + +/// Like `put_input_in_rse_imm12` above, except is allowed to negate the +/// argument (assuming a two's-complement representation with the given bit +/// width) if this allows use of 12-bit immediate. Used to flip `add`s with +/// negative immediates to `sub`s (and vice-versa). +pub(crate) fn put_input_in_rse_imm12_maybe_negated<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + twos_complement_bits: usize, + narrow_mode: NarrowValueMode, +) -> (ResultRSEImm12, bool) { + assert!(twos_complement_bits <= 64); + if let Some(imm_value) = input_to_const(ctx, input) { + if let Some(i) = Imm12::maybe_from_u64(imm_value) { + return (ResultRSEImm12::Imm12(i), false); + } + let sign_extended = + ((imm_value as i64) << (64 - twos_complement_bits)) >> (64 - twos_complement_bits); + let inverted = sign_extended.wrapping_neg(); + if let Some(i) = Imm12::maybe_from_u64(inverted as u64) { + return (ResultRSEImm12::Imm12(i), true); + } + } + + ( + ResultRSEImm12::from_rse(put_input_in_rse(ctx, input, narrow_mode)), + false, + ) +} + +pub(crate) fn put_input_in_rs_immlogic<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> ResultRSImmLogic { + if let Some(imm_value) = input_to_const(ctx, input) { + let ty = ctx.input_ty(input.insn, input.input); + let ty = if ty_bits(ty) < 32 { I32 } else { ty }; + if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) { + return ResultRSImmLogic::ImmLogic(i); + } + } + + ResultRSImmLogic::from_rs(put_input_in_rs(ctx, input, narrow_mode)) +} + +pub(crate) fn put_input_in_reg_immshift<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + shift_width_bits: usize, +) -> ResultRegImmShift { + if let Some(imm_value) = input_to_const(ctx, input) { + let imm_value = imm_value & ((shift_width_bits - 1) as u64); + if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) { + return ResultRegImmShift::ImmShift(immshift); + } + } + + ResultRegImmShift::Reg(put_input_in_reg(ctx, input, NarrowValueMode::None)) +} + +//============================================================================ +// ALU instruction constructors. + +pub(crate) fn alu_inst_imm12(op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRSEImm12) -> Inst { + match rm { + ResultRSEImm12::Imm12(imm12) => Inst::AluRRImm12 { + alu_op: op, + rd, + rn, + imm12, + }, + ResultRSEImm12::Reg(rm) => Inst::AluRRR { + alu_op: op, + rd, + rn, + rm, + }, + ResultRSEImm12::RegShift(rm, shiftop) => Inst::AluRRRShift { + alu_op: op, + rd, + rn, + rm, + shiftop, + }, + ResultRSEImm12::RegExtend(rm, extendop) => Inst::AluRRRExtend { + alu_op: op, + rd, + rn, + rm, + extendop, + }, + } +} + +pub(crate) fn alu_inst_immlogic( + op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + rm: ResultRSImmLogic, +) -> Inst { + match rm { + ResultRSImmLogic::ImmLogic(imml) => Inst::AluRRImmLogic { + alu_op: op, + rd, + rn, + imml, + }, + ResultRSImmLogic::Reg(rm) => Inst::AluRRR { + alu_op: op, + rd, + rn, + rm, + }, + ResultRSImmLogic::RegShift(rm, shiftop) => Inst::AluRRRShift { + alu_op: op, + rd, + rn, + rm, + shiftop, + }, + } +} + +pub(crate) fn alu_inst_immshift( + op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + rm: ResultRegImmShift, +) -> Inst { + match rm { + ResultRegImmShift::ImmShift(immshift) => Inst::AluRRImmShift { + alu_op: op, + rd, + rn, + immshift, + }, + ResultRegImmShift::Reg(rm) => Inst::AluRRR { + alu_op: op, + rd, + rn, + rm, + }, + } +} + +//============================================================================ +// Lowering: addressing mode support. Takes instruction directly, rather +// than an `InsnInput`, to do more introspection. + +/// 32-bit addends that make up an address: an input, and an extension mode on that +/// input. +type AddressAddend32List = SmallVec<[(Reg, ExtendOp); 4]>; +/// 64-bit addends that make up an address: just an input. +type AddressAddend64List = SmallVec<[Reg; 4]>; + +/// Collect all addends that feed into an address computation, with extend-modes +/// on each. Note that a load/store may have multiple address components (and +/// the CLIF semantics are that these components are added to form the final +/// address), but sometimes the CLIF that we receive still has arguments that +/// refer to `iadd` instructions. We also want to handle uextend/sextend below +/// the add(s). +/// +/// We match any 64-bit add (and descend into its inputs), and we match any +/// 32-to-64-bit sign or zero extension. The returned addend-list will use +/// NarrowValueMode values to indicate how to extend each input: +/// +/// - NarrowValueMode::None: the associated input is 64 bits wide; no extend. +/// - NarrowValueMode::SignExtend64: the associated input is 32 bits wide; +/// do a sign-extension. +/// - NarrowValueMode::ZeroExtend64: the associated input is 32 bits wide; +/// do a zero-extension. +/// +/// We do not descend further into the inputs of extensions (unless it is a constant), +/// because supporting (e.g.) a 32-bit add that is later extended would require +/// additional masking of high-order bits, which is too complex. So, in essence, we +/// descend any number of adds from the roots, collecting all 64-bit address addends; +/// then possibly support extensions at these leaves. +fn collect_address_addends<C: LowerCtx<I = Inst>>( + ctx: &mut C, + roots: &[InsnInput], +) -> (AddressAddend64List, AddressAddend32List, i64) { + let mut result32: AddressAddend32List = SmallVec::new(); + let mut result64: AddressAddend64List = SmallVec::new(); + let mut offset: i64 = 0; + + let mut workqueue: SmallVec<[InsnInput; 4]> = roots.iter().cloned().collect(); + + while let Some(input) = workqueue.pop() { + debug_assert!(ty_bits(ctx.input_ty(input.insn, input.input)) == 64); + if let Some((op, insn)) = maybe_input_insn_multi( + ctx, + input, + &[ + Opcode::Uextend, + Opcode::Sextend, + Opcode::Iadd, + Opcode::Iconst, + ], + ) { + match op { + Opcode::Uextend | Opcode::Sextend if ty_bits(ctx.input_ty(insn, 0)) == 32 => { + let extendop = if op == Opcode::Uextend { + ExtendOp::UXTW + } else { + ExtendOp::SXTW + }; + let extendee_input = InsnInput { insn, input: 0 }; + // If the input is a zero-extension of a constant, add the value to the known + // offset. + // Only do this for zero-extension, as generating a sign-extended + // constant may be more instructions than using the 'SXTW' addressing mode. + if let (Some(insn), ExtendOp::UXTW) = ( + maybe_input_insn(ctx, extendee_input, Opcode::Iconst), + extendop, + ) { + let value = (ctx.get_constant(insn).unwrap() & 0xFFFF_FFFF_u64) as i64; + offset += value; + } else { + let reg = put_input_in_reg(ctx, extendee_input, NarrowValueMode::None); + result32.push((reg, extendop)); + } + } + Opcode::Uextend | Opcode::Sextend => { + let reg = put_input_in_reg(ctx, input, NarrowValueMode::None); + result64.push(reg); + } + Opcode::Iadd => { + for input in 0..ctx.num_inputs(insn) { + let addend = InsnInput { insn, input }; + workqueue.push(addend); + } + } + Opcode::Iconst => { + let value: i64 = ctx.get_constant(insn).unwrap() as i64; + offset += value; + } + _ => panic!("Unexpected opcode from maybe_input_insn_multi"), + } + } else { + let reg = put_input_in_reg(ctx, input, NarrowValueMode::ZeroExtend64); + result64.push(reg); + } + } + + (result64, result32, offset) +} + +/// Lower the address of a load or store. +pub(crate) fn lower_address<C: LowerCtx<I = Inst>>( + ctx: &mut C, + elem_ty: Type, + roots: &[InsnInput], + offset: i32, +) -> AMode { + // TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or + // mul instructions (Load/StoreComplex don't include scale factors). + + // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero + // extends and addition ops. We update these as we consume address + // components, so they represent the remaining addends not yet handled. + let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots); + let mut offset = args_offset + (offset as i64); + + trace!( + "lower_address: addends64 {:?}, addends32 {:?}, offset {}", + addends64, + addends32, + offset + ); + + // First, decide what the `AMode` will be. Take one extendee and one 64-bit + // reg, or two 64-bit regs, or a 64-bit reg and a 32-bit reg with extension, + // or some other combination as appropriate. + let memarg = if addends64.len() > 0 { + if addends32.len() > 0 { + let (reg32, extendop) = addends32.pop().unwrap(); + let reg64 = addends64.pop().unwrap(); + AMode::RegExtended(reg64, reg32, extendop) + } else if offset > 0 && offset < 0x1000 { + let reg64 = addends64.pop().unwrap(); + let off = offset; + offset = 0; + AMode::RegOffset(reg64, off, elem_ty) + } else if addends64.len() >= 2 { + let reg1 = addends64.pop().unwrap(); + let reg2 = addends64.pop().unwrap(); + AMode::RegReg(reg1, reg2) + } else { + let reg1 = addends64.pop().unwrap(); + AMode::reg(reg1) + } + } else + /* addends64.len() == 0 */ + { + if addends32.len() > 0 { + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + let (reg1, extendop) = addends32.pop().unwrap(); + let signed = match extendop { + ExtendOp::SXTW => true, + ExtendOp::UXTW => false, + _ => unreachable!(), + }; + ctx.emit(Inst::Extend { + rd: tmp, + rn: reg1, + signed, + from_bits: 32, + to_bits: 64, + }); + if let Some((reg2, extendop)) = addends32.pop() { + AMode::RegExtended(tmp.to_reg(), reg2, extendop) + } else { + AMode::reg(tmp.to_reg()) + } + } else + /* addends32.len() == 0 */ + { + let off_reg = ctx.alloc_tmp(RegClass::I64, I64); + lower_constant_u64(ctx, off_reg, offset as u64); + offset = 0; + AMode::reg(off_reg.to_reg()) + } + }; + + // At this point, if we have any remaining components, we need to allocate a + // temp, replace one of the registers in the AMode with the temp, and emit + // instructions to add together the remaining components. Return immediately + // if this is *not* the case. + if offset == 0 && addends32.len() == 0 && addends64.len() == 0 { + return memarg; + } + + // Allocate the temp and shoehorn it into the AMode. + let addr = ctx.alloc_tmp(RegClass::I64, I64); + let (reg, memarg) = match memarg { + AMode::RegExtended(r1, r2, extendop) => { + (r1, AMode::RegExtended(addr.to_reg(), r2, extendop)) + } + AMode::RegOffset(r, off, ty) => (r, AMode::RegOffset(addr.to_reg(), off, ty)), + AMode::RegReg(r1, r2) => (r2, AMode::RegReg(addr.to_reg(), r1)), + AMode::UnsignedOffset(r, imm) => (r, AMode::UnsignedOffset(addr.to_reg(), imm)), + _ => unreachable!(), + }; + + // If there is any offset, load that first into `addr`, and add the `reg` + // that we kicked out of the `AMode`; otherwise, start with that reg. + if offset != 0 { + // If we can fit offset or -offset in an imm12, use an add-imm + // to combine the reg and offset. Otherwise, load value first then add. + if let Some(imm12) = Imm12::maybe_from_u64(offset as u64) { + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: addr, + rn: reg, + imm12, + }); + } else if let Some(imm12) = Imm12::maybe_from_u64(offset.wrapping_neg() as u64) { + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::Sub64, + rd: addr, + rn: reg, + imm12, + }); + } else { + lower_constant_u64(ctx, addr, offset as u64); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: addr, + rn: addr.to_reg(), + rm: reg, + }); + } + } else { + ctx.emit(Inst::gen_move(addr, reg, I64)); + } + + // Now handle reg64 and reg32-extended components. + for reg in addends64 { + // If the register is the stack reg, we must move it to another reg + // before adding it. + let reg = if reg == stack_reg() { + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + ctx.emit(Inst::gen_move(tmp, stack_reg(), I64)); + tmp.to_reg() + } else { + reg + }; + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: addr, + rn: addr.to_reg(), + rm: reg, + }); + } + for (reg, extendop) in addends32 { + assert!(reg != stack_reg()); + ctx.emit(Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd: addr, + rn: addr.to_reg(), + rm: reg, + extendop, + }); + } + + memarg +} + +pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>( + ctx: &mut C, + rd: Writable<Reg>, + value: u64, +) { + for inst in Inst::load_constant(rd, value) { + ctx.emit(inst); + } +} + +pub(crate) fn lower_constant_f32<C: LowerCtx<I = Inst>>( + ctx: &mut C, + rd: Writable<Reg>, + value: f32, +) { + let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty); + + for inst in Inst::load_fp_constant32(rd, value.to_bits(), alloc_tmp) { + ctx.emit(inst); + } +} + +pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>( + ctx: &mut C, + rd: Writable<Reg>, + value: f64, +) { + let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty); + + for inst in Inst::load_fp_constant64(rd, value.to_bits(), alloc_tmp) { + ctx.emit(inst); + } +} + +pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>( + ctx: &mut C, + rd: Writable<Reg>, + value: u128, +) { + if value == 0 { + // Fast-track a common case. The general case, viz, calling `Inst::load_fp_constant128`, + // is potentially expensive. + ctx.emit(Inst::VecDupImm { + rd, + imm: ASIMDMovModImm::zero(), + invert: false, + size: VectorSize::Size8x16, + }); + } else { + let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty); + for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) { + ctx.emit(inst); + } + } +} + +pub(crate) fn lower_splat_const<C: LowerCtx<I = Inst>>( + ctx: &mut C, + rd: Writable<Reg>, + value: u64, + size: VectorSize, +) { + let (value, narrow_size) = match size.lane_size() { + ScalarSize::Size8 => (value as u8 as u64, ScalarSize::Size128), + ScalarSize::Size16 => (value as u16 as u64, ScalarSize::Size8), + ScalarSize::Size32 => (value as u32 as u64, ScalarSize::Size16), + ScalarSize::Size64 => (value, ScalarSize::Size32), + _ => unreachable!(), + }; + let (value, size) = match Inst::get_replicated_vector_pattern(value as u128, narrow_size) { + Some((value, lane_size)) => ( + value, + VectorSize::from_lane_size(lane_size, size.is_128bits()), + ), + None => (value, size), + }; + let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty); + + for inst in Inst::load_replicated_vector_pattern(rd, value, size, alloc_tmp) { + ctx.emit(inst); + } +} + +pub(crate) fn lower_condcode(cc: IntCC) -> Cond { + match cc { + IntCC::Equal => Cond::Eq, + IntCC::NotEqual => Cond::Ne, + IntCC::SignedGreaterThanOrEqual => Cond::Ge, + IntCC::SignedGreaterThan => Cond::Gt, + IntCC::SignedLessThanOrEqual => Cond::Le, + IntCC::SignedLessThan => Cond::Lt, + IntCC::UnsignedGreaterThanOrEqual => Cond::Hs, + IntCC::UnsignedGreaterThan => Cond::Hi, + IntCC::UnsignedLessThanOrEqual => Cond::Ls, + IntCC::UnsignedLessThan => Cond::Lo, + IntCC::Overflow => Cond::Vs, + IntCC::NotOverflow => Cond::Vc, + } +} + +pub(crate) fn lower_fp_condcode(cc: FloatCC) -> Cond { + // Refer to `codegen/shared/src/condcodes.rs` and to the `FCMP` AArch64 docs. + // The FCMP instruction sets: + // NZCV + // - PCSR.NZCV = 0011 on UN (unordered), + // 0110 on EQ, + // 1000 on LT, + // 0010 on GT. + match cc { + // EQ | LT | GT. Vc => V clear. + FloatCC::Ordered => Cond::Vc, + // UN. Vs => V set. + FloatCC::Unordered => Cond::Vs, + // EQ. Eq => Z set. + FloatCC::Equal => Cond::Eq, + // UN | LT | GT. Ne => Z clear. + FloatCC::NotEqual => Cond::Ne, + // LT | GT. + FloatCC::OrderedNotEqual => unimplemented!(), + // UN | EQ + FloatCC::UnorderedOrEqual => unimplemented!(), + // LT. Mi => N set. + FloatCC::LessThan => Cond::Mi, + // LT | EQ. Ls => C clear or Z set. + FloatCC::LessThanOrEqual => Cond::Ls, + // GT. Gt => Z clear, N = V. + FloatCC::GreaterThan => Cond::Gt, + // GT | EQ. Ge => N = V. + FloatCC::GreaterThanOrEqual => Cond::Ge, + // UN | LT + FloatCC::UnorderedOrLessThan => unimplemented!(), + // UN | LT | EQ + FloatCC::UnorderedOrLessThanOrEqual => unimplemented!(), + // UN | GT + FloatCC::UnorderedOrGreaterThan => unimplemented!(), + // UN | GT | EQ + FloatCC::UnorderedOrGreaterThanOrEqual => unimplemented!(), + } +} + +pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>( + ctx: &mut C, + rd: Writable<Reg>, + mut rn: Reg, + mut rm: Reg, + ty: Type, + cond: Cond, +) -> CodegenResult<()> { + let is_float = match ty { + F32X4 | F64X2 => true, + _ => false, + }; + let size = VectorSize::from_ty(ty); + // 'Less than' operations are implemented by swapping + // the order of operands and using the 'greater than' + // instructions. + // 'Not equal' is implemented with 'equal' and inverting + // the result. + let (alu_op, swap) = match (is_float, cond) { + (false, Cond::Eq) => (VecALUOp::Cmeq, false), + (false, Cond::Ne) => (VecALUOp::Cmeq, false), + (false, Cond::Ge) => (VecALUOp::Cmge, false), + (false, Cond::Gt) => (VecALUOp::Cmgt, false), + (false, Cond::Le) => (VecALUOp::Cmge, true), + (false, Cond::Lt) => (VecALUOp::Cmgt, true), + (false, Cond::Hs) => (VecALUOp::Cmhs, false), + (false, Cond::Hi) => (VecALUOp::Cmhi, false), + (false, Cond::Ls) => (VecALUOp::Cmhs, true), + (false, Cond::Lo) => (VecALUOp::Cmhi, true), + (true, Cond::Eq) => (VecALUOp::Fcmeq, false), + (true, Cond::Ne) => (VecALUOp::Fcmeq, false), + (true, Cond::Mi) => (VecALUOp::Fcmgt, true), + (true, Cond::Ls) => (VecALUOp::Fcmge, true), + (true, Cond::Ge) => (VecALUOp::Fcmge, false), + (true, Cond::Gt) => (VecALUOp::Fcmgt, false), + _ => unreachable!(), + }; + + if swap { + std::mem::swap(&mut rn, &mut rm); + } + + ctx.emit(Inst::VecRRR { + alu_op, + rd, + rn, + rm, + size, + }); + + if cond == Cond::Ne { + ctx.emit(Inst::VecMisc { + op: VecMisc2::Not, + rd, + rn: rd.to_reg(), + size, + }); + } + + Ok(()) +} + +/// Determines whether this condcode interprets inputs as signed or unsigned. See the +/// documentation for the `icmp` instruction in cranelift-codegen/meta/src/shared/instructions.rs +/// for further insights into this. +pub(crate) fn condcode_is_signed(cc: IntCC) -> bool { + match cc { + IntCC::Equal + | IntCC::UnsignedGreaterThanOrEqual + | IntCC::UnsignedGreaterThan + | IntCC::UnsignedLessThanOrEqual + | IntCC::UnsignedLessThan + | IntCC::NotEqual => false, + IntCC::SignedGreaterThanOrEqual + | IntCC::SignedGreaterThan + | IntCC::SignedLessThanOrEqual + | IntCC::SignedLessThan + | IntCC::Overflow + | IntCC::NotOverflow => true, + } +} + +//============================================================================= +// Helpers for instruction lowering. + +pub(crate) fn choose_32_64<T: Copy>(ty: Type, op32: T, op64: T) -> T { + let bits = ty_bits(ty); + if bits <= 32 { + op32 + } else if bits == 64 { + op64 + } else { + panic!("choose_32_64 on > 64 bits!") + } +} + +/// Checks for an instance of `op` feeding the given input. +pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>( + c: &mut C, + input: InsnInput, + op: Opcode, +) -> Option<IRInst> { + let inputs = c.get_input(input.insn, input.input); + debug!( + "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}", + input, inputs, op + ); + if let Some((src_inst, _)) = inputs.inst { + let data = c.data(src_inst); + debug!(" -> input inst {:?}", data); + if data.opcode() == op { + return Some(src_inst); + } + } + None +} + +/// Checks for an instance of any one of `ops` feeding the given input. +pub(crate) fn maybe_input_insn_multi<C: LowerCtx<I = Inst>>( + c: &mut C, + input: InsnInput, + ops: &[Opcode], +) -> Option<(Opcode, IRInst)> { + for &op in ops { + if let Some(inst) = maybe_input_insn(c, input, op) { + return Some((op, inst)); + } + } + None +} + +/// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g., +/// Bint or a bitcast). +/// +/// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it +/// a bit more generic. +pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>( + c: &mut C, + input: InsnInput, + op: Opcode, + conv: Opcode, +) -> Option<IRInst> { + let inputs = c.get_input(input.insn, input.input); + if let Some((src_inst, _)) = inputs.inst { + let data = c.data(src_inst); + if data.opcode() == op { + return Some(src_inst); + } + if data.opcode() == conv { + let inputs = c.get_input(src_inst, 0); + if let Some((src_inst, _)) = inputs.inst { + let data = c.data(src_inst); + if data.opcode() == op { + return Some(src_inst); + } + } + } + } + None +} + +pub(crate) fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>( + ctx: &mut C, + insn: IRInst, + is_signed: bool, +) { + debug!("lower_icmp_or_ifcmp_to_flags: insn {}", insn); + let ty = ctx.input_ty(insn, 0); + let bits = ty_bits(ty); + let narrow_mode = match (bits <= 32, is_signed) { + (true, true) => NarrowValueMode::SignExtend32, + (true, false) => NarrowValueMode::ZeroExtend32, + (false, true) => NarrowValueMode::SignExtend64, + (false, false) => NarrowValueMode::ZeroExtend64, + }; + let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; + let ty = ctx.input_ty(insn, 0); + let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); + let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode); + debug!("lower_icmp_or_ifcmp_to_flags: rn = {:?} rm = {:?}", rn, rm); + let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); + let rd = writable_zero_reg(); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); +} + +pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) { + let ty = ctx.input_ty(insn, 0); + let bits = ty_bits(ty); + let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + match bits { + 32 => { + ctx.emit(Inst::FpuCmp32 { rn, rm }); + } + 64 => { + ctx.emit(Inst::FpuCmp64 { rn, rm }); + } + _ => panic!("Unknown float size"), + } +} + +/// Convert a 0 / 1 result, such as from a conditional-set instruction, into a 0 +/// / -1 (all-ones) result as expected for bool operations. +pub(crate) fn normalize_bool_result<C: LowerCtx<I = Inst>>( + ctx: &mut C, + insn: IRInst, + rd: Writable<Reg>, +) { + // A boolean is 0 / -1; if output width is > 1, negate. + if ty_bits(ctx.output_ty(insn, 0)) > 1 { + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Sub64, + rd, + rn: zero_reg(), + rm: rd.to_reg(), + }); + } +} + +//============================================================================= +// Lowering-backend trait implementation. + +impl LowerBackend for AArch64Backend { + type MInst = Inst; + + fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> { + lower_inst::lower_insn_to_regs(ctx, ir_inst) + } + + fn lower_branch_group<C: LowerCtx<I = Inst>>( + &self, + ctx: &mut C, + branches: &[IRInst], + targets: &[MachLabel], + fallthrough: Option<MachLabel>, + ) -> CodegenResult<()> { + lower_inst::lower_branch(ctx, branches, targets, fallthrough) + } + + fn maybe_pinned_reg(&self) -> Option<Reg> { + Some(xreg(PINNED_REG)) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/lower_inst.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/lower_inst.rs new file mode 100644 index 0000000000..faa89d3b98 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/lower_inst.rs @@ -0,0 +1,3409 @@ +//! Lower a single Cranelift instruction into vcode. + +use crate::binemit::CodeOffset; +use crate::ir::condcodes::FloatCC; +use crate::ir::types::*; +use crate::ir::Inst as IRInst; +use crate::ir::{InstructionData, Opcode, TrapCode}; +use crate::machinst::lower::*; +use crate::machinst::*; +use crate::{CodegenError, CodegenResult}; + +use crate::isa::aarch64::abi::*; +use crate::isa::aarch64::inst::*; + +use regalloc::{RegClass, Writable}; + +use alloc::boxed::Box; +use alloc::vec::Vec; +use core::convert::TryFrom; +use smallvec::SmallVec; + +use super::lower::*; + +/// This is target-word-size dependent. And it excludes booleans and reftypes. +fn is_valid_atomic_transaction_ty(ty: Type) -> bool { + match ty { + I8 | I16 | I32 | I64 => true, + _ => false, + } +} + +/// Actually codegen an instruction's results into registers. +pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>( + ctx: &mut C, + insn: IRInst, +) -> CodegenResult<()> { + let op = ctx.data(insn).opcode(); + let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn)) + .map(|i| InsnInput { insn, input: i }) + .collect(); + let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn)) + .map(|i| InsnOutput { insn, output: i }) + .collect(); + let ty = if outputs.len() > 0 { + Some(ctx.output_ty(insn, 0)) + } else { + None + }; + + match op { + Opcode::Iconst | Opcode::Bconst | Opcode::Null => { + let value = ctx.get_constant(insn).unwrap(); + // Sign extend constant if necessary + let value = match ty.unwrap() { + I8 => (((value as i64) << 56) >> 56) as u64, + I16 => (((value as i64) << 48) >> 48) as u64, + I32 => (((value as i64) << 32) >> 32) as u64, + I64 | R64 => value, + ty if ty.is_bool() => value, + ty => unreachable!("Unknown type for const: {}", ty), + }; + let rd = get_output_reg(ctx, outputs[0]); + lower_constant_u64(ctx, rd, value); + } + Opcode::F32const => { + let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32); + let rd = get_output_reg(ctx, outputs[0]); + lower_constant_f32(ctx, rd, value); + } + Opcode::F64const => { + let value = f64::from_bits(ctx.get_constant(insn).unwrap()); + let rd = get_output_reg(ctx, outputs[0]); + lower_constant_f64(ctx, rd, value); + } + Opcode::Iadd => { + let rd = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + if !ty.is_vector() { + let mul_insn = + if let Some(mul_insn) = maybe_input_insn(ctx, inputs[1], Opcode::Imul) { + Some((mul_insn, 0)) + } else if let Some(mul_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Imul) { + Some((mul_insn, 1)) + } else { + None + }; + // If possible combine mul + add into madd. + if let Some((insn, addend_idx)) = mul_insn { + let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64); + let rn_input = InsnInput { insn, input: 0 }; + let rm_input = InsnInput { insn, input: 1 }; + + let rn = put_input_in_reg(ctx, rn_input, NarrowValueMode::None); + let rm = put_input_in_reg(ctx, rm_input, NarrowValueMode::None); + let ra = put_input_in_reg(ctx, inputs[addend_idx], NarrowValueMode::None); + + ctx.emit(Inst::AluRRRR { + alu_op, + rd, + rn, + rm, + ra, + }); + } else { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let (rm, negated) = put_input_in_rse_imm12_maybe_negated( + ctx, + inputs[1], + ty_bits(ty), + NarrowValueMode::None, + ); + let alu_op = if !negated { + choose_32_64(ty, ALUOp::Add32, ALUOp::Add64) + } else { + choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64) + }; + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + } + } else { + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + ctx.emit(Inst::VecRRR { + rd, + rn, + rm, + alu_op: VecALUOp::Add, + size: VectorSize::from_ty(ty), + }); + } + } + Opcode::Isub => { + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ty.unwrap(); + if !ty.is_vector() { + let (rm, negated) = put_input_in_rse_imm12_maybe_negated( + ctx, + inputs[1], + ty_bits(ty), + NarrowValueMode::None, + ); + let alu_op = if !negated { + choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64) + } else { + choose_32_64(ty, ALUOp::Add32, ALUOp::Add64) + }; + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + } else { + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + ctx.emit(Inst::VecRRR { + rd, + rn, + rm, + alu_op: VecALUOp::Sub, + size: VectorSize::from_ty(ty), + }); + } + } + Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => { + // We use the scalar SIMD & FP saturating additions and subtractions + // (SQADD / UQADD / SQSUB / UQSUB), which require scalar FP registers. + let is_signed = op == Opcode::SaddSat || op == Opcode::SsubSat; + let ty = ty.unwrap(); + let rd = get_output_reg(ctx, outputs[0]); + if !ty.is_vector() { + let narrow_mode = if is_signed { + NarrowValueMode::SignExtend64 + } else { + NarrowValueMode::ZeroExtend64 + }; + let fpu_op = match op { + Opcode::UaddSat => FPUOp2::Uqadd64, + Opcode::SaddSat => FPUOp2::Sqadd64, + Opcode::UsubSat => FPUOp2::Uqsub64, + Opcode::SsubSat => FPUOp2::Sqsub64, + _ => unreachable!(), + }; + let va = ctx.alloc_tmp(RegClass::V128, I128); + let vb = ctx.alloc_tmp(RegClass::V128, I128); + let ra = put_input_in_reg(ctx, inputs[0], narrow_mode); + let rb = put_input_in_reg(ctx, inputs[1], narrow_mode); + ctx.emit(Inst::MovToFpu { + rd: va, + rn: ra, + size: ScalarSize::Size64, + }); + ctx.emit(Inst::MovToFpu { + rd: vb, + rn: rb, + size: ScalarSize::Size64, + }); + ctx.emit(Inst::FpuRRR { + fpu_op, + rd: va, + rn: va.to_reg(), + rm: vb.to_reg(), + }); + ctx.emit(Inst::MovFromVec { + rd, + rn: va.to_reg(), + idx: 0, + size: VectorSize::Size64x2, + }); + } else { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + + let alu_op = match op { + Opcode::UaddSat => VecALUOp::Uqadd, + Opcode::SaddSat => VecALUOp::Sqadd, + Opcode::UsubSat => VecALUOp::Uqsub, + Opcode::SsubSat => VecALUOp::Sqsub, + _ => unreachable!(), + }; + + ctx.emit(Inst::VecRRR { + rd, + rn, + rm, + alu_op, + size: VectorSize::from_ty(ty), + }); + } + } + + Opcode::Ineg => { + let rd = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + if !ty.is_vector() { + let rn = zero_reg(); + let rm = put_input_in_rse_imm12(ctx, inputs[0], NarrowValueMode::None); + let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + } else { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + ctx.emit(Inst::VecMisc { + op: VecMisc2::Neg, + rd, + rn, + size: VectorSize::from_ty(ty), + }); + } + } + + Opcode::Imul => { + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + if !ty.is_vector() { + let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64); + ctx.emit(Inst::AluRRRR { + alu_op, + rd, + rn, + rm, + ra: zero_reg(), + }); + } else { + if ty == I64X2 { + let tmp1 = ctx.alloc_tmp(RegClass::V128, I64X2); + let tmp2 = ctx.alloc_tmp(RegClass::V128, I64X2); + + // This I64X2 multiplication is performed with several 32-bit + // operations. + + // 64-bit numbers x and y, can be represented as: + // x = a + 2^32(b) + // y = c + 2^32(d) + + // A 64-bit multiplication is: + // x * y = ac + 2^32(ad + bc) + 2^64(bd) + // note: `2^64(bd)` can be ignored, the value is too large to fit in + // 64 bits. + + // This sequence implements a I64X2 multiply, where the registers + // `rn` and `rm` are split up into 32-bit components: + // rn = |d|c|b|a| + // rm = |h|g|f|e| + // + // rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)| + // + // The sequence is: + // rev64 rd.4s, rm.4s + // mul rd.4s, rd.4s, rn.4s + // xtn tmp1.2s, rn.2d + // addp rd.4s, rd.4s, rd.4s + // xtn tmp2.2s, rm.2d + // shll rd.2d, rd.2s, #32 + // umlal rd.2d, tmp2.2s, tmp1.2s + + // Reverse the 32-bit elements in the 64-bit words. + // rd = |g|h|e|f| + ctx.emit(Inst::VecMisc { + op: VecMisc2::Rev64, + rd, + rn: rm, + size: VectorSize::Size32x4, + }); + + // Calculate the high half components. + // rd = |dg|ch|be|af| + // + // Note that this 32-bit multiply of the high half + // discards the bits that would overflow, same as + // if 64-bit operations were used. Also the Shll + // below would shift out the overflow bits anyway. + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd, + rn: rd.to_reg(), + rm: rn, + size: VectorSize::Size32x4, + }); + + // Extract the low half components of rn. + // tmp1 = |c|a| + ctx.emit(Inst::VecMiscNarrow { + op: VecMiscNarrowOp::Xtn, + rd: tmp1, + rn, + size: VectorSize::Size32x2, + high_half: false, + }); + + // Sum the respective high half components. + // rd = |dg+ch|be+af||dg+ch|be+af| + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Addp, + rd: rd, + rn: rd.to_reg(), + rm: rd.to_reg(), + size: VectorSize::Size32x4, + }); + + // Extract the low half components of rm. + // tmp2 = |g|e| + ctx.emit(Inst::VecMiscNarrow { + op: VecMiscNarrowOp::Xtn, + rd: tmp2, + rn: rm, + size: VectorSize::Size32x2, + high_half: false, + }); + + // Shift the high half components, into the high half. + // rd = |dg+ch << 32|be+af << 32| + ctx.emit(Inst::VecMisc { + op: VecMisc2::Shll, + rd, + rn: rd.to_reg(), + size: VectorSize::Size32x2, + }); + + // Multiply the low components together, and accumulate with the high + // half. + // rd = |rd[1] + cg|rd[0] + ae| + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Umlal, + rd, + rn: tmp2.to_reg(), + rm: tmp1.to_reg(), + size: VectorSize::Size32x2, + }); + } else { + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd, + rn, + rm, + size: VectorSize::from_ty(ty), + }); + } + } + } + + Opcode::Umulhi | Opcode::Smulhi => { + let rd = get_output_reg(ctx, outputs[0]); + let is_signed = op == Opcode::Smulhi; + let input_ty = ctx.input_ty(insn, 0); + assert!(ctx.input_ty(insn, 1) == input_ty); + assert!(ctx.output_ty(insn, 0) == input_ty); + + match input_ty { + I64 => { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let alu_op = if is_signed { + ALUOp::SMulH + } else { + ALUOp::UMulH + }; + ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm }); + } + I32 | I16 | I8 => { + let narrow_mode = if is_signed { + NarrowValueMode::SignExtend64 + } else { + NarrowValueMode::ZeroExtend64 + }; + let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); + let rm = put_input_in_reg(ctx, inputs[1], narrow_mode); + let ra = zero_reg(); + ctx.emit(Inst::AluRRRR { + alu_op: ALUOp3::MAdd64, + rd, + rn, + rm, + ra, + }); + let shift_op = if is_signed { + ALUOp::Asr64 + } else { + ALUOp::Lsr64 + }; + let shift_amt = match input_ty { + I32 => 32, + I16 => 16, + I8 => 8, + _ => unreachable!(), + }; + ctx.emit(Inst::AluRRImmShift { + alu_op: shift_op, + rd, + rn: rd.to_reg(), + immshift: ImmShift::maybe_from_u64(shift_amt).unwrap(), + }); + } + _ => { + panic!("Unsupported argument type for umulhi/smulhi: {}", input_ty); + } + } + } + + Opcode::Udiv | Opcode::Sdiv | Opcode::Urem | Opcode::Srem => { + let is_signed = match op { + Opcode::Udiv | Opcode::Urem => false, + Opcode::Sdiv | Opcode::Srem => true, + _ => unreachable!(), + }; + let is_rem = match op { + Opcode::Udiv | Opcode::Sdiv => false, + Opcode::Urem | Opcode::Srem => true, + _ => unreachable!(), + }; + let narrow_mode = if is_signed { + NarrowValueMode::SignExtend64 + } else { + NarrowValueMode::ZeroExtend64 + }; + // TODO: Add SDiv32 to implement 32-bit directly, rather + // than extending the input. + let div_op = if is_signed { + ALUOp::SDiv64 + } else { + ALUOp::UDiv64 + }; + + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); + let rm = put_input_in_reg(ctx, inputs[1], narrow_mode); + // The div instruction does not trap on divide by zero or signed overflow + // so checks are inserted below. + // + // div rd, rn, rm + ctx.emit(Inst::AluRRR { + alu_op: div_op, + rd, + rn, + rm, + }); + + if is_rem { + // Remainder (rn % rm) is implemented as: + // + // tmp = rn / rm + // rd = rn - (tmp*rm) + // + // use 'rd' for tmp and you have: + // + // div rd, rn, rm ; rd = rn / rm + // cbnz rm, #8 ; branch over trap + // udf ; divide by zero + // msub rd, rd, rm, rn ; rd = rn - rd * rm + + // Check for divide by 0. + let trap_code = TrapCode::IntegerDivisionByZero; + ctx.emit(Inst::TrapIf { + trap_code, + kind: CondBrKind::Zero(rm), + }); + + ctx.emit(Inst::AluRRRR { + alu_op: ALUOp3::MSub64, + rd: rd, + rn: rd.to_reg(), + rm: rm, + ra: rn, + }); + } else { + if div_op == ALUOp::SDiv64 { + // cbnz rm, #8 + // udf ; divide by zero + // cmn rm, 1 + // ccmp rn, 1, #nzcv, eq + // b.vc #8 + // udf ; signed overflow + + // Check for divide by 0. + let trap_code = TrapCode::IntegerDivisionByZero; + ctx.emit(Inst::TrapIf { + trap_code, + kind: CondBrKind::Zero(rm), + }); + + // Check for signed overflow. The only case is min_value / -1. + let ty = ty.unwrap(); + // The following checks must be done in 32-bit or 64-bit, depending + // on the input type. Even though the initial div instruction is + // always done in 64-bit currently. + let size = OperandSize::from_ty(ty); + // Check RHS is -1. + ctx.emit(Inst::AluRRImm12 { + alu_op: choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64), + rd: writable_zero_reg(), + rn: rm, + imm12: Imm12::maybe_from_u64(1).unwrap(), + }); + // Check LHS is min_value, by subtracting 1 and branching if + // there is overflow. + ctx.emit(Inst::CCmpImm { + size, + rn, + imm: UImm5::maybe_from_u8(1).unwrap(), + nzcv: NZCV::new(false, false, false, false), + cond: Cond::Eq, + }); + let trap_code = TrapCode::IntegerOverflow; + ctx.emit(Inst::TrapIf { + trap_code, + kind: CondBrKind::Cond(Cond::Vs), + }); + } else { + // cbnz rm, #8 + // udf ; divide by zero + + // Check for divide by 0. + let trap_code = TrapCode::IntegerDivisionByZero; + ctx.emit(Inst::TrapIf { + trap_code, + kind: CondBrKind::Zero(rm), + }); + } + } + } + + Opcode::Uextend | Opcode::Sextend => { + let output_ty = ty.unwrap(); + let input_ty = ctx.input_ty(insn, 0); + let from_bits = ty_bits(input_ty) as u8; + let to_bits = ty_bits(output_ty) as u8; + let to_bits = std::cmp::max(32, to_bits); + assert!(from_bits <= to_bits); + if from_bits < to_bits { + let signed = op == Opcode::Sextend; + let rd = get_output_reg(ctx, outputs[0]); + + if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) { + let idx = + if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(extract_insn) { + *imm + } else { + unreachable!(); + }; + let input = InsnInput { + insn: extract_insn, + input: 0, + }; + let rn = put_input_in_reg(ctx, input, NarrowValueMode::None); + let size = VectorSize::from_ty(ctx.input_ty(extract_insn, 0)); + + if signed { + let scalar_size = OperandSize::from_ty(output_ty); + + ctx.emit(Inst::MovFromVecSigned { + rd, + rn, + idx, + size, + scalar_size, + }); + } else { + ctx.emit(Inst::MovFromVec { rd, rn, idx, size }); + } + } else { + // If we reach this point, we weren't able to incorporate the extend as + // a register-mode on another instruction, so we have a 'None' + // narrow-value/extend mode here, and we emit the explicit instruction. + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + ctx.emit(Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + }); + } + } + } + + Opcode::Bnot => { + let rd = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + if !ty.is_vector() { + let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None); + let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64); + // NOT rd, rm ==> ORR_NOT rd, zero, rm + ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm)); + } else { + let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + ctx.emit(Inst::VecMisc { + op: VecMisc2::Not, + rd, + rn: rm, + size: VectorSize::from_ty(ty), + }); + } + } + + Opcode::Band + | Opcode::Bor + | Opcode::Bxor + | Opcode::BandNot + | Opcode::BorNot + | Opcode::BxorNot => { + let rd = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + if !ty.is_vector() { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_rs_immlogic(ctx, inputs[1], NarrowValueMode::None); + let alu_op = match op { + Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64), + Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64), + Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64), + Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64), + Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64), + Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64), + _ => unreachable!(), + }; + ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm)); + } else { + let alu_op = match op { + Opcode::Band => VecALUOp::And, + Opcode::BandNot => VecALUOp::Bic, + Opcode::Bor => VecALUOp::Orr, + Opcode::Bxor => VecALUOp::Eor, + _ => unreachable!(), + }; + + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + + ctx.emit(Inst::VecRRR { + alu_op, + rd, + rn, + rm, + size: VectorSize::from_ty(ty), + }); + } + } + + Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => { + let ty = ty.unwrap(); + let rd = get_output_reg(ctx, outputs[0]); + if !ty.is_vector() { + let size = OperandSize::from_bits(ty_bits(ty)); + let narrow_mode = match (op, size) { + (Opcode::Ishl, _) => NarrowValueMode::None, + (Opcode::Ushr, OperandSize::Size64) => NarrowValueMode::ZeroExtend64, + (Opcode::Ushr, OperandSize::Size32) => NarrowValueMode::ZeroExtend32, + (Opcode::Sshr, OperandSize::Size64) => NarrowValueMode::SignExtend64, + (Opcode::Sshr, OperandSize::Size32) => NarrowValueMode::SignExtend32, + _ => unreachable!(), + }; + let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); + let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty)); + let alu_op = match op { + Opcode::Ishl => choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64), + Opcode::Ushr => choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64), + Opcode::Sshr => choose_32_64(ty, ALUOp::Asr32, ALUOp::Asr64), + _ => unreachable!(), + }; + ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm)); + } else { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let size = VectorSize::from_ty(ty); + let (alu_op, is_right_shift) = match op { + Opcode::Ishl => (VecALUOp::Sshl, false), + Opcode::Ushr => (VecALUOp::Ushl, true), + Opcode::Sshr => (VecALUOp::Sshl, true), + _ => unreachable!(), + }; + + let rm = if is_right_shift { + // Right shifts are implemented with a negative left shift. + let tmp = ctx.alloc_tmp(RegClass::I64, I32); + let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None); + let rn = zero_reg(); + ctx.emit(alu_inst_imm12(ALUOp::Sub32, tmp, rn, rm)); + tmp.to_reg() + } else { + put_input_in_reg(ctx, inputs[1], NarrowValueMode::None) + }; + + ctx.emit(Inst::VecDup { rd, rn: rm, size }); + + ctx.emit(Inst::VecRRR { + alu_op, + rd, + rn, + rm: rd.to_reg(), + size, + }); + } + } + + Opcode::Rotr | Opcode::Rotl => { + // aarch64 doesn't have a left-rotate instruction, but a left rotation of K places is + // effectively a right rotation of N - K places, if N is the integer's bit size. We + // implement left rotations with this trick. + // + // For a 32-bit or 64-bit rotate-right, we can use the ROR instruction directly. + // + // For a < 32-bit rotate-right, we synthesize this as: + // + // rotr rd, rn, rm + // + // => + // + // zero-extend rn, <32-or-64> + // and tmp_masked_rm, rm, <bitwidth - 1> + // sub tmp1, tmp_masked_rm, <bitwidth> + // sub tmp1, zero, tmp1 ; neg + // lsr tmp2, rn, tmp_masked_rm + // lsl rd, rn, tmp1 + // orr rd, rd, tmp2 + // + // For a constant amount, we can instead do: + // + // zero-extend rn, <32-or-64> + // lsr tmp2, rn, #<shiftimm> + // lsl rd, rn, <bitwidth - shiftimm> + // orr rd, rd, tmp2 + + let is_rotl = op == Opcode::Rotl; + + let ty = ty.unwrap(); + let ty_bits_size = ty_bits(ty) as u8; + + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg( + ctx, + inputs[0], + if ty_bits_size <= 32 { + NarrowValueMode::ZeroExtend32 + } else { + NarrowValueMode::ZeroExtend64 + }, + ); + let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty)); + + if ty_bits_size == 32 || ty_bits_size == 64 { + let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64); + match rm { + ResultRegImmShift::ImmShift(mut immshift) => { + if is_rotl { + immshift.imm = ty_bits_size.wrapping_sub(immshift.value()); + } + immshift.imm &= ty_bits_size - 1; + ctx.emit(Inst::AluRRImmShift { + alu_op, + rd, + rn, + immshift, + }); + } + + ResultRegImmShift::Reg(rm) => { + let rm = if is_rotl { + // Really ty_bits_size - rn, but the upper bits of the result are + // ignored (because of the implicit masking done by the instruction), + // so this is equivalent to negating the input. + let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); + let tmp = ctx.alloc_tmp(RegClass::I64, ty); + ctx.emit(Inst::AluRRR { + alu_op, + rd: tmp, + rn: zero_reg(), + rm, + }); + tmp.to_reg() + } else { + rm + }; + ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm }); + } + } + } else { + debug_assert!(ty_bits_size < 32); + + match rm { + ResultRegImmShift::Reg(reg) => { + let reg = if is_rotl { + // Really ty_bits_size - rn, but the upper bits of the result are + // ignored (because of the implicit masking done by the instruction), + // so this is equivalent to negating the input. + let tmp = ctx.alloc_tmp(RegClass::I64, I32); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Sub32, + rd: tmp, + rn: zero_reg(), + rm: reg, + }); + tmp.to_reg() + } else { + reg + }; + + // Explicitly mask the rotation count. + let tmp_masked_rm = ctx.alloc_tmp(RegClass::I64, I32); + ctx.emit(Inst::AluRRImmLogic { + alu_op: ALUOp::And32, + rd: tmp_masked_rm, + rn: reg, + imml: ImmLogic::maybe_from_u64((ty_bits_size - 1) as u64, I32).unwrap(), + }); + let tmp_masked_rm = tmp_masked_rm.to_reg(); + + let tmp1 = ctx.alloc_tmp(RegClass::I64, I32); + let tmp2 = ctx.alloc_tmp(RegClass::I64, I32); + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::Sub32, + rd: tmp1, + rn: tmp_masked_rm, + imm12: Imm12::maybe_from_u64(ty_bits_size as u64).unwrap(), + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Sub32, + rd: tmp1, + rn: zero_reg(), + rm: tmp1.to_reg(), + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Lsr32, + rd: tmp2, + rn, + rm: tmp_masked_rm, + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Lsl32, + rd, + rn, + rm: tmp1.to_reg(), + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Orr32, + rd, + rn: rd.to_reg(), + rm: tmp2.to_reg(), + }); + } + + ResultRegImmShift::ImmShift(mut immshift) => { + if is_rotl { + immshift.imm = ty_bits_size.wrapping_sub(immshift.value()); + } + immshift.imm &= ty_bits_size - 1; + + let tmp1 = ctx.alloc_tmp(RegClass::I64, I32); + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsr32, + rd: tmp1, + rn, + immshift: immshift.clone(), + }); + + let amount = immshift.value() & (ty_bits_size - 1); + let opp_shift = + ImmShift::maybe_from_u64(ty_bits_size as u64 - amount as u64).unwrap(); + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsl32, + rd, + rn, + immshift: opp_shift, + }); + + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Orr32, + rd, + rn: rd.to_reg(), + rm: tmp1.to_reg(), + }); + } + } + } + } + + Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => { + let rd = get_output_reg(ctx, outputs[0]); + let needs_zext = match op { + Opcode::Bitrev | Opcode::Ctz => false, + Opcode::Clz | Opcode::Cls => true, + _ => unreachable!(), + }; + let ty = ty.unwrap(); + let narrow_mode = if needs_zext && ty_bits(ty) == 64 { + NarrowValueMode::ZeroExtend64 + } else if needs_zext { + NarrowValueMode::ZeroExtend32 + } else { + NarrowValueMode::None + }; + let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); + let op_ty = match ty { + I8 | I16 | I32 => I32, + I64 => I64, + _ => panic!("Unsupported type for Bitrev/Clz/Cls"), + }; + let bitop = match op { + Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)), + Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)), + _ => unreachable!(), + }; + ctx.emit(Inst::BitRR { rd, rn, op: bitop }); + + // Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem + // to a clz, and bitrev as the main operation. + if op == Opcode::Bitrev || op == Opcode::Ctz { + // Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place + // the reversed result in the highest n bits, so we need to shift them down into + // place. + let right_shift = match ty { + I8 => Some(24), + I16 => Some(16), + I32 => None, + I64 => None, + _ => panic!("Unsupported type for Bitrev"), + }; + if let Some(s) = right_shift { + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsr32, + rd, + rn: rd.to_reg(), + immshift: ImmShift::maybe_from_u64(s).unwrap(), + }); + } + } + + if op == Opcode::Ctz { + ctx.emit(Inst::BitRR { + op: BitOp::from((Opcode::Clz, op_ty)), + rd, + rn: rd.to_reg(), + }); + } + } + + Opcode::Popcnt => { + // Lower popcount using the following algorithm: + // + // x -= (x >> 1) & 0x5555555555555555 + // x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333) + // x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f + // x += x << 8 + // x += x << 16 + // x += x << 32 + // x >> 56 + let ty = ty.unwrap(); + let rd = get_output_reg(ctx, outputs[0]); + // FIXME(#1537): zero-extend 8/16/32-bit operands only to 32 bits, + // and fix the sequence below to work properly for this. + let narrow_mode = NarrowValueMode::ZeroExtend64; + let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + + // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then + // the rest of the code is identical to the 64-bit version. + // lsr [wx]d, [wx]n, #1 + ctx.emit(Inst::AluRRImmShift { + alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64), + rd: rd, + rn: rn, + immshift: ImmShift::maybe_from_u64(1).unwrap(), + }); + + // and xd, xd, #0x5555555555555555 + ctx.emit(Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd: rd, + rn: rd.to_reg(), + imml: ImmLogic::maybe_from_u64(0x5555555555555555, I64).unwrap(), + }); + + // sub xd, xn, xd + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Sub64, + rd: rd, + rn: rn, + rm: rd.to_reg(), + }); + + // and xt, xd, #0x3333333333333333 + ctx.emit(Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd: tmp, + rn: rd.to_reg(), + imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(), + }); + + // lsr xd, xd, #2 + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsr64, + rd: rd, + rn: rd.to_reg(), + immshift: ImmShift::maybe_from_u64(2).unwrap(), + }); + + // and xd, xd, #0x3333333333333333 + ctx.emit(Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd: rd, + rn: rd.to_reg(), + imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(), + }); + + // add xt, xd, xt + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: tmp, + rn: rd.to_reg(), + rm: tmp.to_reg(), + }); + + // add xt, xt, xt LSR #4 + ctx.emit(Inst::AluRRRShift { + alu_op: ALUOp::Add64, + rd: tmp, + rn: tmp.to_reg(), + rm: tmp.to_reg(), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSR, + ShiftOpShiftImm::maybe_from_shift(4).unwrap(), + ), + }); + + // and xt, xt, #0x0f0f0f0f0f0f0f0f + ctx.emit(Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd: tmp, + rn: tmp.to_reg(), + imml: ImmLogic::maybe_from_u64(0x0f0f0f0f0f0f0f0f, I64).unwrap(), + }); + + // add xt, xt, xt, LSL #8 + ctx.emit(Inst::AluRRRShift { + alu_op: ALUOp::Add64, + rd: tmp, + rn: tmp.to_reg(), + rm: tmp.to_reg(), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(8).unwrap(), + ), + }); + + // add xt, xt, xt, LSL #16 + ctx.emit(Inst::AluRRRShift { + alu_op: ALUOp::Add64, + rd: tmp, + rn: tmp.to_reg(), + rm: tmp.to_reg(), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(16).unwrap(), + ), + }); + + // add xt, xt, xt, LSL #32 + ctx.emit(Inst::AluRRRShift { + alu_op: ALUOp::Add64, + rd: tmp, + rn: tmp.to_reg(), + rm: tmp.to_reg(), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(32).unwrap(), + ), + }); + + // lsr xd, xt, #56 + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsr64, + rd: rd, + rn: tmp.to_reg(), + immshift: ImmShift::maybe_from_u64(56).unwrap(), + }); + } + + Opcode::Load + | Opcode::Uload8 + | Opcode::Sload8 + | Opcode::Uload16 + | Opcode::Sload16 + | Opcode::Uload32 + | Opcode::Sload32 + | Opcode::LoadComplex + | Opcode::Uload8Complex + | Opcode::Sload8Complex + | Opcode::Uload16Complex + | Opcode::Sload16Complex + | Opcode::Uload32Complex + | Opcode::Sload32Complex + | Opcode::Sload8x8 + | Opcode::Uload8x8 + | Opcode::Sload16x4 + | Opcode::Uload16x4 + | Opcode::Sload32x2 + | Opcode::Uload32x2 => { + let off = ctx.data(insn).load_store_offset().unwrap(); + let elem_ty = match op { + Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => { + I8 + } + Opcode::Sload16 + | Opcode::Uload16 + | Opcode::Sload16Complex + | Opcode::Uload16Complex => I16, + Opcode::Sload32 + | Opcode::Uload32 + | Opcode::Sload32Complex + | Opcode::Uload32Complex => I32, + Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0), + Opcode::Sload8x8 | Opcode::Uload8x8 => I8X8, + Opcode::Sload16x4 | Opcode::Uload16x4 => I16X4, + Opcode::Sload32x2 | Opcode::Uload32x2 => I32X2, + _ => unreachable!(), + }; + let sign_extend = match op { + Opcode::Sload8 + | Opcode::Sload8Complex + | Opcode::Sload16 + | Opcode::Sload16Complex + | Opcode::Sload32 + | Opcode::Sload32Complex => true, + _ => false, + }; + let is_float = ty_has_float_or_vec_representation(elem_ty); + + let mem = lower_address(ctx, elem_ty, &inputs[..], off); + let rd = get_output_reg(ctx, outputs[0]); + let flags = ctx + .memflags(insn) + .expect("Load instruction should have memflags"); + + ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) { + (1, _, _) => Inst::ULoad8 { rd, mem, flags }, + (8, false, _) => Inst::ULoad8 { rd, mem, flags }, + (8, true, _) => Inst::SLoad8 { rd, mem, flags }, + (16, false, _) => Inst::ULoad16 { rd, mem, flags }, + (16, true, _) => Inst::SLoad16 { rd, mem, flags }, + (32, false, false) => Inst::ULoad32 { rd, mem, flags }, + (32, true, false) => Inst::SLoad32 { rd, mem, flags }, + (32, _, true) => Inst::FpuLoad32 { rd, mem, flags }, + (64, _, false) => Inst::ULoad64 { rd, mem, flags }, + // Note that we treat some of the vector loads as scalar floating-point loads, + // which is correct in a little endian environment. + (64, _, true) => Inst::FpuLoad64 { rd, mem, flags }, + (128, _, _) => Inst::FpuLoad128 { rd, mem, flags }, + _ => panic!("Unsupported size in load"), + }); + + let vec_extend = match op { + Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8), + Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8), + Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16), + Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16), + Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32), + Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32), + _ => None, + }; + + if let Some(t) = vec_extend { + ctx.emit(Inst::VecExtend { + t, + rd, + rn: rd.to_reg(), + high_half: false, + }); + } + } + + Opcode::LoadSplat => { + let off = ctx.data(insn).load_store_offset().unwrap(); + let ty = ty.unwrap(); + let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off); + let rd = get_output_reg(ctx, outputs[0]); + let size = VectorSize::from_ty(ty); + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + + ctx.emit(Inst::LoadAddr { rd: tmp, mem }); + ctx.emit(Inst::VecLoadReplicate { + rd, + rn: tmp.to_reg(), + size, + }); + } + + Opcode::Store + | Opcode::Istore8 + | Opcode::Istore16 + | Opcode::Istore32 + | Opcode::StoreComplex + | Opcode::Istore8Complex + | Opcode::Istore16Complex + | Opcode::Istore32Complex => { + let off = ctx.data(insn).load_store_offset().unwrap(); + let elem_ty = match op { + Opcode::Istore8 | Opcode::Istore8Complex => I8, + Opcode::Istore16 | Opcode::Istore16Complex => I16, + Opcode::Istore32 | Opcode::Istore32Complex => I32, + Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0), + _ => unreachable!(), + }; + let is_float = ty_has_float_or_vec_representation(elem_ty); + let flags = ctx + .memflags(insn) + .expect("Store instruction should have memflags"); + + let mem = lower_address(ctx, elem_ty, &inputs[1..], off); + let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + + ctx.emit(match (ty_bits(elem_ty), is_float) { + (1, _) | (8, _) => Inst::Store8 { rd, mem, flags }, + (16, _) => Inst::Store16 { rd, mem, flags }, + (32, false) => Inst::Store32 { rd, mem, flags }, + (32, true) => Inst::FpuStore32 { rd, mem, flags }, + (64, false) => Inst::Store64 { rd, mem, flags }, + (64, true) => Inst::FpuStore64 { rd, mem, flags }, + (128, _) => Inst::FpuStore128 { rd, mem, flags }, + _ => panic!("Unsupported size in store"), + }); + } + + Opcode::StackAddr => { + let (stack_slot, offset) = match *ctx.data(insn) { + InstructionData::StackLoad { + opcode: Opcode::StackAddr, + stack_slot, + offset, + } => (stack_slot, offset), + _ => unreachable!(), + }; + let rd = get_output_reg(ctx, outputs[0]); + let offset: i32 = offset.into(); + let inst = ctx + .abi() + .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd); + ctx.emit(inst); + } + + Opcode::AtomicRmw => { + let r_dst = get_output_reg(ctx, outputs[0]); + let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + // Make sure that both args are in virtual regs, since in effect + // we have to do a parallel copy to get them safely to the AtomicRMW input + // regs, and that's not guaranteed safe if either is in a real reg. + r_addr = ctx.ensure_in_vreg(r_addr, I64); + r_arg2 = ctx.ensure_in_vreg(r_arg2, I64); + // Move the args to the preordained AtomicRMW input regs + ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64)); + ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64)); + // Now the AtomicRMW insn itself + let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap()); + ctx.emit(Inst::AtomicRMW { ty: ty_access, op }); + // And finally, copy the preordained AtomicRMW output reg to its destination. + ctx.emit(Inst::gen_move(r_dst, xreg(27), I64)); + // Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that. + } + + Opcode::AtomicCas => { + // This is very similar to, but not identical to, the AtomicRmw case. Note + // that the AtomicCAS sequence does its own masking, so we don't need to worry + // about zero-extending narrow (I8/I16/I32) values here. + let r_dst = get_output_reg(ctx, outputs[0]); + let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + // Make sure that all three args are in virtual regs. See corresponding comment + // for `Opcode::AtomicRmw` above. + r_addr = ctx.ensure_in_vreg(r_addr, I64); + r_expected = ctx.ensure_in_vreg(r_expected, I64); + r_replacement = ctx.ensure_in_vreg(r_replacement, I64); + // Move the args to the preordained AtomicCAS input regs + ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64)); + ctx.emit(Inst::gen_move( + Writable::from_reg(xreg(26)), + r_expected, + I64, + )); + ctx.emit(Inst::gen_move( + Writable::from_reg(xreg(28)), + r_replacement, + I64, + )); + // Now the AtomicCAS itself, implemented in the normal way, with an LL-SC loop + ctx.emit(Inst::AtomicCAS { ty: ty_access }); + // And finally, copy the preordained AtomicCAS output reg to its destination. + ctx.emit(Inst::gen_move(r_dst, xreg(27), I64)); + // Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that. + } + + Opcode::AtomicLoad => { + let r_data = get_output_reg(ctx, outputs[0]); + let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + ctx.emit(Inst::AtomicLoad { + ty: ty_access, + r_data, + r_addr, + }); + } + + Opcode::AtomicStore => { + let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let ty_access = ctx.input_ty(insn, 0); + assert!(is_valid_atomic_transaction_ty(ty_access)); + ctx.emit(Inst::AtomicStore { + ty: ty_access, + r_data, + r_addr, + }); + } + + Opcode::Fence => { + ctx.emit(Inst::Fence {}); + } + + Opcode::StackLoad | Opcode::StackStore => { + panic!("Direct stack memory access not supported; should not be used by Wasm"); + } + + Opcode::HeapAddr => { + panic!("heap_addr should have been removed by legalization!"); + } + + Opcode::TableAddr => { + panic!("table_addr should have been removed by legalization!"); + } + + Opcode::ConstAddr => unimplemented!(), + + Opcode::Nop => { + // Nothing. + } + + Opcode::Select => { + let flag_input = inputs[0]; + let cond = if let Some(icmp_insn) = + maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint) + { + let condcode = ctx.data(icmp_insn).cond_code().unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed); + cond + } else if let Some(fcmp_insn) = + maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint) + { + let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap(); + let cond = lower_fp_condcode(condcode); + lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn); + cond + } else { + let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 { + (ALUOp::SubS64, NarrowValueMode::ZeroExtend64) + } else { + (ALUOp::SubS32, NarrowValueMode::ZeroExtend32) + }; + + let rcond = put_input_in_reg(ctx, inputs[0], narrow_mode); + // cmp rcond, #0 + ctx.emit(Inst::AluRRR { + alu_op: cmp_op, + rd: writable_zero_reg(), + rn: rcond, + rm: zero_reg(), + }); + Cond::Ne + }; + + // csel.cond rd, rn, rm + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None); + let ty = ctx.output_ty(insn, 0); + let bits = ty_bits(ty); + let is_float = ty_has_float_or_vec_representation(ty); + if is_float && bits == 32 { + ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm }); + } else if is_float && bits == 64 { + ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm }); + } else if is_float && bits == 128 { + ctx.emit(Inst::VecCSel { cond, rd, rn, rm }); + } else { + ctx.emit(Inst::CSel { cond, rd, rn, rm }); + } + } + + Opcode::Selectif | Opcode::SelectifSpectreGuard => { + let condcode = ctx.data(insn).cond_code().unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + // Verification ensures that the input is always a + // single-def ifcmp. + let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); + + // csel.COND rd, rn, rm + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None); + let ty = ctx.output_ty(insn, 0); + let bits = ty_bits(ty); + let is_float = ty_has_float_or_vec_representation(ty); + if is_float && bits == 32 { + ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm }); + } else if is_float && bits == 64 { + ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm }); + } else { + ctx.emit(Inst::CSel { cond, rd, rn, rm }); + } + } + + Opcode::Bitselect | Opcode::Vselect => { + let ty = ty.unwrap(); + if !ty.is_vector() { + debug_assert_ne!(Opcode::Vselect, op); + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + let rd = get_output_reg(ctx, outputs[0]); + let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None); + // AND rTmp, rn, rcond + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::And64, + rd: tmp, + rn, + rm: rcond, + }); + // BIC rd, rm, rcond + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::AndNot64, + rd, + rn: rm, + rm: rcond, + }); + // ORR rd, rd, rTmp + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Orr64, + rd, + rn: rd.to_reg(), + rm: tmp.to_reg(), + }); + } else { + let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(rd, rcond, ty)); + + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Bsl, + rd, + rn, + rm, + size: VectorSize::from_ty(ty), + }); + } + } + + Opcode::Trueif => { + let condcode = ctx.data(insn).cond_code().unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + // Verification ensures that the input is always a + // single-def ifcmp. + let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); + let rd = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::CSet { rd, cond }); + normalize_bool_result(ctx, insn, rd); + } + + Opcode::Trueff => { + let condcode = ctx.data(insn).fp_cond_code().unwrap(); + let cond = lower_fp_condcode(condcode); + let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap(); + lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn); + let rd = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::CSet { rd, cond }); + normalize_bool_result(ctx, insn, rd); + } + + Opcode::IsNull | Opcode::IsInvalid => { + // Null references are represented by the constant value 0; invalid references are + // represented by the constant value -1. See `define_reftypes()` in + // `meta/src/isa/x86/encodings.rs` to confirm. + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ctx.input_ty(insn, 0); + let (alu_op, const_value) = match op { + Opcode::IsNull => { + // cmp rn, #0 + (choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64), 0) + } + Opcode::IsInvalid => { + // cmn rn, #1 + (choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64), 1) + } + _ => unreachable!(), + }; + let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap()); + ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value)); + ctx.emit(Inst::CSet { rd, cond: Cond::Eq }); + normalize_bool_result(ctx, insn, rd); + } + + Opcode::Copy => { + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ctx.input_ty(insn, 0); + ctx.emit(Inst::gen_move(rd, rn, ty)); + } + + Opcode::Breduce | Opcode::Ireduce => { + // Smaller integers/booleans are stored with high-order bits + // undefined, so we can simply do a copy. + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + ctx.emit(Inst::gen_move(rd, rn, ty)); + } + + Opcode::Bextend | Opcode::Bmask => { + // Bextend and Bmask both simply sign-extend. This works for: + // - Bextend, because booleans are stored as 0 / -1, so we + // sign-extend the -1 to a -1 in the wider width. + // - Bmask, because the resulting integer mask value must be + // all-ones (-1) if the argument is true. + // + // For a sign-extension from a 1-bit value (Case 1 below), we need + // to do things a bit specially, because the ISA does not have a + // 1-to-N-bit sign extension instruction. For 8-bit or wider + // sources (Case 2 below), we do a sign extension normally. + + let from_ty = ctx.input_ty(insn, 0); + let to_ty = ctx.output_ty(insn, 0); + let from_bits = ty_bits(from_ty); + let to_bits = ty_bits(to_ty); + + assert!( + from_bits <= 64 && to_bits <= 64, + "Vector Bextend not supported yet" + ); + assert!(from_bits <= to_bits); + + if from_bits == to_bits { + // Nothing. + } else if from_bits == 1 { + assert!(to_bits >= 8); + // Case 1: 1-bit to N-bit extension: AND the LSB of source into + // dest, generating a value of 0 or 1, then negate to get + // 0x000... or 0xfff... + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + // AND Rdest, Rsource, #1 + ctx.emit(Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd, + rn, + imml: ImmLogic::maybe_from_u64(1, I64).unwrap(), + }); + // SUB Rdest, XZR, Rdest (i.e., NEG Rdest) + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Sub64, + rd, + rn: zero_reg(), + rm: rd.to_reg(), + }); + } else { + // Case 2: 8-or-more-bit to N-bit extension: just sign-extend. A + // `true` (all ones, or `-1`) will be extended to -1 with the + // larger width. + assert!(from_bits >= 8); + let narrow_mode = if to_bits == 64 { + NarrowValueMode::SignExtend64 + } else { + assert!(to_bits <= 32); + NarrowValueMode::SignExtend32 + }; + let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); + let rd = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(rd, rn, to_ty)); + } + } + + Opcode::Bint => { + // Booleans are stored as all-zeroes (0) or all-ones (-1). We AND + // out the LSB to give a 0 / 1-valued integer result. + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + let output_bits = ty_bits(ctx.output_ty(insn, 0)); + + let (imm_ty, alu_op) = if output_bits > 32 { + (I64, ALUOp::And64) + } else { + (I32, ALUOp::And32) + }; + ctx.emit(Inst::AluRRImmLogic { + alu_op, + rd, + rn, + imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(), + }); + } + + Opcode::Bitcast => { + let rd = get_output_reg(ctx, outputs[0]); + let ity = ctx.input_ty(insn, 0); + let oty = ctx.output_ty(insn, 0); + let ity_vec_reg = ty_has_float_or_vec_representation(ity); + let oty_vec_reg = ty_has_float_or_vec_representation(oty); + match (ity_vec_reg, oty_vec_reg) { + (true, true) => { + let narrow_mode = if ty_bits(ity) <= 32 && ty_bits(oty) <= 32 { + NarrowValueMode::ZeroExtend32 + } else { + NarrowValueMode::ZeroExtend64 + }; + let rm = put_input_in_reg(ctx, inputs[0], narrow_mode); + ctx.emit(Inst::gen_move(rd, rm, oty)); + } + (false, false) => { + let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + ctx.emit(Inst::gen_move(rd, rm, oty)); + } + (false, true) => { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64); + ctx.emit(Inst::MovToFpu { + rd, + rn, + size: ScalarSize::Size64, + }); + } + (true, false) => { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + ctx.emit(Inst::MovFromVec { + rd, + rn, + idx: 0, + size: VectorSize::Size64x2, + }); + } + } + } + + Opcode::FallthroughReturn | Opcode::Return => { + for (i, input) in inputs.iter().enumerate() { + // N.B.: according to the AArch64 ABI, the top bits of a register + // (above the bits for the value's type) are undefined, so we + // need not extend the return values. + let reg = put_input_in_reg(ctx, *input, NarrowValueMode::None); + let retval_reg = ctx.retval(i); + let ty = ctx.input_ty(insn, i); + ctx.emit(Inst::gen_move(retval_reg, reg, ty)); + } + // N.B.: the Ret itself is generated by the ABI. + } + + Opcode::Ifcmp | Opcode::Ffcmp => { + // An Ifcmp/Ffcmp must always be seen as a use of a brif/brff or trueif/trueff + // instruction. This will always be the case as long as the IR uses an Ifcmp/Ffcmp from + // the same block, or a dominating block. In other words, it cannot pass through a BB + // param (phi). The flags pass of the verifier will ensure this. + panic!("Should never reach ifcmp as isel root!"); + } + + Opcode::Icmp => { + let condcode = ctx.data(insn).cond_code().unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + let rd = get_output_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + let bits = ty_bits(ty); + let narrow_mode = match (bits <= 32, is_signed) { + (true, true) => NarrowValueMode::SignExtend32, + (true, false) => NarrowValueMode::ZeroExtend32, + (false, true) => NarrowValueMode::SignExtend64, + (false, false) => NarrowValueMode::ZeroExtend64, + }; + let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); + + if !ty.is_vector() { + let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); + let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode); + ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm)); + ctx.emit(Inst::CSet { cond, rd }); + normalize_bool_result(ctx, insn, rd); + } else { + let rm = put_input_in_reg(ctx, inputs[1], narrow_mode); + lower_vector_compare(ctx, rd, rn, rm, ty, cond)?; + } + } + + Opcode::Fcmp => { + let condcode = ctx.data(insn).fp_cond_code().unwrap(); + let cond = lower_fp_condcode(condcode); + let ty = ctx.input_ty(insn, 0); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + + if !ty.is_vector() { + match ty_bits(ty) { + 32 => { + ctx.emit(Inst::FpuCmp32 { rn, rm }); + } + 64 => { + ctx.emit(Inst::FpuCmp64 { rn, rm }); + } + _ => panic!("Bad float size"), + } + ctx.emit(Inst::CSet { cond, rd }); + normalize_bool_result(ctx, insn, rd); + } else { + lower_vector_compare(ctx, rd, rn, rm, ty, cond)?; + } + } + + Opcode::JumpTableEntry | Opcode::JumpTableBase => { + panic!("Should not appear: we handle BrTable directly"); + } + + Opcode::Debugtrap => { + ctx.emit(Inst::Brk); + } + + Opcode::Trap | Opcode::ResumableTrap => { + let trap_code = ctx.data(insn).trap_code().unwrap(); + ctx.emit_safepoint(Inst::Udf { trap_code }); + } + + Opcode::Trapif | Opcode::Trapff => { + let trap_code = ctx.data(insn).trap_code().unwrap(); + + let cond = if maybe_input_insn(ctx, inputs[0], Opcode::IaddIfcout).is_some() { + let condcode = ctx.data(insn).cond_code().unwrap(); + let cond = lower_condcode(condcode); + // The flags must not have been clobbered by any other + // instruction between the iadd_ifcout and this instruction, as + // verified by the CLIF validator; so we can simply use the + // flags here. + cond + } else if op == Opcode::Trapif { + let condcode = ctx.data(insn).cond_code().unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + + // Verification ensures that the input is always a single-def ifcmp. + let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); + cond + } else { + let condcode = ctx.data(insn).fp_cond_code().unwrap(); + let cond = lower_fp_condcode(condcode); + + // Verification ensures that the input is always a + // single-def ffcmp. + let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap(); + lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn); + cond + }; + + ctx.emit_safepoint(Inst::TrapIf { + trap_code, + kind: CondBrKind::Cond(cond), + }); + } + + Opcode::Safepoint => { + panic!("safepoint instructions not used by new backend's safepoints!"); + } + + Opcode::Trapz | Opcode::Trapnz | Opcode::ResumableTrapnz => { + panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!"); + } + + Opcode::FuncAddr => { + let rd = get_output_reg(ctx, outputs[0]); + let (extname, _) = ctx.call_target(insn).unwrap(); + let extname = extname.clone(); + ctx.emit(Inst::LoadExtName { + rd, + name: Box::new(extname), + offset: 0, + }); + } + + Opcode::GlobalValue => { + panic!("global_value should have been removed by legalization!"); + } + + Opcode::SymbolValue => { + let rd = get_output_reg(ctx, outputs[0]); + let (extname, _, offset) = ctx.symbol_value(insn).unwrap(); + let extname = extname.clone(); + ctx.emit(Inst::LoadExtName { + rd, + name: Box::new(extname), + offset, + }); + } + + Opcode::Call | Opcode::CallIndirect => { + let caller_conv = ctx.abi().call_conv(); + let (mut abi, inputs) = match op { + Opcode::Call => { + let (extname, dist) = ctx.call_target(insn).unwrap(); + let extname = extname.clone(); + let sig = ctx.call_sig(insn).unwrap(); + assert!(inputs.len() == sig.params.len()); + assert!(outputs.len() == sig.returns.len()); + ( + AArch64ABICaller::from_func(sig, &extname, dist, caller_conv)?, + &inputs[..], + ) + } + Opcode::CallIndirect => { + let ptr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64); + let sig = ctx.call_sig(insn).unwrap(); + assert!(inputs.len() - 1 == sig.params.len()); + assert!(outputs.len() == sig.returns.len()); + ( + AArch64ABICaller::from_ptr(sig, ptr, op, caller_conv)?, + &inputs[1..], + ) + } + _ => unreachable!(), + }; + + abi.emit_stack_pre_adjust(ctx); + assert!(inputs.len() == abi.num_args()); + for (i, input) in inputs.iter().enumerate() { + let arg_reg = put_input_in_reg(ctx, *input, NarrowValueMode::None); + abi.emit_copy_reg_to_arg(ctx, i, arg_reg); + } + abi.emit_call(ctx); + for (i, output) in outputs.iter().enumerate() { + let retval_reg = get_output_reg(ctx, *output); + abi.emit_copy_retval_to_reg(ctx, i, retval_reg); + } + abi.emit_stack_post_adjust(ctx); + } + + Opcode::GetPinnedReg => { + let rd = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::mov(rd, xreg(PINNED_REG))); + } + + Opcode::SetPinnedReg => { + let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm)); + } + + Opcode::Spill + | Opcode::Fill + | Opcode::FillNop + | Opcode::Regmove + | Opcode::CopySpecial + | Opcode::CopyToSsa + | Opcode::CopyNop + | Opcode::AdjustSpDown + | Opcode::AdjustSpUpImm + | Opcode::AdjustSpDownImm + | Opcode::IfcmpSp + | Opcode::Regspill + | Opcode::Regfill => { + panic!("Unused opcode should not be encountered."); + } + + Opcode::Jump + | Opcode::Fallthrough + | Opcode::Brz + | Opcode::Brnz + | Opcode::BrIcmp + | Opcode::Brif + | Opcode::Brff + | Opcode::IndirectJumpTableBr + | Opcode::BrTable => { + panic!("Branch opcode reached non-branch lowering logic!"); + } + + Opcode::Vconst => { + let value = const_param_to_u128(ctx, insn).expect("Invalid immediate bytes"); + let rd = get_output_reg(ctx, outputs[0]); + lower_constant_f128(ctx, rd, value); + } + + Opcode::RawBitcast => { + let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + ctx.emit(Inst::gen_move(rd, rm, ty)); + } + + Opcode::Extractlane => { + if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) { + let idx = *imm; + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let size = VectorSize::from_ty(ctx.input_ty(insn, 0)); + let ty = ty.unwrap(); + + if ty_has_int_representation(ty) { + ctx.emit(Inst::MovFromVec { rd, rn, idx, size }); + // Plain moves are faster on some processors. + } else if idx == 0 { + ctx.emit(Inst::gen_move(rd, rn, ty)); + } else { + ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size }); + } + } else { + unreachable!(); + } + } + + Opcode::Insertlane => { + let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) { + *imm + } else { + unreachable!(); + }; + let input_ty = ctx.input_ty(insn, 1); + let rd = get_output_reg(ctx, outputs[0]); + let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + let size = VectorSize::from_ty(ty); + + ctx.emit(Inst::gen_move(rd, rm, ty)); + + if ty_has_int_representation(input_ty) { + ctx.emit(Inst::MovToVec { rd, rn, idx, size }); + } else { + ctx.emit(Inst::VecMovElement { + rd, + rn, + dest_idx: idx, + src_idx: 0, + size, + }); + } + } + + Opcode::Splat => { + let rd = get_output_reg(ctx, outputs[0]); + let size = VectorSize::from_ty(ty.unwrap()); + + if let Some((_, insn)) = maybe_input_insn_multi( + ctx, + inputs[0], + &[ + Opcode::Bconst, + Opcode::F32const, + Opcode::F64const, + Opcode::Iconst, + ], + ) { + lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); + } else if let Some(insn) = + maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce) + { + lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); + } else if let Some(insn) = + maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce) + { + lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); + } else { + let input_ty = ctx.input_ty(insn, 0); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let inst = if ty_has_int_representation(input_ty) { + Inst::VecDup { rd, rn, size } + } else { + Inst::VecDupFromFpu { rd, rn, size } + }; + + ctx.emit(inst); + } + } + + Opcode::ScalarToVector => { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + let input_ty = ctx.input_ty(insn, 0); + if (input_ty == I32 && ty.unwrap() == I32X4) + || (input_ty == I64 && ty.unwrap() == I64X2) + { + ctx.emit(Inst::MovToFpu { + rd, + rn, + size: ScalarSize::from_ty(input_ty), + }); + } else { + return Err(CodegenError::Unsupported(format!( + "ScalarToVector: unsupported types {:?} -> {:?}", + input_ty, ty + ))); + } + } + + Opcode::VanyTrue | Opcode::VallTrue => { + let rd = get_output_reg(ctx, outputs[0]); + let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let tmp = ctx.alloc_tmp(RegClass::V128, ty.unwrap()); + + // This operation is implemented by using umaxp or uminv to + // create a scalar value, which is then compared against zero. + // + // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b + // mov xm, vn.d[0] + // cmp xm, #0 + // cset xm, ne + + let size = VectorSize::from_ty(ctx.input_ty(insn, 0)); + + if op == Opcode::VanyTrue { + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Umaxp, + rd: tmp, + rn: rm, + rm: rm, + size, + }); + } else { + ctx.emit(Inst::VecLanes { + op: VecLanesOp::Uminv, + rd: tmp, + rn: rm, + size, + }); + }; + + ctx.emit(Inst::MovFromVec { + rd, + rn: tmp.to_reg(), + idx: 0, + size: VectorSize::Size64x2, + }); + + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::SubS64, + rd: writable_zero_reg(), + rn: rd.to_reg(), + imm12: Imm12::zero(), + }); + + ctx.emit(Inst::CSet { rd, cond: Cond::Ne }); + normalize_bool_result(ctx, insn, rd); + } + + Opcode::VhighBits => { + let dst_r = get_output_reg(ctx, outputs[0]); + let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ctx.input_ty(insn, 0); + // All three sequences use one integer temporary and two vector temporaries. The + // shift is done early so as to give the register allocator the possibility of using + // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of + // `src_v`. See https://github.com/WebAssembly/simd/pull/201 for the background and + // derivation of these sequences. Alternative sequences are discussed in + // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not + // used here. + // Also .. FIXME: when https://github.com/bytecodealliance/wasmtime/pull/2310 is + // merged, use `lower_splat_constant` instead to generate the constants. + let tmp_r0 = ctx.alloc_tmp(RegClass::I64, I64); + let tmp_v0 = ctx.alloc_tmp(RegClass::V128, I8X16); + let tmp_v1 = ctx.alloc_tmp(RegClass::V128, I8X16); + match ty { + I8X16 => { + // sshr tmp_v1.16b, src_v.16b, #7 + // mov tmp_r0, #0x0201 + // movk tmp_r0, #0x0804, lsl 16 + // movk tmp_r0, #0x2010, lsl 32 + // movk tmp_r0, #0x8040, lsl 48 + // dup tmp_v0.2d, tmp_r0 + // and tmp_v1.16b, tmp_v1.16b, tmp_v0.16b + // ext tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8 + // zip1 tmp_v0.16b, tmp_v1.16b, tmp_v0.16b + // addv tmp_v0h, tmp_v0.8h + // mov dst_r, tmp_v0.h[0] + ctx.emit(Inst::VecShiftImm { + op: VecShiftImmOp::Sshr, + rd: tmp_v1, + rn: src_v, + size: VectorSize::Size8x16, + imm: 7, + }); + lower_constant_u64(ctx, tmp_r0, 0x8040201008040201u64); + ctx.emit(Inst::VecDup { + rd: tmp_v0, + rn: tmp_r0.to_reg(), + size: VectorSize::Size64x2, + }); + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::And, + rd: tmp_v1, + rn: tmp_v1.to_reg(), + rm: tmp_v0.to_reg(), + size: VectorSize::Size8x16, + }); + ctx.emit(Inst::VecExtract { + rd: tmp_v0, + rn: tmp_v1.to_reg(), + rm: tmp_v1.to_reg(), + imm4: 8, + }); + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Zip1, + rd: tmp_v0, + rn: tmp_v1.to_reg(), + rm: tmp_v0.to_reg(), + size: VectorSize::Size8x16, + }); + ctx.emit(Inst::VecLanes { + op: VecLanesOp::Addv, + rd: tmp_v0, + rn: tmp_v0.to_reg(), + size: VectorSize::Size16x8, + }); + ctx.emit(Inst::MovFromVec { + rd: dst_r, + rn: tmp_v0.to_reg(), + idx: 0, + size: VectorSize::Size16x8, + }); + } + I16X8 => { + // sshr tmp_v1.8h, src_v.8h, #15 + // mov tmp_r0, #0x1 + // movk tmp_r0, #0x2, lsl 16 + // movk tmp_r0, #0x4, lsl 32 + // movk tmp_r0, #0x8, lsl 48 + // dup tmp_v0.2d, tmp_r0 + // shl tmp_r0, tmp_r0, #4 + // mov tmp_v0.d[1], tmp_r0 + // and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b + // addv tmp_v0h, tmp_v0.8h + // mov dst_r, tmp_v0.h[0] + ctx.emit(Inst::VecShiftImm { + op: VecShiftImmOp::Sshr, + rd: tmp_v1, + rn: src_v, + size: VectorSize::Size16x8, + imm: 15, + }); + lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64); + ctx.emit(Inst::VecDup { + rd: tmp_v0, + rn: tmp_r0.to_reg(), + size: VectorSize::Size64x2, + }); + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsl64, + rd: tmp_r0, + rn: tmp_r0.to_reg(), + immshift: ImmShift { imm: 4 }, + }); + ctx.emit(Inst::MovToVec { + rd: tmp_v0, + rn: tmp_r0.to_reg(), + idx: 1, + size: VectorSize::Size64x2, + }); + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::And, + rd: tmp_v0, + rn: tmp_v1.to_reg(), + rm: tmp_v0.to_reg(), + size: VectorSize::Size8x16, + }); + ctx.emit(Inst::VecLanes { + op: VecLanesOp::Addv, + rd: tmp_v0, + rn: tmp_v0.to_reg(), + size: VectorSize::Size16x8, + }); + ctx.emit(Inst::MovFromVec { + rd: dst_r, + rn: tmp_v0.to_reg(), + idx: 0, + size: VectorSize::Size16x8, + }); + } + I32X4 => { + // sshr tmp_v1.4s, src_v.4s, #31 + // mov tmp_r0, #0x1 + // movk tmp_r0, #0x2, lsl 32 + // dup tmp_v0.2d, tmp_r0 + // shl tmp_r0, tmp_r0, #2 + // mov tmp_v0.d[1], tmp_r0 + // and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b + // addv tmp_v0s, tmp_v0.4s + // mov dst_r, tmp_v0.s[0] + ctx.emit(Inst::VecShiftImm { + op: VecShiftImmOp::Sshr, + rd: tmp_v1, + rn: src_v, + size: VectorSize::Size32x4, + imm: 31, + }); + lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64); + ctx.emit(Inst::VecDup { + rd: tmp_v0, + rn: tmp_r0.to_reg(), + size: VectorSize::Size64x2, + }); + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsl64, + rd: tmp_r0, + rn: tmp_r0.to_reg(), + immshift: ImmShift { imm: 2 }, + }); + ctx.emit(Inst::MovToVec { + rd: tmp_v0, + rn: tmp_r0.to_reg(), + idx: 1, + size: VectorSize::Size64x2, + }); + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::And, + rd: tmp_v0, + rn: tmp_v1.to_reg(), + rm: tmp_v0.to_reg(), + size: VectorSize::Size8x16, + }); + ctx.emit(Inst::VecLanes { + op: VecLanesOp::Addv, + rd: tmp_v0, + rn: tmp_v0.to_reg(), + size: VectorSize::Size32x4, + }); + ctx.emit(Inst::MovFromVec { + rd: dst_r, + rn: tmp_v0.to_reg(), + idx: 0, + size: VectorSize::Size32x4, + }); + } + _ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty), + } + } + + Opcode::Shuffle => { + let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes"); + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + // 2 register table vector lookups require consecutive table registers; + // we satisfy this constraint by hardcoding the usage of v29 and v30. + let temp = writable_vreg(29); + let temp2 = writable_vreg(30); + let input_ty = ctx.input_ty(insn, 0); + assert_eq!(input_ty, ctx.input_ty(insn, 1)); + // Make sure that both inputs are in virtual registers, since it is + // not guaranteed that we can get them safely to the temporaries if + // either is in a real register. + let rn = ctx.ensure_in_vreg(rn, input_ty); + let rn2 = ctx.ensure_in_vreg(rn2, input_ty); + + lower_constant_f128(ctx, rd, mask); + ctx.emit(Inst::gen_move(temp, rn, input_ty)); + ctx.emit(Inst::gen_move(temp2, rn2, input_ty)); + ctx.emit(Inst::VecTbl2 { + rd, + rn: temp.to_reg(), + rn2: temp2.to_reg(), + rm: rd.to_reg(), + is_extension: false, + }); + } + + Opcode::Swizzle => { + let rd = get_output_reg(ctx, outputs[0]); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + + ctx.emit(Inst::VecTbl { + rd, + rn, + rm, + is_extension: false, + }); + } + + Opcode::Vsplit + | Opcode::Vconcat + | Opcode::Uload8x8Complex + | Opcode::Sload8x8Complex + | Opcode::Uload16x4Complex + | Opcode::Sload16x4Complex + | Opcode::Uload32x2Complex + | Opcode::Sload32x2Complex => { + // TODO + panic!("Vector ops not implemented."); + } + + Opcode::Isplit | Opcode::Iconcat => panic!("Vector ops not supported."), + + Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => { + let alu_op = match op { + Opcode::Umin => VecALUOp::Umin, + Opcode::Imin => VecALUOp::Smin, + Opcode::Umax => VecALUOp::Umax, + Opcode::Imax => VecALUOp::Smax, + _ => unreachable!(), + }; + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + ctx.emit(Inst::VecRRR { + alu_op, + rd, + rn, + rm, + size: VectorSize::from_ty(ty), + }); + } + + Opcode::WideningPairwiseDotProductS => { + let r_y = get_output_reg(ctx, outputs[0]); + let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + if ty == I32X4 { + let tmp = ctx.alloc_tmp(RegClass::V128, I8X16); + // The args have type I16X8. + // "y = i32x4.dot_i16x8_s(a, b)" + // => smull tmp, a, b + // smull2 y, a, b + // addp y, tmp, y + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Smull, + rd: tmp, + rn: r_a, + rm: r_b, + size: VectorSize::Size16x8, + }); + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Smull2, + rd: r_y, + rn: r_a, + rm: r_b, + size: VectorSize::Size16x8, + }); + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Addp, + rd: r_y, + rn: tmp.to_reg(), + rm: r_y.to_reg(), + size: VectorSize::Size32x4, + }); + } else { + return Err(CodegenError::Unsupported(format!( + "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}", + ty + ))); + } + } + + Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => { + let ty = ty.unwrap(); + let bits = ty_bits(ty); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + if !ty.is_vector() { + let fpu_op = match (op, bits) { + (Opcode::Fadd, 32) => FPUOp2::Add32, + (Opcode::Fadd, 64) => FPUOp2::Add64, + (Opcode::Fsub, 32) => FPUOp2::Sub32, + (Opcode::Fsub, 64) => FPUOp2::Sub64, + (Opcode::Fmul, 32) => FPUOp2::Mul32, + (Opcode::Fmul, 64) => FPUOp2::Mul64, + (Opcode::Fdiv, 32) => FPUOp2::Div32, + (Opcode::Fdiv, 64) => FPUOp2::Div64, + (Opcode::Fmin, 32) => FPUOp2::Min32, + (Opcode::Fmin, 64) => FPUOp2::Min64, + (Opcode::Fmax, 32) => FPUOp2::Max32, + (Opcode::Fmax, 64) => FPUOp2::Max64, + _ => panic!("Unknown op/bits combination"), + }; + ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm }); + } else { + let alu_op = match op { + Opcode::Fadd => VecALUOp::Fadd, + Opcode::Fsub => VecALUOp::Fsub, + Opcode::Fdiv => VecALUOp::Fdiv, + Opcode::Fmax => VecALUOp::Fmax, + Opcode::Fmin => VecALUOp::Fmin, + Opcode::Fmul => VecALUOp::Fmul, + _ => unreachable!(), + }; + + ctx.emit(Inst::VecRRR { + rd, + rn, + rm, + alu_op, + size: VectorSize::from_ty(ty), + }); + } + } + + Opcode::FminPseudo | Opcode::FmaxPseudo => { + let ty = ctx.input_ty(insn, 0); + if ty == F32X4 || ty == F64X2 { + // pmin(a,b) => bitsel(b, a, cmpgt(a, b)) + // pmax(a,b) => bitsel(b, a, cmpgt(b, a)) + let r_dst = get_output_reg(ctx, outputs[0]); + let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + // Since we're going to write the output register `r_dst` anyway, we might as + // well first use it to hold the comparison result. This has the slightly unusual + // effect that we modify the output register in the first instruction (`fcmgt`) + // but read both the inputs again in the second instruction (`bsl`), which means + // that the output register can't be either of the input registers. Regalloc + // should handle this correctly, nevertheless. + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Fcmgt, + rd: r_dst, + rn: if op == Opcode::FminPseudo { r_a } else { r_b }, + rm: if op == Opcode::FminPseudo { r_b } else { r_a }, + size: if ty == F32X4 { + VectorSize::Size32x4 + } else { + VectorSize::Size64x2 + }, + }); + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Bsl, + rd: r_dst, + rn: r_b, + rm: r_a, + size: VectorSize::Size8x16, + }); + } else { + panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type"); + } + } + + Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => { + let ty = ty.unwrap(); + let bits = ty_bits(ty); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + if !ty.is_vector() { + let fpu_op = match (op, bits) { + (Opcode::Sqrt, 32) => FPUOp1::Sqrt32, + (Opcode::Sqrt, 64) => FPUOp1::Sqrt64, + (Opcode::Fneg, 32) => FPUOp1::Neg32, + (Opcode::Fneg, 64) => FPUOp1::Neg64, + (Opcode::Fabs, 32) => FPUOp1::Abs32, + (Opcode::Fabs, 64) => FPUOp1::Abs64, + (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"), + (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64, + (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32, + (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"), + _ => panic!("Unknown op/bits combination"), + }; + ctx.emit(Inst::FpuRR { fpu_op, rd, rn }); + } else { + let op = match op { + Opcode::Fabs => VecMisc2::Fabs, + Opcode::Fneg => VecMisc2::Fneg, + Opcode::Sqrt => VecMisc2::Fsqrt, + _ => unimplemented!(), + }; + + ctx.emit(Inst::VecMisc { + op, + rd, + rn, + size: VectorSize::from_ty(ty), + }); + } + } + + Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => { + let ty = ctx.output_ty(insn, 0); + if !ty.is_vector() { + let bits = ty_bits(ty); + let op = match (op, bits) { + (Opcode::Ceil, 32) => FpuRoundMode::Plus32, + (Opcode::Ceil, 64) => FpuRoundMode::Plus64, + (Opcode::Floor, 32) => FpuRoundMode::Minus32, + (Opcode::Floor, 64) => FpuRoundMode::Minus64, + (Opcode::Trunc, 32) => FpuRoundMode::Zero32, + (Opcode::Trunc, 64) => FpuRoundMode::Zero64, + (Opcode::Nearest, 32) => FpuRoundMode::Nearest32, + (Opcode::Nearest, 64) => FpuRoundMode::Nearest64, + _ => panic!("Unknown op/bits combination (scalar)"), + }; + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::FpuRound { op, rd, rn }); + } else { + let (op, size) = match (op, ty) { + (Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4), + (Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2), + (Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4), + (Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2), + (Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4), + (Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2), + (Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4), + (Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2), + _ => panic!("Unknown op/ty combination (vector){:?}", ty), + }; + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::VecMisc { op, rd, rn, size }); + } + } + + Opcode::Fma => { + let bits = ty_bits(ctx.output_ty(insn, 0)); + let fpu_op = match bits { + 32 => FPUOp3::MAdd32, + 64 => FPUOp3::MAdd64, + _ => panic!("Unknown op size"), + }; + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::FpuRRRR { + fpu_op, + rn, + rm, + ra, + rd, + }); + } + + Opcode::Fcopysign => { + // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence: + // + // This is a scalar Fcopysign. + // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit. + // + // mov vd, vn + // ushr vtmp, vm, #63 / #31 + // sli vd, vtmp, #63 / #31 + + let ty = ctx.output_ty(insn, 0); + let bits = ty_bits(ty) as u8; + assert!(bits == 32 || bits == 64); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + let tmp = ctx.alloc_tmp(RegClass::V128, F64); + + // Copy LHS to rd. + ctx.emit(Inst::FpuMove64 { rd, rn }); + + // Copy the sign bit to the lowest bit in tmp. + let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap(); + ctx.emit(Inst::FpuRRI { + fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)), + rd: tmp, + rn: rm, + }); + + // Insert the bit from tmp into the sign bit of rd. + let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap(); + ctx.emit(Inst::FpuRRI { + fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)), + rd, + rn: tmp.to_reg(), + }); + } + + Opcode::FcvtToUint | Opcode::FcvtToSint => { + let in_bits = ty_bits(ctx.input_ty(insn, 0)); + let out_bits = ty_bits(ctx.output_ty(insn, 0)); + let signed = op == Opcode::FcvtToSint; + let op = match (signed, in_bits, out_bits) { + (false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32, + (true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32, + (false, 32, 64) => FpuToIntOp::F32ToU64, + (true, 32, 64) => FpuToIntOp::F32ToI64, + (false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32, + (true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32, + (false, 64, 64) => FpuToIntOp::F64ToU64, + (true, 64, 64) => FpuToIntOp::F64ToI64, + _ => panic!("Unknown input/output-bits combination"), + }; + + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + + // First, check the output: it's important to carry the NaN conversion before the + // in-bounds conversion, per wasm semantics. + + // Check that the input is not a NaN. + if in_bits == 32 { + ctx.emit(Inst::FpuCmp32 { rn, rm: rn }); + } else { + ctx.emit(Inst::FpuCmp64 { rn, rm: rn }); + } + let trap_code = TrapCode::BadConversionToInteger; + ctx.emit(Inst::TrapIf { + trap_code, + kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)), + }); + + let tmp = ctx.alloc_tmp(RegClass::V128, I128); + + // Check that the input is in range, with "truncate towards zero" semantics. This means + // we allow values that are slightly out of range: + // - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this + // can be represented), and strictly less than INT_MAX+1 (when this can be + // represented). + // - for unsigned conversions, we allow values strictly greater than -1, and strictly + // less than UINT_MAX+1 (when this can be represented). + + if in_bits == 32 { + // From float32. + let (low_bound, low_cond, high_bound) = match (signed, out_bits) { + (true, 8) => ( + i8::min_value() as f32 - 1., + FloatCC::GreaterThan, + i8::max_value() as f32 + 1., + ), + (true, 16) => ( + i16::min_value() as f32 - 1., + FloatCC::GreaterThan, + i16::max_value() as f32 + 1., + ), + (true, 32) => ( + i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32. + FloatCC::GreaterThanOrEqual, + i32::max_value() as f32 + 1., + ), + (true, 64) => ( + i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32. + FloatCC::GreaterThanOrEqual, + i64::max_value() as f32 + 1., + ), + (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.), + (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.), + (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.), + (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.), + _ => panic!("Unknown input/output-bits combination"), + }; + + // >= low_bound + lower_constant_f32(ctx, tmp, low_bound); + ctx.emit(Inst::FpuCmp32 { + rn, + rm: tmp.to_reg(), + }); + let trap_code = TrapCode::IntegerOverflow; + ctx.emit(Inst::TrapIf { + trap_code, + kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()), + }); + + // <= high_bound + lower_constant_f32(ctx, tmp, high_bound); + ctx.emit(Inst::FpuCmp32 { + rn, + rm: tmp.to_reg(), + }); + let trap_code = TrapCode::IntegerOverflow; + ctx.emit(Inst::TrapIf { + trap_code, + kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()), + }); + } else { + // From float64. + let (low_bound, low_cond, high_bound) = match (signed, out_bits) { + (true, 8) => ( + i8::min_value() as f64 - 1., + FloatCC::GreaterThan, + i8::max_value() as f64 + 1., + ), + (true, 16) => ( + i16::min_value() as f64 - 1., + FloatCC::GreaterThan, + i16::max_value() as f64 + 1., + ), + (true, 32) => ( + i32::min_value() as f64 - 1., + FloatCC::GreaterThan, + i32::max_value() as f64 + 1., + ), + (true, 64) => ( + i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64. + FloatCC::GreaterThanOrEqual, + i64::max_value() as f64 + 1., + ), + (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.), + (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.), + (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.), + (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.), + _ => panic!("Unknown input/output-bits combination"), + }; + + // >= low_bound + lower_constant_f64(ctx, tmp, low_bound); + ctx.emit(Inst::FpuCmp64 { + rn, + rm: tmp.to_reg(), + }); + let trap_code = TrapCode::IntegerOverflow; + ctx.emit(Inst::TrapIf { + trap_code, + kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()), + }); + + // <= high_bound + lower_constant_f64(ctx, tmp, high_bound); + ctx.emit(Inst::FpuCmp64 { + rn, + rm: tmp.to_reg(), + }); + let trap_code = TrapCode::IntegerOverflow; + ctx.emit(Inst::TrapIf { + trap_code, + kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()), + }); + }; + + // Do the conversion. + ctx.emit(Inst::FpuToInt { op, rd, rn }); + } + + Opcode::FcvtFromUint | Opcode::FcvtFromSint => { + let ty = ty.unwrap(); + let signed = op == Opcode::FcvtFromSint; + let rd = get_output_reg(ctx, outputs[0]); + + if ty.is_vector() { + let op = if signed { + VecMisc2::Scvtf + } else { + VecMisc2::Ucvtf + }; + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + + ctx.emit(Inst::VecMisc { + op, + rd, + rn, + size: VectorSize::from_ty(ty), + }); + } else { + let in_bits = ty_bits(ctx.input_ty(insn, 0)); + let out_bits = ty_bits(ty); + let op = match (signed, in_bits, out_bits) { + (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32, + (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32, + (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64, + (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64, + (false, 64, 32) => IntToFpuOp::U64ToF32, + (true, 64, 32) => IntToFpuOp::I64ToF32, + (false, 64, 64) => IntToFpuOp::U64ToF64, + (true, 64, 64) => IntToFpuOp::I64ToF64, + _ => panic!("Unknown input/output-bits combination"), + }; + let narrow_mode = match (signed, in_bits) { + (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32, + (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32, + (false, 64) => NarrowValueMode::ZeroExtend64, + (true, 64) => NarrowValueMode::SignExtend64, + _ => panic!("Unknown input size"), + }; + let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); + ctx.emit(Inst::IntToFpu { op, rd, rn }); + } + } + + Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => { + let ty = ty.unwrap(); + let out_signed = op == Opcode::FcvtToSintSat; + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + + if ty.is_vector() { + let op = if out_signed { + VecMisc2::Fcvtzs + } else { + VecMisc2::Fcvtzu + }; + + ctx.emit(Inst::VecMisc { + op, + rd, + rn, + size: VectorSize::from_ty(ty), + }); + } else { + let in_ty = ctx.input_ty(insn, 0); + let in_bits = ty_bits(in_ty); + let out_bits = ty_bits(ty); + // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX + // FMIN Vtmp2, Vin, Vtmp1 + // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN + // FMAX Vtmp2, Vtmp2, Vtmp1 + // (if signed) FIMM Vtmp1, 0 + // FCMP Vin, Vin + // FCSEL Vtmp2, Vtmp1, Vtmp2, NE // on NaN, select 0 + // convert Rout, Vtmp2 + + assert!(in_bits == 32 || in_bits == 64); + assert!(out_bits == 32 || out_bits == 64); + + let min: f64 = match (out_bits, out_signed) { + (32, true) => std::i32::MIN as f64, + (32, false) => 0.0, + (64, true) => std::i64::MIN as f64, + (64, false) => 0.0, + _ => unreachable!(), + }; + + let max = match (out_bits, out_signed) { + (32, true) => std::i32::MAX as f64, + (32, false) => std::u32::MAX as f64, + (64, true) => std::i64::MAX as f64, + (64, false) => std::u64::MAX as f64, + _ => unreachable!(), + }; + + let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty); + let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty); + + if in_bits == 32 { + lower_constant_f32(ctx, rtmp1, max as f32); + } else { + lower_constant_f64(ctx, rtmp1, max); + } + ctx.emit(Inst::FpuRRR { + fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64), + rd: rtmp2, + rn: rn, + rm: rtmp1.to_reg(), + }); + if in_bits == 32 { + lower_constant_f32(ctx, rtmp1, min as f32); + } else { + lower_constant_f64(ctx, rtmp1, min); + } + ctx.emit(Inst::FpuRRR { + fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64), + rd: rtmp2, + rn: rtmp2.to_reg(), + rm: rtmp1.to_reg(), + }); + if out_signed { + if in_bits == 32 { + lower_constant_f32(ctx, rtmp1, 0.0); + } else { + lower_constant_f64(ctx, rtmp1, 0.0); + } + } + if in_bits == 32 { + ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn }); + ctx.emit(Inst::FpuCSel32 { + rd: rtmp2, + rn: rtmp1.to_reg(), + rm: rtmp2.to_reg(), + cond: Cond::Ne, + }); + } else { + ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn }); + ctx.emit(Inst::FpuCSel64 { + rd: rtmp2, + rn: rtmp1.to_reg(), + rm: rtmp2.to_reg(), + cond: Cond::Ne, + }); + } + + let cvt = match (in_bits, out_bits, out_signed) { + (32, 32, false) => FpuToIntOp::F32ToU32, + (32, 32, true) => FpuToIntOp::F32ToI32, + (32, 64, false) => FpuToIntOp::F32ToU64, + (32, 64, true) => FpuToIntOp::F32ToI64, + (64, 32, false) => FpuToIntOp::F64ToU32, + (64, 32, true) => FpuToIntOp::F64ToI32, + (64, 64, false) => FpuToIntOp::F64ToU64, + (64, 64, true) => FpuToIntOp::F64ToI64, + _ => unreachable!(), + }; + ctx.emit(Inst::FpuToInt { + op: cvt, + rd, + rn: rtmp2.to_reg(), + }); + } + } + + Opcode::IaddIfcout => { + // This is a two-output instruction that is needed for the + // legalizer's explicit heap-check sequence, among possible other + // uses. Its second output is a flags output only ever meant to + // check for overflow using the + // `backend.unsigned_add_overflow_condition()` condition. + // + // Note that the CLIF validation will ensure that no flag-setting + // operation comes between this IaddIfcout and its use (e.g., a + // Trapif). Thus, we can rely on implicit communication through the + // processor flags rather than explicitly generating flags into a + // register. We simply use the variant of the add instruction that + // sets flags (`adds`) here. + + // Ensure that the second output isn't directly called for: it + // should only be used by a flags-consuming op, which will directly + // understand this instruction and merge the comparison. + assert!(!ctx.is_reg_needed(insn, ctx.get_output(insn, 1).to_reg())); + + // Now handle the iadd as above, except use an AddS opcode that sets + // flags. + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + let alu_op = choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + } + + Opcode::IaddImm + | Opcode::ImulImm + | Opcode::UdivImm + | Opcode::SdivImm + | Opcode::UremImm + | Opcode::SremImm + | Opcode::IrsubImm + | Opcode::IaddCin + | Opcode::IaddIfcin + | Opcode::IaddCout + | Opcode::IaddCarry + | Opcode::IaddIfcarry + | Opcode::IsubBin + | Opcode::IsubIfbin + | Opcode::IsubBout + | Opcode::IsubIfbout + | Opcode::IsubBorrow + | Opcode::IsubIfborrow + | Opcode::BandImm + | Opcode::BorImm + | Opcode::BxorImm + | Opcode::RotlImm + | Opcode::RotrImm + | Opcode::IshlImm + | Opcode::UshrImm + | Opcode::SshrImm + | Opcode::IcmpImm + | Opcode::IfcmpImm => { + panic!("ALU+imm and ALU+carry ops should not appear here!"); + } + + #[cfg(feature = "x86")] + Opcode::X86Udivmodx + | Opcode::X86Sdivmodx + | Opcode::X86Umulx + | Opcode::X86Smulx + | Opcode::X86Cvtt2si + | Opcode::X86Fmin + | Opcode::X86Fmax + | Opcode::X86Push + | Opcode::X86Pop + | Opcode::X86Bsr + | Opcode::X86Bsf + | Opcode::X86Pblendw + | Opcode::X86Pshufd + | Opcode::X86Pshufb + | Opcode::X86Pextr + | Opcode::X86Pinsr + | Opcode::X86Insertps + | Opcode::X86Movsd + | Opcode::X86Movlhps + | Opcode::X86Palignr + | Opcode::X86Psll + | Opcode::X86Psrl + | Opcode::X86Psra + | Opcode::X86Ptest + | Opcode::X86Pmaxs + | Opcode::X86Pmaxu + | Opcode::X86Pmins + | Opcode::X86Pminu + | Opcode::X86Pmullq + | Opcode::X86Pmuludq + | Opcode::X86Punpckh + | Opcode::X86Punpckl + | Opcode::X86Vcvtudq2ps + | Opcode::X86ElfTlsGetAddr + | Opcode::X86MachoTlsGetAddr => { + panic!("x86-specific opcode in supposedly arch-neutral IR!"); + } + + Opcode::DummySargT => unreachable!(), + + Opcode::Iabs => { + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ty.unwrap(); + ctx.emit(Inst::VecMisc { + op: VecMisc2::Abs, + rd, + rn, + size: VectorSize::from_ty(ty), + }); + } + Opcode::AvgRound => { + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Urhadd, + rd, + rn, + rm, + size: VectorSize::from_ty(ty), + }); + } + + Opcode::Snarrow | Opcode::Unarrow => { + let op = if op == Opcode::Snarrow { + VecMiscNarrowOp::Sqxtn + } else { + VecMiscNarrowOp::Sqxtun + }; + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + + ctx.emit(Inst::VecMiscNarrow { + op, + rd, + rn, + size: VectorSize::from_ty(ty), + high_half: false, + }); + ctx.emit(Inst::VecMiscNarrow { + op, + rd, + rn: rn2, + size: VectorSize::from_ty(ty), + high_half: true, + }); + } + + Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => { + let lane_type = ty.unwrap().lane_type(); + let rd = get_output_reg(ctx, outputs[0]); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let (t, high_half) = match (lane_type, op) { + (I16, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false), + (I16, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true), + (I16, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false), + (I16, Opcode::UwidenHigh) => (VecExtendOp::Uxtl8, true), + (I32, Opcode::SwidenLow) => (VecExtendOp::Sxtl16, false), + (I32, Opcode::SwidenHigh) => (VecExtendOp::Sxtl16, true), + (I32, Opcode::UwidenLow) => (VecExtendOp::Uxtl16, false), + (I32, Opcode::UwidenHigh) => (VecExtendOp::Uxtl16, true), + _ => { + return Err(CodegenError::Unsupported(format!( + "Unsupported SIMD vector lane type: {:?}", + lane_type + ))); + } + }; + + ctx.emit(Inst::VecExtend { + t, + rd, + rn, + high_half, + }); + } + + Opcode::TlsValue => unimplemented!("tls_value"), + } + + Ok(()) +} + +pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>( + ctx: &mut C, + branches: &[IRInst], + targets: &[MachLabel], + fallthrough: Option<MachLabel>, +) -> CodegenResult<()> { + // A block should end with at most two branches. The first may be a + // conditional branch; a conditional branch can be followed only by an + // unconditional branch or fallthrough. Otherwise, if only one branch, + // it may be an unconditional branch, a fallthrough, a return, or a + // trap. These conditions are verified by `is_ebb_basic()` during the + // verifier pass. + assert!(branches.len() <= 2); + + if branches.len() == 2 { + // Must be a conditional branch followed by an unconditional branch. + let op0 = ctx.data(branches[0]).opcode(); + let op1 = ctx.data(branches[1]).opcode(); + + assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough); + let taken = BranchTarget::Label(targets[0]); + let not_taken = match op1 { + Opcode::Jump => BranchTarget::Label(targets[1]), + Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()), + _ => unreachable!(), // assert above. + }; + + match op0 { + Opcode::Brz | Opcode::Brnz => { + let flag_input = InsnInput { + insn: branches[0], + input: 0, + }; + if let Some(icmp_insn) = + maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint) + { + let condcode = ctx.data(icmp_insn).cond_code().unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + let negated = op0 == Opcode::Brz; + let cond = if negated { cond.invert() } else { cond }; + + lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind: CondBrKind::Cond(cond), + }); + } else if let Some(fcmp_insn) = + maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint) + { + let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap(); + let cond = lower_fp_condcode(condcode); + let negated = op0 == Opcode::Brz; + let cond = if negated { cond.invert() } else { cond }; + + lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind: CondBrKind::Cond(cond), + }); + } else { + let rt = put_input_in_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + NarrowValueMode::ZeroExtend64, + ); + let kind = match op0 { + Opcode::Brz => CondBrKind::Zero(rt), + Opcode::Brnz => CondBrKind::NotZero(rt), + _ => unreachable!(), + }; + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind, + }); + } + } + Opcode::BrIcmp => { + let condcode = ctx.data(branches[0]).cond_code().unwrap(); + let cond = lower_condcode(condcode); + let kind = CondBrKind::Cond(cond); + + let is_signed = condcode_is_signed(condcode); + let ty = ctx.input_ty(branches[0], 0); + let bits = ty_bits(ty); + let narrow_mode = match (bits <= 32, is_signed) { + (true, true) => NarrowValueMode::SignExtend32, + (true, false) => NarrowValueMode::ZeroExtend32, + (false, true) => NarrowValueMode::SignExtend64, + (false, false) => NarrowValueMode::ZeroExtend64, + }; + let rn = put_input_in_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + narrow_mode, + ); + let rm = put_input_in_rse_imm12( + ctx, + InsnInput { + insn: branches[0], + input: 1, + }, + narrow_mode, + ); + + let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); + let rd = writable_zero_reg(); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind, + }); + } + + Opcode::Brif => { + let condcode = ctx.data(branches[0]).cond_code().unwrap(); + let cond = lower_condcode(condcode); + let kind = CondBrKind::Cond(cond); + + let is_signed = condcode_is_signed(condcode); + let flag_input = InsnInput { + insn: branches[0], + input: 0, + }; + if let Some(ifcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ifcmp) { + lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind, + }); + } else { + // If the ifcmp result is actually placed in a + // register, we need to move it back into the flags. + let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None); + ctx.emit(Inst::MovToNZCV { rn }); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind, + }); + } + } + + Opcode::Brff => { + let condcode = ctx.data(branches[0]).fp_cond_code().unwrap(); + let cond = lower_fp_condcode(condcode); + let kind = CondBrKind::Cond(cond); + let flag_input = InsnInput { + insn: branches[0], + input: 0, + }; + if let Some(ffcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ffcmp) { + lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind, + }); + } else { + // If the ffcmp result is actually placed in a + // register, we need to move it back into the flags. + let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None); + ctx.emit(Inst::MovToNZCV { rn }); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind, + }); + } + } + + _ => unimplemented!(), + } + } else { + // Must be an unconditional branch or an indirect branch. + let op = ctx.data(branches[0]).opcode(); + match op { + Opcode::Jump | Opcode::Fallthrough => { + assert!(branches.len() == 1); + // In the Fallthrough case, the machine-independent driver + // fills in `targets[0]` with our fallthrough block, so this + // is valid for both Jump and Fallthrough. + ctx.emit(Inst::Jump { + dest: BranchTarget::Label(targets[0]), + }); + } + + Opcode::BrTable => { + // Expand `br_table index, default, JT` to: + // + // emit_island // this forces an island at this point + // // if the jumptable would push us past + // // the deadline + // subs idx, #jt_size + // b.hs default + // adr vTmp1, PC+16 + // ldr vTmp2, [vTmp1, idx, lsl #2] + // add vTmp2, vTmp2, vTmp1 + // br vTmp2 + // [jumptable offsets relative to JT base] + let jt_size = targets.len() - 1; + assert!(jt_size <= std::u32::MAX as usize); + + ctx.emit(Inst::EmitIsland { + needed_space: 4 * (6 + jt_size) as CodeOffset, + }); + + let ridx = put_input_in_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + NarrowValueMode::ZeroExtend32, + ); + + let rtmp1 = ctx.alloc_tmp(RegClass::I64, I32); + let rtmp2 = ctx.alloc_tmp(RegClass::I64, I32); + + // Bounds-check, leaving condition codes for JTSequence's + // branch to default target below. + if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) { + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::SubS32, + rd: writable_zero_reg(), + rn: ridx, + imm12, + }); + } else { + lower_constant_u64(ctx, rtmp1, jt_size as u64); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::SubS32, + rd: writable_zero_reg(), + rn: ridx, + rm: rtmp1.to_reg(), + }); + } + + // Emit the compound instruction that does: + // + // b.hs default + // adr rA, jt + // ldrsw rB, [rA, rIndex, UXTW 2] + // add rA, rA, rB + // br rA + // [jt entries] + // + // This must be *one* instruction in the vcode because + // we cannot allow regalloc to insert any spills/fills + // in the middle of the sequence; otherwise, the ADR's + // PC-rel offset to the jumptable would be incorrect. + // (The alternative is to introduce a relocation pass + // for inlined jumptables, which is much worse, IMHO.) + + let jt_targets: Vec<BranchTarget> = targets + .iter() + .skip(1) + .map(|bix| BranchTarget::Label(*bix)) + .collect(); + let default_target = BranchTarget::Label(targets[0]); + let targets_for_term: Vec<MachLabel> = targets.to_vec(); + ctx.emit(Inst::JTSequence { + ridx, + rtmp1, + rtmp2, + info: Box::new(JTSequenceInfo { + targets: jt_targets, + default_target, + targets_for_term, + }), + }); + } + + _ => panic!("Unknown branch type!"), + } + } + + Ok(()) +} diff --git a/third_party/rust/cranelift-codegen/src/isa/aarch64/mod.rs b/third_party/rust/cranelift-codegen/src/isa/aarch64/mod.rs new file mode 100644 index 0000000000..c3c56632d3 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/aarch64/mod.rs @@ -0,0 +1,274 @@ +//! ARM 64-bit Instruction Set Architecture. + +use crate::ir::condcodes::IntCC; +use crate::ir::Function; +use crate::isa::Builder as IsaBuilder; +use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode}; +use crate::result::CodegenResult; +use crate::settings; + +use alloc::boxed::Box; + +use regalloc::{PrettyPrint, RealRegUniverse}; +use target_lexicon::{Aarch64Architecture, Architecture, Triple}; + +// New backend: +mod abi; +pub(crate) mod inst; +mod lower; +mod lower_inst; + +use inst::create_reg_universe; + +use self::inst::EmitInfo; + +/// An AArch64 backend. +pub struct AArch64Backend { + triple: Triple, + flags: settings::Flags, + reg_universe: RealRegUniverse, +} + +impl AArch64Backend { + /// Create a new AArch64 backend with the given (shared) flags. + pub fn new_with_flags(triple: Triple, flags: settings::Flags) -> AArch64Backend { + let reg_universe = create_reg_universe(&flags); + AArch64Backend { + triple, + flags, + reg_universe, + } + } + + /// This performs lowering to VCode, register-allocates the code, computes block layout and + /// finalizes branches. The result is ready for binary emission. + fn compile_vcode( + &self, + func: &Function, + flags: settings::Flags, + ) -> CodegenResult<VCode<inst::Inst>> { + let emit_info = EmitInfo::new(flags.clone()); + let abi = Box::new(abi::AArch64ABICallee::new(func, flags)?); + compile::compile::<AArch64Backend>(func, self, abi, emit_info) + } +} + +impl MachBackend for AArch64Backend { + fn compile_function( + &self, + func: &Function, + want_disasm: bool, + ) -> CodegenResult<MachCompileResult> { + let flags = self.flags(); + let vcode = self.compile_vcode(func, flags.clone())?; + + let buffer = vcode.emit(); + let frame_size = vcode.frame_size(); + let unwind_info = vcode.unwind_info()?; + + let disasm = if want_disasm { + Some(vcode.show_rru(Some(&create_reg_universe(flags)))) + } else { + None + }; + + let buffer = buffer.finish(); + + Ok(MachCompileResult { + buffer, + frame_size, + disasm, + unwind_info, + }) + } + + fn name(&self) -> &'static str { + "aarch64" + } + + fn triple(&self) -> Triple { + self.triple.clone() + } + + fn flags(&self) -> &settings::Flags { + &self.flags + } + + fn reg_universe(&self) -> &RealRegUniverse { + &self.reg_universe + } + + fn unsigned_add_overflow_condition(&self) -> IntCC { + // Unsigned `>=`; this corresponds to the carry flag set on aarch64, which happens on + // overflow of an add. + IntCC::UnsignedGreaterThanOrEqual + } + + fn unsigned_sub_overflow_condition(&self) -> IntCC { + // unsigned `<`; this corresponds to the carry flag cleared on aarch64, which happens on + // underflow of a subtract (aarch64 follows a carry-cleared-on-borrow convention, the + // opposite of x86). + IntCC::UnsignedLessThan + } + + #[cfg(feature = "unwind")] + fn emit_unwind_info( + &self, + result: &MachCompileResult, + kind: crate::machinst::UnwindInfoKind, + ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> { + use crate::isa::unwind::UnwindInfo; + use crate::machinst::UnwindInfoKind; + Ok(match (result.unwind_info.as_ref(), kind) { + (Some(info), UnwindInfoKind::SystemV) => { + inst::unwind::systemv::create_unwind_info(info.clone())?.map(UnwindInfo::SystemV) + } + (Some(_info), UnwindInfoKind::Windows) => { + // TODO: support Windows unwind info on AArch64 + None + } + _ => None, + }) + } + + #[cfg(feature = "unwind")] + fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> { + Some(inst::unwind::systemv::create_cie()) + } +} + +/// Create a new `isa::Builder`. +pub fn isa_builder(triple: Triple) -> IsaBuilder { + assert!(triple.architecture == Architecture::Aarch64(Aarch64Architecture::Aarch64)); + IsaBuilder { + triple, + setup: settings::builder(), + constructor: |triple, shared_flags, _| { + let backend = AArch64Backend::new_with_flags(triple, shared_flags); + Box::new(TargetIsaAdapter::new(backend)) + }, + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::types::*; + use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature}; + use crate::isa::CallConv; + use crate::settings; + use crate::settings::Configurable; + use core::str::FromStr; + use target_lexicon::Triple; + + #[test] + fn test_compile_function() { + let name = ExternalName::testcase("test0"); + let mut sig = Signature::new(CallConv::SystemV); + sig.params.push(AbiParam::new(I32)); + sig.returns.push(AbiParam::new(I32)); + let mut func = Function::with_name_signature(name, sig); + + let bb0 = func.dfg.make_block(); + let arg0 = func.dfg.append_block_param(bb0, I32); + + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(bb0); + let v0 = pos.ins().iconst(I32, 0x1234); + let v1 = pos.ins().iadd(arg0, v0); + pos.ins().return_(&[v1]); + + let mut shared_flags = settings::builder(); + shared_flags.set("opt_level", "none").unwrap(); + let backend = AArch64Backend::new_with_flags( + Triple::from_str("aarch64").unwrap(), + settings::Flags::new(shared_flags), + ); + let buffer = backend.compile_function(&mut func, false).unwrap().buffer; + let code = &buffer.data[..]; + + // stp x29, x30, [sp, #-16]! + // mov x29, sp + // mov x1, #0x1234 + // add w0, w0, w1 + // mov sp, x29 + // ldp x29, x30, [sp], #16 + // ret + let golden = vec![ + 0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0x81, 0x46, 0x82, 0xd2, 0x00, 0x00, + 0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, + ]; + + assert_eq!(code, &golden[..]); + } + + #[test] + fn test_branch_lowering() { + let name = ExternalName::testcase("test0"); + let mut sig = Signature::new(CallConv::SystemV); + sig.params.push(AbiParam::new(I32)); + sig.returns.push(AbiParam::new(I32)); + let mut func = Function::with_name_signature(name, sig); + + let bb0 = func.dfg.make_block(); + let arg0 = func.dfg.append_block_param(bb0, I32); + let bb1 = func.dfg.make_block(); + let bb2 = func.dfg.make_block(); + let bb3 = func.dfg.make_block(); + + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(bb0); + let v0 = pos.ins().iconst(I32, 0x1234); + let v1 = pos.ins().iadd(arg0, v0); + pos.ins().brnz(v1, bb1, &[]); + pos.ins().jump(bb2, &[]); + pos.insert_block(bb1); + pos.ins().brnz(v1, bb2, &[]); + pos.ins().jump(bb3, &[]); + pos.insert_block(bb2); + let v2 = pos.ins().iadd(v1, v0); + pos.ins().brnz(v2, bb2, &[]); + pos.ins().jump(bb1, &[]); + pos.insert_block(bb3); + let v3 = pos.ins().isub(v1, v0); + pos.ins().return_(&[v3]); + + let mut shared_flags = settings::builder(); + shared_flags.set("opt_level", "none").unwrap(); + let backend = AArch64Backend::new_with_flags( + Triple::from_str("aarch64").unwrap(), + settings::Flags::new(shared_flags), + ); + let result = backend + .compile_function(&mut func, /* want_disasm = */ false) + .unwrap(); + let code = &result.buffer.data[..]; + + // stp x29, x30, [sp, #-16]! + // mov x29, sp + // mov x1, #0x1234 // #4660 + // add w0, w0, w1 + // mov w1, w0 + // cbnz x1, 0x28 + // mov x1, #0x1234 // #4660 + // add w1, w0, w1 + // mov w1, w1 + // cbnz x1, 0x18 + // mov w1, w0 + // cbnz x1, 0x18 + // mov x1, #0x1234 // #4660 + // sub w0, w0, w1 + // mov sp, x29 + // ldp x29, x30, [sp], #16 + // ret + let golden = vec![ + 253, 123, 191, 169, 253, 3, 0, 145, 129, 70, 130, 210, 0, 0, 1, 11, 225, 3, 0, 42, 161, + 0, 0, 181, 129, 70, 130, 210, 1, 0, 1, 11, 225, 3, 1, 42, 161, 255, 255, 181, 225, 3, + 0, 42, 97, 255, 255, 181, 129, 70, 130, 210, 0, 0, 1, 75, 191, 3, 0, 145, 253, 123, + 193, 168, 192, 3, 95, 214, + ]; + + assert_eq!(code, &golden[..]); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/abi.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/abi.rs new file mode 100644 index 0000000000..edf1792e52 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/arm32/abi.rs @@ -0,0 +1,471 @@ +//! Implementation of the 32-bit ARM ABI. + +use crate::ir; +use crate::ir::types::*; +use crate::isa; +use crate::isa::arm32::inst::*; +use crate::machinst::*; +use crate::settings; +use crate::{CodegenError, CodegenResult}; +use alloc::boxed::Box; +use alloc::vec::Vec; +use regalloc::{RealReg, Reg, RegClass, Set, Writable}; +use smallvec::SmallVec; + +/// Support for the ARM ABI from the callee side (within a function body). +pub(crate) type Arm32ABICallee = ABICalleeImpl<Arm32MachineDeps>; + +/// Support for the ARM ABI from the caller side (at a callsite). +pub(crate) type Arm32ABICaller = ABICallerImpl<Arm32MachineDeps>; + +/// This is the limit for the size of argument and return-value areas on the +/// stack. We place a reasonable limit here to avoid integer overflow issues +/// with 32-bit arithmetic: for now, 128 MB. +static STACK_ARG_RET_SIZE_LIMIT: u64 = 128 * 1024 * 1024; + +/// ARM-specific ABI behavior. This struct just serves as an implementation +/// point for the trait; it is never actually instantiated. +pub(crate) struct Arm32MachineDeps; + +impl Into<AMode> for StackAMode { + fn into(self) -> AMode { + match self { + StackAMode::FPOffset(off, ty) => AMode::FPOffset(off, ty), + StackAMode::NominalSPOffset(off, ty) => AMode::NominalSPOffset(off, ty), + StackAMode::SPOffset(off, ty) => AMode::SPOffset(off, ty), + } + } +} + +impl ABIMachineSpec for Arm32MachineDeps { + type I = Inst; + + fn word_bits() -> u32 { + 32 + } + + /// Return required stack alignment in bytes. + fn stack_align(_call_conv: isa::CallConv) -> u32 { + 8 + } + + fn compute_arg_locs( + _call_conv: isa::CallConv, + params: &[ir::AbiParam], + args_or_rets: ArgsOrRets, + add_ret_area_ptr: bool, + ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> { + let mut next_rreg = 0; + let mut next_stack: u64 = 0; + let mut ret = vec![]; + let mut stack_args = vec![]; + + let max_reg_val = 4; // r0-r3 + + for i in 0..params.len() { + let param = params[i]; + + // Validate "purpose". + match ¶m.purpose { + &ir::ArgumentPurpose::VMContext + | &ir::ArgumentPurpose::Normal + | &ir::ArgumentPurpose::StackLimit + | &ir::ArgumentPurpose::SignatureId => {} + _ => panic!( + "Unsupported argument purpose {:?} in signature: {:?}", + param.purpose, params + ), + } + assert!(param.value_type.bits() <= 32); + + if next_rreg < max_reg_val { + let reg = rreg(next_rreg); + + ret.push(ABIArg::Reg( + reg.to_real_reg(), + param.value_type, + param.extension, + param.purpose, + )); + next_rreg += 1; + } else { + // Arguments are stored on stack in reversed order. + // https://static.docs.arm.com/ihi0042/g/aapcs32.pdf + + // Stack offset is not known yet. Store param info for later. + stack_args.push((param.value_type, param.extension, param.purpose)); + next_stack += 4; + } + } + + let extra_arg = if add_ret_area_ptr { + debug_assert!(args_or_rets == ArgsOrRets::Args); + if next_rreg < max_reg_val { + ret.push(ABIArg::Reg( + rreg(next_rreg).to_real_reg(), + I32, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + )); + } else { + stack_args.push(( + I32, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + )); + next_stack += 4; + } + Some(ret.len() - 1) + } else { + None + }; + + // Now we can assign proper stack offsets to params. + let max_stack = next_stack; + for (ty, ext, purpose) in stack_args.into_iter().rev() { + next_stack -= 4; + ret.push(ABIArg::Stack( + (max_stack - next_stack) as i64, + ty, + ext, + purpose, + )); + } + assert_eq!(next_stack, 0); + + next_stack = (next_stack + 7) & !7; + + // To avoid overflow issues, limit the arg/return size to something + // reasonable -- here, 128 MB. + if next_stack > STACK_ARG_RET_SIZE_LIMIT { + return Err(CodegenError::ImplLimitExceeded); + } + + Ok((ret, next_stack as i64, extra_arg)) + } + + fn fp_to_arg_offset(_call_conv: isa::CallConv, _flags: &settings::Flags) -> i64 { + 8 // frame pointer and link register + } + + fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Inst { + Inst::gen_load(into_reg, mem.into(), ty) + } + + fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Inst { + Inst::gen_store(from_reg, mem.into(), ty) + } + + fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst { + Inst::gen_move(to_reg, from_reg, ty) + } + + fn gen_extend( + to_reg: Writable<Reg>, + from_reg: Reg, + is_signed: bool, + from_bits: u8, + to_bits: u8, + ) -> Inst { + assert!(to_bits == 32); + assert!(from_bits < 32); + Inst::Extend { + rd: to_reg, + rm: from_reg, + signed: is_signed, + from_bits, + } + } + + fn gen_ret() -> Inst { + Inst::Ret + } + + fn gen_epilogue_placeholder() -> Inst { + Inst::EpiloguePlaceholder + } + + fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Inst; 4]> { + let mut insts = SmallVec::new(); + + if let Some(imm12) = UImm12::maybe_from_i64(imm as i64) { + insts.push(Inst::AluRRImm12 { + alu_op: ALUOp::Add, + rd: into_reg, + rn: from_reg, + imm12, + }); + } else { + let scratch2 = writable_tmp2_reg(); + insts.extend(Inst::load_constant(scratch2, imm)); + insts.push(Inst::AluRRRShift { + alu_op: ALUOp::Add, + rd: into_reg, + rn: from_reg, + rm: scratch2.to_reg(), + shift: None, + }); + } + insts + } + + fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Inst; 2]> { + let mut insts = SmallVec::new(); + insts.push(Inst::Cmp { + rn: sp_reg(), + rm: limit_reg, + }); + insts.push(Inst::TrapIf { + trap_info: ir::TrapCode::StackOverflow, + // Here `Lo` == "less than" when interpreting the two + // operands as unsigned integers. + cond: Cond::Lo, + }); + insts + } + + fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Inst { + let mem = mem.into(); + Inst::LoadAddr { rd: into_reg, mem } + } + + fn get_stacklimit_reg() -> Reg { + ip_reg() + } + + fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Inst { + let mem = AMode::RegOffset(base, offset as i64); + Inst::gen_load(into_reg, mem, ty) + } + + fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Inst { + let mem = AMode::RegOffset(base, offset as i64); + Inst::gen_store(from_reg, mem, ty) + } + + fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Inst; 2]> { + let mut ret = SmallVec::new(); + + if amount == 0 { + return ret; + } + let (amount, is_sub) = if amount > 0 { + (amount, false) + } else { + (-amount, true) + }; + + let alu_op = if is_sub { ALUOp::Sub } else { ALUOp::Add }; + + if let Some(imm12) = UImm12::maybe_from_i64(amount as i64) { + ret.push(Inst::AluRRImm12 { + alu_op, + rd: writable_sp_reg(), + rn: sp_reg(), + imm12, + }); + } else { + let tmp = writable_ip_reg(); + ret.extend(Inst::load_constant(tmp, amount as u32)); + ret.push(Inst::AluRRRShift { + alu_op, + rd: writable_sp_reg(), + rn: sp_reg(), + rm: tmp.to_reg(), + shift: None, + }); + } + ret + } + + fn gen_nominal_sp_adj(offset: i32) -> Inst { + let offset = i64::from(offset); + Inst::VirtualSPOffsetAdj { offset } + } + + fn gen_prologue_frame_setup() -> SmallVec<[Inst; 2]> { + let mut ret = SmallVec::new(); + let reg_list = vec![fp_reg(), lr_reg()]; + ret.push(Inst::Push { reg_list }); + ret.push(Inst::Mov { + rd: writable_fp_reg(), + rm: sp_reg(), + }); + ret + } + + fn gen_epilogue_frame_restore() -> SmallVec<[Inst; 2]> { + let mut ret = SmallVec::new(); + ret.push(Inst::Mov { + rd: writable_sp_reg(), + rm: fp_reg(), + }); + let reg_list = vec![writable_fp_reg(), writable_lr_reg()]; + ret.push(Inst::Pop { reg_list }); + ret + } + + /// Returns stack bytes used as well as instructions. Does not adjust + /// nominal SP offset; caller will do that. + fn gen_clobber_save( + _call_conv: isa::CallConv, + _flags: &settings::Flags, + clobbers: &Set<Writable<RealReg>>, + fixed_frame_storage_size: u32, + _outgoing_args_size: u32, + ) -> (u64, SmallVec<[Inst; 16]>) { + let mut insts = SmallVec::new(); + if fixed_frame_storage_size > 0 { + insts.extend(Self::gen_sp_reg_adjust(-(fixed_frame_storage_size as i32)).into_iter()); + } + let clobbered_vec = get_callee_saves(clobbers); + let mut clobbered_vec: Vec<_> = clobbered_vec + .into_iter() + .map(|r| r.to_reg().to_reg()) + .collect(); + if clobbered_vec.len() % 2 == 1 { + // For alignment purposes. + clobbered_vec.push(ip_reg()); + } + let stack_used = clobbered_vec.len() * 4; + if !clobbered_vec.is_empty() { + insts.push(Inst::Push { + reg_list: clobbered_vec, + }); + } + + (stack_used as u64, insts) + } + + fn gen_clobber_restore( + _call_conv: isa::CallConv, + _flags: &settings::Flags, + clobbers: &Set<Writable<RealReg>>, + _fixed_frame_storage_size: u32, + _outgoing_args_size: u32, + ) -> SmallVec<[Inst; 16]> { + let mut insts = SmallVec::new(); + let clobbered_vec = get_callee_saves(clobbers); + let mut clobbered_vec: Vec<_> = clobbered_vec + .into_iter() + .map(|r| Writable::from_reg(r.to_reg().to_reg())) + .collect(); + if clobbered_vec.len() % 2 == 1 { + clobbered_vec.push(writable_ip_reg()); + } + if !clobbered_vec.is_empty() { + insts.push(Inst::Pop { + reg_list: clobbered_vec, + }); + } + insts + } + + fn gen_call( + dest: &CallDest, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: ir::Opcode, + tmp: Writable<Reg>, + _callee_conv: isa::CallConv, + _caller_conv: isa::CallConv, + ) -> SmallVec<[(InstIsSafepoint, Inst); 2]> { + let mut insts = SmallVec::new(); + match &dest { + &CallDest::ExtName(ref name, RelocDistance::Near) => insts.push(( + InstIsSafepoint::Yes, + Inst::Call { + info: Box::new(CallInfo { + dest: name.clone(), + uses, + defs, + opcode, + }), + }, + )), + &CallDest::ExtName(ref name, RelocDistance::Far) => { + insts.push(( + InstIsSafepoint::No, + Inst::LoadExtName { + rt: tmp, + name: Box::new(name.clone()), + offset: 0, + }, + )); + insts.push(( + InstIsSafepoint::Yes, + Inst::CallInd { + info: Box::new(CallIndInfo { + rm: tmp.to_reg(), + uses, + defs, + opcode, + }), + }, + )); + } + &CallDest::Reg(reg) => insts.push(( + InstIsSafepoint::Yes, + Inst::CallInd { + info: Box::new(CallIndInfo { + rm: *reg, + uses, + defs, + opcode, + }), + }, + )), + } + + insts + } + + fn get_number_of_spillslots_for_value(rc: RegClass, _ty: Type) -> u32 { + match rc { + RegClass::I32 => 1, + _ => panic!("Unexpected register class!"), + } + } + + fn get_virtual_sp_offset_from_state(s: &EmitState) -> i64 { + s.virtual_sp_offset + } + + fn get_nominal_sp_to_fp(s: &EmitState) -> i64 { + s.nominal_sp_to_fp + } + + fn get_regs_clobbered_by_call(_: isa::CallConv) -> Vec<Writable<Reg>> { + let mut caller_saved = Vec::new(); + for i in 0..15 { + let r = writable_rreg(i); + if is_reg_clobbered_by_call(r.to_reg().to_real_reg()) { + caller_saved.push(r); + } + } + caller_saved + } +} + +fn is_callee_save(r: RealReg) -> bool { + let enc = r.get_hw_encoding(); + 4 <= enc && enc <= 10 +} + +fn get_callee_saves(regs: &Set<Writable<RealReg>>) -> Vec<Writable<RealReg>> { + let mut ret = Vec::new(); + for ® in regs.iter() { + if is_callee_save(reg.to_reg()) { + ret.push(reg); + } + } + + // Sort registers for deterministic code output. + ret.sort_by_key(|r| r.to_reg().get_index()); + ret +} + +fn is_reg_clobbered_by_call(r: RealReg) -> bool { + let enc = r.get_hw_encoding(); + enc <= 3 +} diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/args.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/args.rs new file mode 100644 index 0000000000..2c1b8e97d6 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/args.rs @@ -0,0 +1,335 @@ +//! 32-bit ARM ISA definitions: instruction arguments. + +use crate::isa::arm32::inst::*; + +use regalloc::{PrettyPrint, RealRegUniverse, Reg}; + +use std::string::String; + +/// A shift operator for a register or immediate. +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum ShiftOp { + LSL = 0b00, + LSR = 0b01, + ASR = 0b10, + ROR = 0b11, +} + +impl ShiftOp { + /// Get the encoding of this shift op. + pub fn bits(self) -> u8 { + self as u8 + } +} + +/// A shift operator amount. +#[derive(Clone, Copy, Debug)] +pub struct ShiftOpShiftImm(u8); + +impl ShiftOpShiftImm { + /// Maximum shift for shifted-register operands. + pub const MAX_SHIFT: u32 = 31; + + /// Create a new shiftop shift amount, if possible. + pub fn maybe_from_shift(shift: u32) -> Option<ShiftOpShiftImm> { + if shift <= Self::MAX_SHIFT { + Some(ShiftOpShiftImm(shift as u8)) + } else { + None + } + } + + /// Return the shift amount. + pub fn value(self) -> u8 { + self.0 + } +} + +/// A shift operator with an amount, guaranteed to be within range. +#[derive(Clone, Debug)] +pub struct ShiftOpAndAmt { + op: ShiftOp, + shift: ShiftOpShiftImm, +} + +impl ShiftOpAndAmt { + pub fn new(op: ShiftOp, shift: ShiftOpShiftImm) -> ShiftOpAndAmt { + ShiftOpAndAmt { op, shift } + } + + /// Get the shift op. + pub fn op(&self) -> ShiftOp { + self.op + } + + /// Get the shift amount. + pub fn amt(&self) -> ShiftOpShiftImm { + self.shift + } +} + +// An unsigned 8-bit immediate. +#[derive(Clone, Copy, Debug)] +pub struct UImm8 { + /// The value. + value: u8, +} + +impl UImm8 { + pub fn maybe_from_i64(value: i64) -> Option<UImm8> { + if 0 <= value && value < (1 << 8) { + Some(UImm8 { value: value as u8 }) + } else { + None + } + } + + /// Bits for encoding. + pub fn bits(&self) -> u32 { + u32::from(self.value) + } +} + +/// An unsigned 12-bit immediate. +#[derive(Clone, Copy, Debug)] +pub struct UImm12 { + /// The value. + value: u16, +} + +impl UImm12 { + pub fn maybe_from_i64(value: i64) -> Option<UImm12> { + if 0 <= value && value < (1 << 12) { + Some(UImm12 { + value: value as u16, + }) + } else { + None + } + } + + /// Bits for encoding. + pub fn bits(&self) -> u32 { + u32::from(self.value) + } +} + +/// An addressing mode specified for a load/store operation. +#[derive(Clone, Debug)] +pub enum AMode { + // Real addressing modes + /// Register plus register offset, which can be shifted left by imm2. + RegReg(Reg, Reg, u8), + + /// Unsigned 12-bit immediate offset from reg. + RegOffset12(Reg, UImm12), + + /// Immediate offset from program counter aligned to 4. + /// Cannot be used by store instructions. + PCRel(i32), + + // Virtual addressing modes that are lowered at emission time: + /// Immediate offset from reg. + RegOffset(Reg, i64), + + /// Signed immediate offset from stack pointer. + SPOffset(i64, Type), + + /// Offset from the frame pointer. + FPOffset(i64, Type), + + /// Signed immediate offset from "nominal stack pointer". + NominalSPOffset(i64, Type), +} + +impl AMode { + /// Memory reference using the sum of two registers as an address. + pub fn reg_plus_reg(reg1: Reg, reg2: Reg, shift_amt: u8) -> AMode { + assert!(shift_amt <= 3); + AMode::RegReg(reg1, reg2, shift_amt) + } + + /// Memory reference using the sum of a register and an immediate offset + /// as an address. + pub fn reg_plus_imm(reg: Reg, offset: i64) -> AMode { + AMode::RegOffset(reg, offset) + } +} + +/// Condition for conditional branches. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(u8)] +pub enum Cond { + Eq = 0, + Ne = 1, + Hs = 2, + Lo = 3, + Mi = 4, + Pl = 5, + Vs = 6, + Vc = 7, + Hi = 8, + Ls = 9, + Ge = 10, + Lt = 11, + Gt = 12, + Le = 13, + Al = 14, +} + +impl Cond { + /// Return the inverted condition. + pub fn invert(self) -> Cond { + match self { + Cond::Eq => Cond::Ne, + Cond::Ne => Cond::Eq, + + Cond::Hs => Cond::Lo, + Cond::Lo => Cond::Hs, + + Cond::Mi => Cond::Pl, + Cond::Pl => Cond::Mi, + + Cond::Vs => Cond::Vc, + Cond::Vc => Cond::Vs, + + Cond::Hi => Cond::Ls, + Cond::Ls => Cond::Hi, + + Cond::Ge => Cond::Lt, + Cond::Lt => Cond::Ge, + + Cond::Gt => Cond::Le, + Cond::Le => Cond::Gt, + + Cond::Al => panic!("Cannot inverse {:?} condition", self), + } + } + + /// Return the machine encoding of this condition. + pub fn bits(self) -> u16 { + self as u16 + } +} + +/// A branch target. Either unresolved (basic-block index) or resolved (offset +/// from end of current instruction). +#[derive(Clone, Copy, Debug)] +pub enum BranchTarget { + /// An unresolved reference to a Label. + Label(MachLabel), + /// A fixed PC offset. + ResolvedOffset(i32), +} + +impl BranchTarget { + /// Return the target's label, if it is a label-based target. + pub fn as_label(self) -> Option<MachLabel> { + match self { + BranchTarget::Label(l) => Some(l), + _ => None, + } + } + + // Ready for embedding in instruction. + fn as_offset(self, inst_16_bit: bool) -> i32 { + match self { + BranchTarget::ResolvedOffset(off) => { + if inst_16_bit { + // pc is equal to end of the current inst + 2. + (off - 2) >> 1 + } else { + // pc points to end of the current inst. + off >> 1 + } + } + _ => 0, + } + } + + // For 32-bit unconditional jump. + pub fn as_off24(self) -> u32 { + let off = self.as_offset(false); + assert!(off < (1 << 24)); + assert!(off >= -(1 << 24)); + (off as u32) & ((1 << 24) - 1) + } + + // For 32-bit conditional jump. + pub fn as_off20(self) -> u32 { + let off = self.as_offset(false); + assert!(off < (1 << 20)); + assert!(off >= -(1 << 20)); + (off as u32) & ((1 << 20) - 1) + } +} + +impl PrettyPrint for ShiftOpAndAmt { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + let op = match self.op() { + ShiftOp::LSL => "lsl", + ShiftOp::LSR => "lsr", + ShiftOp::ASR => "asr", + ShiftOp::ROR => "ror", + }; + format!("{} #{}", op, self.amt().value()) + } +} + +impl PrettyPrint for UImm8 { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value) + } +} + +impl PrettyPrint for UImm12 { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value) + } +} + +impl PrettyPrint for AMode { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + match self { + &AMode::RegReg(rn, rm, imm2) => { + let shift = if imm2 != 0 { + format!(", lsl #{}", imm2) + } else { + "".to_string() + }; + format!( + "[{}, {}{}]", + rn.show_rru(mb_rru), + rm.show_rru(mb_rru), + shift + ) + } + &AMode::RegOffset12(rn, off) => { + format!("[{}, {}]", rn.show_rru(mb_rru), off.show_rru(mb_rru)) + } + &AMode::PCRel(off) => format!("[pc, #{}]", off), + &AMode::RegOffset(..) + | &AMode::SPOffset(..) + | &AMode::FPOffset(..) + | &AMode::NominalSPOffset(..) => panic!("unexpected mem mode"), + } + } +} + +impl PrettyPrint for Cond { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + let mut s = format!("{:?}", self); + s.make_ascii_lowercase(); + s + } +} + +impl PrettyPrint for BranchTarget { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + match self { + &BranchTarget::Label(label) => format!("label{:?}", label.get()), + &BranchTarget::ResolvedOffset(off) => format!("{}", off), + } + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit.rs new file mode 100644 index 0000000000..5e4a412e96 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit.rs @@ -0,0 +1,829 @@ +//! 32-bit ARM ISA: binary code emission. + +use crate::binemit::{Reloc, StackMap}; +use crate::ir::SourceLoc; +use crate::isa::arm32::inst::*; + +use core::convert::TryFrom; +use log::debug; + +/// Memory addressing mode finalization: convert "special" modes (e.g., +/// nominal stack offset) into real addressing modes, possibly by +/// emitting some helper instructions that come immediately before the use +/// of this amode. +pub fn mem_finalize(mem: &AMode, state: &EmitState) -> (SmallVec<[Inst; 4]>, AMode) { + match mem { + &AMode::RegOffset(_, off) + | &AMode::SPOffset(off, _) + | &AMode::FPOffset(off, _) + | &AMode::NominalSPOffset(off, _) => { + let basereg = match mem { + &AMode::RegOffset(reg, _) => reg, + &AMode::SPOffset(..) | &AMode::NominalSPOffset(..) => sp_reg(), + &AMode::FPOffset(..) => fp_reg(), + _ => unreachable!(), + }; + let adj = match mem { + &AMode::NominalSPOffset(..) => { + debug!( + "mem_finalize: nominal SP offset {} + adj {} -> {}", + off, + state.virtual_sp_offset, + off + state.virtual_sp_offset + ); + state.virtual_sp_offset + } + _ => 0, + }; + let off = off + adj; + + assert!(-(1 << 31) <= off && off <= (1 << 32)); + + if let Some(off) = UImm12::maybe_from_i64(off) { + let mem = AMode::RegOffset12(basereg, off); + (smallvec![], mem) + } else { + let tmp = writable_ip_reg(); + let const_insts = Inst::load_constant(tmp, off as u32); + let mem = AMode::reg_plus_reg(basereg, tmp.to_reg(), 0); + (const_insts, mem) + } + } + // Just assert immediate is valid here. + _ => (smallvec![], mem.clone()), + } +} + +//============================================================================= +// Instructions and subcomponents: emission + +fn machreg_to_gpr(m: Reg) -> u16 { + assert_eq!(m.get_class(), RegClass::I32); + u16::try_from(m.to_real_reg().get_hw_encoding()).unwrap() +} + +fn machreg_to_gpr_lo(m: Reg) -> u16 { + let gpr_lo = machreg_to_gpr(m); + assert!(gpr_lo < 8); + gpr_lo +} + +fn machreg_is_lo(m: Reg) -> bool { + machreg_to_gpr(m) < 8 +} + +fn enc_16_rr(bits_15_6: u16, rd: Reg, rm: Reg) -> u16 { + (bits_15_6 << 6) | machreg_to_gpr_lo(rd) | (machreg_to_gpr_lo(rm) << 3) +} + +fn enc_16_rr_any(bits_15_8: u16, rd: Reg, rm: Reg) -> u16 { + let rd = machreg_to_gpr(rd); + (bits_15_8 << 8) | (rd & 0x7) | ((rd >> 3) << 7) | (machreg_to_gpr(rm) << 3) +} + +fn enc_16_mov(rd: Writable<Reg>, rm: Reg) -> u16 { + enc_16_rr_any(0b01000110, rd.to_reg(), rm) +} + +fn enc_16_it(cond: Cond, insts: &Vec<CondInst>) -> u16 { + let cond = cond.bits(); + let mut mask: u16 = 0; + for inst in insts.iter().skip(1) { + if inst.then { + mask |= cond & 0x1; + } else { + mask |= (cond & 0x1) ^ 0x1; + } + mask <<= 1; + } + mask |= 0x1; + mask <<= 4 - insts.len(); + 0b1011_1111_0000_0000 | (cond << 4) | mask +} + +fn enc_32_regs( + mut inst: u32, + reg_0: Option<Reg>, + reg_8: Option<Reg>, + reg_12: Option<Reg>, + reg_16: Option<Reg>, +) -> u32 { + if let Some(reg_0) = reg_0 { + inst |= u32::from(machreg_to_gpr(reg_0)); + } + if let Some(reg_8) = reg_8 { + inst |= u32::from(machreg_to_gpr(reg_8)) << 8; + } + if let Some(reg_12) = reg_12 { + inst |= u32::from(machreg_to_gpr(reg_12)) << 12; + } + if let Some(reg_16) = reg_16 { + inst |= u32::from(machreg_to_gpr(reg_16)) << 16; + } + inst +} + +fn enc_32_reg_shift(inst: u32, shift: &Option<ShiftOpAndAmt>) -> u32 { + match shift { + Some(shift) => { + let op = u32::from(shift.op().bits()); + let amt = u32::from(shift.amt().value()); + let imm2 = amt & 0x3; + let imm3 = (amt >> 2) & 0x7; + + inst | (op << 4) | (imm2 << 6) | (imm3 << 12) + } + None => inst, + } +} + +fn enc_32_r_imm16(bits_31_20: u32, rd: Reg, imm16: u16) -> u32 { + let imm16 = u32::from(imm16); + let imm8 = imm16 & 0xff; + let imm3 = (imm16 >> 8) & 0x7; + let i = (imm16 >> 11) & 0x1; + let imm4 = (imm16 >> 12) & 0xf; + + let inst = ((bits_31_20 << 20) & !(1 << 26)) | imm8 | (imm3 << 12) | (imm4 << 16) | (i << 26); + enc_32_regs(inst, None, Some(rd), None, None) +} + +fn enc_32_rrr(bits_31_20: u32, bits_15_12: u32, bits_7_4: u32, rd: Reg, rm: Reg, rn: Reg) -> u32 { + let inst = (bits_31_20 << 20) | (bits_15_12 << 12) | (bits_7_4 << 4); + enc_32_regs(inst, Some(rm), Some(rd), None, Some(rn)) +} + +fn enc_32_imm12(inst: u32, imm12: UImm12) -> u32 { + let imm12 = imm12.bits(); + let imm8 = imm12 & 0xff; + let imm3 = (imm12 >> 8) & 0x7; + let i = (imm12 >> 11) & 0x1; + inst | imm8 | (imm3 << 12) | (i << 26) +} + +fn enc_32_mem_r(bits_24_20: u32, rt: Reg, rn: Reg, rm: Reg, imm2: u8) -> u32 { + let imm2 = u32::from(imm2); + let inst = (imm2 << 4) | (bits_24_20 << 20) | (0b11111 << 27); + enc_32_regs(inst, Some(rm), None, Some(rt), Some(rn)) +} + +fn enc_32_mem_off12(bits_24_20: u32, rt: Reg, rn: Reg, off12: UImm12) -> u32 { + let off12 = off12.bits(); + let inst = off12 | (bits_24_20 << 20) | (0b11111 << 27); + enc_32_regs(inst, None, None, Some(rt), Some(rn)) +} + +fn enc_32_jump(target: BranchTarget) -> u32 { + let off24 = target.as_off24(); + let imm11 = off24 & 0x7ff; + let imm10 = (off24 >> 11) & 0x3ff; + let i2 = (off24 >> 21) & 0x1; + let i1 = (off24 >> 22) & 0x1; + let s = (off24 >> 23) & 0x1; + let j1 = (i1 ^ s) ^ 1; + let j2 = (i2 ^ s) ^ 1; + + 0b11110_0_0000000000_10_0_1_0_00000000000 + | imm11 + | (j2 << 11) + | (j1 << 13) + | (imm10 << 16) + | (s << 26) +} + +fn enc_32_cond_branch(cond: Cond, target: BranchTarget) -> u32 { + let cond = u32::from(cond.bits()); + let off20 = target.as_off20(); + let imm11 = off20 & 0x7ff; + let imm6 = (off20 >> 11) & 0x3f; + let j1 = (off20 >> 17) & 0x1; + let j2 = (off20 >> 18) & 0x1; + let s = (off20 >> 19) & 0x1; + + 0b11110_0_0000_000000_10_0_0_0_00000000000 + | imm11 + | (j2 << 11) + | (j1 << 13) + | (imm6 << 16) + | (cond << 22) + | (s << 26) +} + +fn u32_swap_halfwords(x: u32) -> u32 { + (x >> 16) | (x << 16) +} + +fn emit_32(inst: u32, sink: &mut MachBuffer<Inst>) { + let inst_hi = (inst >> 16) as u16; + let inst_lo = (inst & 0xffff) as u16; + sink.put2(inst_hi); + sink.put2(inst_lo); +} + +/// State carried between emissions of a sequence of instructions. +#[derive(Default, Clone, Debug)] +pub struct EmitState { + /// Addend to convert nominal-SP offsets to real-SP offsets at the current + /// program point. + pub(crate) virtual_sp_offset: i64, + /// Offset of FP from nominal-SP. + pub(crate) nominal_sp_to_fp: i64, + /// Safepoint stack map for upcoming instruction, as provided to `pre_safepoint()`. + stack_map: Option<StackMap>, + /// Source location of next machine code instruction to be emitted. + cur_srcloc: SourceLoc, +} + +impl MachInstEmitState<Inst> for EmitState { + fn new(abi: &dyn ABICallee<I = Inst>) -> Self { + EmitState { + virtual_sp_offset: 0, + nominal_sp_to_fp: abi.frame_size() as i64, + stack_map: None, + cur_srcloc: SourceLoc::default(), + } + } + + fn pre_safepoint(&mut self, stack_map: StackMap) { + self.stack_map = Some(stack_map); + } + + fn pre_sourceloc(&mut self, srcloc: SourceLoc) { + self.cur_srcloc = srcloc; + } +} + +impl EmitState { + fn take_stack_map(&mut self) -> Option<StackMap> { + self.stack_map.take() + } + + fn clear_post_insn(&mut self) { + self.stack_map = None; + } + + fn cur_srcloc(&self) -> SourceLoc { + self.cur_srcloc + } +} + +pub struct EmitInfo { + flags: settings::Flags, +} + +impl EmitInfo { + pub(crate) fn new(flags: settings::Flags) -> Self { + EmitInfo { flags } + } +} + +impl MachInstEmitInfo for EmitInfo { + fn flags(&self) -> &settings::Flags { + &self.flags + } +} + +impl MachInstEmit for Inst { + type Info = EmitInfo; + type State = EmitState; + type UnwindInfo = super::unwind::Arm32UnwindInfo; + + fn emit(&self, sink: &mut MachBuffer<Inst>, emit_info: &Self::Info, state: &mut EmitState) { + let start_off = sink.cur_offset(); + + match self { + &Inst::Nop0 | &Inst::EpiloguePlaceholder => {} + &Inst::Nop2 => { + sink.put2(0b1011_1111_0000_0000); + } + &Inst::AluRRR { alu_op, rd, rn, rm } => { + let (bits_31_20, bits_15_12, bits_7_4) = match alu_op { + ALUOp::Lsl => (0b111110100000, 0b1111, 0b0000), + ALUOp::Lsr => (0b111110100010, 0b1111, 0b0000), + ALUOp::Asr => (0b111110100100, 0b1111, 0b0000), + ALUOp::Ror => (0b111110100110, 0b1111, 0b0000), + ALUOp::Qadd => (0b111110101000, 0b1111, 0b1000), + ALUOp::Qsub => (0b111110101000, 0b1111, 0b1010), + ALUOp::Mul => (0b111110110000, 0b1111, 0b0000), + ALUOp::Udiv => (0b111110111011, 0b1111, 0b1111), + ALUOp::Sdiv => (0b111110111001, 0b1111, 0b1111), + _ => panic!("Invalid ALUOp {:?} in RRR form!", alu_op), + }; + emit_32( + enc_32_rrr(bits_31_20, bits_15_12, bits_7_4, rd.to_reg(), rm, rn), + sink, + ); + } + &Inst::AluRRRShift { + alu_op, + rd, + rn, + rm, + ref shift, + } => { + let bits_31_24 = 0b111_0101; + let bits_24_20 = match alu_op { + ALUOp::And => 0b00000, + ALUOp::Bic => 0b00010, + ALUOp::Orr => 0b00100, + ALUOp::Orn => 0b00110, + ALUOp::Eor => 0b01000, + ALUOp::Add => 0b10000, + ALUOp::Adds => 0b10001, + ALUOp::Adc => 0b10100, + ALUOp::Adcs => 0b10101, + ALUOp::Sbc => 0b10110, + ALUOp::Sbcs => 0b10111, + ALUOp::Sub => 0b11010, + ALUOp::Subs => 0b11011, + ALUOp::Rsb => 0b11100, + _ => panic!("Invalid ALUOp {:?} in RRRShift form!", alu_op), + }; + let bits_31_20 = (bits_31_24 << 5) | bits_24_20; + let inst = enc_32_rrr(bits_31_20, 0, 0, rd.to_reg(), rm, rn); + let inst = enc_32_reg_shift(inst, shift); + emit_32(inst, sink); + } + &Inst::AluRRShift { + alu_op, + rd, + rm, + ref shift, + } => { + let bits_24_21 = match alu_op { + ALUOp1::Mvn => 0b0011, + ALUOp1::Mov => 0b0010, + }; + let inst = 0b1110101_0000_0_1111_0_000_0000_00_00_0000 | (bits_24_21 << 21); + let inst = enc_32_regs(inst, Some(rm), Some(rd.to_reg()), None, None); + let inst = enc_32_reg_shift(inst, shift); + emit_32(inst, sink); + } + &Inst::AluRRRR { + alu_op, + rd_hi, + rd_lo, + rn, + rm, + } => { + let (bits_22_20, bits_7_4) = match alu_op { + ALUOp::Smull => (0b000, 0b0000), + ALUOp::Umull => (0b010, 0b0000), + _ => panic!("Invalid ALUOp {:?} in RRRR form!", alu_op), + }; + let inst = (0b111110111 << 23) | (bits_22_20 << 20) | (bits_7_4 << 4); + let inst = enc_32_regs( + inst, + Some(rm), + Some(rd_hi.to_reg()), + Some(rd_lo.to_reg()), + Some(rn), + ); + emit_32(inst, sink); + } + &Inst::AluRRImm12 { + alu_op, + rd, + rn, + imm12, + } => { + let bits_24_20 = match alu_op { + ALUOp::Add => 0b00000, + ALUOp::Sub => 0b01010, + _ => panic!("Invalid ALUOp {:?} in RRImm12 form!", alu_op), + }; + let inst = (0b11110_0_1 << 25) | (bits_24_20 << 20); + let inst = enc_32_regs(inst, None, Some(rd.to_reg()), None, Some(rn)); + let inst = enc_32_imm12(inst, imm12); + emit_32(inst, sink); + } + &Inst::AluRRImm8 { + alu_op, + rd, + rn, + imm8, + } => { + let bits_24_20 = match alu_op { + ALUOp::And => 0b00000, + ALUOp::Bic => 0b00010, + ALUOp::Orr => 0b00100, + ALUOp::Orn => 0b00110, + ALUOp::Eor => 0b01000, + ALUOp::Add => 0b10000, + ALUOp::Adds => 0b10001, + ALUOp::Adc => 0b10100, + ALUOp::Adcs => 0b10101, + ALUOp::Sbc => 0b10110, + ALUOp::Sbcs => 0b10111, + ALUOp::Sub => 0b11010, + ALUOp::Subs => 0b11011, + ALUOp::Rsb => 0b11100, + _ => panic!("Invalid ALUOp {:?} in RRImm8 form!", alu_op), + }; + let imm8 = imm8.bits(); + let inst = 0b11110_0_0_00000_0000_0_000_0000_00000000 | imm8 | (bits_24_20 << 20); + let inst = enc_32_regs(inst, None, Some(rd.to_reg()), None, Some(rn)); + emit_32(inst, sink); + } + &Inst::AluRImm8 { alu_op, rd, imm8 } => { + let bits_24_20 = match alu_op { + ALUOp1::Mvn => 0b00110, + ALUOp1::Mov => 0b00100, + }; + let imm8 = imm8.bits(); + let inst = 0b11110_0_0_00000_1111_0_000_0000_00000000 | imm8 | (bits_24_20 << 20); + let inst = enc_32_regs(inst, None, Some(rd.to_reg()), None, None); + emit_32(inst, sink); + } + &Inst::BitOpRR { bit_op, rd, rm } => { + let (bits_22_20, bits_7_4) = match bit_op { + BitOp::Rbit => (0b001, 0b1010), + BitOp::Rev => (0b001, 0b1000), + BitOp::Clz => (0b011, 0b1000), + }; + let inst = + 0b111110101_000_0000_1111_0000_0000_0000 | (bits_22_20 << 20) | (bits_7_4 << 4); + let inst = enc_32_regs(inst, Some(rm), Some(rd.to_reg()), None, Some(rm)); + emit_32(inst, sink); + } + &Inst::Mov { rd, rm } => { + sink.put2(enc_16_mov(rd, rm)); + } + &Inst::MovImm16 { rd, imm16 } => { + emit_32(enc_32_r_imm16(0b11110_0_100100, rd.to_reg(), imm16), sink); + } + &Inst::Movt { rd, imm16 } => { + emit_32(enc_32_r_imm16(0b11110_0_101100, rd.to_reg(), imm16), sink); + } + &Inst::Cmp { rn, rm } => { + // Check which 16-bit encoding is allowed. + if machreg_is_lo(rn) && machreg_is_lo(rm) { + sink.put2(enc_16_rr(0b0100001010, rn, rm)); + } else { + sink.put2(enc_16_rr_any(0b01000101, rn, rm)); + } + } + &Inst::CmpImm8 { rn, imm8 } => { + let inst = 0b11110_0_011011_0000_0_000_1111_00000000 | u32::from(imm8); + let inst = enc_32_regs(inst, None, None, None, Some(rn)); + emit_32(inst, sink); + } + &Inst::Store { rt, ref mem, bits } => { + let (mem_insts, mem) = mem_finalize(mem, state); + for inst in mem_insts.into_iter() { + inst.emit(sink, emit_info, state); + } + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() { + // Register the offset at which the store instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + match mem { + AMode::RegReg(rn, rm, imm2) => { + let bits_24_20 = match bits { + 32 => 0b00100, + 16 => 0b00010, + 8 => 0b00000, + _ => panic!("Unsupported store case {:?}", self), + }; + emit_32(enc_32_mem_r(bits_24_20, rt, rn, rm, imm2), sink); + } + AMode::RegOffset12(rn, off12) => { + let bits_24_20 = match bits { + 32 => 0b01100, + 16 => 0b01010, + 8 => 0b01000, + _ => panic!("Unsupported store case {:?}", self), + }; + emit_32(enc_32_mem_off12(bits_24_20, rt, rn, off12), sink); + } + AMode::PCRel(_) => panic!("Unsupported store case {:?}", self), + _ => unreachable!(), + } + } + &Inst::Load { + rt, + ref mem, + bits, + sign_extend, + } => { + let (mem_insts, mem) = mem_finalize(mem, state); + for inst in mem_insts.into_iter() { + inst.emit(sink, emit_info, state); + } + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() { + // Register the offset at which the load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + match mem { + AMode::RegReg(rn, rm, imm2) => { + let bits_24_20 = match (bits, sign_extend) { + (32, _) => 0b00101, + (16, true) => 0b10011, + (16, false) => 0b00011, + (8, true) => 0b10001, + (8, false) => 0b00001, + _ => panic!("Unsupported load case {:?}", self), + }; + emit_32(enc_32_mem_r(bits_24_20, rt.to_reg(), rn, rm, imm2), sink); + } + AMode::RegOffset12(rn, off12) => { + let bits_24_20 = match (bits, sign_extend) { + (32, _) => 0b01101, + (16, true) => 0b11011, + (16, false) => 0b01011, + (8, true) => 0b11001, + (8, false) => 0b01001, + _ => panic!("Unsupported load case {:?}", self), + }; + emit_32(enc_32_mem_off12(bits_24_20, rt.to_reg(), rn, off12), sink); + } + AMode::PCRel(off12) => { + let mut bits_24_20 = match (bits, sign_extend) { + (32, _) => 0b00101, + (16, true) => 0b10011, + (16, false) => 0b00011, + (8, true) => 0b10001, + (8, false) => 0b00001, + _ => panic!("Unsupported load case {:?}", self), + }; + let (u, off12) = if off12 > 0 { (1, off12) } else { (0, -off12) }; + let off12 = UImm12::maybe_from_i64(i64::from(off12)).unwrap(); + bits_24_20 |= u << 3; + + emit_32( + enc_32_mem_off12(bits_24_20, rt.to_reg(), pc_reg(), off12), + sink, + ); + } + _ => unreachable!(), + } + } + &Inst::LoadAddr { rd, ref mem } => { + let (mem_insts, mem) = mem_finalize(mem, state); + for inst in mem_insts.into_iter() { + inst.emit(sink, emit_info, state); + } + let inst = match mem { + AMode::RegReg(reg1, reg2, shift) => { + let shift = u32::from(shift); + let shift_amt = ShiftOpShiftImm::maybe_from_shift(shift).unwrap(); + let shift = ShiftOpAndAmt::new(ShiftOp::LSL, shift_amt); + Inst::AluRRRShift { + alu_op: ALUOp::Add, + rd, + rn: reg1, + rm: reg2, + shift: Some(shift), + } + } + AMode::RegOffset12(reg, imm12) => Inst::AluRRImm12 { + alu_op: ALUOp::Add, + rd, + rn: reg, + imm12, + }, + AMode::PCRel(off12) => { + let (off12, alu_op) = if off12 > 0 { + (off12, ALUOp::Add) + } else { + (-off12, ALUOp::Sub) + }; + let imm12 = UImm12::maybe_from_i64(i64::from(off12)).unwrap(); + Inst::AluRRImm12 { + alu_op, + rd, + rn: pc_reg(), + imm12, + } + } + _ => unreachable!(), + }; + inst.emit(sink, emit_info, state); + } + &Inst::Extend { + rd, + rm, + from_bits, + signed, + } if from_bits >= 8 => { + let rd = rd.to_reg(); + if machreg_is_lo(rd) && machreg_is_lo(rm) { + let bits_15_9 = match (from_bits, signed) { + (16, true) => 0b1011001000, + (16, false) => 0b1011001010, + (8, true) => 0b1011001001, + (8, false) => 0b1011001011, + _ => panic!("Unsupported Extend case: {:?}", self), + }; + sink.put2(enc_16_rr(bits_15_9, rd, rm)); + } else { + let bits_22_20 = match (from_bits, signed) { + (16, true) => 0b000, + (16, false) => 0b001, + (8, true) => 0b100, + (8, false) => 0b101, + _ => panic!("Unsupported Extend case: {:?}", self), + }; + let inst = 0b111110100_000_11111111_0000_1000_0000 | (bits_22_20 << 20); + let inst = enc_32_regs(inst, Some(rm), Some(rd), None, None); + emit_32(inst, sink); + } + } + &Inst::Extend { + rd, + rm, + from_bits, + signed, + } if from_bits == 1 => { + let inst = Inst::AluRRImm8 { + alu_op: ALUOp::And, + rd, + rn: rm, + imm8: UImm8::maybe_from_i64(1).unwrap(), + }; + inst.emit(sink, emit_info, state); + + if signed { + let inst = Inst::AluRRImm8 { + alu_op: ALUOp::Rsb, + rd, + rn: rd.to_reg(), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }; + inst.emit(sink, emit_info, state); + } + } + &Inst::Extend { .. } => { + panic!("Unsupported extend variant"); + } + &Inst::It { cond, ref insts } => { + assert!(1 <= insts.len() && insts.len() <= 4); + assert!(insts[0].then); + + sink.put2(enc_16_it(cond, insts)); + for inst in insts.iter() { + inst.inst.emit(sink, emit_info, state); + } + } + &Inst::Push { ref reg_list } => match reg_list.len() { + 0 => panic!("Unsupported Push case: {:?}", self), + 1 => { + let reg = u32::from(machreg_to_gpr(reg_list[0])); + let inst: u32 = 0b1111100001001101_0000_110100000100 | (reg << 12); + emit_32(inst, sink); + } + _ => { + let mut inst: u32 = 0b1110100100101101 << 16; + for reg in reg_list { + inst |= 1 << machreg_to_gpr(*reg); + } + if inst & ((1 << 13) | (1 << 15)) != 0 { + panic!("Unsupported Push case: {:?}", self); + } + emit_32(inst, sink); + } + }, + &Inst::Pop { ref reg_list } => match reg_list.len() { + 0 => panic!("Unsupported Pop case: {:?}", self), + 1 => { + let reg = u32::from(machreg_to_gpr(reg_list[0].to_reg())); + let inst: u32 = 0b1111100001011101_0000_101100000100 | (reg << 12); + emit_32(inst, sink); + } + _ => { + let mut inst: u32 = 0b1110100010111101 << 16; + for reg in reg_list { + inst |= 1 << machreg_to_gpr(reg.to_reg()); + } + if (inst & (1 << 14) != 0) && (inst & (1 << 15) != 0) { + panic!("Unsupported Pop case: {:?}", self); + } + emit_32(inst, sink); + } + }, + &Inst::Call { ref info } => { + let srcloc = state.cur_srcloc(); + sink.add_reloc(srcloc, Reloc::Arm32Call, &info.dest, 0); + emit_32(0b11110_0_0000000000_11_0_1_0_00000000000, sink); + if info.opcode.is_call() { + sink.add_call_site(srcloc, info.opcode); + } + } + &Inst::CallInd { ref info } => { + let srcloc = state.cur_srcloc(); + sink.put2(0b01000111_1_0000_000 | (machreg_to_gpr(info.rm) << 3)); + if info.opcode.is_call() { + sink.add_call_site(srcloc, info.opcode); + } + } + &Inst::LoadExtName { + rt, + ref name, + offset, + } => { + // maybe nop2 (0|2) bytes (pc is now 4-aligned) + // ldr rt, [pc, #4] 4 bytes + // b continue 4 bytes + // addr 4 bytes + // continue: + // + if start_off & 0x3 != 0 { + Inst::Nop2.emit(sink, emit_info, state); + } + assert_eq!(sink.cur_offset() & 0x3, 0); + + let mem = AMode::PCRel(4); + let inst = Inst::Load { + rt, + mem, + bits: 32, + sign_extend: false, + }; + inst.emit(sink, emit_info, state); + + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(4), + }; + inst.emit(sink, emit_info, state); + + let srcloc = state.cur_srcloc(); + sink.add_reloc(srcloc, Reloc::Abs4, name, offset.into()); + sink.put4(0); + } + &Inst::Ret => { + sink.put2(0b010001110_1110_000); // bx lr + } + &Inst::Jump { dest } => { + let off = sink.cur_offset(); + // Indicate that the jump uses a label, if so, so that a fixup can occur later. + if let Some(l) = dest.as_label() { + sink.use_label_at_offset(off, l, LabelUse::Branch24); + sink.add_uncond_branch(off, off + 4, l); + } + emit_32(enc_32_jump(dest), sink); + } + &Inst::CondBr { + taken, + not_taken, + cond, + } => { + // Conditional part first. + let cond_off = sink.cur_offset(); + if let Some(l) = taken.as_label() { + let label_use = LabelUse::Branch20; + sink.use_label_at_offset(cond_off, l, label_use); + let inverted = enc_32_cond_branch(cond.invert(), taken); + let inverted = u32_swap_halfwords(inverted).to_le_bytes(); + sink.add_cond_branch(cond_off, cond_off + 4, l, &inverted[..]); + } + emit_32(enc_32_cond_branch(cond, taken), sink); + + // Unconditional part. + let uncond_off = sink.cur_offset(); + if let Some(l) = not_taken.as_label() { + sink.use_label_at_offset(uncond_off, l, LabelUse::Branch24); + sink.add_uncond_branch(uncond_off, uncond_off + 4, l); + } + emit_32(enc_32_jump(not_taken), sink); + } + &Inst::IndirectBr { rm, .. } => { + let inst = 0b010001110_0000_000 | (machreg_to_gpr(rm) << 3); + sink.put2(inst); + } + &Inst::Udf { trap_info } => { + let srcloc = state.cur_srcloc(); + let code = trap_info; + sink.add_trap(srcloc, code); + sink.put2(0b11011110_00000000); + } + &Inst::Bkpt => { + sink.put2(0b10111110_00000000); + } + &Inst::TrapIf { cond, trap_info } => { + let cond = cond.invert(); + let dest = BranchTarget::ResolvedOffset(2); + emit_32(enc_32_cond_branch(cond, dest), sink); + + let trap = Inst::Udf { trap_info }; + trap.emit(sink, emit_info, state); + } + &Inst::VirtualSPOffsetAdj { offset } => { + debug!( + "virtual sp offset adjusted by {} -> {}", + offset, + state.virtual_sp_offset + offset, + ); + state.virtual_sp_offset += offset; + } + } + + let end_off = sink.cur_offset(); + debug_assert!((end_off - start_off) <= Inst::worst_case_size()); + } + + fn pretty_print(&self, mb_rru: Option<&RealRegUniverse>, state: &mut EmitState) -> String { + self.print_with_state(mb_rru, state) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit_tests.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit_tests.rs new file mode 100644 index 0000000000..73269be999 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/emit_tests.rs @@ -0,0 +1,1959 @@ +use crate::isa::arm32::inst::*; +use crate::isa::test_utils; +use crate::settings; + +use alloc::vec::Vec; + +#[test] +fn test_arm32_emit() { + let flags = settings::Flags::new(settings::builder()); + let mut insns = Vec::<(Inst, &str, &str)>::new(); + + // litle endian order + insns.push((Inst::Nop0, "", "nop-zero-len")); + insns.push((Inst::Nop2, "00BF", "nop")); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsl, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + }, + "01FA02F0", + "lsl r0, r1, r2", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsl, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + }, + "09FA0AF8", + "lsl r8, r9, r10", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsr, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + }, + "21FA02F0", + "lsr r0, r1, r2", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsr, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + }, + "29FA0AF8", + "lsr r8, r9, r10", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Asr, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + }, + "41FA02F0", + "asr r0, r1, r2", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Asr, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + }, + "49FA0AF8", + "asr r8, r9, r10", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Ror, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + }, + "61FA02F0", + "ror r0, r1, r2", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Ror, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + }, + "69FA0AF8", + "ror r8, r9, r10", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Qadd, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + }, + "81FA82F0", + "qadd r0, r1, r2", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Qadd, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + }, + "89FA8AF8", + "qadd r8, r9, r10", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Qsub, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + }, + "81FAA2F0", + "qsub r0, r1, r2", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Qsub, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + }, + "89FAAAF8", + "qsub r8, r9, r10", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Mul, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + }, + "01FB02F0", + "mul r0, r1, r2", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Mul, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + }, + "09FB0AF8", + "mul r8, r9, r10", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Udiv, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + }, + "B1FBF2F0", + "udiv r0, r1, r2", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Udiv, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + }, + "B9FBFAF8", + "udiv r8, r9, r10", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Sdiv, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + }, + "91FBF2F0", + "sdiv r0, r1, r2", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Sdiv, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + }, + "99FBFAF8", + "sdiv r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::And, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "01EAC250", + "and r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::And, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "09EA0A08", + "and r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Bic, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "21EAC250", + "bic r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Bic, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "29EA0A08", + "bic r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Orr, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "41EAC250", + "orr r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Orr, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "49EA0A08", + "orr r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Orn, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "61EAC250", + "orn r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Orn, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "69EA0A08", + "orn r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Eor, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "81EAC250", + "eor r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Eor, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "89EA0A08", + "eor r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Add, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "01EBC250", + "add r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Add, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "09EB0A08", + "add r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Adds, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "11EBC250", + "adds r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Adds, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "19EB0A08", + "adds r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Adc, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "41EBC250", + "adc r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Adc, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "49EB0A08", + "adc r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Adcs, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "51EBC250", + "adcs r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Adcs, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "59EB0A08", + "adcs r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Sbc, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "61EBC250", + "sbc r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Sbc, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "69EB0A08", + "sbc r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Sbcs, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "71EBC250", + "sbcs r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Sbcs, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "79EB0A08", + "sbcs r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Sub, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "A1EBC250", + "sub r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Sub, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "A9EB0A08", + "sub r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Subs, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "B1EBC250", + "subs r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Subs, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "B9EB0A08", + "subs r8, r9, r10", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Rsb, + rd: writable_rreg(0), + rn: rreg(1), + rm: rreg(2), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + )), + }, + "C1EBC250", + "rsb r0, r1, r2, lsl #23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Rsb, + rd: writable_rreg(8), + rn: rreg(9), + rm: rreg(10), + shift: None, + }, + "C9EB0A08", + "rsb r8, r9, r10", + )); + insns.push(( + Inst::AluRRShift { + alu_op: ALUOp1::Mvn, + rd: writable_rreg(0), + rm: rreg(1), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(11).unwrap(), + )), + }, + "6FEAC120", + "mvn r0, r1, lsl #11", + )); + insns.push(( + Inst::AluRRShift { + alu_op: ALUOp1::Mvn, + rd: writable_rreg(8), + rm: rreg(9), + shift: None, + }, + "6FEA0908", + "mvn r8, r9", + )); + insns.push(( + Inst::AluRRShift { + alu_op: ALUOp1::Mov, + rd: writable_rreg(0), + rm: rreg(1), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(11).unwrap(), + )), + }, + "4FEAC120", + "mov r0, r1, lsl #11", + )); + insns.push(( + Inst::AluRRShift { + alu_op: ALUOp1::Mov, + rd: writable_rreg(2), + rm: rreg(8), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::LSR, + ShiftOpShiftImm::maybe_from_shift(27).unwrap(), + )), + }, + "4FEAD862", + "mov r2, r8, lsr #27", + )); + insns.push(( + Inst::AluRRShift { + alu_op: ALUOp1::Mov, + rd: writable_rreg(9), + rm: rreg(3), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::ASR, + ShiftOpShiftImm::maybe_from_shift(3).unwrap(), + )), + }, + "4FEAE309", + "mov r9, r3, asr #3", + )); + insns.push(( + Inst::AluRRShift { + alu_op: ALUOp1::Mov, + rd: writable_rreg(10), + rm: rreg(11), + shift: Some(ShiftOpAndAmt::new( + ShiftOp::ROR, + ShiftOpShiftImm::maybe_from_shift(7).unwrap(), + )), + }, + "4FEAFB1A", + "mov r10, fp, ror #7", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp::Smull, + rd_lo: writable_rreg(0), + rd_hi: writable_rreg(1), + rn: rreg(2), + rm: rreg(3), + }, + "82FB0301", + "smull r0, r1, r2, r3", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp::Smull, + rd_lo: writable_rreg(8), + rd_hi: writable_rreg(9), + rn: rreg(10), + rm: rreg(11), + }, + "8AFB0B89", + "smull r8, r9, r10, fp", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp::Umull, + rd_lo: writable_rreg(0), + rd_hi: writable_rreg(1), + rn: rreg(2), + rm: rreg(3), + }, + "A2FB0301", + "umull r0, r1, r2, r3", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp::Umull, + rd_lo: writable_rreg(8), + rd_hi: writable_rreg(9), + rn: rreg(10), + rm: rreg(11), + }, + "AAFB0B89", + "umull r8, r9, r10, fp", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Add, + rd: writable_rreg(0), + rn: rreg(1), + imm12: UImm12::maybe_from_i64(4095).unwrap(), + }, + "01F6FF70", + "add r0, r1, #4095", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Add, + rd: writable_rreg(8), + rn: rreg(9), + imm12: UImm12::maybe_from_i64(0).unwrap(), + }, + "09F20008", + "add r8, r9, #0", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Sub, + rd: writable_rreg(0), + rn: rreg(1), + imm12: UImm12::maybe_from_i64(1999).unwrap(), + }, + "A1F2CF70", + "sub r0, r1, #1999", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Sub, + rd: writable_rreg(8), + rn: rreg(9), + imm12: UImm12::maybe_from_i64(101).unwrap(), + }, + "A9F26508", + "sub r8, r9, #101", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::And, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "01F0FF00", + "and r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::And, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "09F00108", + "and r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Bic, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "21F0FF00", + "bic r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Bic, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "29F00108", + "bic r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Orr, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "41F0FF00", + "orr r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Orr, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "49F00108", + "orr r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Orn, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "61F0FF00", + "orn r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Orn, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "69F00108", + "orn r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Eor, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "81F0FF00", + "eor r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Eor, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "89F00108", + "eor r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Add, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "01F1FF00", + "add r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Add, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "09F10108", + "add r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Adds, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "11F1FF00", + "adds r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Adds, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "19F10108", + "adds r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Adc, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "41F1FF00", + "adc r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Adc, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "49F10108", + "adc r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Adcs, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "51F1FF00", + "adcs r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Adcs, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "59F10108", + "adcs r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Sbc, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "61F1FF00", + "sbc r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Sbc, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "69F10108", + "sbc r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Sbcs, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "71F1FF00", + "sbcs r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Sbcs, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "79F10108", + "sbcs r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Sub, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "A1F1FF00", + "sub r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Sub, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "A9F10108", + "sub r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Subs, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "B1F1FF00", + "subs r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Subs, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "B9F10108", + "subs r8, r9, #1", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Rsb, + rd: writable_rreg(0), + rn: rreg(1), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "C1F1FF00", + "rsb r0, r1, #255", + )); + insns.push(( + Inst::AluRRImm8 { + alu_op: ALUOp::Rsb, + rd: writable_rreg(8), + rn: rreg(9), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "C9F10108", + "rsb r8, r9, #1", + )); + insns.push(( + Inst::AluRImm8 { + alu_op: ALUOp1::Mvn, + rd: writable_rreg(0), + imm8: UImm8::maybe_from_i64(255).unwrap(), + }, + "6FF0FF00", + "mvn r0, #255", + )); + insns.push(( + Inst::AluRImm8 { + alu_op: ALUOp1::Mvn, + rd: writable_rreg(8), + imm8: UImm8::maybe_from_i64(1).unwrap(), + }, + "6FF00108", + "mvn r8, #1", + )); + insns.push(( + Inst::AluRImm8 { + alu_op: ALUOp1::Mov, + rd: writable_rreg(0), + imm8: UImm8::maybe_from_i64(0).unwrap(), + }, + "4FF00000", + "mov r0, #0", + )); + insns.push(( + Inst::AluRImm8 { + alu_op: ALUOp1::Mov, + rd: writable_rreg(8), + imm8: UImm8::maybe_from_i64(176).unwrap(), + }, + "4FF0B008", + "mov r8, #176", + )); + insns.push(( + Inst::BitOpRR { + bit_op: BitOp::Rbit, + rd: writable_rreg(0), + rm: rreg(1), + }, + "91FAA1F0", + "rbit r0, r1", + )); + insns.push(( + Inst::BitOpRR { + bit_op: BitOp::Rbit, + rd: writable_rreg(8), + rm: rreg(9), + }, + "99FAA9F8", + "rbit r8, r9", + )); + insns.push(( + Inst::BitOpRR { + bit_op: BitOp::Rev, + rd: writable_rreg(0), + rm: rreg(1), + }, + "91FA81F0", + "rev r0, r1", + )); + insns.push(( + Inst::BitOpRR { + bit_op: BitOp::Rev, + rd: writable_rreg(8), + rm: rreg(9), + }, + "99FA89F8", + "rev r8, r9", + )); + insns.push(( + Inst::BitOpRR { + bit_op: BitOp::Clz, + rd: writable_rreg(0), + rm: rreg(1), + }, + "B1FA81F0", + "clz r0, r1", + )); + insns.push(( + Inst::BitOpRR { + bit_op: BitOp::Clz, + rd: writable_rreg(8), + rm: rreg(9), + }, + "B9FA89F8", + "clz r8, r9", + )); + insns.push(( + Inst::Mov { + rd: writable_rreg(0), + rm: rreg(1), + }, + "0846", + "mov r0, r1", + )); + insns.push(( + Inst::Mov { + rd: writable_rreg(2), + rm: rreg(8), + }, + "4246", + "mov r2, r8", + )); + insns.push(( + Inst::Mov { + rd: writable_rreg(9), + rm: rreg(3), + }, + "9946", + "mov r9, r3", + )); + insns.push(( + Inst::Mov { + rd: writable_rreg(10), + rm: rreg(11), + }, + "DA46", + "mov r10, fp", + )); + insns.push(( + Inst::MovImm16 { + rd: writable_rreg(0), + imm16: 0, + }, + "40F20000", + "mov r0, #0", + )); + insns.push(( + Inst::MovImm16 { + rd: writable_rreg(1), + imm16: 15, + }, + "40F20F01", + "mov r1, #15", + )); + insns.push(( + Inst::MovImm16 { + rd: writable_rreg(2), + imm16: 255, + }, + "40F2FF02", + "mov r2, #255", + )); + insns.push(( + Inst::MovImm16 { + rd: writable_rreg(8), + imm16: 4095, + }, + "40F6FF78", + "mov r8, #4095", + )); + insns.push(( + Inst::MovImm16 { + rd: writable_rreg(9), + imm16: 65535, + }, + "4FF6FF79", + "mov r9, #65535", + )); + insns.push(( + Inst::Movt { + rd: writable_rreg(0), + imm16: 0, + }, + "C0F20000", + "movt r0, #0", + )); + insns.push(( + Inst::Movt { + rd: writable_rreg(1), + imm16: 15, + }, + "C0F20F01", + "movt r1, #15", + )); + insns.push(( + Inst::Movt { + rd: writable_rreg(2), + imm16: 255, + }, + "C0F2FF02", + "movt r2, #255", + )); + insns.push(( + Inst::Movt { + rd: writable_rreg(8), + imm16: 4095, + }, + "C0F6FF78", + "movt r8, #4095", + )); + insns.push(( + Inst::Movt { + rd: writable_rreg(9), + imm16: 65535, + }, + "CFF6FF79", + "movt r9, #65535", + )); + insns.push(( + Inst::Cmp { + rn: rreg(0), + rm: rreg(1), + }, + "8842", + "cmp r0, r1", + )); + insns.push(( + Inst::Cmp { + rn: rreg(2), + rm: rreg(8), + }, + "4245", + "cmp r2, r8", + )); + insns.push(( + Inst::Cmp { + rn: rreg(9), + rm: rreg(3), + }, + "9945", + "cmp r9, r3", + )); + insns.push(( + Inst::Cmp { + rn: rreg(10), + rm: rreg(11), + }, + "DA45", + "cmp r10, fp", + )); + insns.push(( + Inst::CmpImm8 { + rn: rreg(0), + imm8: 255, + }, + "B0F1FF0F", + "cmp r0, #255", + )); + insns.push(( + Inst::CmpImm8 { + rn: rreg(1), + imm8: 0, + }, + "B1F1000F", + "cmp r1, #0", + )); + insns.push(( + Inst::CmpImm8 { + rn: rreg(8), + imm8: 1, + }, + "B8F1010F", + "cmp r8, #1", + )); + + insns.push(( + Inst::Store { + rt: rreg(0), + mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0), + bits: 32, + }, + "41F80200", + "str r0, [r1, r2]", + )); + insns.push(( + Inst::Store { + rt: rreg(8), + mem: AMode::reg_plus_reg(rreg(9), rreg(10), 3), + bits: 32, + }, + "49F83A80", + "str r8, [r9, r10, lsl #3]", + )); + insns.push(( + Inst::Store { + rt: rreg(0), + mem: AMode::RegOffset(rreg(1), 4095), + bits: 32, + }, + "C1F8FF0F", + "str r0, [r1, #4095]", + )); + insns.push(( + Inst::Store { + rt: rreg(8), + mem: AMode::RegOffset(rreg(9), 0), + bits: 32, + }, + "C9F80080", + "str r8, [r9, #0]", + )); + insns.push(( + Inst::Store { + rt: rreg(7), + mem: AMode::RegOffset(rreg(11), 65535), + bits: 32, + }, + "4FF6FF7C4BF80C70", + "mov ip, #65535 ; str r7, [fp, ip]", + )); + insns.push(( + Inst::Store { + rt: rreg(10), + mem: AMode::RegOffset(rreg(4), 16777215), + bits: 32, + }, + "4FF6FF7CC0F2FF0C44F80CA0", + "mov ip, #65535 ; movt ip, #255 ; str r10, [r4, ip]", + )); + insns.push(( + Inst::Store { + rt: rreg(0), + mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0), + bits: 16, + }, + "21F80200", + "strh r0, [r1, r2]", + )); + insns.push(( + Inst::Store { + rt: rreg(8), + mem: AMode::reg_plus_reg(rreg(9), rreg(10), 2), + bits: 16, + }, + "29F82A80", + "strh r8, [r9, r10, lsl #2]", + )); + insns.push(( + Inst::Store { + rt: rreg(0), + mem: AMode::RegOffset(rreg(1), 3210), + bits: 16, + }, + "A1F88A0C", + "strh r0, [r1, #3210]", + )); + insns.push(( + Inst::Store { + rt: rreg(8), + mem: AMode::RegOffset(rreg(9), 1), + bits: 16, + }, + "A9F80180", + "strh r8, [r9, #1]", + )); + insns.push(( + Inst::Store { + rt: rreg(7), + mem: AMode::RegOffset(rreg(11), 65535), + bits: 16, + }, + "4FF6FF7C2BF80C70", + "mov ip, #65535 ; strh r7, [fp, ip]", + )); + insns.push(( + Inst::Store { + rt: rreg(10), + mem: AMode::RegOffset(rreg(4), 16777215), + bits: 16, + }, + "4FF6FF7CC0F2FF0C24F80CA0", + "mov ip, #65535 ; movt ip, #255 ; strh r10, [r4, ip]", + )); + insns.push(( + Inst::Store { + rt: rreg(0), + mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0), + bits: 8, + }, + "01F80200", + "strb r0, [r1, r2]", + )); + insns.push(( + Inst::Store { + rt: rreg(8), + mem: AMode::reg_plus_reg(rreg(9), rreg(10), 1), + bits: 8, + }, + "09F81A80", + "strb r8, [r9, r10, lsl #1]", + )); + insns.push(( + Inst::Store { + rt: rreg(0), + mem: AMode::RegOffset(rreg(1), 4), + bits: 8, + }, + "81F80400", + "strb r0, [r1, #4]", + )); + insns.push(( + Inst::Store { + rt: rreg(8), + mem: AMode::RegOffset(rreg(9), 777), + bits: 8, + }, + "89F80983", + "strb r8, [r9, #777]", + )); + insns.push(( + Inst::Store { + rt: rreg(7), + mem: AMode::RegOffset(rreg(11), 65535), + bits: 8, + }, + "4FF6FF7C0BF80C70", + "mov ip, #65535 ; strb r7, [fp, ip]", + )); + insns.push(( + Inst::Store { + rt: rreg(10), + mem: AMode::RegOffset(rreg(4), 16777215), + bits: 8, + }, + "4FF6FF7CC0F2FF0C04F80CA0", + "mov ip, #65535 ; movt ip, #255 ; strb r10, [r4, ip]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(0), + mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0), + bits: 32, + sign_extend: false, + }, + "51F80200", + "ldr r0, [r1, r2]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(8), + mem: AMode::reg_plus_reg(rreg(9), rreg(10), 1), + bits: 32, + sign_extend: false, + }, + "59F81A80", + "ldr r8, [r9, r10, lsl #1]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(0), + mem: AMode::RegOffset(rreg(1), 55), + bits: 32, + sign_extend: false, + }, + "D1F83700", + "ldr r0, [r1, #55]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(8), + mem: AMode::RegOffset(rreg(9), 1234), + bits: 32, + sign_extend: false, + }, + "D9F8D284", + "ldr r8, [r9, #1234]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(7), + mem: AMode::RegOffset(rreg(11), 9876), + bits: 32, + sign_extend: false, + }, + "42F2946C5BF80C70", + "mov ip, #9876 ; ldr r7, [fp, ip]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(10), + mem: AMode::RegOffset(rreg(4), 252645135), + bits: 32, + sign_extend: false, + }, + "40F60F7CC0F60F7C54F80CA0", + "mov ip, #3855 ; movt ip, #3855 ; ldr r10, [r4, ip]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(0), + mem: AMode::PCRel(-56), + bits: 32, + sign_extend: false, + }, + "5FF83800", + "ldr r0, [pc, #-56]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(8), + mem: AMode::PCRel(1024), + bits: 32, + sign_extend: false, + }, + "DFF80084", + "ldr r8, [pc, #1024]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(0), + mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0), + bits: 16, + sign_extend: true, + }, + "31F90200", + "ldrsh r0, [r1, r2]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(8), + mem: AMode::reg_plus_reg(rreg(9), rreg(10), 2), + bits: 16, + sign_extend: false, + }, + "39F82A80", + "ldrh r8, [r9, r10, lsl #2]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(0), + mem: AMode::RegOffset(rreg(1), 55), + bits: 16, + sign_extend: false, + }, + "B1F83700", + "ldrh r0, [r1, #55]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(8), + mem: AMode::RegOffset(rreg(9), 1234), + bits: 16, + sign_extend: true, + }, + "B9F9D284", + "ldrsh r8, [r9, #1234]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(7), + mem: AMode::RegOffset(rreg(11), 9876), + bits: 16, + sign_extend: true, + }, + "42F2946C3BF90C70", + "mov ip, #9876 ; ldrsh r7, [fp, ip]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(10), + mem: AMode::RegOffset(rreg(4), 252645135), + bits: 16, + sign_extend: false, + }, + "40F60F7CC0F60F7C34F80CA0", + "mov ip, #3855 ; movt ip, #3855 ; ldrh r10, [r4, ip]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(0), + mem: AMode::PCRel(56), + bits: 16, + sign_extend: false, + }, + "BFF83800", + "ldrh r0, [pc, #56]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(8), + mem: AMode::PCRel(-1000), + bits: 16, + sign_extend: true, + }, + "3FF9E883", + "ldrsh r8, [pc, #-1000]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(0), + mem: AMode::reg_plus_reg(rreg(1), rreg(2), 0), + bits: 8, + sign_extend: true, + }, + "11F90200", + "ldrsb r0, [r1, r2]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(8), + mem: AMode::reg_plus_reg(rreg(9), rreg(10), 3), + bits: 8, + sign_extend: false, + }, + "19F83A80", + "ldrb r8, [r9, r10, lsl #3]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(0), + mem: AMode::RegOffset(rreg(1), 55), + bits: 8, + sign_extend: false, + }, + "91F83700", + "ldrb r0, [r1, #55]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(8), + mem: AMode::RegOffset(rreg(9), 1234), + bits: 8, + sign_extend: true, + }, + "99F9D284", + "ldrsb r8, [r9, #1234]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(7), + mem: AMode::RegOffset(rreg(11), 9876), + bits: 8, + sign_extend: true, + }, + "42F2946C1BF90C70", + "mov ip, #9876 ; ldrsb r7, [fp, ip]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(10), + mem: AMode::RegOffset(rreg(4), 252645135), + bits: 8, + sign_extend: false, + }, + "40F60F7CC0F60F7C14F80CA0", + "mov ip, #3855 ; movt ip, #3855 ; ldrb r10, [r4, ip]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(0), + mem: AMode::PCRel(72), + bits: 8, + sign_extend: false, + }, + "9FF84800", + "ldrb r0, [pc, #72]", + )); + insns.push(( + Inst::Load { + rt: writable_rreg(8), + mem: AMode::PCRel(-1234), + bits: 8, + sign_extend: true, + }, + "1FF9D284", + "ldrsb r8, [pc, #-1234]", + )); + insns.push(( + Inst::Extend { + rd: writable_rreg(0), + rm: rreg(1), + from_bits: 16, + signed: false, + }, + "88B2", + "uxth r0, r1", + )); + insns.push(( + Inst::Extend { + rd: writable_rreg(8), + rm: rreg(9), + from_bits: 16, + signed: false, + }, + "1FFA89F8", + "uxth r8, r9", + )); + insns.push(( + Inst::Extend { + rd: writable_rreg(0), + rm: rreg(1), + from_bits: 8, + signed: false, + }, + "C8B2", + "uxtb r0, r1", + )); + insns.push(( + Inst::Extend { + rd: writable_rreg(8), + rm: rreg(9), + from_bits: 8, + signed: false, + }, + "5FFA89F8", + "uxtb r8, r9", + )); + insns.push(( + Inst::Extend { + rd: writable_rreg(0), + rm: rreg(1), + from_bits: 16, + signed: true, + }, + "08B2", + "sxth r0, r1", + )); + insns.push(( + Inst::Extend { + rd: writable_rreg(8), + rm: rreg(9), + from_bits: 16, + signed: true, + }, + "0FFA89F8", + "sxth r8, r9", + )); + insns.push(( + Inst::Extend { + rd: writable_rreg(0), + rm: rreg(1), + from_bits: 8, + signed: true, + }, + "48B2", + "sxtb r0, r1", + )); + insns.push(( + Inst::Extend { + rd: writable_rreg(8), + rm: rreg(9), + from_bits: 8, + signed: true, + }, + "4FFA89F8", + "sxtb r8, r9", + )); + insns.push(( + Inst::It { + cond: Cond::Eq, + insts: vec![CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true)], + }, + "08BF0046", + "it eq ; mov r0, r0", + )); + insns.push(( + Inst::It { + cond: Cond::Ne, + insts: vec![ + CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true), + CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), false), + ], + }, + "14BF00460046", + "ite ne ; mov r0, r0 ; mov r0, r0", + )); + insns.push(( + Inst::It { + cond: Cond::Lt, + insts: vec![ + CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true), + CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), false), + CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true), + ], + }, + "B6BF004600460046", + "itet lt ; mov r0, r0 ; mov r0, r0 ; mov r0, r0", + )); + insns.push(( + Inst::It { + cond: Cond::Hs, + insts: vec![ + CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true), + CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), true), + CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), false), + CondInst::new(Inst::mov(writable_rreg(0), rreg(0)), false), + ], + }, + "27BF0046004600460046", + "ittee hs ; mov r0, r0 ; mov r0, r0 ; mov r0, r0 ; mov r0, r0", + )); + insns.push(( + Inst::Push { + reg_list: vec![rreg(0)], + }, + "4DF8040D", + "push {r0}", + )); + insns.push(( + Inst::Push { + reg_list: vec![rreg(8)], + }, + "4DF8048D", + "push {r8}", + )); + insns.push(( + Inst::Push { + reg_list: vec![rreg(0), rreg(1), rreg(2), rreg(6), rreg(8)], + }, + "2DE94701", + "push {r0, r1, r2, r6, r8}", + )); + insns.push(( + Inst::Push { + reg_list: vec![rreg(8), rreg(9), rreg(10)], + }, + "2DE90007", + "push {r8, r9, r10}", + )); + insns.push(( + Inst::Pop { + reg_list: vec![writable_rreg(0)], + }, + "5DF8040B", + "pop {r0}", + )); + insns.push(( + Inst::Pop { + reg_list: vec![writable_rreg(8)], + }, + "5DF8048B", + "pop {r8}", + )); + insns.push(( + Inst::Pop { + reg_list: vec![ + writable_rreg(0), + writable_rreg(1), + writable_rreg(2), + writable_rreg(6), + writable_rreg(8), + ], + }, + "BDE84701", + "pop {r0, r1, r2, r6, r8}", + )); + insns.push(( + Inst::Pop { + reg_list: vec![writable_rreg(8), writable_rreg(9), writable_rreg(10)], + }, + "BDE80007", + "pop {r8, r9, r10}", + )); + insns.push(( + Inst::Call { + info: Box::new(CallInfo { + dest: ExternalName::testcase("test0"), + uses: Vec::new(), + defs: Vec::new(), + loc: SourceLoc::default(), + opcode: Opcode::Call, + }), + }, + "00F000D0", + "bl 0", + )); + insns.push(( + Inst::CallInd { + info: Box::new(CallIndInfo { + rm: rreg(0), + uses: Vec::new(), + defs: Vec::new(), + loc: SourceLoc::default(), + opcode: Opcode::CallIndirect, + }), + }, + "8047", + "blx r0", + )); + insns.push(( + Inst::CallInd { + info: Box::new(CallIndInfo { + rm: rreg(8), + uses: Vec::new(), + defs: Vec::new(), + loc: SourceLoc::default(), + opcode: Opcode::CallIndirect, + }), + }, + "C047", + "blx r8", + )); + insns.push((Inst::Ret, "7047", "bx lr")); + insns.push(( + Inst::Jump { + dest: BranchTarget::ResolvedOffset(32), + }, + "00F010B8", + "b 32", + )); + insns.push(( + Inst::Jump { + dest: BranchTarget::ResolvedOffset(0xfffff4), + }, + "FFF3FA97", + "b 16777204", + )); + insns.push(( + Inst::CondBr { + taken: BranchTarget::ResolvedOffset(20), + not_taken: BranchTarget::ResolvedOffset(68), + cond: Cond::Eq, + }, + "00F00A8000F022B8", + "beq 20 ; b 68", + )); + insns.push(( + Inst::CondBr { + taken: BranchTarget::ResolvedOffset(6), + not_taken: BranchTarget::ResolvedOffset(100), + cond: Cond::Gt, + }, + "00F3038000F032B8", + "bgt 6 ; b 100", + )); + insns.push(( + Inst::IndirectBr { + rm: rreg(0), + targets: vec![], + }, + "0047", + "bx r0", + )); + insns.push(( + Inst::IndirectBr { + rm: rreg(8), + targets: vec![], + }, + "4047", + "bx r8", + )); + insns.push(( + Inst::TrapIf { + cond: Cond::Eq, + trap_info: TrapCode::Interrupt, + }, + "40F0018000DE", + "bne 2 ; udf #0", + )); + insns.push(( + Inst::TrapIf { + cond: Cond::Hs, + trap_info: TrapCode::Interrupt, + }, + "C0F0018000DE", + "blo 2 ; udf #0", + )); + insns.push(( + Inst::Udf { + trap_info: TrapCode::Interrupt, + }, + "00DE", + "udf #0", + )); + insns.push((Inst::Bkpt, "00BE", "bkpt #0")); + + // ======================================================== + // Run the tests + let rru = regs::create_reg_universe(); + for (insn, expected_encoding, expected_printing) in insns { + // Check the printed text is as expected. + let actual_printing = insn.show_rru(Some(&rru)); + assert_eq!(expected_printing, actual_printing); + let mut sink = test_utils::TestCodeSink::new(); + let mut buffer = MachBuffer::new(); + insn.emit(&mut buffer, &flags, &mut Default::default()); + let buffer = buffer.finish(); + buffer.emit(&mut sink); + let actual_encoding = &sink.stringify(); + assert_eq!(expected_encoding, actual_encoding, "{}", expected_printing); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/mod.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/mod.rs new file mode 100644 index 0000000000..fff01b7d82 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/mod.rs @@ -0,0 +1,1358 @@ +//! This module defines 32-bit ARM specific machine instruction types. + +#![allow(dead_code)] + +use crate::binemit::CodeOffset; +use crate::ir::types::{B1, B16, B32, B8, I16, I32, I8, IFLAGS}; +use crate::ir::{ExternalName, Opcode, TrapCode, Type}; +use crate::machinst::*; +use crate::{settings, CodegenError, CodegenResult}; + +use regalloc::{PrettyPrint, RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable}; +use regalloc::{RegUsageCollector, RegUsageMapper}; + +use alloc::boxed::Box; +use alloc::vec::Vec; +use smallvec::{smallvec, SmallVec}; +use std::string::{String, ToString}; + +mod args; +pub use self::args::*; +mod emit; +pub use self::emit::*; +mod regs; +pub use self::regs::*; +pub mod unwind; + +#[cfg(test)] +mod emit_tests; + +//============================================================================= +// Instructions (top level): definition + +/// An ALU operation. This can be paired with several instruction formats +/// below (see `Inst`) in any combination. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum ALUOp { + Add, + Adds, + Adc, + Adcs, + Qadd, + Sub, + Subs, + Sbc, + Sbcs, + Rsb, + Qsub, + Mul, + Smull, + Umull, + Udiv, + Sdiv, + And, + Orr, + Orn, + Eor, + Bic, + Lsl, + Lsr, + Asr, + Ror, +} + +/// An ALU operation with one argument. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum ALUOp1 { + Mvn, + Mov, +} + +/// An operation on the bits of a register. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum BitOp { + Rbit, + Rev, + Clz, +} + +/// Additional information for (direct) Call instructions, left out of line to lower the size of +/// the Inst enum. +#[derive(Clone, Debug)] +pub struct CallInfo { + pub dest: ExternalName, + pub uses: Vec<Reg>, + pub defs: Vec<Writable<Reg>>, + pub opcode: Opcode, +} + +/// Additional information for CallInd instructions, left out of line to lower the size of the Inst +/// enum. +#[derive(Clone, Debug)] +pub struct CallIndInfo { + pub rm: Reg, + pub uses: Vec<Reg>, + pub defs: Vec<Writable<Reg>>, + pub opcode: Opcode, +} + +/// Instruction formats. +#[derive(Clone, Debug)] +pub enum Inst { + /// A no-op of zero size. + Nop0, + + /// A no-op that is two bytes large. + Nop2, + + /// An ALU operation with two register sources and one register destination. + AluRRR { + alu_op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + }, + + /// An ALU operation with two register sources, one of which can be optionally shifted + /// and one register destination. + AluRRRShift { + alu_op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + rm: Reg, + shift: Option<ShiftOpAndAmt>, + }, + + /// An ALU operation with one register source, which can be optionally shifted + /// and one register destination. + AluRRShift { + alu_op: ALUOp1, + rd: Writable<Reg>, + rm: Reg, + shift: Option<ShiftOpAndAmt>, + }, + + /// An ALU operation with two register sources and two register destinations. + AluRRRR { + alu_op: ALUOp, + rd_hi: Writable<Reg>, + rd_lo: Writable<Reg>, + rn: Reg, + rm: Reg, + }, + + /// An ALU operation with a register source and a 12-bit immediate source, + /// and a register destination. + AluRRImm12 { + alu_op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + imm12: UImm12, + }, + + /// An ALU operation with a register source and a 8-bit immediate source, + /// and a register destination. + /// + /// In fact these instructions take a `modified immediate constant` operand, + /// which is encoded as a 12-bit immediate. The only case used here + /// is when high 4 bits of that 12-immediate are zeros. + /// In this case operand is simple 8-bit immediate. + /// For all possible operands see + /// https://static.docs.arm.com/ddi0406/c/DDI0406C_C_arm_architecture_reference_manual.pdf#G10.4954509 + AluRRImm8 { + alu_op: ALUOp, + rd: Writable<Reg>, + rn: Reg, + imm8: UImm8, + }, + + /// An ALU operation with a 8-bit immediate and a register destination. + /// See `AluRRImm8` description above. + AluRImm8 { + alu_op: ALUOp1, + rd: Writable<Reg>, + imm8: UImm8, + }, + + /// A bit operation with a register source and a register destination. + BitOpRR { + bit_op: BitOp, + rd: Writable<Reg>, + rm: Reg, + }, + + /// A mov instruction with a GPR source and a GPR destination. + Mov { + rd: Writable<Reg>, + rm: Reg, + }, + + /// A move instruction with a 16-bit immediate source and a register destination. + MovImm16 { + rd: Writable<Reg>, + imm16: u16, + }, + + /// A move top instruction, which writes 16-bit immediate to the top + /// halfword of the destination register. + Movt { + rd: Writable<Reg>, + imm16: u16, + }, + + /// A compare instruction with two register arguments. + Cmp { + rn: Reg, + rm: Reg, + }, + + /// A compare instruction with a register operand and a 8-bit immediate operand. + CmpImm8 { + rn: Reg, + imm8: u8, + }, + + /// A store instruction, which stores to memory 8, 16 or 32-bit operand. + Store { + rt: Reg, + mem: AMode, + bits: u8, + }, + + /// A load instruction, which loads from memory 8, 16 or 32-bit operand, + /// which can be sign- or zero-extended. + Load { + rt: Writable<Reg>, + mem: AMode, + bits: u8, + sign_extend: bool, + }, + + /// Load address referenced by `mem` into `rd`. + LoadAddr { + rd: Writable<Reg>, + mem: AMode, + }, + + /// A sign- or zero-extend operation. + Extend { + rd: Writable<Reg>, + rm: Reg, + from_bits: u8, + signed: bool, + }, + + // An If-Then instruction, which makes up to four instructions conditinal. + It { + cond: Cond, + insts: Vec<CondInst>, + }, + + /// A push instuction, which stores registers to the stack and updates sp. + Push { + reg_list: Vec<Reg>, + }, + + /// A pop instuction, which load registers from the stack and updates sp. + Pop { + reg_list: Vec<Writable<Reg>>, + }, + + /// A machine call instruction. + Call { + info: Box<CallInfo>, + }, + + /// A machine indirect-call instruction. + CallInd { + info: Box<CallIndInfo>, + }, + + /// Load an inline symbol reference. + LoadExtName { + rt: Writable<Reg>, + name: Box<ExternalName>, + offset: i32, + }, + + /// A return instruction, which is encoded as `bx lr`. + Ret, + + /// An unconditional branch. + Jump { + dest: BranchTarget, + }, + + /// A conditional branch. + CondBr { + taken: BranchTarget, + not_taken: BranchTarget, + cond: Cond, + }, + + /// An indirect branch through a register, augmented with set of all + /// possible successors. + IndirectBr { + rm: Reg, + targets: Vec<MachLabel>, + }, + + /// A conditional trap: execute a `udf` if the condition is true. This is + /// one VCode instruction because it uses embedded control flow; it is + /// logically a single-in, single-out region, but needs to appear as one + /// unit to the register allocator. + TrapIf { + cond: Cond, + trap_info: TrapCode, + }, + + /// An instruction guaranteed to always be undefined and to trigger an illegal instruction at + /// runtime. + Udf { + trap_info: TrapCode, + }, + + /// A "breakpoint" instruction, used for e.g. traps and debug breakpoints. + Bkpt, + + /// Marker, no-op in generated code: SP "virtual offset" is adjusted. + VirtualSPOffsetAdj { + offset: i64, + }, + + /// A placeholder instruction, generating no code, meaning that a function epilogue must be + /// inserted there. + EpiloguePlaceholder, +} + +/// An instruction inside an it block. +#[derive(Clone, Debug)] +pub struct CondInst { + inst: Inst, + // In which case execute the instruction: + // true => when it condition is met + // false => otherwise. + then: bool, +} + +impl CondInst { + pub fn new(inst: Inst, then: bool) -> Self { + match inst { + Inst::It { .. } + | Inst::Ret { .. } + | Inst::Jump { .. } + | Inst::CondBr { .. } + | Inst::TrapIf { .. } + | Inst::EpiloguePlaceholder { .. } + | Inst::LoadExtName { .. } => panic!("Instruction {:?} cannot occur in it block", inst), + _ => Self { inst, then }, + } + } +} + +impl Inst { + /// Create a move instruction. + pub fn mov(to_reg: Writable<Reg>, from_reg: Reg) -> Inst { + Inst::Mov { + rd: to_reg, + rm: from_reg, + } + } + + /// Create an instruction that loads a constant. + pub fn load_constant(rd: Writable<Reg>, value: u32) -> SmallVec<[Inst; 4]> { + let mut insts = smallvec![]; + let imm_lo = (value & 0xffff) as u16; + let imm_hi = (value >> 16) as u16; + + if imm_lo != 0 || imm_hi == 0 { + // imm_lo == 0 && imm_hi == 0 => we have to overwrite reg value with 0 + insts.push(Inst::MovImm16 { rd, imm16: imm_lo }); + } + if imm_hi != 0 { + insts.push(Inst::Movt { rd, imm16: imm_hi }); + } + + insts + } + + /// Generic constructor for a load (zero-extending where appropriate). + pub fn gen_load(into_reg: Writable<Reg>, mem: AMode, ty: Type) -> Inst { + assert!(ty.bits() <= 32); + // Load 8 bits for B1. + let bits = std::cmp::max(ty.bits(), 8) as u8; + + Inst::Load { + rt: into_reg, + mem, + bits, + sign_extend: false, + } + } + + /// Generic constructor for a store. + pub fn gen_store(from_reg: Reg, mem: AMode, ty: Type) -> Inst { + assert!(ty.bits() <= 32); + // Store 8 bits for B1. + let bits = std::cmp::max(ty.bits(), 8) as u8; + + Inst::Store { + rt: from_reg, + mem, + bits, + } + } +} + +//============================================================================= +// Instructions: get_regs + +fn memarg_regs(memarg: &AMode, collector: &mut RegUsageCollector) { + match memarg { + &AMode::RegReg(rn, rm, ..) => { + collector.add_use(rn); + collector.add_use(rm); + } + &AMode::RegOffset12(rn, ..) | &AMode::RegOffset(rn, _) => { + collector.add_use(rn); + } + &AMode::SPOffset(..) | &AMode::NominalSPOffset(..) => { + collector.add_use(sp_reg()); + } + &AMode::FPOffset(..) => { + collector.add_use(fp_reg()); + } + &AMode::PCRel(_) => {} + } +} + +fn arm32_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { + match inst { + &Inst::Nop0 + | &Inst::Nop2 + | &Inst::Ret + | &Inst::VirtualSPOffsetAdj { .. } + | &Inst::EpiloguePlaceholder + | &Inst::Jump { .. } + | &Inst::CondBr { .. } + | &Inst::Bkpt + | &Inst::Udf { .. } + | &Inst::TrapIf { .. } => {} + &Inst::AluRRR { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::AluRRRShift { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::AluRRShift { rd, rm, .. } => { + collector.add_def(rd); + collector.add_use(rm); + } + &Inst::AluRRRR { + rd_hi, + rd_lo, + rn, + rm, + .. + } => { + collector.add_def(rd_hi); + collector.add_def(rd_lo); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::AluRRImm12 { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::AluRRImm8 { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::AluRImm8 { rd, .. } => { + collector.add_def(rd); + } + &Inst::BitOpRR { rd, rm, .. } => { + collector.add_def(rd); + collector.add_use(rm); + } + &Inst::Mov { rd, rm, .. } => { + collector.add_def(rd); + collector.add_use(rm); + } + &Inst::MovImm16 { rd, .. } => { + collector.add_def(rd); + } + &Inst::Movt { rd, .. } => { + collector.add_def(rd); + } + &Inst::Cmp { rn, rm } => { + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::CmpImm8 { rn, .. } => { + collector.add_use(rn); + } + &Inst::Store { rt, ref mem, .. } => { + collector.add_use(rt); + memarg_regs(mem, collector); + } + &Inst::Load { rt, ref mem, .. } => { + collector.add_def(rt); + memarg_regs(mem, collector); + } + &Inst::LoadAddr { rd, mem: _ } => { + collector.add_def(rd); + } + &Inst::Extend { rd, rm, .. } => { + collector.add_def(rd); + collector.add_use(rm); + } + &Inst::It { ref insts, .. } => { + for inst in insts.iter() { + arm32_get_regs(&inst.inst, collector); + } + } + &Inst::Push { ref reg_list } => { + for reg in reg_list { + collector.add_use(*reg); + } + } + &Inst::Pop { ref reg_list } => { + for reg in reg_list { + collector.add_def(*reg); + } + } + &Inst::Call { ref info, .. } => { + collector.add_uses(&*info.uses); + collector.add_defs(&*info.defs); + } + &Inst::CallInd { ref info, .. } => { + collector.add_uses(&*info.uses); + collector.add_defs(&*info.defs); + collector.add_use(info.rm); + } + &Inst::LoadExtName { rt, .. } => { + collector.add_def(rt); + } + &Inst::IndirectBr { rm, .. } => { + collector.add_use(rm); + } + } +} + +//============================================================================= +// Instructions: map_regs + +fn arm32_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) { + fn map_use<RUM: RegUsageMapper>(m: &RUM, r: &mut Reg) { + if r.is_virtual() { + let new = m.get_use(r.to_virtual_reg()).unwrap().to_reg(); + *r = new; + } + } + + fn map_def<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) { + if r.to_reg().is_virtual() { + let new = m.get_def(r.to_reg().to_virtual_reg()).unwrap().to_reg(); + *r = Writable::from_reg(new); + } + } + + fn map_mod<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) { + if r.to_reg().is_virtual() { + let new = m.get_mod(r.to_reg().to_virtual_reg()).unwrap().to_reg(); + *r = Writable::from_reg(new); + } + } + + fn map_mem<RUM: RegUsageMapper>(m: &RUM, mem: &mut AMode) { + match mem { + &mut AMode::RegReg(ref mut rn, ref mut rm, ..) => { + map_use(m, rn); + map_use(m, rm); + } + &mut AMode::RegOffset12(ref mut rn, ..) | &mut AMode::RegOffset(ref mut rn, ..) => { + map_use(m, rn) + } + &mut AMode::SPOffset(..) + | &mut AMode::FPOffset(..) + | &mut AMode::NominalSPOffset(..) + | &mut AMode::PCRel(_) => {} + }; + } + + match inst { + &mut Inst::Nop0 + | &mut Inst::Nop2 + | &mut Inst::Ret + | &mut Inst::VirtualSPOffsetAdj { .. } + | &mut Inst::EpiloguePlaceholder + | &mut Inst::Jump { .. } + | &mut Inst::CondBr { .. } + | &mut Inst::Bkpt + | &mut Inst::Udf { .. } + | &mut Inst::TrapIf { .. } => {} + &mut Inst::AluRRR { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::AluRRRShift { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::AluRRShift { + ref mut rd, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rm); + } + &mut Inst::AluRRRR { + ref mut rd_hi, + ref mut rd_lo, + ref mut rn, + ref mut rm, + .. + } => { + map_def(mapper, rd_hi); + map_def(mapper, rd_lo); + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::AluRRImm12 { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::AluRRImm8 { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } + &mut Inst::AluRImm8 { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::BitOpRR { + ref mut rd, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rm); + } + &mut Inst::Mov { + ref mut rd, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rm); + } + &mut Inst::MovImm16 { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::Movt { ref mut rd, .. } => { + map_def(mapper, rd); + } + &mut Inst::Cmp { + ref mut rn, + ref mut rm, + } => { + map_use(mapper, rn); + map_use(mapper, rm); + } + &mut Inst::CmpImm8 { ref mut rn, .. } => { + map_use(mapper, rn); + } + &mut Inst::Store { + ref mut rt, + ref mut mem, + .. + } => { + map_use(mapper, rt); + map_mem(mapper, mem); + } + &mut Inst::Load { + ref mut rt, + ref mut mem, + .. + } => { + map_def(mapper, rt); + map_mem(mapper, mem); + } + &mut Inst::LoadAddr { + ref mut rd, + ref mut mem, + } => { + map_def(mapper, rd); + map_mem(mapper, mem); + } + &mut Inst::Extend { + ref mut rd, + ref mut rm, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rm); + } + &mut Inst::It { ref mut insts, .. } => { + for inst in insts.iter_mut() { + arm32_map_regs(&mut inst.inst, mapper); + } + } + &mut Inst::Push { ref mut reg_list } => { + for reg in reg_list { + map_use(mapper, reg); + } + } + &mut Inst::Pop { ref mut reg_list } => { + for reg in reg_list { + map_def(mapper, reg); + } + } + &mut Inst::Call { ref mut info } => { + for r in info.uses.iter_mut() { + map_use(mapper, r); + } + for r in info.defs.iter_mut() { + map_def(mapper, r); + } + } + &mut Inst::CallInd { ref mut info, .. } => { + for r in info.uses.iter_mut() { + map_use(mapper, r); + } + for r in info.defs.iter_mut() { + map_def(mapper, r); + } + map_use(mapper, &mut info.rm); + } + &mut Inst::LoadExtName { ref mut rt, .. } => { + map_def(mapper, rt); + } + &mut Inst::IndirectBr { ref mut rm, .. } => { + map_use(mapper, rm); + } + } +} + +//============================================================================= +// Instructions: misc functions and external interface + +impl MachInst for Inst { + type LabelUse = LabelUse; + + fn get_regs(&self, collector: &mut RegUsageCollector) { + arm32_get_regs(self, collector) + } + + fn map_regs<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + arm32_map_regs(self, mapper); + } + + fn is_move(&self) -> Option<(Writable<Reg>, Reg)> { + match self { + &Inst::Mov { rd, rm } => Some((rd, rm)), + _ => None, + } + } + + fn is_epilogue_placeholder(&self) -> bool { + if let Inst::EpiloguePlaceholder = self { + true + } else { + false + } + } + + fn is_term<'a>(&'a self) -> MachTerminator<'a> { + match self { + &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret, + &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_label().unwrap()), + &Inst::CondBr { + taken, not_taken, .. + } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()), + &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]), + _ => MachTerminator::None, + } + } + + fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, _ty: Type) -> Inst { + assert_eq!(from_reg.get_class(), RegClass::I32); + assert_eq!(to_reg.to_reg().get_class(), from_reg.get_class()); + + Inst::mov(to_reg, from_reg) + } + + fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>( + to_reg: Writable<Reg>, + value: u64, + ty: Type, + _alloc_tmp: F, + ) -> SmallVec<[Inst; 4]> { + match ty { + B1 | I8 | B8 | I16 | B16 | I32 | B32 => { + let v: i64 = value as i64; + + if v >= (1 << 32) || v < -(1 << 32) { + panic!("Cannot load constant value {}", value) + } + Inst::load_constant(to_reg, value as u32) + } + _ => unimplemented!(), + } + } + + fn gen_zero_len_nop() -> Inst { + Inst::Nop0 + } + + fn gen_nop(preferred_size: usize) -> Inst { + assert!(preferred_size >= 2); + Inst::Nop2 + } + + fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> { + None + } + + fn rc_for_type(ty: Type) -> CodegenResult<RegClass> { + match ty { + I8 | I16 | I32 | B1 | B8 | B16 | B32 => Ok(RegClass::I32), + IFLAGS => Ok(RegClass::I32), + _ => Err(CodegenError::Unsupported(format!( + "Unexpected SSA-value type: {}", + ty + ))), + } + } + + fn gen_jump(target: MachLabel) -> Inst { + Inst::Jump { + dest: BranchTarget::Label(target), + } + } + + fn reg_universe(_flags: &settings::Flags) -> RealRegUniverse { + create_reg_universe() + } + + fn worst_case_size() -> CodeOffset { + // It inst with four 32-bit instructions + 2 + 4 * 4 + } + + fn ref_type_regclass(_: &settings::Flags) -> RegClass { + RegClass::I32 + } +} + +//============================================================================= +// Pretty-printing of instructions. + +fn mem_finalize_for_show( + mem: &AMode, + mb_rru: Option<&RealRegUniverse>, + state: &EmitState, +) -> (String, AMode) { + let (mem_insts, mem) = mem_finalize(mem, state); + let mut mem_str = mem_insts + .into_iter() + .map(|inst| inst.show_rru(mb_rru)) + .collect::<Vec<_>>() + .join(" ; "); + if !mem_str.is_empty() { + mem_str += " ; "; + } + + (mem_str, mem) +} + +impl PrettyPrint for Inst { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + self.pretty_print(mb_rru, &mut EmitState::default()) + } +} + +impl Inst { + fn print_with_state(&self, mb_rru: Option<&RealRegUniverse>, state: &mut EmitState) -> String { + fn op_name(alu_op: ALUOp) -> &'static str { + match alu_op { + ALUOp::Add => "add", + ALUOp::Adds => "adds", + ALUOp::Adc => "adc", + ALUOp::Adcs => "adcs", + ALUOp::Qadd => "qadd", + ALUOp::Sub => "sub", + ALUOp::Subs => "subs", + ALUOp::Sbc => "sbc", + ALUOp::Sbcs => "sbcs", + ALUOp::Rsb => "rsb", + ALUOp::Qsub => "qsub", + ALUOp::Mul => "mul", + ALUOp::Smull => "smull", + ALUOp::Umull => "umull", + ALUOp::Udiv => "udiv", + ALUOp::Sdiv => "sdiv", + ALUOp::And => "and", + ALUOp::Orr => "orr", + ALUOp::Orn => "orn", + ALUOp::Eor => "eor", + ALUOp::Bic => "bic", + ALUOp::Lsl => "lsl", + ALUOp::Lsr => "lsr", + ALUOp::Asr => "asr", + ALUOp::Ror => "ror", + } + } + + fn reg_shift_str( + shift: &Option<ShiftOpAndAmt>, + mb_rru: Option<&RealRegUniverse>, + ) -> String { + if let Some(ref shift) = shift { + format!(", {}", shift.show_rru(mb_rru)) + } else { + "".to_string() + } + } + + match self { + &Inst::Nop0 => "nop-zero-len".to_string(), + &Inst::Nop2 => "nop".to_string(), + &Inst::AluRRR { alu_op, rd, rn, rm } => { + let op = op_name(alu_op); + let rd = rd.show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + format!("{} {}, {}, {}", op, rd, rn, rm) + } + &Inst::AluRRRShift { + alu_op, + rd, + rn, + rm, + ref shift, + } => { + let op = op_name(alu_op); + let rd = rd.show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + let shift = reg_shift_str(shift, mb_rru); + format!("{} {}, {}, {}{}", op, rd, rn, rm, shift) + } + &Inst::AluRRShift { + alu_op, + rd, + rm, + ref shift, + } => { + let op = match alu_op { + ALUOp1::Mvn => "mvn", + ALUOp1::Mov => "mov", + }; + let rd = rd.show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + let shift = reg_shift_str(shift, mb_rru); + format!("{} {}, {}{}", op, rd, rm, shift) + } + &Inst::AluRRRR { + alu_op, + rd_hi, + rd_lo, + rn, + rm, + } => { + let op = op_name(alu_op); + let rd_hi = rd_hi.show_rru(mb_rru); + let rd_lo = rd_lo.show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + format!("{} {}, {}, {}, {}", op, rd_lo, rd_hi, rn, rm) + } + &Inst::AluRRImm12 { + alu_op, + rd, + rn, + imm12, + } => { + let op = op_name(alu_op); + let rd = rd.show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + let imm = imm12.show_rru(mb_rru); + format!("{} {}, {}, {}", op, rd, rn, imm) + } + &Inst::AluRRImm8 { + alu_op, + rd, + rn, + imm8, + } => { + let op = op_name(alu_op); + let rd = rd.show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + let imm = imm8.show_rru(mb_rru); + format!("{} {}, {}, {}", op, rd, rn, imm) + } + &Inst::AluRImm8 { alu_op, rd, imm8 } => { + let op = match alu_op { + ALUOp1::Mvn => "mvn", + ALUOp1::Mov => "mov", + }; + let rd = rd.show_rru(mb_rru); + let imm = imm8.show_rru(mb_rru); + format!("{} {}, {}", op, rd, imm) + } + &Inst::BitOpRR { bit_op, rd, rm } => { + let op = match bit_op { + BitOp::Rbit => "rbit", + BitOp::Rev => "rev", + BitOp::Clz => "clz", + }; + let rd = rd.show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + format!("{} {}, {}", op, rd, rm) + } + &Inst::Mov { rd, rm } => { + let rd = rd.show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + format!("mov {}, {}", rd, rm) + } + &Inst::MovImm16 { rd, imm16 } => { + let rd = rd.show_rru(mb_rru); + format!("mov {}, #{}", rd, imm16) + } + &Inst::Movt { rd, imm16 } => { + let rd = rd.show_rru(mb_rru); + format!("movt {}, #{}", rd, imm16) + } + &Inst::Cmp { rn, rm } => { + let rn = rn.show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + format!("cmp {}, {}", rn, rm) + } + &Inst::CmpImm8 { rn, imm8 } => { + let rn = rn.show_rru(mb_rru); + format!("cmp {}, #{}", rn, imm8) + } + &Inst::Store { + rt, ref mem, bits, .. + } => { + let op = match bits { + 32 => "str", + 16 => "strh", + 8 => "strb", + _ => panic!("Invalid bit amount {}", bits), + }; + let rt = rt.show_rru(mb_rru); + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state); + let mem = mem.show_rru(mb_rru); + format!("{}{} {}, {}", mem_str, op, rt, mem) + } + &Inst::Load { + rt, + ref mem, + bits, + sign_extend, + .. + } => { + let op = match (bits, sign_extend) { + (32, _) => "ldr", + (16, true) => "ldrsh", + (16, false) => "ldrh", + (8, true) => "ldrsb", + (8, false) => "ldrb", + (_, _) => panic!("Invalid bit amount {}", bits), + }; + let rt = rt.show_rru(mb_rru); + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru, state); + let mem = mem.show_rru(mb_rru); + format!("{}{} {}, {}", mem_str, op, rt, mem) + } + &Inst::LoadAddr { rd, ref mem } => { + let mut ret = String::new(); + let (mem_insts, mem) = mem_finalize(mem, state); + for inst in mem_insts.into_iter() { + ret.push_str(&inst.show_rru(mb_rru)); + } + let inst = match mem { + AMode::RegReg(rn, rm, shift) => { + let shift = u32::from(shift); + let shift_amt = ShiftOpShiftImm::maybe_from_shift(shift).unwrap(); + let shift = ShiftOpAndAmt::new(ShiftOp::LSL, shift_amt); + Inst::AluRRRShift { + alu_op: ALUOp::Add, + rd, + rn, + rm, + shift: Some(shift), + } + } + AMode::RegOffset12(reg, imm12) => Inst::AluRRImm12 { + alu_op: ALUOp::Add, + rd, + rn: reg, + imm12, + }, + _ => unreachable!(), + }; + ret.push_str(&inst.show_rru(mb_rru)); + ret + } + &Inst::Extend { + rd, + rm, + from_bits, + signed, + } => { + let op = match (from_bits, signed) { + (16, true) => "sxth", + (16, false) => "uxth", + (8, true) => "sxtb", + (8, false) => "uxtb", + _ => panic!("Unsupported extend case: {:?}", self), + }; + let rd = rd.show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + format!("{} {}, {}", op, rd, rm) + } + &Inst::It { cond, ref insts } => { + let te: String = insts + .iter() + .skip(1) + .map(|i| if i.then { "t" } else { "e" }) + .collect(); + let cond = cond.show_rru(mb_rru); + let mut ret = format!("it{} {}", te, cond); + for inst in insts.into_iter() { + ret.push_str(" ; "); + ret.push_str(&inst.inst.show_rru(mb_rru)); + } + ret + } + &Inst::Push { ref reg_list } => { + assert!(!reg_list.is_empty()); + let first_reg = reg_list[0].show_rru(mb_rru); + let regs: String = reg_list + .iter() + .skip(1) + .map(|r| [",", &r.show_rru(mb_rru)].join(" ")) + .collect(); + format!("push {{{}{}}}", first_reg, regs) + } + &Inst::Pop { ref reg_list } => { + assert!(!reg_list.is_empty()); + let first_reg = reg_list[0].show_rru(mb_rru); + let regs: String = reg_list + .iter() + .skip(1) + .map(|r| [",", &r.show_rru(mb_rru)].join(" ")) + .collect(); + format!("pop {{{}{}}}", first_reg, regs) + } + &Inst::Call { .. } => format!("bl 0"), + &Inst::CallInd { ref info, .. } => { + let rm = info.rm.show_rru(mb_rru); + format!("blx {}", rm) + } + &Inst::LoadExtName { + rt, + ref name, + offset, + } => { + let rt = rt.show_rru(mb_rru); + format!("ldr {}, [pc, #4] ; b 4 ; data {:?} + {}", rt, name, offset) + } + &Inst::Ret => "bx lr".to_string(), + &Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset), + &Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(), + &Inst::Jump { ref dest } => { + let dest = dest.show_rru(mb_rru); + format!("b {}", dest) + } + &Inst::CondBr { + ref taken, + ref not_taken, + ref cond, + } => { + let taken = taken.show_rru(mb_rru); + let not_taken = not_taken.show_rru(mb_rru); + let c = cond.show_rru(mb_rru); + format!("b{} {} ; b {}", c, taken, not_taken) + } + &Inst::IndirectBr { rm, .. } => { + let rm = rm.show_rru(mb_rru); + format!("bx {}", rm) + } + &Inst::Udf { .. } => "udf #0".to_string(), + &Inst::Bkpt => "bkpt #0".to_string(), + &Inst::TrapIf { cond, .. } => { + let c = cond.invert().show_rru(mb_rru); + format!("b{} 2 ; udf #0", c) + } + } + } +} + +//============================================================================= +// Label fixups and jump veneers. + +/// Different forms of label references for different instruction formats. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LabelUse { + /// 20-bit branch offset used by 32-bit conditional jumps. + Branch20, + + /// 24-bit branch offset used by 32-bit uncoditional jump instruction. + Branch24, +} + +impl MachInstLabelUse for LabelUse { + /// Alignment for veneer code. Every instruction must be 4-byte-aligned. + const ALIGN: CodeOffset = 2; + + // Branches range: + // 20-bit sign-extended immediate gives us range [-(2^19), 2^19 - 1]. + // Left-shifted by 1 => [-(2^20), 2^20 - 2]. + // PC is start of this instruction + 4 bytes => [-(2^20) + 4, 2^20 + 2]. + // Likewise for Branch24. + + /// Maximum PC-relative range (positive), inclusive. + fn max_pos_range(self) -> CodeOffset { + match self { + LabelUse::Branch20 => (1 << 20) + 2, + LabelUse::Branch24 => (1 << 24) + 2, + } + } + + /// Maximum PC-relative range (negative). + fn max_neg_range(self) -> CodeOffset { + match self { + LabelUse::Branch20 => (1 << 20) - 4, + LabelUse::Branch24 => (1 << 24) - 4, + } + } + + /// Size of window into code needed to do the patch. + fn patch_size(self) -> CodeOffset { + 4 + } + + /// Perform the patch. + fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) { + let off = (label_offset as i64) - (use_offset as i64); + debug_assert!(off <= self.max_pos_range() as i64); + debug_assert!(off >= -(self.max_neg_range() as i64)); + let off = off - 4; + match self { + LabelUse::Branch20 => { + let off = off as u32 >> 1; + let imm11 = (off & 0x7ff) as u16; + let imm6 = ((off >> 11) & 0x3f) as u16; + let j1 = ((off >> 17) & 0x1) as u16; + let j2 = ((off >> 18) & 0x1) as u16; + let s = ((off >> 19) & 0x1) as u16; + let insn_fst = u16::from_le_bytes([buffer[0], buffer[1]]); + let insn_fst = (insn_fst & !0x43f) | imm6 | (s << 10); + let insn_snd = u16::from_le_bytes([buffer[2], buffer[3]]); + let insn_snd = (insn_snd & !0x2fff) | imm11 | (j2 << 11) | (j1 << 13); + buffer[0..2].clone_from_slice(&u16::to_le_bytes(insn_fst)); + buffer[2..4].clone_from_slice(&u16::to_le_bytes(insn_snd)); + } + LabelUse::Branch24 => { + let off = off as u32 >> 1; + let imm11 = (off & 0x7ff) as u16; + let imm10 = ((off >> 11) & 0x3ff) as u16; + let s = ((off >> 23) & 0x1) as u16; + let j1 = (((off >> 22) & 0x1) as u16 ^ s) ^ 0x1; + let j2 = (((off >> 21) & 0x1) as u16 ^ s) ^ 0x1; + let insn_fst = u16::from_le_bytes([buffer[0], buffer[1]]); + let insn_fst = (insn_fst & !0x07ff) | imm10 | (s << 10); + let insn_snd = u16::from_le_bytes([buffer[2], buffer[3]]); + let insn_snd = (insn_snd & !0x2fff) | imm11 | (j2 << 11) | (j1 << 13); + buffer[0..2].clone_from_slice(&u16::to_le_bytes(insn_fst)); + buffer[2..4].clone_from_slice(&u16::to_le_bytes(insn_snd)); + } + } + } + + fn supports_veneer(self) -> bool { + false + } + + fn veneer_size(self) -> CodeOffset { + 0 + } + + fn generate_veneer( + self, + _buffer: &mut [u8], + _veneer_offset: CodeOffset, + ) -> (CodeOffset, LabelUse) { + panic!("Veneer not supported yet.") + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn patch_branch20() { + let label_use = LabelUse::Branch20; + let mut buffer = 0x8000_f000_u32.to_le_bytes(); // beq + let use_offset: CodeOffset = 0; + let label_offset: CodeOffset = label_use.max_pos_range(); + label_use.patch(&mut buffer, use_offset, label_offset); + assert_eq!(u16::from_le_bytes([buffer[0], buffer[1]]), 0xf03f); + assert_eq!(u16::from_le_bytes([buffer[2], buffer[3]]), 0xafff); + + let mut buffer = 0x8000_f000_u32.to_le_bytes(); // beq + let use_offset = label_use.max_neg_range(); + let label_offset: CodeOffset = 0; + label_use.patch(&mut buffer, use_offset, label_offset); + assert_eq!(u16::from_le_bytes([buffer[0], buffer[1]]), 0xf400); + assert_eq!(u16::from_le_bytes([buffer[2], buffer[3]]), 0x8000); + } + + #[test] + fn patch_branch24() { + let label_use = LabelUse::Branch24; + let mut buffer = 0x9000_f000_u32.to_le_bytes(); // b + let use_offset: CodeOffset = 0; + let label_offset: CodeOffset = label_use.max_pos_range(); + label_use.patch(&mut buffer, use_offset, label_offset); + assert_eq!(u16::from_le_bytes([buffer[0], buffer[1]]), 0xf3ff); + assert_eq!(u16::from_le_bytes([buffer[2], buffer[3]]), 0x97ff); + + let mut buffer = 0x9000_f000_u32.to_le_bytes(); // b + let use_offset = label_use.max_neg_range(); + let label_offset: CodeOffset = 0; + label_use.patch(&mut buffer, use_offset, label_offset); + assert_eq!(u16::from_le_bytes([buffer[0], buffer[1]]), 0xf400); + assert_eq!(u16::from_le_bytes([buffer[2], buffer[3]]), 0x9000); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/regs.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/regs.rs new file mode 100644 index 0000000000..55df5c8db3 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/regs.rs @@ -0,0 +1,128 @@ +//! 32-bit ARM ISA definitions: registers. + +use regalloc::{RealRegUniverse, Reg, RegClass, RegClassInfo, Writable, NUM_REG_CLASSES}; + +use std::string::ToString; + +/// Get a reference to a GPR. +pub fn rreg(num: u8) -> Reg { + assert!(num < 16); + Reg::new_real(RegClass::I32, num, num) +} + +/// Get a writable reference to a GPR. +pub fn writable_rreg(num: u8) -> Writable<Reg> { + Writable::from_reg(rreg(num)) +} + +/// Get a reference to the program counter (r15). +pub fn pc_reg() -> Reg { + rreg(15) +} + +/// Get a writable reference to the program counter. +pub fn writable_pc_reg() -> Writable<Reg> { + Writable::from_reg(pc_reg()) +} + +/// Get a reference to the link register (r14). +pub fn lr_reg() -> Reg { + rreg(14) +} + +/// Get a writable reference to the link register. +pub fn writable_lr_reg() -> Writable<Reg> { + Writable::from_reg(lr_reg()) +} + +/// Get a reference to the stack pointer (r13). +pub fn sp_reg() -> Reg { + rreg(13) +} + +/// Get a writable reference to the stack pointer. +pub fn writable_sp_reg() -> Writable<Reg> { + Writable::from_reg(sp_reg()) +} + +/// Get a reference to the intra-procedure-call scratch register (r12), +/// which is used as a temporary register. +pub fn ip_reg() -> Reg { + rreg(12) +} + +/// Get a writable reference to the Intra-Procedure-call scratch register. +pub fn writable_ip_reg() -> Writable<Reg> { + Writable::from_reg(ip_reg()) +} + +/// Get a reference to the frame pointer register (r11). +pub fn fp_reg() -> Reg { + rreg(11) +} + +/// Get a writable reference to the frame-pointer register. +pub fn writable_fp_reg() -> Writable<Reg> { + Writable::from_reg(fp_reg()) +} + +/// Get a reference to the second temp register. We need this in some edge cases +/// where we need both the ip and another temporary. +/// +/// We use r10 for this role. +pub fn tmp2_reg() -> Reg { + rreg(10) +} + +/// Get a writable reference to the tmp2 reg. +pub fn writable_tmp2_reg() -> Writable<Reg> { + Writable::from_reg(tmp2_reg()) +} + +/// Create the register universe. +/// Use only GPR for now. +pub fn create_reg_universe() -> RealRegUniverse { + let mut regs = vec![]; + let mut allocable_by_class = [None; NUM_REG_CLASSES]; + + let r_reg_base = 0u8; + let r_reg_count = 10; // to exclude r10, fp, ip, sp, lr and pc. + for i in 0..r_reg_count { + let reg = Reg::new_real( + RegClass::I32, + /* enc = */ i, + /* index = */ r_reg_base + i, + ) + .to_real_reg(); + let name = format!("r{}", i); + regs.push((reg, name)); + } + let r_reg_last = r_reg_base + r_reg_count - 1; + + allocable_by_class[RegClass::I32.rc_to_usize()] = Some(RegClassInfo { + first: r_reg_base as usize, + last: r_reg_last as usize, + suggested_scratch: None, + }); + + // Other regs, not available to the allocator. + let allocable = regs.len(); + regs.push((tmp2_reg().to_real_reg(), "r10".to_string())); + regs.push((fp_reg().to_real_reg(), "fp".to_string())); + regs.push((ip_reg().to_real_reg(), "ip".to_string())); + regs.push((sp_reg().to_real_reg(), "sp".to_string())); + regs.push((lr_reg().to_real_reg(), "lr".to_string())); + regs.push((pc_reg().to_real_reg(), "pc".to_string())); + + // The indices in the register structs must match their + // actual indices in the array. + for (i, reg) in regs.iter().enumerate() { + assert_eq!(i, reg.0.get_index()); + } + + RealRegUniverse { + regs, + allocable, + allocable_by_class, + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/inst/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/unwind.rs new file mode 100644 index 0000000000..b9ffeba0cf --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/arm32/inst/unwind.rs @@ -0,0 +1,14 @@ +use super::*; +use crate::isa::unwind::input::UnwindInfo; +use crate::result::CodegenResult; + +pub struct Arm32UnwindInfo; + +impl UnwindInfoGenerator<Inst> for Arm32UnwindInfo { + fn create_unwind_info( + _context: UnwindInfoContext<Inst>, + ) -> CodegenResult<Option<UnwindInfo<Reg>>> { + // TODO + Ok(None) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/lower.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/lower.rs new file mode 100644 index 0000000000..7c11ae95ba --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/arm32/lower.rs @@ -0,0 +1,240 @@ +//! Lowering rules for 32-bit ARM. + +use crate::ir::condcodes::IntCC; +use crate::ir::types::*; +use crate::ir::Inst as IRInst; +use crate::ir::{InstructionData, Opcode, TrapCode}; +use crate::machinst::lower::*; +use crate::machinst::*; +use crate::CodegenResult; + +use crate::isa::arm32::inst::*; +use crate::isa::arm32::Arm32Backend; + +use super::lower_inst; + +use regalloc::{Reg, RegClass, Writable}; + +//============================================================================ +// Lowering: convert instruction outputs to result types. + +/// Lower an instruction output to a 32-bit constant, if possible. +pub(crate) fn output_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u64> { + if out.output > 0 { + None + } else { + let inst_data = ctx.data(out.insn); + if inst_data.opcode() == Opcode::Null { + Some(0) + } else { + match inst_data { + &InstructionData::UnaryImm { opcode: _, imm } => { + // Only has Into for i64; we use u64 elsewhere, so we cast. + let imm: i64 = imm.into(); + Some(imm as u64) + } + &InstructionData::UnaryBool { opcode: _, imm } => Some(u64::from(imm)), + &InstructionData::UnaryIeee32 { .. } | &InstructionData::UnaryIeee64 { .. } => { + unimplemented!() + } + _ => None, + } + } + } +} + +/// How to handle narrow values loaded into registers. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum NarrowValueMode { + None, + /// Zero-extend to 32 bits if original is < 32 bits. + ZeroExtend, + /// Sign-extend to 32 bits if original is < 32 bits. + SignExtend, +} + +/// Lower an instruction output to a reg. +pub(crate) fn output_to_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> { + ctx.get_output(out.insn, out.output) +} + +/// Lower an instruction input to a reg. +/// +/// The given register will be extended appropriately, according to `narrow_mode`. +pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> Reg { + let ty = ctx.input_ty(input.insn, input.input); + let from_bits = ty.bits() as u8; + let inputs = ctx.get_input(input.insn, input.input); + let in_reg = if let Some(c) = inputs.constant { + let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty); + for inst in Inst::gen_constant(to_reg, c, ty, |reg_class, ty| ctx.alloc_tmp(reg_class, ty)) + .into_iter() + { + ctx.emit(inst); + } + to_reg.to_reg() + } else { + ctx.use_input_reg(inputs); + inputs.reg + }; + + match (narrow_mode, from_bits) { + (NarrowValueMode::None, _) => in_reg, + (NarrowValueMode::ZeroExtend, 1) => { + let tmp = ctx.alloc_tmp(RegClass::I32, I32); + ctx.emit(Inst::AluRRImm8 { + alu_op: ALUOp::And, + rd: tmp, + rn: in_reg, + imm8: UImm8::maybe_from_i64(0x1).unwrap(), + }); + tmp.to_reg() + } + (NarrowValueMode::ZeroExtend, n) if n < 32 => { + let tmp = ctx.alloc_tmp(RegClass::I32, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rm: in_reg, + signed: false, + from_bits: n, + }); + tmp.to_reg() + } + (NarrowValueMode::SignExtend, n) if n < 32 => { + let tmp = ctx.alloc_tmp(RegClass::I32, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rm: in_reg, + signed: true, + from_bits: n, + }); + tmp.to_reg() + } + (NarrowValueMode::ZeroExtend, 32) | (NarrowValueMode::SignExtend, 32) => in_reg, + _ => panic!( + "Unsupported input width: input ty {} bits {} mode {:?}", + ty, from_bits, narrow_mode + ), + } +} + +pub(crate) fn lower_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, rd: Writable<Reg>, value: u64) { + // We allow sign bits for high word. + assert!((value >> 32) == 0x0 || (value >> 32) == (1 << 32) - 1); + + for inst in Inst::load_constant(rd, (value & ((1 << 32) - 1)) as u32) { + ctx.emit(inst); + } +} + +pub(crate) fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) { + let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + + ctx.emit(Inst::Cmp { rn, rm }); +} + +pub(crate) fn lower_condcode(cc: IntCC) -> Cond { + match cc { + IntCC::Equal => Cond::Eq, + IntCC::NotEqual => Cond::Ne, + IntCC::SignedGreaterThanOrEqual => Cond::Ge, + IntCC::SignedGreaterThan => Cond::Gt, + IntCC::SignedLessThanOrEqual => Cond::Le, + IntCC::SignedLessThan => Cond::Lt, + IntCC::UnsignedGreaterThanOrEqual => Cond::Hs, + IntCC::UnsignedGreaterThan => Cond::Hi, + IntCC::UnsignedLessThanOrEqual => Cond::Ls, + IntCC::UnsignedLessThan => Cond::Lo, + IntCC::Overflow => Cond::Vs, + IntCC::NotOverflow => Cond::Vc, + } +} + +/// Determines whether this condcode interprets inputs as signed or unsigned. +pub(crate) fn condcode_is_signed(cc: IntCC) -> bool { + match cc { + IntCC::Equal => false, + IntCC::NotEqual => false, + IntCC::SignedGreaterThanOrEqual => true, + IntCC::SignedGreaterThan => true, + IntCC::SignedLessThanOrEqual => true, + IntCC::SignedLessThan => true, + IntCC::UnsignedGreaterThanOrEqual => false, + IntCC::UnsignedGreaterThan => false, + IntCC::UnsignedLessThanOrEqual => false, + IntCC::UnsignedLessThan => false, + IntCC::Overflow => true, + IntCC::NotOverflow => true, + } +} + +//============================================================================= +// Helpers for instruction lowering. + +pub(crate) fn ldst_offset(data: &InstructionData) -> Option<i32> { + match data { + &InstructionData::Load { offset, .. } + | &InstructionData::StackLoad { offset, .. } + | &InstructionData::LoadComplex { offset, .. } + | &InstructionData::Store { offset, .. } + | &InstructionData::StackStore { offset, .. } + | &InstructionData::StoreComplex { offset, .. } => Some(offset.into()), + _ => None, + } +} + +pub(crate) fn inst_condcode(data: &InstructionData) -> Option<IntCC> { + match data { + &InstructionData::IntCond { cond, .. } + | &InstructionData::BranchIcmp { cond, .. } + | &InstructionData::IntCompare { cond, .. } + | &InstructionData::IntCondTrap { cond, .. } + | &InstructionData::BranchInt { cond, .. } + | &InstructionData::IntSelect { cond, .. } + | &InstructionData::IntCompareImm { cond, .. } => Some(cond), + _ => None, + } +} + +pub(crate) fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> { + match data { + &InstructionData::Trap { code, .. } + | &InstructionData::CondTrap { code, .. } + | &InstructionData::IntCondTrap { code, .. } => Some(code), + &InstructionData::FloatCondTrap { code, .. } => { + panic!("Unexpected float cond trap {:?}", code) + } + _ => None, + } +} + +//============================================================================= +// Lowering-backend trait implementation. + +impl LowerBackend for Arm32Backend { + type MInst = Inst; + + fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> { + lower_inst::lower_insn_to_regs(ctx, ir_inst) + } + + fn lower_branch_group<C: LowerCtx<I = Inst>>( + &self, + ctx: &mut C, + branches: &[IRInst], + targets: &[MachLabel], + fallthrough: Option<MachLabel>, + ) -> CodegenResult<()> { + lower_inst::lower_branch(ctx, branches, targets, fallthrough) + } + + fn maybe_pinned_reg(&self) -> Option<Reg> { + None + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/lower_inst.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/lower_inst.rs new file mode 100644 index 0000000000..05256b2540 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/arm32/lower_inst.rs @@ -0,0 +1,608 @@ +//! Lower a single Cranelift instruction into vcode. + +use crate::ir::types::*; +use crate::ir::Inst as IRInst; +use crate::ir::Opcode; +use crate::machinst::lower::*; +use crate::machinst::*; +use crate::CodegenResult; + +use crate::isa::arm32::abi::*; +use crate::isa::arm32::inst::*; + +use regalloc::RegClass; +use smallvec::SmallVec; + +use super::lower::*; + +/// Actually codegen an instruction's results into registers. +pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>( + ctx: &mut C, + insn: IRInst, +) -> CodegenResult<()> { + let op = ctx.data(insn).opcode(); + let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn)) + .map(|i| InsnInput { insn, input: i }) + .collect(); + let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn)) + .map(|i| InsnOutput { insn, output: i }) + .collect(); + let ty = if outputs.len() > 0 { + let ty = ctx.output_ty(insn, 0); + if ty.bits() > 32 || ty.is_float() { + panic!("Cannot lower inst with type {}!", ty); + } + Some(ty) + } else { + None + }; + + match op { + Opcode::Iconst | Opcode::Bconst | Opcode::Null => { + let value = output_to_const(ctx, outputs[0]).unwrap(); + let rd = output_to_reg(ctx, outputs[0]); + lower_constant(ctx, rd, value); + } + Opcode::Iadd + | Opcode::IaddIfcin + | Opcode::IaddIfcout + | Opcode::IaddIfcarry + | Opcode::Isub + | Opcode::IsubIfbin + | Opcode::IsubIfbout + | Opcode::IsubIfborrow + | Opcode::Band + | Opcode::Bor + | Opcode::Bxor + | Opcode::BandNot + | Opcode::BorNot => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + + let alu_op = match op { + Opcode::Iadd => ALUOp::Add, + Opcode::IaddIfcin => ALUOp::Adc, + Opcode::IaddIfcout => ALUOp::Adds, + Opcode::IaddIfcarry => ALUOp::Adcs, + Opcode::Isub => ALUOp::Sub, + Opcode::IsubIfbin => ALUOp::Sbc, + Opcode::IsubIfbout => ALUOp::Subs, + Opcode::IsubIfborrow => ALUOp::Sbcs, + Opcode::Band => ALUOp::And, + Opcode::Bor => ALUOp::Orr, + Opcode::Bxor => ALUOp::Eor, + Opcode::BandNot => ALUOp::Bic, + Opcode::BorNot => ALUOp::Orn, + _ => unreachable!(), + }; + ctx.emit(Inst::AluRRRShift { + alu_op, + rd, + rn, + rm, + shift: None, + }); + } + Opcode::SaddSat | Opcode::SsubSat | Opcode::Imul | Opcode::Udiv | Opcode::Sdiv => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + + let alu_op = match op { + Opcode::SaddSat => ALUOp::Qadd, + Opcode::SsubSat => ALUOp::Qsub, + Opcode::Imul => ALUOp::Mul, + Opcode::Udiv => ALUOp::Udiv, + Opcode::Sdiv => ALUOp::Sdiv, + _ => unreachable!(), + }; + ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm }); + } + Opcode::Ineg => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + + ctx.emit(Inst::AluRRImm8 { + alu_op: ALUOp::Rsb, + rd, + rn, + imm8: UImm8::maybe_from_i64(0).unwrap(), + }); + } + Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => { + let (alu_op, ext) = match op { + Opcode::Ishl => (ALUOp::Lsl, NarrowValueMode::None), + Opcode::Ushr => (ALUOp::Lsr, NarrowValueMode::ZeroExtend), + Opcode::Sshr => (ALUOp::Asr, NarrowValueMode::SignExtend), + _ => unreachable!(), + }; + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], ext); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::ZeroExtend); + ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm }); + } + Opcode::Rotr => { + if ty.unwrap().bits() != 32 { + unimplemented!() + } + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Ror, + rd, + rn, + rm, + }); + } + Opcode::Rotl => { + if ty.unwrap().bits() != 32 { + unimplemented!() + } + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let tmp = ctx.alloc_tmp(RegClass::I32, I32); + + // ror rd, rn, 32 - (rm & 31) + ctx.emit(Inst::AluRRImm8 { + alu_op: ALUOp::And, + rd: tmp, + rn: rm, + imm8: UImm8::maybe_from_i64(31).unwrap(), + }); + ctx.emit(Inst::AluRRImm8 { + alu_op: ALUOp::Rsb, + rd: tmp, + rn: tmp.to_reg(), + imm8: UImm8::maybe_from_i64(32).unwrap(), + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Ror, + rd, + rn, + rm: tmp.to_reg(), + }); + } + Opcode::Smulhi | Opcode::Umulhi => { + let ty = ty.unwrap(); + let is_signed = op == Opcode::Smulhi; + match ty { + I32 => { + let rd_hi = output_to_reg(ctx, outputs[0]); + let rd_lo = ctx.alloc_tmp(RegClass::I32, ty); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + + let alu_op = if is_signed { + ALUOp::Smull + } else { + ALUOp::Umull + }; + ctx.emit(Inst::AluRRRR { + alu_op, + rd_hi, + rd_lo, + rn, + rm, + }); + } + I16 | I8 => { + let narrow_mode = if is_signed { + NarrowValueMode::SignExtend + } else { + NarrowValueMode::ZeroExtend + }; + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], narrow_mode); + let rm = input_to_reg(ctx, inputs[1], narrow_mode); + + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Mul, + rd, + rn, + rm, + }); + let shift_amt = if ty == I16 { 16 } else { 8 }; + let imm8 = UImm8::maybe_from_i64(shift_amt).unwrap(); + let alu_op = if is_signed { ALUOp::Asr } else { ALUOp::Lsr }; + + ctx.emit(Inst::AluRRImm8 { + alu_op, + rd, + rn: rd.to_reg(), + imm8, + }); + } + _ => panic!("Unexpected type {} in lower {}!", ty, op), + } + } + Opcode::Bnot => { + let rd = output_to_reg(ctx, outputs[0]); + let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + + ctx.emit(Inst::AluRRShift { + alu_op: ALUOp1::Mvn, + rd, + rm, + shift: None, + }); + } + Opcode::Clz | Opcode::Ctz => { + let rd = output_to_reg(ctx, outputs[0]); + let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend); + let ty = ctx.output_ty(insn, 0); + + let in_reg = if op == Opcode::Ctz { + ctx.emit(Inst::BitOpRR { + bit_op: BitOp::Rbit, + rd, + rm, + }); + rd.to_reg() + } else { + rm + }; + ctx.emit(Inst::BitOpRR { + bit_op: BitOp::Clz, + rd, + rm: in_reg, + }); + + if ty.bits() < 32 { + let imm12 = UImm12::maybe_from_i64(32 - ty.bits() as i64).unwrap(); + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::Sub, + rd, + rn: rd.to_reg(), + imm12, + }); + } + } + Opcode::Bitrev => { + let rd = output_to_reg(ctx, outputs[0]); + let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ctx.output_ty(insn, 0); + let bit_op = BitOp::Rbit; + + match ty.bits() { + 32 => ctx.emit(Inst::BitOpRR { bit_op, rd, rm }), + n if n < 32 => { + let shift = ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(32 - n as u32).unwrap(), + ); + ctx.emit(Inst::AluRRShift { + alu_op: ALUOp1::Mov, + rd, + rm, + shift: Some(shift), + }); + ctx.emit(Inst::BitOpRR { + bit_op, + rd, + rm: rd.to_reg(), + }); + } + _ => panic!("Unexpected output type {}", ty), + } + } + Opcode::Icmp | Opcode::Ifcmp => { + let condcode = inst_condcode(ctx.data(insn)).unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + + let narrow_mode = if is_signed { + NarrowValueMode::SignExtend + } else { + NarrowValueMode::ZeroExtend + }; + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], narrow_mode); + let rm = input_to_reg(ctx, inputs[1], narrow_mode); + + ctx.emit(Inst::Cmp { rn, rm }); + + if op == Opcode::Icmp { + let mut it_insts = vec![]; + it_insts.push(CondInst::new(Inst::MovImm16 { rd, imm16: 1 }, true)); + it_insts.push(CondInst::new(Inst::MovImm16 { rd, imm16: 0 }, false)); + ctx.emit(Inst::It { + cond, + insts: it_insts, + }); + } + } + Opcode::Trueif => { + let cmp_insn = ctx + .get_input(inputs[0].insn, inputs[0].input) + .inst + .unwrap() + .0; + debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp); + emit_cmp(ctx, cmp_insn); + + let condcode = inst_condcode(ctx.data(insn)).unwrap(); + let cond = lower_condcode(condcode); + let rd = output_to_reg(ctx, outputs[0]); + + let mut it_insts = vec![]; + it_insts.push(CondInst::new(Inst::MovImm16 { rd, imm16: 1 }, true)); + it_insts.push(CondInst::new(Inst::MovImm16 { rd, imm16: 0 }, false)); + + ctx.emit(Inst::It { + cond, + insts: it_insts, + }); + } + Opcode::Select | Opcode::Selectif => { + let cond = if op == Opcode::Select { + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend); + ctx.emit(Inst::CmpImm8 { rn, imm8: 0 }); + Cond::Ne + } else { + // Verification ensures that the input is always a single-def ifcmp. + let cmp_insn = ctx + .get_input(inputs[0].insn, inputs[0].input) + .inst + .unwrap() + .0; + debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp); + emit_cmp(ctx, cmp_insn); + + let condcode = inst_condcode(ctx.data(insn)).unwrap(); + lower_condcode(condcode) + }; + let r1 = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let r2 = input_to_reg(ctx, inputs[2], NarrowValueMode::None); + let out_reg = output_to_reg(ctx, outputs[0]); + + let mut it_insts = vec![]; + it_insts.push(CondInst::new(Inst::mov(out_reg, r1), true)); + it_insts.push(CondInst::new(Inst::mov(out_reg, r2), false)); + + ctx.emit(Inst::It { + cond, + insts: it_insts, + }); + } + Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => { + let off = ldst_offset(ctx.data(insn)).unwrap(); + let elem_ty = match op { + Opcode::Istore8 => I8, + Opcode::Istore16 => I16, + Opcode::Istore32 => I32, + Opcode::Store => ctx.input_ty(insn, 0), + _ => unreachable!(), + }; + if elem_ty.bits() > 32 { + unimplemented!() + } + let bits = elem_ty.bits() as u8; + + assert_eq!(inputs.len(), 2, "only one input for store memory operands"); + let rt = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let base = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + + let mem = AMode::RegOffset(base, i64::from(off)); + + ctx.emit(Inst::Store { rt, mem, bits }); + } + Opcode::Load + | Opcode::Uload8 + | Opcode::Sload8 + | Opcode::Uload16 + | Opcode::Sload16 + | Opcode::Uload32 + | Opcode::Sload32 => { + let off = ldst_offset(ctx.data(insn)).unwrap(); + let elem_ty = match op { + Opcode::Sload8 | Opcode::Uload8 => I8, + Opcode::Sload16 | Opcode::Uload16 => I16, + Opcode::Sload32 | Opcode::Uload32 => I32, + Opcode::Load => ctx.output_ty(insn, 0), + _ => unreachable!(), + }; + if elem_ty.bits() > 32 { + unimplemented!() + } + let bits = elem_ty.bits() as u8; + + let sign_extend = match op { + Opcode::Sload8 | Opcode::Sload16 | Opcode::Sload32 => true, + _ => false, + }; + let out_reg = output_to_reg(ctx, outputs[0]); + + assert_eq!(inputs.len(), 2, "only one input for store memory operands"); + let base = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let mem = AMode::RegOffset(base, i64::from(off)); + + ctx.emit(Inst::Load { + rt: out_reg, + mem, + bits, + sign_extend, + }); + } + Opcode::Uextend | Opcode::Sextend => { + let output_ty = ty.unwrap(); + let input_ty = ctx.input_ty(insn, 0); + let from_bits = input_ty.bits() as u8; + let to_bits = 32; + let signed = op == Opcode::Sextend; + + let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + + if output_ty.bits() > 32 { + panic!("Unexpected output type {}", output_ty); + } + if from_bits < to_bits { + ctx.emit(Inst::Extend { + rd, + rm, + from_bits, + signed, + }); + } + } + Opcode::Bint | Opcode::Breduce | Opcode::Bextend | Opcode::Ireduce => { + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend); + let rd = output_to_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + + ctx.emit(Inst::gen_move(rd, rn, ty)); + } + Opcode::Copy => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ctx.input_ty(insn, 0); + + ctx.emit(Inst::gen_move(rd, rn, ty)); + } + Opcode::Debugtrap => { + ctx.emit(Inst::Bkpt); + } + Opcode::Trap => { + let trap_info = inst_trapcode(ctx.data(insn)).unwrap(); + ctx.emit(Inst::Udf { trap_info }) + } + Opcode::Trapif => { + let cmp_insn = ctx + .get_input(inputs[0].insn, inputs[0].input) + .inst + .unwrap() + .0; + debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp); + emit_cmp(ctx, cmp_insn); + + let trap_info = inst_trapcode(ctx.data(insn)).unwrap(); + let condcode = inst_condcode(ctx.data(insn)).unwrap(); + let cond = lower_condcode(condcode); + + ctx.emit(Inst::TrapIf { cond, trap_info }); + } + Opcode::FallthroughReturn | Opcode::Return => { + for (i, input) in inputs.iter().enumerate() { + let reg = input_to_reg(ctx, *input, NarrowValueMode::None); + let retval_reg = ctx.retval(i); + let ty = ctx.input_ty(insn, i); + + ctx.emit(Inst::gen_move(retval_reg, reg, ty)); + } + } + Opcode::Call | Opcode::CallIndirect => { + let caller_conv = ctx.abi().call_conv(); + let (mut abi, inputs) = match op { + Opcode::Call => { + let (extname, dist) = ctx.call_target(insn).unwrap(); + let extname = extname.clone(); + let sig = ctx.call_sig(insn).unwrap(); + assert_eq!(inputs.len(), sig.params.len()); + assert_eq!(outputs.len(), sig.returns.len()); + ( + Arm32ABICaller::from_func(sig, &extname, dist, caller_conv)?, + &inputs[..], + ) + } + Opcode::CallIndirect => { + let ptr = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend); + let sig = ctx.call_sig(insn).unwrap(); + assert_eq!(inputs.len() - 1, sig.params.len()); + assert_eq!(outputs.len(), sig.returns.len()); + ( + Arm32ABICaller::from_ptr(sig, ptr, op, caller_conv)?, + &inputs[1..], + ) + } + _ => unreachable!(), + }; + assert_eq!(inputs.len(), abi.num_args()); + for (i, input) in inputs.iter().enumerate().filter(|(i, _)| *i <= 3) { + let arg_reg = input_to_reg(ctx, *input, NarrowValueMode::None); + abi.emit_copy_reg_to_arg(ctx, i, arg_reg); + } + abi.emit_call(ctx); + for (i, output) in outputs.iter().enumerate() { + let retval_reg = output_to_reg(ctx, *output); + abi.emit_copy_retval_to_reg(ctx, i, retval_reg); + } + } + _ => panic!("lowering {} unimplemented!", op), + } + + Ok(()) +} + +pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>( + ctx: &mut C, + branches: &[IRInst], + targets: &[MachLabel], + fallthrough: Option<MachLabel>, +) -> CodegenResult<()> { + // A block should end with at most two branches. The first may be a + // conditional branch; a conditional branch can be followed only by an + // unconditional branch or fallthrough. Otherwise, if only one branch, + // it may be an unconditional branch, a fallthrough, a return, or a + // trap. These conditions are verified by `is_ebb_basic()` during the + // verifier pass. + assert!(branches.len() <= 2); + + if branches.len() == 2 { + // Must be a conditional branch followed by an unconditional branch. + let op0 = ctx.data(branches[0]).opcode(); + let op1 = ctx.data(branches[1]).opcode(); + + assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough); + let taken = BranchTarget::Label(targets[0]); + let not_taken = match op1 { + Opcode::Jump => BranchTarget::Label(targets[1]), + Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()), + _ => unreachable!(), // assert above. + }; + match op0 { + Opcode::Brz | Opcode::Brnz => { + let rn = input_to_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + NarrowValueMode::ZeroExtend, + ); + let cond = if op0 == Opcode::Brz { + Cond::Eq + } else { + Cond::Ne + }; + + ctx.emit(Inst::CmpImm8 { rn, imm8: 0 }); + ctx.emit(Inst::CondBr { + taken, + not_taken, + cond, + }); + } + _ => unimplemented!(), + } + } else { + // Must be an unconditional branch or an indirect branch. + let op = ctx.data(branches[0]).opcode(); + match op { + Opcode::Jump | Opcode::Fallthrough => { + assert_eq!(branches.len(), 1); + // In the Fallthrough case, the machine-independent driver + // fills in `targets[0]` with our fallthrough block, so this + // is valid for both Jump and Fallthrough. + ctx.emit(Inst::Jump { + dest: BranchTarget::Label(targets[0]), + }); + } + _ => unimplemented!(), + } + } + + Ok(()) +} diff --git a/third_party/rust/cranelift-codegen/src/isa/arm32/mod.rs b/third_party/rust/cranelift-codegen/src/isa/arm32/mod.rs new file mode 100644 index 0000000000..4b9701fd1d --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/arm32/mod.rs @@ -0,0 +1,123 @@ +//! 32-bit ARM Instruction Set Architecture. + +use crate::ir::condcodes::IntCC; +use crate::ir::Function; +use crate::isa::Builder as IsaBuilder; +use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode}; +use crate::result::CodegenResult; +use crate::settings; + +use alloc::boxed::Box; +use regalloc::{PrettyPrint, RealRegUniverse}; +use target_lexicon::{Architecture, ArmArchitecture, Triple}; + +// New backend: +mod abi; +mod inst; +mod lower; +mod lower_inst; + +use inst::{create_reg_universe, EmitInfo}; + +/// An ARM32 backend. +pub struct Arm32Backend { + triple: Triple, + flags: settings::Flags, + reg_universe: RealRegUniverse, +} + +impl Arm32Backend { + /// Create a new ARM32 backend with the given (shared) flags. + pub fn new_with_flags(triple: Triple, flags: settings::Flags) -> Arm32Backend { + let reg_universe = create_reg_universe(); + Arm32Backend { + triple, + flags, + reg_universe, + } + } + + fn compile_vcode( + &self, + func: &Function, + flags: settings::Flags, + ) -> CodegenResult<VCode<inst::Inst>> { + // This performs lowering to VCode, register-allocates the code, computes + // block layout and finalizes branches. The result is ready for binary emission. + let emit_info = EmitInfo::new(flags.clone()); + let abi = Box::new(abi::Arm32ABICallee::new(func, flags)?); + compile::compile::<Arm32Backend>(func, self, abi, emit_info) + } +} + +impl MachBackend for Arm32Backend { + fn compile_function( + &self, + func: &Function, + want_disasm: bool, + ) -> CodegenResult<MachCompileResult> { + let flags = self.flags(); + let vcode = self.compile_vcode(func, flags.clone())?; + let buffer = vcode.emit(); + let frame_size = vcode.frame_size(); + + let disasm = if want_disasm { + Some(vcode.show_rru(Some(&create_reg_universe()))) + } else { + None + }; + + let buffer = buffer.finish(); + + Ok(MachCompileResult { + buffer, + frame_size, + disasm, + unwind_info: None, + }) + } + + fn name(&self) -> &'static str { + "arm32" + } + + fn triple(&self) -> Triple { + self.triple.clone() + } + + fn flags(&self) -> &settings::Flags { + &self.flags + } + + fn reg_universe(&self) -> &RealRegUniverse { + &self.reg_universe + } + + fn unsigned_add_overflow_condition(&self) -> IntCC { + // Carry flag set. + IntCC::UnsignedGreaterThanOrEqual + } + + fn unsigned_sub_overflow_condition(&self) -> IntCC { + // Carry flag clear. + IntCC::UnsignedLessThan + } +} + +/// Create a new `isa::Builder`. +pub fn isa_builder(triple: Triple) -> IsaBuilder { + assert!(match triple.architecture { + Architecture::Arm(ArmArchitecture::Arm) + | Architecture::Arm(ArmArchitecture::Armv7) + | Architecture::Arm(ArmArchitecture::Armv6) => true, + _ => false, + }); + IsaBuilder { + triple, + setup: settings::builder(), + constructor: |triple, shared_flags, _| { + let backend = Arm32Backend::new_with_flags(triple, shared_flags); + Box::new(TargetIsaAdapter::new(backend)) + }, + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/call_conv.rs b/third_party/rust/cranelift-codegen/src/isa/call_conv.rs new file mode 100644 index 0000000000..61a94e5a43 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/call_conv.rs @@ -0,0 +1,106 @@ +use crate::settings::{self, LibcallCallConv}; +use core::fmt; +use core::str; +use target_lexicon::{CallingConvention, Triple}; + +#[cfg(feature = "enable-serde")] +use serde::{Deserialize, Serialize}; + +/// Calling convention identifiers. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub enum CallConv { + /// Best performance, not ABI-stable + Fast, + /// Smallest caller code size, not ABI-stable + Cold, + /// System V-style convention used on many platforms + SystemV, + /// Windows "fastcall" convention, also used for x64 and ARM + WindowsFastcall, + /// SpiderMonkey WebAssembly convention on systems using natively SystemV + BaldrdashSystemV, + /// SpiderMonkey WebAssembly convention on Windows + BaldrdashWindows, + /// SpiderMonkey WebAssembly convention for "ABI-2020", with extra TLS + /// register slots in the frame. + Baldrdash2020, + /// Specialized convention for the probestack function + Probestack, +} + +impl CallConv { + /// Return the default calling convention for the given target triple. + pub fn triple_default(triple: &Triple) -> Self { + match triple.default_calling_convention() { + // Default to System V for unknown targets because most everything + // uses System V. + Ok(CallingConvention::SystemV) | Err(()) => Self::SystemV, + Ok(CallingConvention::WindowsFastcall) => Self::WindowsFastcall, + Ok(unimp) => unimplemented!("calling convention: {:?}", unimp), + } + } + + /// Returns the calling convention used for libcalls according to the current flags. + pub fn for_libcall(flags: &settings::Flags, default_call_conv: CallConv) -> Self { + match flags.libcall_call_conv() { + LibcallCallConv::IsaDefault => default_call_conv, + LibcallCallConv::Fast => Self::Fast, + LibcallCallConv::Cold => Self::Cold, + LibcallCallConv::SystemV => Self::SystemV, + LibcallCallConv::WindowsFastcall => Self::WindowsFastcall, + LibcallCallConv::BaldrdashSystemV => Self::BaldrdashSystemV, + LibcallCallConv::BaldrdashWindows => Self::BaldrdashWindows, + LibcallCallConv::Baldrdash2020 => Self::Baldrdash2020, + LibcallCallConv::Probestack => Self::Probestack, + } + } + + /// Is the calling convention extending the Windows Fastcall ABI? + pub fn extends_windows_fastcall(self) -> bool { + match self { + Self::WindowsFastcall | Self::BaldrdashWindows => true, + _ => false, + } + } + + /// Is the calling convention extending the Baldrdash ABI? + pub fn extends_baldrdash(self) -> bool { + match self { + Self::BaldrdashSystemV | Self::BaldrdashWindows | Self::Baldrdash2020 => true, + _ => false, + } + } +} + +impl fmt::Display for CallConv { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(match *self { + Self::Fast => "fast", + Self::Cold => "cold", + Self::SystemV => "system_v", + Self::WindowsFastcall => "windows_fastcall", + Self::BaldrdashSystemV => "baldrdash_system_v", + Self::BaldrdashWindows => "baldrdash_windows", + Self::Baldrdash2020 => "baldrdash_2020", + Self::Probestack => "probestack", + }) + } +} + +impl str::FromStr for CallConv { + type Err = (); + fn from_str(s: &str) -> Result<Self, Self::Err> { + match s { + "fast" => Ok(Self::Fast), + "cold" => Ok(Self::Cold), + "system_v" => Ok(Self::SystemV), + "windows_fastcall" => Ok(Self::WindowsFastcall), + "baldrdash_system_v" => Ok(Self::BaldrdashSystemV), + "baldrdash_windows" => Ok(Self::BaldrdashWindows), + "baldrdash_2020" => Ok(Self::Baldrdash2020), + "probestack" => Ok(Self::Probestack), + _ => Err(()), + } + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/constraints.rs b/third_party/rust/cranelift-codegen/src/isa/constraints.rs new file mode 100644 index 0000000000..c87c3bd9d4 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/constraints.rs @@ -0,0 +1,207 @@ +//! Register constraints for instruction operands. +//! +//! An encoding recipe specifies how an instruction is encoded as binary machine code, but it only +//! works if the operands and results satisfy certain constraints. Constraints on immediate +//! operands are checked by instruction predicates when the recipe is chosen. +//! +//! It is the register allocator's job to make sure that the register constraints on value operands +//! are satisfied. + +use crate::binemit::CodeOffset; +use crate::ir::{Function, Inst, ValueLoc}; +use crate::isa::{RegClass, RegUnit}; +use crate::regalloc::RegDiversions; + +/// Register constraint for a single value operand or instruction result. +#[derive(PartialEq, Debug)] +pub struct OperandConstraint { + /// The kind of constraint. + pub kind: ConstraintKind, + + /// The register class of the operand. + /// + /// This applies to all kinds of constraints, but with slightly different meaning. + pub regclass: RegClass, +} + +impl OperandConstraint { + /// Check if this operand constraint is satisfied by the given value location. + /// For tied constraints, this only checks the register class, not that the + /// counterpart operand has the same value location. + pub fn satisfied(&self, loc: ValueLoc) -> bool { + match self.kind { + ConstraintKind::Reg | ConstraintKind::Tied(_) => { + if let ValueLoc::Reg(reg) = loc { + self.regclass.contains(reg) + } else { + false + } + } + ConstraintKind::FixedReg(reg) | ConstraintKind::FixedTied(reg) => { + loc == ValueLoc::Reg(reg) && self.regclass.contains(reg) + } + ConstraintKind::Stack => { + if let ValueLoc::Stack(_) = loc { + true + } else { + false + } + } + } + } +} + +/// The different kinds of operand constraints. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum ConstraintKind { + /// This operand or result must be a register from the given register class. + Reg, + + /// This operand or result must be a fixed register. + /// + /// The constraint's `regclass` field is the top-level register class containing the fixed + /// register. + FixedReg(RegUnit), + + /// This result value must use the same register as an input value operand. + /// + /// The associated number is the index of the input value operand this result is tied to. The + /// constraint's `regclass` field is the same as the tied operand's register class. + /// + /// When an (in, out) operand pair is tied, this constraint kind appears in both the `ins` and + /// the `outs` arrays. The constraint for the in operand is `Tied(out)`, and the constraint for + /// the out operand is `Tied(in)`. + Tied(u8), + + /// This operand must be a fixed register, and it has a tied counterpart. + /// + /// This works just like `FixedReg`, but additionally indicates that there are identical + /// input/output operands for this fixed register. For an input operand, this means that the + /// value will be clobbered by the instruction + FixedTied(RegUnit), + + /// This operand must be a value in a stack slot. + /// + /// The constraint's `regclass` field is the register class that would normally be used to load + /// and store values of this type. + Stack, +} + +/// Value operand constraints for an encoding recipe. +#[derive(PartialEq, Clone)] +pub struct RecipeConstraints { + /// Constraints for the instruction's fixed value operands. + /// + /// If the instruction takes a variable number of operands, the register constraints for those + /// operands must be computed dynamically. + /// + /// - For branches and jumps, block arguments must match the expectations of the destination block. + /// - For calls and returns, the calling convention ABI specifies constraints. + pub ins: &'static [OperandConstraint], + + /// Constraints for the instruction's fixed results. + /// + /// If the instruction produces a variable number of results, it's probably a call and the + /// constraints must be derived from the calling convention ABI. + pub outs: &'static [OperandConstraint], + + /// Are any of the input constraints `FixedReg` or `FixedTied`? + pub fixed_ins: bool, + + /// Are any of the output constraints `FixedReg` or `FixedTied`? + pub fixed_outs: bool, + + /// Are any of the input/output constraints `Tied` (but not `FixedTied`)? + pub tied_ops: bool, + + /// Does this instruction clobber the CPU flags? + /// + /// When true, SSA values of type `iflags` or `fflags` can not be live across the instruction. + pub clobbers_flags: bool, +} + +impl RecipeConstraints { + /// Check that these constraints are satisfied by the operands on `inst`. + pub fn satisfied(&self, inst: Inst, divert: &RegDiversions, func: &Function) -> bool { + for (&arg, constraint) in func.dfg.inst_args(inst).iter().zip(self.ins) { + let loc = divert.get(arg, &func.locations); + + if let ConstraintKind::Tied(out_index) = constraint.kind { + let out_val = func.dfg.inst_results(inst)[out_index as usize]; + let out_loc = func.locations[out_val]; + if loc != out_loc { + return false; + } + } + + if !constraint.satisfied(loc) { + return false; + } + } + + for (&arg, constraint) in func.dfg.inst_results(inst).iter().zip(self.outs) { + let loc = divert.get(arg, &func.locations); + if !constraint.satisfied(loc) { + return false; + } + } + + true + } +} + +/// Constraints on the range of a branch instruction. +/// +/// A branch instruction usually encodes its destination as a signed n-bit offset from an origin. +/// The origin depends on the ISA and the specific instruction: +/// +/// - RISC-V and ARM Aarch64 use the address of the branch instruction, `origin = 0`. +/// - x86 uses the address of the instruction following the branch, `origin = 2` for a 2-byte +/// branch instruction. +/// - ARM's A32 encoding uses the address of the branch instruction + 8 bytes, `origin = 8`. +#[derive(Clone, Copy, Debug)] +pub struct BranchRange { + /// Offset in bytes from the address of the branch instruction to the origin used for computing + /// the branch displacement. This is the destination of a branch that encodes a 0 displacement. + pub origin: u8, + + /// Number of bits in the signed byte displacement encoded in the instruction. This does not + /// account for branches that can only target aligned addresses. + pub bits: u8, +} + +impl BranchRange { + /// Determine if this branch range can represent the range from `branch` to `dest`, where + /// `branch` is the code offset of the branch instruction itself and `dest` is the code offset + /// of the destination block header. + /// + /// This method does not detect if the range is larger than 2 GB. + pub fn contains(self, branch: CodeOffset, dest: CodeOffset) -> bool { + let d = dest.wrapping_sub(branch + CodeOffset::from(self.origin)) as i32; + let s = 32 - self.bits; + d == d << s >> s + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn branch_range() { + // ARM T1 branch. + let t1 = BranchRange { origin: 4, bits: 9 }; + assert!(t1.contains(0, 0)); + assert!(t1.contains(0, 2)); + assert!(t1.contains(2, 0)); + assert!(t1.contains(1000, 1000)); + + // Forward limit. + assert!(t1.contains(1000, 1258)); + assert!(!t1.contains(1000, 1260)); + + // Backward limit + assert!(t1.contains(1000, 748)); + assert!(!t1.contains(1000, 746)); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/enc_tables.rs b/third_party/rust/cranelift-codegen/src/isa/enc_tables.rs new file mode 100644 index 0000000000..e21557497e --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/enc_tables.rs @@ -0,0 +1,292 @@ +//! Support types for generated encoding tables. +//! +//! This module contains types and functions for working with the encoding tables generated by +//! `cranelift-codegen/meta/src/gen_encodings.rs`. + +use crate::constant_hash::{probe, Table}; +use crate::ir::{Function, InstructionData, Opcode, Type}; +use crate::isa::{Encoding, Legalize}; +use crate::settings::PredicateView; +use core::ops::Range; + +/// A recipe predicate. +/// +/// This is a predicate function capable of testing ISA and instruction predicates simultaneously. +/// +/// A None predicate is always satisfied. +pub type RecipePredicate = Option<fn(PredicateView, &InstructionData) -> bool>; + +/// An instruction predicate. +/// +/// This is a predicate function that needs to be tested in addition to the recipe predicate. It +/// can't depend on ISA settings. +pub type InstPredicate = fn(&Function, &InstructionData) -> bool; + +/// Legalization action to perform when no encoding can be found for an instruction. +/// +/// This is an index into an ISA-specific table of legalization actions. +pub type LegalizeCode = u8; + +/// Level 1 hash table entry. +/// +/// One level 1 hash table is generated per CPU mode. This table is keyed by the controlling type +/// variable, using `INVALID` for non-polymorphic instructions. +/// +/// The hash table values are references to level 2 hash tables, encoded as an offset in `LEVEL2` +/// where the table begins, and the binary logarithm of its length. All the level 2 hash tables +/// have a power-of-two size. +/// +/// Entries are generic over the offset type. It will typically be `u32` or `u16`, depending on the +/// size of the `LEVEL2` table. +/// +/// Empty entries are encoded with a `!0` value for `log2len` which will always be out of range. +/// Entries that have a `legalize` value but no level 2 table have an `offset` field that is out of +/// bounds. +pub struct Level1Entry<OffT: Into<u32> + Copy> { + pub ty: Type, + pub log2len: u8, + pub legalize: LegalizeCode, + pub offset: OffT, +} + +impl<OffT: Into<u32> + Copy> Level1Entry<OffT> { + /// Get the level 2 table range indicated by this entry. + fn range(&self) -> Range<usize> { + let b = self.offset.into() as usize; + b..b + (1 << self.log2len) + } +} + +impl<OffT: Into<u32> + Copy> Table<Type> for [Level1Entry<OffT>] { + fn len(&self) -> usize { + self.len() + } + + fn key(&self, idx: usize) -> Option<Type> { + if self[idx].log2len != !0 { + Some(self[idx].ty) + } else { + None + } + } +} + +/// Level 2 hash table entry. +/// +/// The second level hash tables are keyed by `Opcode`, and contain an offset into the `ENCLISTS` +/// table where the encoding recipes for the instruction are stored. +/// +/// Entries are generic over the offset type which depends on the size of `ENCLISTS`. A `u16` +/// offset allows the entries to be only 32 bits each. There is no benefit to dropping down to `u8` +/// for tiny ISAs. The entries won't shrink below 32 bits since the opcode is expected to be 16 +/// bits. +/// +/// Empty entries are encoded with a `NotAnOpcode` `opcode` field. +pub struct Level2Entry<OffT: Into<u32> + Copy> { + pub opcode: Option<Opcode>, + pub offset: OffT, +} + +impl<OffT: Into<u32> + Copy> Table<Opcode> for [Level2Entry<OffT>] { + fn len(&self) -> usize { + self.len() + } + + fn key(&self, idx: usize) -> Option<Opcode> { + self[idx].opcode + } +} + +/// Two-level hash table lookup and iterator construction. +/// +/// Given the controlling type variable and instruction opcode, find the corresponding encoding +/// list. +/// +/// Returns an iterator that produces legal encodings for `inst`. +pub fn lookup_enclist<'a, OffT1, OffT2>( + ctrl_typevar: Type, + inst: &'a InstructionData, + func: &'a Function, + level1_table: &'static [Level1Entry<OffT1>], + level2_table: &'static [Level2Entry<OffT2>], + enclist: &'static [EncListEntry], + legalize_actions: &'static [Legalize], + recipe_preds: &'static [RecipePredicate], + inst_preds: &'static [InstPredicate], + isa_preds: PredicateView<'a>, +) -> Encodings<'a> +where + OffT1: Into<u32> + Copy, + OffT2: Into<u32> + Copy, +{ + let (offset, legalize) = match probe(level1_table, ctrl_typevar, ctrl_typevar.index()) { + Err(l1idx) => { + // No level 1 entry found for the type. + // We have a sentinel entry with the default legalization code. + (!0, level1_table[l1idx].legalize) + } + Ok(l1idx) => { + // We have a valid level 1 entry for this type. + let l1ent = &level1_table[l1idx]; + let offset = match level2_table.get(l1ent.range()) { + Some(l2tab) => { + let opcode = inst.opcode(); + match probe(l2tab, opcode, opcode as usize) { + Ok(l2idx) => l2tab[l2idx].offset.into() as usize, + Err(_) => !0, + } + } + // The l1ent range is invalid. This means that we just have a customized + // legalization code for this type. The level 2 table is empty. + None => !0, + }; + (offset, l1ent.legalize) + } + }; + + // Now we have an offset into `enclist` that is `!0` when no encoding list could be found. + // The default legalization code is always valid. + Encodings::new( + offset, + legalize, + inst, + func, + enclist, + legalize_actions, + recipe_preds, + inst_preds, + isa_preds, + ) +} + +/// Encoding list entry. +/// +/// Encoding lists are represented as sequences of u16 words. +pub type EncListEntry = u16; + +/// Number of bits used to represent a predicate. c.f. `meta/src/gen_encodings.rs`. +const PRED_BITS: u8 = 12; +const PRED_MASK: usize = (1 << PRED_BITS) - 1; +/// First code word representing a predicate check. c.f. `meta/src/gen_encodings.rs`. +const PRED_START: usize = 0x1000; + +/// An iterator over legal encodings for the instruction. +pub struct Encodings<'a> { + // Current offset into `enclist`, or out of bounds after we've reached the end. + offset: usize, + // Legalization code to use of no encoding is found. + legalize: LegalizeCode, + inst: &'a InstructionData, + func: &'a Function, + enclist: &'static [EncListEntry], + legalize_actions: &'static [Legalize], + recipe_preds: &'static [RecipePredicate], + inst_preds: &'static [InstPredicate], + isa_preds: PredicateView<'a>, +} + +impl<'a> Encodings<'a> { + /// Creates a new instance of `Encodings`. + /// + /// This iterator provides search for encodings that applies to the given instruction. The + /// encoding lists are laid out such that first call to `next` returns valid entry in the list + /// or `None`. + pub fn new( + offset: usize, + legalize: LegalizeCode, + inst: &'a InstructionData, + func: &'a Function, + enclist: &'static [EncListEntry], + legalize_actions: &'static [Legalize], + recipe_preds: &'static [RecipePredicate], + inst_preds: &'static [InstPredicate], + isa_preds: PredicateView<'a>, + ) -> Self { + Encodings { + offset, + inst, + func, + legalize, + isa_preds, + recipe_preds, + inst_preds, + enclist, + legalize_actions, + } + } + + /// Get the legalization action that caused the enumeration of encodings to stop. + /// This can be the default legalization action for the type or a custom code for the + /// instruction. + /// + /// This method must only be called after the iterator returns `None`. + pub fn legalize(&self) -> Legalize { + debug_assert_eq!(self.offset, !0, "Premature Encodings::legalize()"); + self.legalize_actions[self.legalize as usize] + } + + /// Check if the `rpred` recipe predicate is satisfied. + fn check_recipe(&self, rpred: RecipePredicate) -> bool { + match rpred { + Some(p) => p(self.isa_preds, self.inst), + None => true, + } + } + + /// Check an instruction or isa predicate. + fn check_pred(&self, pred: usize) -> bool { + if let Some(&p) = self.inst_preds.get(pred) { + p(self.func, self.inst) + } else { + let pred = pred - self.inst_preds.len(); + self.isa_preds.test(pred) + } + } +} + +impl<'a> Iterator for Encodings<'a> { + type Item = Encoding; + + fn next(&mut self) -> Option<Encoding> { + while let Some(entryref) = self.enclist.get(self.offset) { + let entry = *entryref as usize; + + // Check for "recipe+bits". + let recipe = entry >> 1; + if let Some(&rpred) = self.recipe_preds.get(recipe) { + let bits = self.offset + 1; + if entry & 1 == 0 { + self.offset += 2; // Next entry. + } else { + self.offset = !0; // Stop. + } + if self.check_recipe(rpred) { + return Some(Encoding::new(recipe as u16, self.enclist[bits])); + } + continue; + } + + // Check for "stop with legalize". + if entry < PRED_START { + self.legalize = (entry - 2 * self.recipe_preds.len()) as LegalizeCode; + self.offset = !0; // Stop. + return None; + } + + // Finally, this must be a predicate entry. + let pred_entry = entry - PRED_START; + let skip = pred_entry >> PRED_BITS; + let pred = pred_entry & PRED_MASK; + + if self.check_pred(pred) { + self.offset += 1; + } else if skip == 0 { + self.offset = !0; // Stop. + return None; + } else { + self.offset += 1 + skip; + } + } + None + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/encoding.rs b/third_party/rust/cranelift-codegen/src/isa/encoding.rs new file mode 100644 index 0000000000..99894cab2c --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/encoding.rs @@ -0,0 +1,163 @@ +//! The `Encoding` struct. + +use crate::binemit::CodeOffset; +use crate::ir::{Function, Inst}; +use crate::isa::constraints::{BranchRange, RecipeConstraints}; +use crate::regalloc::RegDiversions; +use core::fmt; + +/// Bits needed to encode an instruction as binary machine code. +/// +/// The encoding consists of two parts, both specific to the target ISA: An encoding *recipe*, and +/// encoding *bits*. The recipe determines the native instruction format and the mapping of +/// operands to encoded bits. The encoding bits provide additional information to the recipe, +/// typically parts of the opcode. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct Encoding { + recipe: u16, + bits: u16, +} + +impl Encoding { + /// Create a new `Encoding` containing `(recipe, bits)`. + pub fn new(recipe: u16, bits: u16) -> Self { + Self { recipe, bits } + } + + /// Get the recipe number in this encoding. + pub fn recipe(self) -> usize { + self.recipe as usize + } + + /// Get the recipe-specific encoding bits. + pub fn bits(self) -> u16 { + self.bits + } + + /// Is this a legal encoding, or the default placeholder? + pub fn is_legal(self) -> bool { + self != Self::default() + } +} + +/// The default encoding is the illegal one. +impl Default for Encoding { + fn default() -> Self { + Self::new(0xffff, 0xffff) + } +} + +/// ISA-independent display of an encoding. +impl fmt::Display for Encoding { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.is_legal() { + write!(f, "{}#{:02x}", self.recipe, self.bits) + } else { + write!(f, "-") + } + } +} + +/// Temporary object that holds enough context to properly display an encoding. +/// This is meant to be created by `EncInfo::display()`. +pub struct DisplayEncoding { + pub encoding: Encoding, + pub recipe_names: &'static [&'static str], +} + +impl fmt::Display for DisplayEncoding { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.encoding.is_legal() { + write!( + f, + "{}#{:02x}", + self.recipe_names[self.encoding.recipe()], + self.encoding.bits + ) + } else { + write!(f, "-") + } + } +} + +type SizeCalculatorFn = fn(&RecipeSizing, Encoding, Inst, &RegDiversions, &Function) -> u8; + +/// Returns the base size of the Recipe, assuming it's fixed. This is the default for most +/// encodings; others can be variable and longer than this base size, depending on the registers +/// they're using and use a different function, specific per platform. +pub fn base_size( + sizing: &RecipeSizing, + _: Encoding, + _: Inst, + _: &RegDiversions, + _: &Function, +) -> u8 { + sizing.base_size +} + +/// Code size information for an encoding recipe. +/// +/// Encoding recipes may have runtime-determined instruction size. +pub struct RecipeSizing { + /// Minimum size in bytes of instructions encoded with this recipe. + pub base_size: u8, + + /// Method computing the instruction's real size, given inputs and outputs. + pub compute_size: SizeCalculatorFn, + + /// Allowed branch range in this recipe, if any. + /// + /// All encoding recipes for branches have exact branch range information. + pub branch_range: Option<BranchRange>, +} + +/// Information about all the encodings in this ISA. +#[derive(Clone)] +pub struct EncInfo { + /// Constraints on value operands per recipe. + pub constraints: &'static [RecipeConstraints], + + /// Code size information per recipe. + pub sizing: &'static [RecipeSizing], + + /// Names of encoding recipes. + pub names: &'static [&'static str], +} + +impl EncInfo { + /// Get the value operand constraints for `enc` if it is a legal encoding. + pub fn operand_constraints(&self, enc: Encoding) -> Option<&'static RecipeConstraints> { + self.constraints.get(enc.recipe()) + } + + /// Create an object that can display an ISA-dependent encoding properly. + pub fn display(&self, enc: Encoding) -> DisplayEncoding { + DisplayEncoding { + encoding: enc, + recipe_names: self.names, + } + } + + /// Get the size in bytes of `inst`, if it were encoded with `enc`. + /// + /// Returns 0 for illegal encodings. + pub fn byte_size( + &self, + enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, + ) -> CodeOffset { + self.sizing.get(enc.recipe()).map_or(0, |s| { + let compute_size = s.compute_size; + CodeOffset::from(compute_size(&s, enc, inst, divert, func)) + }) + } + + /// Get the branch range that is supported by `enc`, if any. + /// + /// This will never return `None` for a legal branch encoding. + pub fn branch_range(&self, enc: Encoding) -> Option<BranchRange> { + self.sizing.get(enc.recipe()).and_then(|s| s.branch_range) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/mod.rs b/third_party/rust/cranelift-codegen/src/isa/mod.rs new file mode 100644 index 0000000000..2e56c025d0 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/mod.rs @@ -0,0 +1,447 @@ +//! Instruction Set Architectures. +//! +//! The `isa` module provides a `TargetIsa` trait which provides the behavior specialization needed +//! by the ISA-independent code generator. The sub-modules of this module provide definitions for +//! the instruction sets that Cranelift can target. Each sub-module has it's own implementation of +//! `TargetIsa`. +//! +//! # Constructing a `TargetIsa` instance +//! +//! The target ISA is built from the following information: +//! +//! - The name of the target ISA as a string. Cranelift is a cross-compiler, so the ISA to target +//! can be selected dynamically. Individual ISAs can be left out when Cranelift is compiled, so a +//! string is used to identify the proper sub-module. +//! - Values for settings that apply to all ISAs. This is represented by a `settings::Flags` +//! instance. +//! - Values for ISA-specific settings. +//! +//! The `isa::lookup()` function is the main entry point which returns an `isa::Builder` +//! appropriate for the requested ISA: +//! +//! ``` +//! # extern crate cranelift_codegen; +//! # #[macro_use] extern crate target_lexicon; +//! use cranelift_codegen::isa; +//! use cranelift_codegen::settings::{self, Configurable}; +//! use std::str::FromStr; +//! use target_lexicon::Triple; +//! +//! let shared_builder = settings::builder(); +//! let shared_flags = settings::Flags::new(shared_builder); +//! +//! match isa::lookup(triple!("riscv32")) { +//! Err(_) => { +//! // The RISC-V target ISA is not available. +//! } +//! Ok(mut isa_builder) => { +//! isa_builder.set("supports_m", "on"); +//! let isa = isa_builder.finish(shared_flags); +//! } +//! } +//! ``` +//! +//! The configured target ISA trait object is a `Box<TargetIsa>` which can be used for multiple +//! concurrent function compilations. + +pub use crate::isa::call_conv::CallConv; +pub use crate::isa::constraints::{ + BranchRange, ConstraintKind, OperandConstraint, RecipeConstraints, +}; +pub use crate::isa::enc_tables::Encodings; +pub use crate::isa::encoding::{base_size, EncInfo, Encoding}; +pub use crate::isa::registers::{regs_overlap, RegClass, RegClassIndex, RegInfo, RegUnit}; +pub use crate::isa::stack::{StackBase, StackBaseMask, StackRef}; + +use crate::binemit; +use crate::flowgraph; +use crate::ir; +#[cfg(feature = "unwind")] +use crate::isa::unwind::systemv::RegisterMappingError; +use crate::machinst::MachBackend; +use crate::regalloc; +use crate::result::CodegenResult; +use crate::settings; +use crate::settings::SetResult; +use crate::timing; +use alloc::borrow::Cow; +use alloc::boxed::Box; +use core::any::Any; +use core::fmt; +use core::fmt::{Debug, Formatter}; +use target_lexicon::{triple, Architecture, PointerWidth, Triple}; +use thiserror::Error; + +#[cfg(feature = "riscv")] +mod riscv; + +#[cfg(feature = "x86")] +mod x86; + +#[cfg(feature = "x64")] +mod x64; + +#[cfg(feature = "arm32")] +mod arm32; + +#[cfg(feature = "arm64")] +pub(crate) mod aarch64; + +pub mod unwind; + +mod call_conv; +mod constraints; +mod enc_tables; +mod encoding; +pub mod registers; +mod stack; + +#[cfg(test)] +mod test_utils; + +/// Returns a builder that can create a corresponding `TargetIsa` +/// or `Err(LookupError::SupportDisabled)` if not enabled. +macro_rules! isa_builder { + ($name: ident, $feature: tt, $triple: ident) => {{ + #[cfg(feature = $feature)] + { + Ok($name::isa_builder($triple)) + } + #[cfg(not(feature = $feature))] + { + Err(LookupError::SupportDisabled) + } + }}; +} + +/// Look for an ISA for the given `triple`. +/// Return a builder that can create a corresponding `TargetIsa`. +pub fn lookup(triple: Triple) -> Result<Builder, LookupError> { + match triple.architecture { + Architecture::Riscv32 { .. } | Architecture::Riscv64 { .. } => { + isa_builder!(riscv, "riscv", triple) + } + Architecture::X86_32 { .. } | Architecture::X86_64 => { + if cfg!(feature = "x64") { + isa_builder!(x64, "x64", triple) + } else { + isa_builder!(x86, "x86", triple) + } + } + Architecture::Arm { .. } => isa_builder!(arm32, "arm32", triple), + Architecture::Aarch64 { .. } => isa_builder!(aarch64, "arm64", triple), + _ => Err(LookupError::Unsupported), + } +} + +/// Look for a supported ISA with the given `name`. +/// Return a builder that can create a corresponding `TargetIsa`. +pub fn lookup_by_name(name: &str) -> Result<Builder, LookupError> { + use alloc::str::FromStr; + lookup(triple!(name)) +} + +/// Describes reason for target lookup failure +#[derive(Error, PartialEq, Eq, Copy, Clone, Debug)] +pub enum LookupError { + /// Support for this target was disabled in the current build. + #[error("Support for this target is disabled")] + SupportDisabled, + + /// Support for this target has not yet been implemented. + #[error("Support for this target has not been implemented yet")] + Unsupported, +} + +/// Builder for a `TargetIsa`. +/// Modify the ISA-specific settings before creating the `TargetIsa` trait object with `finish`. +#[derive(Clone)] +pub struct Builder { + triple: Triple, + setup: settings::Builder, + constructor: fn(Triple, settings::Flags, settings::Builder) -> Box<dyn TargetIsa>, +} + +impl Builder { + /// Combine the ISA-specific settings with the provided ISA-independent settings and allocate a + /// fully configured `TargetIsa` trait object. + pub fn finish(self, shared_flags: settings::Flags) -> Box<dyn TargetIsa> { + (self.constructor)(self.triple, shared_flags, self.setup) + } +} + +impl settings::Configurable for Builder { + fn set(&mut self, name: &str, value: &str) -> SetResult<()> { + self.setup.set(name, value) + } + + fn enable(&mut self, name: &str) -> SetResult<()> { + self.setup.enable(name) + } +} + +/// After determining that an instruction doesn't have an encoding, how should we proceed to +/// legalize it? +/// +/// The `Encodings` iterator returns a legalization function to call. +pub type Legalize = + fn(ir::Inst, &mut ir::Function, &mut flowgraph::ControlFlowGraph, &dyn TargetIsa) -> bool; + +/// This struct provides information that a frontend may need to know about a target to +/// produce Cranelift IR for the target. +#[derive(Clone, Copy, Hash)] +pub struct TargetFrontendConfig { + /// The default calling convention of the target. + pub default_call_conv: CallConv, + + /// The pointer width of the target. + pub pointer_width: PointerWidth, +} + +impl TargetFrontendConfig { + /// Get the pointer type of this target. + pub fn pointer_type(self) -> ir::Type { + ir::Type::int(u16::from(self.pointer_bits())).unwrap() + } + + /// Get the width of pointers on this target, in units of bits. + pub fn pointer_bits(self) -> u8 { + self.pointer_width.bits() + } + + /// Get the width of pointers on this target, in units of bytes. + pub fn pointer_bytes(self) -> u8 { + self.pointer_width.bytes() + } +} + +/// Methods that are specialized to a target ISA. Implies a Display trait that shows the +/// shared flags, as well as any isa-specific flags. +pub trait TargetIsa: fmt::Display + Send + Sync { + /// Get the name of this ISA. + fn name(&self) -> &'static str; + + /// Get the target triple that was used to make this trait object. + fn triple(&self) -> &Triple; + + /// Get the ISA-independent flags that were used to make this trait object. + fn flags(&self) -> &settings::Flags; + + /// Get the default calling convention of this target. + fn default_call_conv(&self) -> CallConv { + CallConv::triple_default(self.triple()) + } + + /// Get the pointer type of this ISA. + fn pointer_type(&self) -> ir::Type { + ir::Type::int(u16::from(self.pointer_bits())).unwrap() + } + + /// Get the width of pointers on this ISA. + fn pointer_width(&self) -> PointerWidth { + self.triple().pointer_width().unwrap() + } + + /// Get the width of pointers on this ISA, in units of bits. + fn pointer_bits(&self) -> u8 { + self.pointer_width().bits() + } + + /// Get the width of pointers on this ISA, in units of bytes. + fn pointer_bytes(&self) -> u8 { + self.pointer_width().bytes() + } + + /// Get the information needed by frontends producing Cranelift IR. + fn frontend_config(&self) -> TargetFrontendConfig { + TargetFrontendConfig { + default_call_conv: self.default_call_conv(), + pointer_width: self.pointer_width(), + } + } + + /// Does the CPU implement scalar comparisons using a CPU flags register? + fn uses_cpu_flags(&self) -> bool { + false + } + + /// Does the CPU implement multi-register addressing? + fn uses_complex_addresses(&self) -> bool { + false + } + + /// Get a data structure describing the registers in this ISA. + fn register_info(&self) -> RegInfo; + + #[cfg(feature = "unwind")] + /// Map a Cranelift register to its corresponding DWARF register. + fn map_dwarf_register(&self, _: RegUnit) -> Result<u16, RegisterMappingError> { + Err(RegisterMappingError::UnsupportedArchitecture) + } + + /// Returns an iterator over legal encodings for the instruction. + fn legal_encodings<'a>( + &'a self, + func: &'a ir::Function, + inst: &'a ir::InstructionData, + ctrl_typevar: ir::Type, + ) -> Encodings<'a>; + + /// Encode an instruction after determining it is legal. + /// + /// If `inst` can legally be encoded in this ISA, produce the corresponding `Encoding` object. + /// Otherwise, return `Legalize` action. + /// + /// This is also the main entry point for determining if an instruction is legal. + fn encode( + &self, + func: &ir::Function, + inst: &ir::InstructionData, + ctrl_typevar: ir::Type, + ) -> Result<Encoding, Legalize> { + let mut iter = self.legal_encodings(func, inst, ctrl_typevar); + iter.next().ok_or_else(|| iter.legalize()) + } + + /// Get a data structure describing the instruction encodings in this ISA. + fn encoding_info(&self) -> EncInfo; + + /// Legalize a function signature. + /// + /// This is used to legalize both the signature of the function being compiled and any called + /// functions. The signature should be modified by adding `ArgumentLoc` annotations to all + /// arguments and return values. + /// + /// Arguments with types that are not supported by the ABI can be expanded into multiple + /// arguments: + /// + /// - Integer types that are too large to fit in a register can be broken into multiple + /// arguments of a smaller integer type. + /// - Floating point types can be bit-cast to an integer type of the same size, and possible + /// broken into smaller integer types. + /// - Vector types can be bit-cast and broken down into smaller vectors or scalars. + /// + /// The legalizer will adapt argument and return values as necessary at all ABI boundaries. + /// + /// When this function is called to legalize the signature of the function currently being + /// compiled, `current` is true. The legalized signature can then also contain special purpose + /// arguments and return values such as: + /// + /// - A `link` argument representing the link registers on RISC architectures that don't push + /// the return address on the stack. + /// - A `link` return value which will receive the value that was passed to the `link` + /// argument. + /// - An `sret` argument can be added if one wasn't present already. This is necessary if the + /// signature returns more values than registers are available for returning values. + /// - An `sret` return value can be added if the ABI requires a function to return its `sret` + /// argument in a register. + /// + /// Arguments and return values for the caller's frame pointer and other callee-saved registers + /// should not be added by this function. These arguments are not added until after register + /// allocation. + fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool); + + /// Get the register class that should be used to represent an ABI argument or return value of + /// type `ty`. This should be the top-level register class that contains the argument + /// registers. + /// + /// This function can assume that it will only be asked to provide register classes for types + /// that `legalize_signature()` produces in `ArgumentLoc::Reg` entries. + fn regclass_for_abi_type(&self, ty: ir::Type) -> RegClass; + + /// Get the set of allocatable registers that can be used when compiling `func`. + /// + /// This set excludes reserved registers like the stack pointer and other special-purpose + /// registers. + fn allocatable_registers(&self, func: &ir::Function) -> regalloc::RegisterSet; + + /// Compute the stack layout and insert prologue and epilogue code into `func`. + /// + /// Return an error if the stack frame is too large. + fn prologue_epilogue(&self, func: &mut ir::Function) -> CodegenResult<()> { + let _tt = timing::prologue_epilogue(); + // This default implementation is unlikely to be good enough. + use crate::ir::stackslot::{StackOffset, StackSize}; + use crate::stack_layout::layout_stack; + + let word_size = StackSize::from(self.pointer_bytes()); + + // Account for the SpiderMonkey standard prologue pushes. + if func.signature.call_conv.extends_baldrdash() { + let bytes = StackSize::from(self.flags().baldrdash_prologue_words()) * word_size; + let mut ss = ir::StackSlotData::new(ir::StackSlotKind::IncomingArg, bytes); + ss.offset = Some(-(bytes as StackOffset)); + func.stack_slots.push(ss); + } + + let is_leaf = func.is_leaf(); + layout_stack(&mut func.stack_slots, is_leaf, word_size)?; + Ok(()) + } + + /// Emit binary machine code for a single instruction into the `sink` trait object. + /// + /// Note that this will call `put*` methods on the `sink` trait object via its vtable which + /// is not the fastest way of emitting code. + /// + /// This function is under the "testing_hooks" feature, and is only suitable for use by + /// test harnesses. It increases code size, and is inefficient. + #[cfg(feature = "testing_hooks")] + fn emit_inst( + &self, + func: &ir::Function, + inst: ir::Inst, + divert: &mut regalloc::RegDiversions, + sink: &mut dyn binemit::CodeSink, + ); + + /// Emit a whole function into memory. + fn emit_function_to_memory(&self, func: &ir::Function, sink: &mut binemit::MemoryCodeSink); + + /// IntCC condition for Unsigned Addition Overflow (Carry). + fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC; + + /// IntCC condition for Unsigned Subtraction Overflow (Borrow/Carry). + fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC; + + /// Creates unwind information for the function. + /// + /// Returns `None` if there is no unwind information for the function. + #[cfg(feature = "unwind")] + fn create_unwind_info( + &self, + _func: &ir::Function, + ) -> CodegenResult<Option<unwind::UnwindInfo>> { + // By default, an ISA has no unwind information + Ok(None) + } + + /// Creates a new System V Common Information Entry for the ISA. + /// + /// Returns `None` if the ISA does not support System V unwind information. + #[cfg(feature = "unwind")] + fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> { + // By default, an ISA cannot create a System V CIE + None + } + + /// Get the new-style MachBackend, if this is an adapter around one. + fn get_mach_backend(&self) -> Option<&dyn MachBackend> { + None + } + + /// Return an [Any] reference for downcasting to the ISA-specific implementation of this trait + /// with `isa.as_any().downcast_ref::<isa::foo::Isa>()`. + fn as_any(&self) -> &dyn Any; +} + +impl Debug for &dyn TargetIsa { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "TargetIsa {{ triple: {:?}, pointer_width: {:?}}}", + self.triple(), + self.pointer_width() + ) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/registers.rs b/third_party/rust/cranelift-codegen/src/isa/registers.rs new file mode 100644 index 0000000000..e67ae13453 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/registers.rs @@ -0,0 +1,360 @@ +//! Data structures describing the registers in an ISA. + +use crate::entity::EntityRef; +use core::fmt; + +/// Register units are the smallest units of register allocation. +/// +/// Normally there is a 1-1 correspondence between registers and register units, but when an ISA +/// has aliasing registers, the aliasing can be modeled with registers that cover multiple +/// register units. +/// +/// The register allocator will enforce that each register unit only gets used for one thing. +pub type RegUnit = u16; + +/// A bit mask indexed by register classes. +/// +/// The size of this type is determined by the ISA with the most register classes. +pub type RegClassMask = u32; + +/// A bit mask indexed by register units. +/// +/// The size of this type is determined by the target ISA that has the most register units defined. +/// Currently that is arm32 which has 64+16 units. +pub type RegUnitMask = [RegClassMask; 3]; + +/// The register units in a target ISA are divided into disjoint register banks. Each bank covers a +/// contiguous range of register units. +/// +/// The `RegBank` struct provides a static description of a register bank. +pub struct RegBank { + /// The name of this register bank as defined in the ISA's DSL definition. + pub name: &'static str, + + /// The first register unit in this bank. + pub first_unit: RegUnit, + + /// The total number of register units in this bank. + pub units: RegUnit, + + /// Array of specially named register units. This array can be shorter than the number of units + /// in the bank. + pub names: &'static [&'static str], + + /// Name prefix to use for those register units in the bank not covered by the `names` array. + /// The remaining register units will be named this prefix followed by their decimal offset in + /// the bank. So with a prefix `r`, registers will be named `r8`, `r9`, ... + pub prefix: &'static str, + + /// Index of the first top-level register class in this bank. + pub first_toprc: usize, + + /// Number of top-level register classes in this bank. + /// + /// The top-level register classes in a bank are guaranteed to be numbered sequentially from + /// `first_toprc`, and all top-level register classes across banks come before any sub-classes. + pub num_toprcs: usize, + + /// Is register pressure tracking enabled for this bank? + pub pressure_tracking: bool, +} + +impl RegBank { + /// Does this bank contain `regunit`? + fn contains(&self, regunit: RegUnit) -> bool { + regunit >= self.first_unit && regunit - self.first_unit < self.units + } + + /// Try to parse a regunit name. The name is not expected to begin with `%`. + fn parse_regunit(&self, name: &str) -> Option<RegUnit> { + match self.names.iter().position(|&x| x == name) { + Some(offset) => { + // This is one of the special-cased names. + Some(offset as RegUnit) + } + None => { + // Try a regular prefixed name. + if name.starts_with(self.prefix) { + name[self.prefix.len()..].parse().ok() + } else { + None + } + } + } + .and_then(|offset| { + if offset < self.units { + Some(offset + self.first_unit) + } else { + None + } + }) + } + + /// Write `regunit` to `w`, assuming that it belongs to this bank. + /// All regunits are written with a `%` prefix. + fn write_regunit(&self, f: &mut fmt::Formatter, regunit: RegUnit) -> fmt::Result { + let offset = regunit - self.first_unit; + assert!(offset < self.units); + if (offset as usize) < self.names.len() { + write!(f, "%{}", self.names[offset as usize]) + } else { + write!(f, "%{}{}", self.prefix, offset) + } + } +} + +/// A register class reference. +/// +/// All register classes are statically defined in tables generated from the meta descriptions. +pub type RegClass = &'static RegClassData; + +/// Data about a register class. +/// +/// A register class represents a subset of the registers in a bank. It describes the set of +/// permitted registers for a register operand in a given encoding of an instruction. +/// +/// A register class can be a subset of another register class. The top-level register classes are +/// disjoint. +pub struct RegClassData { + /// The name of the register class. + pub name: &'static str, + + /// The index of this class in the ISA's RegInfo description. + pub index: u8, + + /// How many register units to allocate per register. + pub width: u8, + + /// Index of the register bank this class belongs to. + pub bank: u8, + + /// Index of the top-level register class contains this one. + pub toprc: u8, + + /// The first register unit in this class. + pub first: RegUnit, + + /// Bit-mask of sub-classes of this register class, including itself. + /// + /// Bits correspond to RC indexes. + pub subclasses: RegClassMask, + + /// Mask of register units in the class. If `width > 1`, the mask only has a bit set for the + /// first register unit in each allocatable register. + pub mask: RegUnitMask, + + /// The global `RegInfo` instance containing this register class. + pub info: &'static RegInfo, + + /// The "pinned" register of the associated register bank. + /// + /// This register must be non-volatile (callee-preserved) and must not be the fixed + /// output register of any instruction. + pub pinned_reg: Option<RegUnit>, +} + +impl RegClassData { + /// Get the register class index corresponding to the intersection of `self` and `other`. + /// + /// This register class is guaranteed to exist if the register classes overlap. If the register + /// classes don't overlap, returns `None`. + pub fn intersect_index(&self, other: RegClass) -> Option<RegClassIndex> { + // Compute the set of common subclasses. + let mask = self.subclasses & other.subclasses; + + if mask == 0 { + // No overlap. + None + } else { + // Register class indexes are topologically ordered, so the largest common subclass has + // the smallest index. + Some(RegClassIndex(mask.trailing_zeros() as u8)) + } + } + + /// Get the intersection of `self` and `other`. + pub fn intersect(&self, other: RegClass) -> Option<RegClass> { + self.intersect_index(other).map(|rci| self.info.rc(rci)) + } + + /// Returns true if `other` is a subclass of this register class. + /// A register class is considered to be a subclass of itself. + pub fn has_subclass<RCI: Into<RegClassIndex>>(&self, other: RCI) -> bool { + self.subclasses & (1 << other.into().0) as u32 != 0 + } + + /// Get the top-level register class containing this class. + pub fn toprc(&self) -> RegClass { + self.info.rc(RegClassIndex(self.toprc)) + } + + /// Get a specific register unit in this class. + pub fn unit(&self, offset: usize) -> RegUnit { + let uoffset = offset * usize::from(self.width); + self.first + uoffset as RegUnit + } + + /// Does this register class contain `regunit`? + pub fn contains(&self, regunit: RegUnit) -> bool { + self.mask[(regunit / 32) as usize] & (1u32 << (regunit % 32) as u32) != 0 + } + + /// If the pinned register is used, is the given regunit the pinned register of this class? + #[inline] + pub fn is_pinned_reg(&self, enabled: bool, regunit: RegUnit) -> bool { + enabled + && self + .pinned_reg + .map_or(false, |pinned_reg| pinned_reg == regunit) + } + + /// Calculate the index of the register inside the class. + pub fn index_of(&self, regunit: RegUnit) -> u16 { + assert!( + self.contains(regunit), + "the {} register class does not contain {}", + self.name, + regunit + ); + regunit - self.first + } +} + +impl fmt::Display for RegClassData { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(self.name) + } +} + +impl fmt::Debug for RegClassData { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(self.name) + } +} + +/// Within an ISA, register classes are uniquely identified by their index. +impl PartialEq for RegClassData { + fn eq(&self, other: &Self) -> bool { + self.index == other.index + } +} + +/// A small reference to a register class. +/// +/// Use this when storing register classes in compact data structures. The `RegInfo::rc()` method +/// can be used to get the real register class reference back. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct RegClassIndex(u8); + +impl EntityRef for RegClassIndex { + fn new(idx: usize) -> Self { + Self(idx as u8) + } + + fn index(self) -> usize { + usize::from(self.0) + } +} + +impl From<RegClass> for RegClassIndex { + fn from(rc: RegClass) -> Self { + Self(rc.index) + } +} + +impl fmt::Display for RegClassIndex { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "rci{}", self.0) + } +} + +/// Test of two registers overlap. +/// +/// A register is identified as a `(RegClass, RegUnit)` pair. The register class is needed to +/// determine the width (in regunits) of the register. +pub fn regs_overlap(rc1: RegClass, reg1: RegUnit, rc2: RegClass, reg2: RegUnit) -> bool { + let end1 = reg1 + RegUnit::from(rc1.width); + let end2 = reg2 + RegUnit::from(rc2.width); + !(end1 <= reg2 || end2 <= reg1) +} + +/// Information about the registers in an ISA. +/// +/// The `RegUnit` data structure collects all relevant static information about the registers in an +/// ISA. +#[derive(Clone)] +pub struct RegInfo { + /// All register banks, ordered by their `first_unit`. The register banks are disjoint, but + /// there may be holes of unused register unit numbers between banks due to alignment. + pub banks: &'static [RegBank], + + /// All register classes ordered topologically so a sub-class always follows its parent. + pub classes: &'static [RegClass], +} + +impl RegInfo { + /// Get the register bank holding `regunit`. + pub fn bank_containing_regunit(&self, regunit: RegUnit) -> Option<&RegBank> { + // We could do a binary search, but most ISAs have only two register banks... + self.banks.iter().find(|b| b.contains(regunit)) + } + + /// Try to parse a regunit name. The name is not expected to begin with `%`. + pub fn parse_regunit(&self, name: &str) -> Option<RegUnit> { + self.banks + .iter() + .filter_map(|b| b.parse_regunit(name)) + .next() + } + + /// Make a temporary object that can display a register unit. + pub fn display_regunit(&self, regunit: RegUnit) -> DisplayRegUnit { + DisplayRegUnit { + regunit, + reginfo: self, + } + } + + /// Get the register class corresponding to `idx`. + pub fn rc(&self, idx: RegClassIndex) -> RegClass { + self.classes[idx.index()] + } + + /// Get the top-level register class containing the `idx` class. + pub fn toprc(&self, idx: RegClassIndex) -> RegClass { + self.classes[self.rc(idx).toprc as usize] + } +} + +/// Temporary object that holds enough information to print a register unit. +pub struct DisplayRegUnit<'a> { + regunit: RegUnit, + reginfo: &'a RegInfo, +} + +impl<'a> fmt::Display for DisplayRegUnit<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.reginfo.bank_containing_regunit(self.regunit) { + Some(b) => b.write_regunit(f, self.regunit), + None => write!(f, "%INVALID{}", self.regunit), + } + } +} + +#[test] +fn assert_sizes() { + use cranelift_codegen_shared::constants; + use std::mem::size_of; + + // In these tests, size_of returns number of bytes: we actually want the number of bits, so + // multiply these by 8. + assert!( + (size_of::<RegClassMask>() * 8) <= constants::MAX_NUM_REG_CLASSES, + "need to bump MAX_NUM_REG_CLASSES or change RegClassMask type" + ); + + assert!( + constants::MAX_NUM_REG_CLASSES < (1 << (size_of::<RegClassIndex>() * 8)), + "need to change RegClassIndex's type to a wider type" + ); +} diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/abi.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/abi.rs new file mode 100644 index 0000000000..44c5f36afe --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/riscv/abi.rs @@ -0,0 +1,149 @@ +//! RISC-V ABI implementation. +//! +//! This module implements the RISC-V calling convention through the primary `legalize_signature()` +//! entry point. +//! +//! This doesn't support the soft-float ABI at the moment. + +use super::registers::{FPR, GPR}; +use super::settings; +use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion}; +use crate::ir::{self, AbiParam, ArgumentExtension, ArgumentLoc, ArgumentPurpose, Type}; +use crate::isa::RegClass; +use crate::regalloc::RegisterSet; +use alloc::borrow::Cow; +use core::i32; +use target_lexicon::Triple; + +struct Args { + pointer_bits: u8, + pointer_bytes: u8, + pointer_type: Type, + regs: u32, + reg_limit: u32, + offset: u32, +} + +impl Args { + fn new(bits: u8, enable_e: bool) -> Self { + Self { + pointer_bits: bits, + pointer_bytes: bits / 8, + pointer_type: Type::int(u16::from(bits)).unwrap(), + regs: 0, + reg_limit: if enable_e { 6 } else { 8 }, + offset: 0, + } + } +} + +impl ArgAssigner for Args { + fn assign(&mut self, arg: &AbiParam) -> ArgAction { + fn align(value: u32, to: u32) -> u32 { + (value + to - 1) & !(to - 1) + } + + let ty = arg.value_type; + + // Check for a legal type. + // RISC-V doesn't have SIMD at all, so break all vectors down. + if ty.is_vector() { + return ValueConversion::VectorSplit.into(); + } + + // Large integers and booleans are broken down to fit in a register. + if !ty.is_float() && ty.bits() > u16::from(self.pointer_bits) { + // Align registers and stack to a multiple of two pointers. + self.regs = align(self.regs, 2); + self.offset = align(self.offset, 2 * u32::from(self.pointer_bytes)); + return ValueConversion::IntSplit.into(); + } + + // Small integers are extended to the size of a pointer register. + if ty.is_int() && ty.bits() < u16::from(self.pointer_bits) { + match arg.extension { + ArgumentExtension::None => {} + ArgumentExtension::Uext => return ValueConversion::Uext(self.pointer_type).into(), + ArgumentExtension::Sext => return ValueConversion::Sext(self.pointer_type).into(), + } + } + + if self.regs < self.reg_limit { + // Assign to a register. + let reg = if ty.is_float() { + FPR.unit(10 + self.regs as usize) + } else { + GPR.unit(10 + self.regs as usize) + }; + self.regs += 1; + ArgumentLoc::Reg(reg).into() + } else { + // Assign a stack location. + let loc = ArgumentLoc::Stack(self.offset as i32); + self.offset += u32::from(self.pointer_bytes); + debug_assert!(self.offset <= i32::MAX as u32); + loc.into() + } + } +} + +/// Legalize `sig` for RISC-V. +pub fn legalize_signature( + sig: &mut Cow<ir::Signature>, + triple: &Triple, + isa_flags: &settings::Flags, + current: bool, +) { + let bits = triple.pointer_width().unwrap().bits(); + + let mut args = Args::new(bits, isa_flags.enable_e()); + if let Some(new_params) = legalize_args(&sig.params, &mut args) { + sig.to_mut().params = new_params; + } + + let mut rets = Args::new(bits, isa_flags.enable_e()); + if let Some(new_returns) = legalize_args(&sig.returns, &mut rets) { + sig.to_mut().returns = new_returns; + } + + if current { + let ptr = Type::int(u16::from(bits)).unwrap(); + + // Add the link register as an argument and return value. + // + // The `jalr` instruction implementing a return can technically accept the return address + // in any register, but a micro-architecture with a return address predictor will only + // recognize it as a return if the address is in `x1`. + let link = AbiParam::special_reg(ptr, ArgumentPurpose::Link, GPR.unit(1)); + sig.to_mut().params.push(link); + sig.to_mut().returns.push(link); + } +} + +/// Get register class for a type appearing in a legalized signature. +pub fn regclass_for_abi_type(ty: Type) -> RegClass { + if ty.is_float() { + FPR + } else { + GPR + } +} + +pub fn allocatable_registers(_func: &ir::Function, isa_flags: &settings::Flags) -> RegisterSet { + let mut regs = RegisterSet::new(); + regs.take(GPR, GPR.unit(0)); // Hard-wired 0. + // %x1 is the link register which is available for allocation. + regs.take(GPR, GPR.unit(2)); // Stack pointer. + regs.take(GPR, GPR.unit(3)); // Global pointer. + regs.take(GPR, GPR.unit(4)); // Thread pointer. + // TODO: %x8 is the frame pointer. Reserve it? + + // Remove %x16 and up for RV32E. + if isa_flags.enable_e() { + for u in 16..32 { + regs.take(GPR, GPR.unit(u)); + } + } + + regs +} diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/binemit.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/binemit.rs new file mode 100644 index 0000000000..a1d2b82e12 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/riscv/binemit.rs @@ -0,0 +1,182 @@ +//! Emitting binary RISC-V machine code. + +use crate::binemit::{bad_encoding, CodeSink, Reloc}; +use crate::ir::{Function, Inst, InstructionData}; +use crate::isa::{RegUnit, StackBaseMask, StackRef, TargetIsa}; +use crate::predicates::is_signed_int; +use crate::regalloc::RegDiversions; +use core::u32; + +include!(concat!(env!("OUT_DIR"), "/binemit-riscv.rs")); + +/// R-type instructions. +/// +/// 31 24 19 14 11 6 +/// funct7 rs2 rs1 funct3 rd opcode +/// 25 20 15 12 7 0 +/// +/// Encoding bits: `opcode[6:2] | (funct3 << 5) | (funct7 << 8)`. +fn put_r<CS: CodeSink + ?Sized>(bits: u16, rs1: RegUnit, rs2: RegUnit, rd: RegUnit, sink: &mut CS) { + let bits = u32::from(bits); + let opcode5 = bits & 0x1f; + let funct3 = (bits >> 5) & 0x7; + let funct7 = (bits >> 8) & 0x7f; + let rs1 = u32::from(rs1) & 0x1f; + let rs2 = u32::from(rs2) & 0x1f; + let rd = u32::from(rd) & 0x1f; + + // 0-6: opcode + let mut i = 0x3; + i |= opcode5 << 2; + i |= rd << 7; + i |= funct3 << 12; + i |= rs1 << 15; + i |= rs2 << 20; + i |= funct7 << 25; + + sink.put4(i); +} + +/// R-type instructions with a shift amount instead of rs2. +/// +/// 31 25 19 14 11 6 +/// funct7 shamt rs1 funct3 rd opcode +/// 25 20 15 12 7 0 +/// +/// Both funct7 and shamt contribute to bit 25. In RV64, shamt uses it for shifts > 31. +/// +/// Encoding bits: `opcode[6:2] | (funct3 << 5) | (funct7 << 8)`. +fn put_rshamt<CS: CodeSink + ?Sized>( + bits: u16, + rs1: RegUnit, + shamt: i64, + rd: RegUnit, + sink: &mut CS, +) { + let bits = u32::from(bits); + let opcode5 = bits & 0x1f; + let funct3 = (bits >> 5) & 0x7; + let funct7 = (bits >> 8) & 0x7f; + let rs1 = u32::from(rs1) & 0x1f; + let shamt = shamt as u32 & 0x3f; + let rd = u32::from(rd) & 0x1f; + + // 0-6: opcode + let mut i = 0x3; + i |= opcode5 << 2; + i |= rd << 7; + i |= funct3 << 12; + i |= rs1 << 15; + i |= shamt << 20; + i |= funct7 << 25; + + sink.put4(i); +} + +/// I-type instructions. +/// +/// 31 19 14 11 6 +/// imm rs1 funct3 rd opcode +/// 20 15 12 7 0 +/// +/// Encoding bits: `opcode[6:2] | (funct3 << 5)` +fn put_i<CS: CodeSink + ?Sized>(bits: u16, rs1: RegUnit, imm: i64, rd: RegUnit, sink: &mut CS) { + let bits = u32::from(bits); + let opcode5 = bits & 0x1f; + let funct3 = (bits >> 5) & 0x7; + let rs1 = u32::from(rs1) & 0x1f; + let rd = u32::from(rd) & 0x1f; + + // 0-6: opcode + let mut i = 0x3; + i |= opcode5 << 2; + i |= rd << 7; + i |= funct3 << 12; + i |= rs1 << 15; + i |= (imm << 20) as u32; + + sink.put4(i); +} + +/// U-type instructions. +/// +/// 31 11 6 +/// imm rd opcode +/// 12 7 0 +/// +/// Encoding bits: `opcode[6:2] | (funct3 << 5)` +fn put_u<CS: CodeSink + ?Sized>(bits: u16, imm: i64, rd: RegUnit, sink: &mut CS) { + let bits = u32::from(bits); + let opcode5 = bits & 0x1f; + let rd = u32::from(rd) & 0x1f; + + // 0-6: opcode + let mut i = 0x3; + i |= opcode5 << 2; + i |= rd << 7; + i |= imm as u32 & 0xfffff000; + + sink.put4(i); +} + +/// SB-type branch instructions. +/// +/// 31 24 19 14 11 6 +/// imm rs2 rs1 funct3 imm opcode +/// 25 20 15 12 7 0 +/// +/// Encoding bits: `opcode[6:2] | (funct3 << 5)` +fn put_sb<CS: CodeSink + ?Sized>(bits: u16, imm: i64, rs1: RegUnit, rs2: RegUnit, sink: &mut CS) { + let bits = u32::from(bits); + let opcode5 = bits & 0x1f; + let funct3 = (bits >> 5) & 0x7; + let rs1 = u32::from(rs1) & 0x1f; + let rs2 = u32::from(rs2) & 0x1f; + + debug_assert!(is_signed_int(imm, 13, 1), "SB out of range {:#x}", imm); + let imm = imm as u32; + + // 0-6: opcode + let mut i = 0x3; + i |= opcode5 << 2; + i |= funct3 << 12; + i |= rs1 << 15; + i |= rs2 << 20; + + // The displacement is completely hashed up. + i |= ((imm >> 11) & 0x1) << 7; + i |= ((imm >> 1) & 0xf) << 8; + i |= ((imm >> 5) & 0x3f) << 25; + i |= ((imm >> 12) & 0x1) << 31; + + sink.put4(i); +} + +/// UJ-type jump instructions. +/// +/// 31 11 6 +/// imm rd opcode +/// 12 7 0 +/// +/// Encoding bits: `opcode[6:2]` +fn put_uj<CS: CodeSink + ?Sized>(bits: u16, imm: i64, rd: RegUnit, sink: &mut CS) { + let bits = u32::from(bits); + let opcode5 = bits & 0x1f; + let rd = u32::from(rd) & 0x1f; + + debug_assert!(is_signed_int(imm, 21, 1), "UJ out of range {:#x}", imm); + let imm = imm as u32; + + // 0-6: opcode + let mut i = 0x3; + i |= opcode5 << 2; + i |= rd << 7; + + // The displacement is completely hashed up. + i |= imm & 0xff000; + i |= ((imm >> 11) & 0x1) << 20; + i |= ((imm >> 1) & 0x3ff) << 21; + i |= ((imm >> 20) & 0x1) << 31; + + sink.put4(i); +} diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/enc_tables.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/enc_tables.rs new file mode 100644 index 0000000000..76184ad727 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/riscv/enc_tables.rs @@ -0,0 +1,18 @@ +//! Encoding tables for RISC-V. + +use super::registers::*; +use crate::ir; +use crate::isa; +use crate::isa::constraints::*; +use crate::isa::enc_tables::*; +use crate::isa::encoding::{base_size, RecipeSizing}; +use crate::predicates; + +// Include the generated encoding tables: +// - `LEVEL1_RV32` +// - `LEVEL1_RV64` +// - `LEVEL2` +// - `ENCLIST` +// - `INFO` +include!(concat!(env!("OUT_DIR"), "/encoding-riscv.rs")); +include!(concat!(env!("OUT_DIR"), "/legalize-riscv.rs")); diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/mod.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/mod.rs new file mode 100644 index 0000000000..e69a3a0e12 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/riscv/mod.rs @@ -0,0 +1,295 @@ +//! RISC-V Instruction Set Architecture. + +mod abi; +mod binemit; +mod enc_tables; +mod registers; +pub mod settings; + +use super::super::settings as shared_settings; +#[cfg(feature = "testing_hooks")] +use crate::binemit::CodeSink; +use crate::binemit::{emit_function, MemoryCodeSink}; +use crate::ir; +use crate::isa::enc_tables::{self as shared_enc_tables, lookup_enclist, Encodings}; +use crate::isa::Builder as IsaBuilder; +use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa}; +use crate::regalloc; +use alloc::borrow::Cow; +use alloc::boxed::Box; +use core::any::Any; +use core::fmt; +use target_lexicon::{PointerWidth, Triple}; + +#[allow(dead_code)] +struct Isa { + triple: Triple, + shared_flags: shared_settings::Flags, + isa_flags: settings::Flags, + cpumode: &'static [shared_enc_tables::Level1Entry<u16>], +} + +/// Get an ISA builder for creating RISC-V targets. +pub fn isa_builder(triple: Triple) -> IsaBuilder { + IsaBuilder { + triple, + setup: settings::builder(), + constructor: isa_constructor, + } +} + +fn isa_constructor( + triple: Triple, + shared_flags: shared_settings::Flags, + builder: shared_settings::Builder, +) -> Box<dyn TargetIsa> { + let level1 = match triple.pointer_width().unwrap() { + PointerWidth::U16 => panic!("16-bit RISC-V unrecognized"), + PointerWidth::U32 => &enc_tables::LEVEL1_RV32[..], + PointerWidth::U64 => &enc_tables::LEVEL1_RV64[..], + }; + Box::new(Isa { + triple, + isa_flags: settings::Flags::new(&shared_flags, builder), + shared_flags, + cpumode: level1, + }) +} + +impl TargetIsa for Isa { + fn name(&self) -> &'static str { + "riscv" + } + + fn triple(&self) -> &Triple { + &self.triple + } + + fn flags(&self) -> &shared_settings::Flags { + &self.shared_flags + } + + fn register_info(&self) -> RegInfo { + registers::INFO.clone() + } + + fn encoding_info(&self) -> EncInfo { + enc_tables::INFO.clone() + } + + fn legal_encodings<'a>( + &'a self, + func: &'a ir::Function, + inst: &'a ir::InstructionData, + ctrl_typevar: ir::Type, + ) -> Encodings<'a> { + lookup_enclist( + ctrl_typevar, + inst, + func, + self.cpumode, + &enc_tables::LEVEL2[..], + &enc_tables::ENCLISTS[..], + &enc_tables::LEGALIZE_ACTIONS[..], + &enc_tables::RECIPE_PREDICATES[..], + &enc_tables::INST_PREDICATES[..], + self.isa_flags.predicate_view(), + ) + } + + fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool) { + abi::legalize_signature(sig, &self.triple, &self.isa_flags, current) + } + + fn regclass_for_abi_type(&self, ty: ir::Type) -> RegClass { + abi::regclass_for_abi_type(ty) + } + + fn allocatable_registers(&self, func: &ir::Function) -> regalloc::RegisterSet { + abi::allocatable_registers(func, &self.isa_flags) + } + + #[cfg(feature = "testing_hooks")] + fn emit_inst( + &self, + func: &ir::Function, + inst: ir::Inst, + divert: &mut regalloc::RegDiversions, + sink: &mut dyn CodeSink, + ) { + binemit::emit_inst(func, inst, divert, sink, self) + } + + fn emit_function_to_memory(&self, func: &ir::Function, sink: &mut MemoryCodeSink) { + emit_function(func, binemit::emit_inst, sink, self) + } + + fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC { + unimplemented!() + } + + fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC { + unimplemented!() + } + + fn as_any(&self) -> &dyn Any { + self as &dyn Any + } +} + +#[cfg(test)] +mod tests { + use crate::ir::{immediates, types}; + use crate::ir::{Function, InstructionData, Opcode}; + use crate::isa; + use crate::settings::{self, Configurable}; + use alloc::string::{String, ToString}; + use core::str::FromStr; + use target_lexicon::triple; + + fn encstr(isa: &dyn isa::TargetIsa, enc: Result<isa::Encoding, isa::Legalize>) -> String { + match enc { + Ok(e) => isa.encoding_info().display(e).to_string(), + Err(_) => "no encoding".to_string(), + } + } + + #[test] + fn test_64bitenc() { + let shared_builder = settings::builder(); + let shared_flags = settings::Flags::new(shared_builder); + let isa = isa::lookup(triple!("riscv64")) + .unwrap() + .finish(shared_flags); + + let mut func = Function::new(); + let block = func.dfg.make_block(); + let arg64 = func.dfg.append_block_param(block, types::I64); + let arg32 = func.dfg.append_block_param(block, types::I32); + + // Try to encode iadd_imm.i64 v1, -10. + let inst64 = InstructionData::BinaryImm64 { + opcode: Opcode::IaddImm, + arg: arg64, + imm: immediates::Imm64::new(-10), + }; + + // ADDI is I/0b00100 + assert_eq!( + encstr(&*isa, isa.encode(&func, &inst64, types::I64)), + "Ii#04" + ); + + // Try to encode iadd_imm.i64 v1, -10000. + let inst64_large = InstructionData::BinaryImm64 { + opcode: Opcode::IaddImm, + arg: arg64, + imm: immediates::Imm64::new(-10000), + }; + + // Immediate is out of range for ADDI. + assert!(isa.encode(&func, &inst64_large, types::I64).is_err()); + + // Create an iadd_imm.i32 which is encodable in RV64. + let inst32 = InstructionData::BinaryImm64 { + opcode: Opcode::IaddImm, + arg: arg32, + imm: immediates::Imm64::new(10), + }; + + // ADDIW is I/0b00110 + assert_eq!( + encstr(&*isa, isa.encode(&func, &inst32, types::I32)), + "Ii#06" + ); + } + + // Same as above, but for RV32. + #[test] + fn test_32bitenc() { + let shared_builder = settings::builder(); + let shared_flags = settings::Flags::new(shared_builder); + let isa = isa::lookup(triple!("riscv32")) + .unwrap() + .finish(shared_flags); + + let mut func = Function::new(); + let block = func.dfg.make_block(); + let arg64 = func.dfg.append_block_param(block, types::I64); + let arg32 = func.dfg.append_block_param(block, types::I32); + + // Try to encode iadd_imm.i64 v1, -10. + let inst64 = InstructionData::BinaryImm64 { + opcode: Opcode::IaddImm, + arg: arg64, + imm: immediates::Imm64::new(-10), + }; + + // In 32-bit mode, an i64 bit add should be narrowed. + assert!(isa.encode(&func, &inst64, types::I64).is_err()); + + // Try to encode iadd_imm.i64 v1, -10000. + let inst64_large = InstructionData::BinaryImm64 { + opcode: Opcode::IaddImm, + arg: arg64, + imm: immediates::Imm64::new(-10000), + }; + + // In 32-bit mode, an i64 bit add should be narrowed. + assert!(isa.encode(&func, &inst64_large, types::I64).is_err()); + + // Create an iadd_imm.i32 which is encodable in RV32. + let inst32 = InstructionData::BinaryImm64 { + opcode: Opcode::IaddImm, + arg: arg32, + imm: immediates::Imm64::new(10), + }; + + // ADDI is I/0b00100 + assert_eq!( + encstr(&*isa, isa.encode(&func, &inst32, types::I32)), + "Ii#04" + ); + + // Create an imul.i32 which is encodable in RV32, but only when use_m is true. + let mul32 = InstructionData::Binary { + opcode: Opcode::Imul, + args: [arg32, arg32], + }; + + assert!(isa.encode(&func, &mul32, types::I32).is_err()); + } + + #[test] + fn test_rv32m() { + let shared_builder = settings::builder(); + let shared_flags = settings::Flags::new(shared_builder); + + // Set the supports_m stting which in turn enables the use_m predicate that unlocks + // encodings for imul. + let mut isa_builder = isa::lookup(triple!("riscv32")).unwrap(); + isa_builder.enable("supports_m").unwrap(); + + let isa = isa_builder.finish(shared_flags); + + let mut func = Function::new(); + let block = func.dfg.make_block(); + let arg32 = func.dfg.append_block_param(block, types::I32); + + // Create an imul.i32 which is encodable in RV32M. + let mul32 = InstructionData::Binary { + opcode: Opcode::Imul, + args: [arg32, arg32], + }; + assert_eq!( + encstr(&*isa, isa.encode(&func, &mul32, types::I32)), + "R#10c" + ); + } +} + +impl fmt::Display for Isa { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}\n{}", self.shared_flags, self.isa_flags) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/registers.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/registers.rs new file mode 100644 index 0000000000..9043b7f65f --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/riscv/registers.rs @@ -0,0 +1,50 @@ +//! RISC-V register descriptions. + +use crate::isa::registers::{RegBank, RegClass, RegClassData, RegInfo, RegUnit}; + +include!(concat!(env!("OUT_DIR"), "/registers-riscv.rs")); + +#[cfg(test)] +mod tests { + use super::{FPR, GPR, INFO}; + use crate::isa::RegUnit; + use alloc::string::{String, ToString}; + + #[test] + fn unit_encodings() { + assert_eq!(INFO.parse_regunit("x0"), Some(0)); + assert_eq!(INFO.parse_regunit("x31"), Some(31)); + assert_eq!(INFO.parse_regunit("f0"), Some(32)); + assert_eq!(INFO.parse_regunit("f31"), Some(63)); + + assert_eq!(INFO.parse_regunit("x32"), None); + assert_eq!(INFO.parse_regunit("f32"), None); + } + + #[test] + fn unit_names() { + fn uname(ru: RegUnit) -> String { + INFO.display_regunit(ru).to_string() + } + + assert_eq!(uname(0), "%x0"); + assert_eq!(uname(1), "%x1"); + assert_eq!(uname(31), "%x31"); + assert_eq!(uname(32), "%f0"); + assert_eq!(uname(33), "%f1"); + assert_eq!(uname(63), "%f31"); + assert_eq!(uname(64), "%INVALID64"); + } + + #[test] + fn classes() { + assert!(GPR.contains(GPR.unit(0))); + assert!(GPR.contains(GPR.unit(31))); + assert!(!FPR.contains(GPR.unit(0))); + assert!(!FPR.contains(GPR.unit(31))); + assert!(!GPR.contains(FPR.unit(0))); + assert!(!GPR.contains(FPR.unit(31))); + assert!(FPR.contains(FPR.unit(0))); + assert!(FPR.contains(FPR.unit(31))); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/riscv/settings.rs b/third_party/rust/cranelift-codegen/src/isa/riscv/settings.rs new file mode 100644 index 0000000000..40aa3bed2b --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/riscv/settings.rs @@ -0,0 +1,56 @@ +//! RISC-V Settings. + +use crate::settings::{self, detail, Builder}; +use core::fmt; + +// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs`. This file contains a +// public `Flags` struct with an impl for all of the settings defined in +// `cranelift-codegen/meta/src/isa/riscv/mod.rs`. +include!(concat!(env!("OUT_DIR"), "/settings-riscv.rs")); + +#[cfg(test)] +mod tests { + use super::{builder, Flags}; + use crate::settings::{self, Configurable}; + use alloc::string::ToString; + + #[test] + fn display_default() { + let shared = settings::Flags::new(settings::builder()); + let b = builder(); + let f = Flags::new(&shared, b); + assert_eq!( + f.to_string(), + "[riscv]\n\ + supports_m = false\n\ + supports_a = false\n\ + supports_f = false\n\ + supports_d = false\n\ + enable_m = true\n\ + enable_e = false\n" + ); + // Predicates are not part of the Display output. + assert_eq!(f.full_float(), false); + } + + #[test] + fn predicates() { + let mut sb = settings::builder(); + sb.set("enable_simd", "true").unwrap(); + let shared = settings::Flags::new(sb); + let mut b = builder(); + b.enable("supports_f").unwrap(); + b.enable("supports_d").unwrap(); + let f = Flags::new(&shared, b); + assert_eq!(f.full_float(), true); + + let mut sb = settings::builder(); + sb.set("enable_simd", "false").unwrap(); + let shared = settings::Flags::new(sb); + let mut b = builder(); + b.enable("supports_f").unwrap(); + b.enable("supports_d").unwrap(); + let f = Flags::new(&shared, b); + assert_eq!(f.full_float(), false); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/stack.rs b/third_party/rust/cranelift-codegen/src/isa/stack.rs new file mode 100644 index 0000000000..ae093bed28 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/stack.rs @@ -0,0 +1,95 @@ +//! Low-level details of stack accesses. +//! +//! The `ir::StackSlots` type deals with stack slots and stack frame layout. The `StackRef` type +//! defined in this module expresses the low-level details of accessing a stack slot from an +//! encoded instruction. + +use crate::ir::stackslot::{StackOffset, StackSlotKind, StackSlots}; +use crate::ir::StackSlot; + +/// A method for referencing a stack slot in the current stack frame. +/// +/// Stack slots are addressed with a constant offset from a base register. The base can be the +/// stack pointer, the frame pointer, or (in the future) a zone register pointing to an inner zone +/// of a large stack frame. +#[derive(Clone, Copy, Debug)] +pub struct StackRef { + /// The base register to use for addressing. + pub base: StackBase, + + /// Immediate offset from the base register to the first byte of the stack slot. + pub offset: StackOffset, +} + +impl StackRef { + /// Get a reference to the stack slot `ss` using one of the base pointers in `mask`. + pub fn masked(ss: StackSlot, mask: StackBaseMask, frame: &StackSlots) -> Option<Self> { + // Try an SP-relative reference. + if mask.contains(StackBase::SP) { + return Some(Self::sp(ss, frame)); + } + + // No reference possible with this mask. + None + } + + /// Get a reference to `ss` using the stack pointer as a base. + pub fn sp(ss: StackSlot, frame: &StackSlots) -> Self { + let size = frame + .layout_info + .expect("Stack layout must be computed before referencing stack slots") + .frame_size; + let slot = &frame[ss]; + let offset = if slot.kind == StackSlotKind::OutgoingArg { + // Outgoing argument slots have offsets relative to our stack pointer. + slot.offset.unwrap() + } else { + // All other slots have offsets relative to our caller's stack frame. + // Offset where SP is pointing. (All ISAs have stacks growing downwards.) + let sp_offset = -(size as StackOffset); + slot.offset.unwrap() - sp_offset + }; + Self { + base: StackBase::SP, + offset, + } + } +} + +/// Generic base register for referencing stack slots. +/// +/// Most ISAs have a stack pointer and an optional frame pointer, so provide generic names for +/// those two base pointers. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum StackBase { + /// Use the stack pointer. + SP = 0, + + /// Use the frame pointer (if one is present). + FP = 1, + + /// Use an explicit zone pointer in a general-purpose register. + /// + /// This feature is not yet implemented. + Zone = 2, +} + +/// Bit mask of supported stack bases. +/// +/// Many instruction encodings can use different base registers while others only work with the +/// stack pointer, say. A `StackBaseMask` is a bit mask of supported stack bases for a given +/// instruction encoding. +/// +/// This behaves like a set of `StackBase` variants. +/// +/// The internal representation as a `u8` is public because stack base masks are used in constant +/// tables generated from the meta-language encoding definitions. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct StackBaseMask(pub u8); + +impl StackBaseMask { + /// Check if this mask contains the `base` variant. + pub fn contains(self, base: StackBase) -> bool { + self.0 & (1 << base as usize) != 0 + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/test_utils.rs b/third_party/rust/cranelift-codegen/src/isa/test_utils.rs new file mode 100644 index 0000000000..01c500d6ca --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/test_utils.rs @@ -0,0 +1,86 @@ +// This is unused when no platforms with the new backend are enabled. +#![allow(dead_code)] + +use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc}; +use crate::ir::Value; +use crate::ir::{ConstantOffset, ExternalName, Function, JumpTable, Opcode, SourceLoc, TrapCode}; +use crate::isa::TargetIsa; + +use alloc::vec::Vec; +use std::string::String; + +pub struct TestCodeSink { + bytes: Vec<u8>, +} + +impl TestCodeSink { + /// Create a new TestCodeSink. + pub fn new() -> TestCodeSink { + TestCodeSink { bytes: vec![] } + } + + /// Return the code emitted to this sink as a hex string. + pub fn stringify(&self) -> String { + // This is pretty lame, but whatever .. + use std::fmt::Write; + let mut s = String::with_capacity(self.bytes.len() * 2); + for b in &self.bytes { + write!(&mut s, "{:02X}", b).unwrap(); + } + s + } +} + +impl CodeSink for TestCodeSink { + fn offset(&self) -> CodeOffset { + self.bytes.len() as CodeOffset + } + + fn put1(&mut self, x: u8) { + self.bytes.push(x); + } + + fn put2(&mut self, x: u16) { + self.bytes.push((x >> 0) as u8); + self.bytes.push((x >> 8) as u8); + } + + fn put4(&mut self, mut x: u32) { + for _ in 0..4 { + self.bytes.push(x as u8); + x >>= 8; + } + } + + fn put8(&mut self, mut x: u64) { + for _ in 0..8 { + self.bytes.push(x as u8); + x >>= 8; + } + } + + fn reloc_external( + &mut self, + _srcloc: SourceLoc, + _rel: Reloc, + _name: &ExternalName, + _addend: Addend, + ) { + } + + fn reloc_constant(&mut self, _rel: Reloc, _constant_offset: ConstantOffset) {} + + fn reloc_jt(&mut self, _rel: Reloc, _jt: JumpTable) {} + + fn trap(&mut self, _code: TrapCode, _srcloc: SourceLoc) {} + + fn begin_jumptables(&mut self) {} + + fn begin_rodata(&mut self) {} + + fn end_codegen(&mut self) {} + + fn add_stack_map(&mut self, _val_list: &[Value], _func: &Function, _isa: &dyn TargetIsa) {} + + fn add_call_site(&mut self, _opcode: Opcode, _srcloc: SourceLoc) {} +} diff --git a/third_party/rust/cranelift-codegen/src/isa/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/unwind.rs new file mode 100644 index 0000000000..a4c5f0b6b7 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/unwind.rs @@ -0,0 +1,88 @@ +//! Represents information relating to function unwinding. +#[cfg(feature = "enable-serde")] +use serde::{Deserialize, Serialize}; + +#[cfg(feature = "unwind")] +pub mod systemv; + +#[cfg(feature = "unwind")] +pub mod winx64; + +/// Represents unwind information for a single function. +#[derive(Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +#[non_exhaustive] +pub enum UnwindInfo { + /// Windows x64 ABI unwind information. + #[cfg(feature = "unwind")] + WindowsX64(winx64::UnwindInfo), + /// System V ABI unwind information. + #[cfg(feature = "unwind")] + SystemV(systemv::UnwindInfo), +} + +/// Intermediate representation for the unwind information +/// generated by a backend. +pub mod input { + use crate::binemit::CodeOffset; + use alloc::vec::Vec; + #[cfg(feature = "enable-serde")] + use serde::{Deserialize, Serialize}; + + /// Elementary operation in the unwind operations. + #[derive(Clone, Debug, PartialEq, Eq)] + #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] + pub enum UnwindCode<Reg> { + /// Defines that a register is saved at the specified offset. + SaveRegister { + /// The saved register. + reg: Reg, + /// The specified offset relative to the stack pointer. + stack_offset: u32, + }, + /// Defines that a register is as defined before call. + RestoreRegister { + /// The restored register. + reg: Reg, + }, + /// The stack pointer was adjusted to allocate the stack. + StackAlloc { + /// Size to allocate. + size: u32, + }, + /// The stack pointer was adjusted to free the stack. + StackDealloc { + /// Size to deallocate. + size: u32, + }, + /// The alternative register was assigned as frame pointer base. + SetFramePointer { + /// The specified register. + reg: Reg, + }, + /// Restores a frame pointer base to default register. + RestoreFramePointer, + /// Saves the state. + RememberState, + /// Restores the state. + RestoreState, + } + + /// Unwind information as generated by a backend. + #[derive(Clone, Debug, PartialEq, Eq)] + #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] + pub struct UnwindInfo<Reg> { + /// Size of the prologue. + pub prologue_size: CodeOffset, + /// Unwind codes for prologue. + pub prologue_unwind_codes: Vec<(CodeOffset, UnwindCode<Reg>)>, + /// Unwind codes for epilogues. + pub epilogues_unwind_codes: Vec<Vec<(CodeOffset, UnwindCode<Reg>)>>, + /// Entire function size. + pub function_size: CodeOffset, + /// Platform word size in bytes. + pub word_size: u8, + /// Initial stack pointer offset. + pub initial_sp_offset: u8, + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/unwind/systemv.rs b/third_party/rust/cranelift-codegen/src/isa/unwind/systemv.rs new file mode 100644 index 0000000000..dfb2ef5936 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/unwind/systemv.rs @@ -0,0 +1,313 @@ +//! System V ABI unwind information. + +use crate::isa::unwind::input; +use crate::result::{CodegenError, CodegenResult}; +use alloc::vec::Vec; +use gimli::write::{Address, FrameDescriptionEntry}; +use thiserror::Error; + +#[cfg(feature = "enable-serde")] +use serde::{Deserialize, Serialize}; + +type Register = u16; + +/// Enumerate the errors possible in mapping Cranelift registers to their DWARF equivalent. +#[allow(missing_docs)] +#[derive(Error, Debug, PartialEq, Eq)] +pub enum RegisterMappingError { + #[error("unable to find bank for register info")] + MissingBank, + #[error("register mapping is currently only implemented for x86_64")] + UnsupportedArchitecture, + #[error("unsupported register bank: {0}")] + UnsupportedRegisterBank(&'static str), +} + +// This mirrors gimli's CallFrameInstruction, but is serializable +// This excludes CfaExpression, Expression, ValExpression due to +// https://github.com/gimli-rs/gimli/issues/513. +// TODO: if gimli ever adds serialization support, remove this type +#[derive(Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub(crate) enum CallFrameInstruction { + Cfa(Register, i32), + CfaRegister(Register), + CfaOffset(i32), + Restore(Register), + Undefined(Register), + SameValue(Register), + Offset(Register, i32), + ValOffset(Register, i32), + Register(Register, Register), + RememberState, + RestoreState, + ArgsSize(u32), +} + +impl From<gimli::write::CallFrameInstruction> for CallFrameInstruction { + fn from(cfi: gimli::write::CallFrameInstruction) -> Self { + use gimli::write::CallFrameInstruction; + + match cfi { + CallFrameInstruction::Cfa(reg, offset) => Self::Cfa(reg.0, offset), + CallFrameInstruction::CfaRegister(reg) => Self::CfaRegister(reg.0), + CallFrameInstruction::CfaOffset(offset) => Self::CfaOffset(offset), + CallFrameInstruction::Restore(reg) => Self::Restore(reg.0), + CallFrameInstruction::Undefined(reg) => Self::Undefined(reg.0), + CallFrameInstruction::SameValue(reg) => Self::SameValue(reg.0), + CallFrameInstruction::Offset(reg, offset) => Self::Offset(reg.0, offset), + CallFrameInstruction::ValOffset(reg, offset) => Self::ValOffset(reg.0, offset), + CallFrameInstruction::Register(reg1, reg2) => Self::Register(reg1.0, reg2.0), + CallFrameInstruction::RememberState => Self::RememberState, + CallFrameInstruction::RestoreState => Self::RestoreState, + CallFrameInstruction::ArgsSize(size) => Self::ArgsSize(size), + _ => { + // Cranelift's unwind support does not generate `CallFrameInstruction`s with + // Expression at this moment, and it is not trivial to + // serialize such instructions. + panic!("CallFrameInstruction with Expression not supported"); + } + } + } +} + +impl Into<gimli::write::CallFrameInstruction> for CallFrameInstruction { + fn into(self) -> gimli::write::CallFrameInstruction { + use gimli::{write::CallFrameInstruction, Register}; + + match self { + Self::Cfa(reg, offset) => CallFrameInstruction::Cfa(Register(reg), offset), + Self::CfaRegister(reg) => CallFrameInstruction::CfaRegister(Register(reg)), + Self::CfaOffset(offset) => CallFrameInstruction::CfaOffset(offset), + Self::Restore(reg) => CallFrameInstruction::Restore(Register(reg)), + Self::Undefined(reg) => CallFrameInstruction::Undefined(Register(reg)), + Self::SameValue(reg) => CallFrameInstruction::SameValue(Register(reg)), + Self::Offset(reg, offset) => CallFrameInstruction::Offset(Register(reg), offset), + Self::ValOffset(reg, offset) => CallFrameInstruction::ValOffset(Register(reg), offset), + Self::Register(reg1, reg2) => { + CallFrameInstruction::Register(Register(reg1), Register(reg2)) + } + Self::RememberState => CallFrameInstruction::RememberState, + Self::RestoreState => CallFrameInstruction::RestoreState, + Self::ArgsSize(size) => CallFrameInstruction::ArgsSize(size), + } + } +} + +/// Maps UnwindInfo register to gimli's index space. +pub(crate) trait RegisterMapper<Reg> { + /// Maps Reg. + fn map(&self, reg: Reg) -> Result<Register, RegisterMappingError>; + /// Gets stack pointer register. + fn sp(&self) -> Register; +} + +/// Represents unwind information for a single System V ABI function. +/// +/// This representation is not ISA specific. +#[derive(Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct UnwindInfo { + instructions: Vec<(u32, CallFrameInstruction)>, + len: u32, +} + +impl UnwindInfo { + pub(crate) fn build<'b, Reg: PartialEq + Copy>( + unwind: input::UnwindInfo<Reg>, + map_reg: &'b dyn RegisterMapper<Reg>, + ) -> CodegenResult<Self> { + use input::UnwindCode; + let mut builder = InstructionBuilder::new(unwind.initial_sp_offset, map_reg); + + for (offset, c) in unwind.prologue_unwind_codes.iter().chain( + unwind + .epilogues_unwind_codes + .iter() + .map(|c| c.iter()) + .flatten(), + ) { + match c { + UnwindCode::SaveRegister { reg, stack_offset } => { + builder + .save_reg(*offset, *reg, *stack_offset) + .map_err(CodegenError::RegisterMappingError)?; + } + UnwindCode::StackAlloc { size } => { + builder.adjust_sp_down_imm(*offset, *size as i64); + } + UnwindCode::StackDealloc { size } => { + builder.adjust_sp_up_imm(*offset, *size as i64); + } + UnwindCode::RestoreRegister { reg } => { + builder + .restore_reg(*offset, *reg) + .map_err(CodegenError::RegisterMappingError)?; + } + UnwindCode::SetFramePointer { reg } => { + builder + .set_cfa_reg(*offset, *reg) + .map_err(CodegenError::RegisterMappingError)?; + } + UnwindCode::RestoreFramePointer => { + builder.restore_cfa(*offset); + } + UnwindCode::RememberState => { + builder.remember_state(*offset); + } + UnwindCode::RestoreState => { + builder.restore_state(*offset); + } + } + } + + let instructions = builder.instructions; + let len = unwind.function_size; + + Ok(Self { instructions, len }) + } + + /// Converts the unwind information into a `FrameDescriptionEntry`. + pub fn to_fde(&self, address: Address) -> gimli::write::FrameDescriptionEntry { + let mut fde = FrameDescriptionEntry::new(address, self.len); + + for (offset, inst) in &self.instructions { + fde.add_instruction(*offset, inst.clone().into()); + } + + fde + } +} + +struct InstructionBuilder<'a, Reg: PartialEq + Copy> { + sp_offset: i32, + frame_register: Option<Reg>, + saved_state: Option<(i32, Option<Reg>)>, + map_reg: &'a dyn RegisterMapper<Reg>, + instructions: Vec<(u32, CallFrameInstruction)>, +} + +impl<'a, Reg: PartialEq + Copy> InstructionBuilder<'a, Reg> { + fn new(sp_offset: u8, map_reg: &'a (dyn RegisterMapper<Reg> + 'a)) -> Self { + Self { + sp_offset: sp_offset as i32, // CFA offset starts at the specified offset to account for the return address on stack + saved_state: None, + frame_register: None, + map_reg, + instructions: Vec::new(), + } + } + + fn save_reg( + &mut self, + offset: u32, + reg: Reg, + stack_offset: u32, + ) -> Result<(), RegisterMappingError> { + // Pushes in the prologue are register saves, so record an offset of the save + self.instructions.push(( + offset, + CallFrameInstruction::Offset( + self.map_reg.map(reg)?, + stack_offset as i32 - self.sp_offset, + ), + )); + + Ok(()) + } + + fn adjust_sp_down_imm(&mut self, offset: u32, imm: i64) { + assert!(imm <= core::u32::MAX as i64); + + self.sp_offset += imm as i32; + + // Don't adjust the CFA if we're using a frame pointer + if self.frame_register.is_some() { + return; + } + + self.instructions + .push((offset, CallFrameInstruction::CfaOffset(self.sp_offset))); + } + + fn adjust_sp_up_imm(&mut self, offset: u32, imm: i64) { + assert!(imm <= core::u32::MAX as i64); + + self.sp_offset -= imm as i32; + + // Don't adjust the CFA if we're using a frame pointer + if self.frame_register.is_some() { + return; + } + + let cfa_inst_ofs = { + // Scan to find and merge with CFA instruction with the same offset. + let mut it = self.instructions.iter_mut(); + loop { + match it.next_back() { + Some((i_offset, i)) if *i_offset == offset => { + if let CallFrameInstruction::Cfa(_, o) = i { + break Some(o); + } + } + _ => { + break None; + } + } + } + }; + + if let Some(o) = cfa_inst_ofs { + // Update previous CFA instruction. + *o = self.sp_offset; + } else { + // Add just CFA offset instruction. + self.instructions + .push((offset, CallFrameInstruction::CfaOffset(self.sp_offset))); + } + } + + fn set_cfa_reg(&mut self, offset: u32, reg: Reg) -> Result<(), RegisterMappingError> { + self.instructions.push(( + offset, + CallFrameInstruction::CfaRegister(self.map_reg.map(reg)?), + )); + self.frame_register = Some(reg); + Ok(()) + } + + fn restore_cfa(&mut self, offset: u32) { + // Restore SP and its offset. + self.instructions.push(( + offset, + CallFrameInstruction::Cfa(self.map_reg.sp(), self.sp_offset), + )); + self.frame_register = None; + } + + fn restore_reg(&mut self, offset: u32, reg: Reg) -> Result<(), RegisterMappingError> { + // Pops in the epilogue are register restores, so record a "same value" for the register + self.instructions.push(( + offset, + CallFrameInstruction::SameValue(self.map_reg.map(reg)?), + )); + + Ok(()) + } + + fn remember_state(&mut self, offset: u32) { + self.saved_state = Some((self.sp_offset, self.frame_register)); + + self.instructions + .push((offset, CallFrameInstruction::RememberState)); + } + + fn restore_state(&mut self, offset: u32) { + let (sp_offset, frame_register) = self.saved_state.take().unwrap(); + self.sp_offset = sp_offset; + self.frame_register = frame_register; + + self.instructions + .push((offset, CallFrameInstruction::RestoreState)); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/unwind/winx64.rs b/third_party/rust/cranelift-codegen/src/isa/unwind/winx64.rs new file mode 100644 index 0000000000..b3c21fc473 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/unwind/winx64.rs @@ -0,0 +1,294 @@ +//! Windows x64 ABI unwind information. + +use crate::isa::{unwind::input, RegUnit}; +use crate::result::{CodegenError, CodegenResult}; +use alloc::vec::Vec; +use byteorder::{ByteOrder, LittleEndian}; +use log::warn; +#[cfg(feature = "enable-serde")] +use serde::{Deserialize, Serialize}; + +/// Maximum (inclusive) size of a "small" stack allocation +const SMALL_ALLOC_MAX_SIZE: u32 = 128; +/// Maximum (inclusive) size of a "large" stack allocation that can represented in 16-bits +const LARGE_ALLOC_16BIT_MAX_SIZE: u32 = 524280; + +struct Writer<'a> { + buf: &'a mut [u8], + offset: usize, +} + +impl<'a> Writer<'a> { + pub fn new(buf: &'a mut [u8]) -> Self { + Self { buf, offset: 0 } + } + + fn write_u8(&mut self, v: u8) { + self.buf[self.offset] = v; + self.offset += 1; + } + + fn write_u16<T: ByteOrder>(&mut self, v: u16) { + T::write_u16(&mut self.buf[self.offset..(self.offset + 2)], v); + self.offset += 2; + } + + fn write_u32<T: ByteOrder>(&mut self, v: u32) { + T::write_u32(&mut self.buf[self.offset..(self.offset + 4)], v); + self.offset += 4; + } +} + +/// The supported unwind codes for the x64 Windows ABI. +/// +/// See: https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64 +/// Only what is needed to describe the prologues generated by the Cranelift x86 ISA are represented here. +/// Note: the Cranelift x86 ISA RU enum matches the Windows unwind GPR encoding values. +#[derive(Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub(crate) enum UnwindCode { + PushRegister { + offset: u8, + reg: u8, + }, + SaveXmm { + offset: u8, + reg: u8, + stack_offset: u32, + }, + StackAlloc { + offset: u8, + size: u32, + }, +} + +impl UnwindCode { + fn emit(&self, writer: &mut Writer) { + enum UnwindOperation { + PushNonvolatileRegister = 0, + LargeStackAlloc = 1, + SmallStackAlloc = 2, + SaveXmm128 = 8, + SaveXmm128Far = 9, + } + + match self { + Self::PushRegister { offset, reg } => { + writer.write_u8(*offset); + writer.write_u8((*reg << 4) | (UnwindOperation::PushNonvolatileRegister as u8)); + } + Self::SaveXmm { + offset, + reg, + stack_offset, + } => { + writer.write_u8(*offset); + let scaled_stack_offset = stack_offset / 16; + if scaled_stack_offset <= core::u16::MAX as u32 { + writer.write_u8((*reg << 4) | (UnwindOperation::SaveXmm128 as u8)); + writer.write_u16::<LittleEndian>(scaled_stack_offset as u16); + } else { + writer.write_u8((*reg << 4) | (UnwindOperation::SaveXmm128Far as u8)); + writer.write_u16::<LittleEndian>(*stack_offset as u16); + writer.write_u16::<LittleEndian>((stack_offset >> 16) as u16); + } + } + Self::StackAlloc { offset, size } => { + // Stack allocations on Windows must be a multiple of 8 and be at least 1 slot + assert!(*size >= 8); + assert!((*size % 8) == 0); + + writer.write_u8(*offset); + if *size <= SMALL_ALLOC_MAX_SIZE { + writer.write_u8( + ((((*size - 8) / 8) as u8) << 4) | UnwindOperation::SmallStackAlloc as u8, + ); + } else if *size <= LARGE_ALLOC_16BIT_MAX_SIZE { + writer.write_u8(UnwindOperation::LargeStackAlloc as u8); + writer.write_u16::<LittleEndian>((*size / 8) as u16); + } else { + writer.write_u8((1 << 4) | (UnwindOperation::LargeStackAlloc as u8)); + writer.write_u32::<LittleEndian>(*size); + } + } + }; + } + + fn node_count(&self) -> usize { + match self { + Self::StackAlloc { size, .. } => { + if *size <= SMALL_ALLOC_MAX_SIZE { + 1 + } else if *size <= LARGE_ALLOC_16BIT_MAX_SIZE { + 2 + } else { + 3 + } + } + Self::SaveXmm { stack_offset, .. } => { + if *stack_offset <= core::u16::MAX as u32 { + 2 + } else { + 3 + } + } + _ => 1, + } + } +} + +pub(crate) enum MappedRegister { + Int(u8), + Xmm(u8), +} + +/// Maps UnwindInfo register to Windows x64 unwind data. +pub(crate) trait RegisterMapper { + /// Maps RegUnit. + fn map(reg: RegUnit) -> MappedRegister; +} + +/// Represents Windows x64 unwind information. +/// +/// For information about Windows x64 unwind info, see: +/// https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64 +#[derive(Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct UnwindInfo { + pub(crate) flags: u8, + pub(crate) prologue_size: u8, + pub(crate) frame_register: Option<u8>, + pub(crate) frame_register_offset: u8, + pub(crate) unwind_codes: Vec<UnwindCode>, +} + +impl UnwindInfo { + /// Gets the emit size of the unwind information, in bytes. + pub fn emit_size(&self) -> usize { + let node_count = self.node_count(); + + // Calculation of the size requires no SEH handler or chained info + assert!(self.flags == 0); + + // Size of fixed part of UNWIND_INFO is 4 bytes + // Then comes the UNWIND_CODE nodes (2 bytes each) + // Then comes 2 bytes of padding for the unwind codes if necessary + // Next would come the SEH data, but we assert above that the function doesn't have SEH data + + 4 + (node_count * 2) + if (node_count & 1) == 1 { 2 } else { 0 } + } + + /// Emits the unwind information into the given mutable byte slice. + /// + /// This function will panic if the slice is not at least `emit_size` in length. + pub fn emit(&self, buf: &mut [u8]) { + const UNWIND_INFO_VERSION: u8 = 1; + + let node_count = self.node_count(); + assert!(node_count <= 256); + + let mut writer = Writer::new(buf); + + writer.write_u8((self.flags << 3) | UNWIND_INFO_VERSION); + writer.write_u8(self.prologue_size); + writer.write_u8(node_count as u8); + + if let Some(reg) = self.frame_register { + writer.write_u8((self.frame_register_offset << 4) | reg); + } else { + writer.write_u8(0); + } + + // Unwind codes are written in reverse order (prologue offset descending) + for code in self.unwind_codes.iter().rev() { + code.emit(&mut writer); + } + + // To keep a 32-bit alignment, emit 2 bytes of padding if there's an odd number of 16-bit nodes + if (node_count & 1) == 1 { + writer.write_u16::<LittleEndian>(0); + } + + // Ensure the correct number of bytes was emitted + assert_eq!(writer.offset, self.emit_size()); + } + + fn node_count(&self) -> usize { + self.unwind_codes + .iter() + .fold(0, |nodes, c| nodes + c.node_count()) + } + + pub(crate) fn build<MR: RegisterMapper>( + unwind: input::UnwindInfo<RegUnit>, + ) -> CodegenResult<Self> { + use crate::isa::unwind::input::UnwindCode as InputUnwindCode; + + let word_size: u32 = unwind.word_size.into(); + let mut unwind_codes = Vec::new(); + for (offset, c) in unwind.prologue_unwind_codes.iter() { + match c { + InputUnwindCode::SaveRegister { reg, stack_offset } => { + let reg = MR::map(*reg); + let offset = ensure_unwind_offset(*offset)?; + match reg { + MappedRegister::Int(reg) => { + // Attempt to convert sequence of the `InputUnwindCode`: + // `StackAlloc { size = word_size }`, `SaveRegister { stack_offset: 0 }` + // to the shorter `UnwindCode::PushRegister`. + let push_reg_sequence = if let Some(UnwindCode::StackAlloc { + offset: alloc_offset, + size, + }) = unwind_codes.last() + { + *size == word_size && offset == *alloc_offset && *stack_offset == 0 + } else { + false + }; + if push_reg_sequence { + *unwind_codes.last_mut().unwrap() = + UnwindCode::PushRegister { offset, reg }; + } else { + // TODO add `UnwindCode::SaveRegister` to handle multiple register + // pushes with single `UnwindCode::StackAlloc`. + return Err(CodegenError::Unsupported( + "Unsupported UnwindCode::PushRegister sequence".into(), + )); + } + } + MappedRegister::Xmm(reg) => { + unwind_codes.push(UnwindCode::SaveXmm { + offset, + reg, + stack_offset: *stack_offset, + }); + } + } + } + InputUnwindCode::StackAlloc { size } => { + unwind_codes.push(UnwindCode::StackAlloc { + offset: ensure_unwind_offset(*offset)?, + size: *size, + }); + } + _ => {} + } + } + + Ok(Self { + flags: 0, // this assumes cranelift functions have no SEH handlers + prologue_size: ensure_unwind_offset(unwind.prologue_size)?, + frame_register: None, + frame_register_offset: 0, + unwind_codes, + }) + } +} + +fn ensure_unwind_offset(offset: u32) -> CodegenResult<u8> { + if offset > 255 { + warn!("function prologues cannot exceed 255 bytes in size for Windows x64"); + return Err(CodegenError::CodeTooLarge); + } + Ok(offset as u8) +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs b/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs new file mode 100644 index 0000000000..f4c7624f36 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs @@ -0,0 +1,794 @@ +//! Implementation of the standard x64 ABI. + +use crate::ir::types::*; +use crate::ir::{self, types, MemFlags, TrapCode, Type}; +use crate::isa; +use crate::isa::{x64::inst::*, CallConv}; +use crate::machinst::abi_impl::*; +use crate::machinst::*; +use crate::settings; +use crate::{CodegenError, CodegenResult}; +use alloc::boxed::Box; +use alloc::vec::Vec; +use args::*; +use regalloc::{RealReg, Reg, RegClass, Set, Writable}; +use smallvec::{smallvec, SmallVec}; +use std::convert::TryFrom; + +/// This is the limit for the size of argument and return-value areas on the +/// stack. We place a reasonable limit here to avoid integer overflow issues +/// with 32-bit arithmetic: for now, 128 MB. +static STACK_ARG_RET_SIZE_LIMIT: u64 = 128 * 1024 * 1024; + +/// Offset in stack-arg area to callee-TLS slot in Baldrdash-2020 calling convention. +static BALDRDASH_CALLEE_TLS_OFFSET: i64 = 0; +/// Offset in stack-arg area to caller-TLS slot in Baldrdash-2020 calling convention. +static BALDRDASH_CALLER_TLS_OFFSET: i64 = 8; + +/// Try to fill a Baldrdash register, returning it if it was found. +fn try_fill_baldrdash_reg(call_conv: CallConv, param: &ir::AbiParam) -> Option<ABIArg> { + if call_conv.extends_baldrdash() { + match ¶m.purpose { + &ir::ArgumentPurpose::VMContext => { + // This is SpiderMonkey's `WasmTlsReg`. + Some(ABIArg::Reg( + regs::r14().to_real_reg(), + types::I64, + param.extension, + param.purpose, + )) + } + &ir::ArgumentPurpose::SignatureId => { + // This is SpiderMonkey's `WasmTableCallSigReg`. + Some(ABIArg::Reg( + regs::r10().to_real_reg(), + types::I64, + param.extension, + param.purpose, + )) + } + &ir::ArgumentPurpose::CalleeTLS => { + // This is SpiderMonkey's callee TLS slot in the extended frame of Wasm's ABI-2020. + assert!(call_conv == isa::CallConv::Baldrdash2020); + Some(ABIArg::Stack( + BALDRDASH_CALLEE_TLS_OFFSET, + ir::types::I64, + ir::ArgumentExtension::None, + param.purpose, + )) + } + &ir::ArgumentPurpose::CallerTLS => { + // This is SpiderMonkey's caller TLS slot in the extended frame of Wasm's ABI-2020. + assert!(call_conv == isa::CallConv::Baldrdash2020); + Some(ABIArg::Stack( + BALDRDASH_CALLER_TLS_OFFSET, + ir::types::I64, + ir::ArgumentExtension::None, + param.purpose, + )) + } + _ => None, + } + } else { + None + } +} + +/// Support for the x64 ABI from the callee side (within a function body). +pub(crate) type X64ABICallee = ABICalleeImpl<X64ABIMachineSpec>; + +/// Support for the x64 ABI from the caller side (at a callsite). +pub(crate) type X64ABICaller = ABICallerImpl<X64ABIMachineSpec>; + +/// Implementation of ABI primitives for x64. +pub(crate) struct X64ABIMachineSpec; + +impl ABIMachineSpec for X64ABIMachineSpec { + type I = Inst; + + fn word_bits() -> u32 { + 64 + } + + /// Return required stack alignment in bytes. + fn stack_align(_call_conv: isa::CallConv) -> u32 { + 16 + } + + fn compute_arg_locs( + call_conv: isa::CallConv, + params: &[ir::AbiParam], + args_or_rets: ArgsOrRets, + add_ret_area_ptr: bool, + ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> { + let is_baldrdash = call_conv.extends_baldrdash(); + let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020; + + let mut next_gpr = 0; + let mut next_vreg = 0; + let mut next_stack: u64 = 0; + let mut ret = vec![]; + + if args_or_rets == ArgsOrRets::Args && has_baldrdash_tls { + // Baldrdash ABI-2020 always has two stack-arg slots reserved, for the callee and + // caller TLS-register values, respectively. + next_stack = 16; + } + + for i in 0..params.len() { + // Process returns backward, according to the SpiderMonkey ABI (which we + // adopt internally if `is_baldrdash` is set). + let param = match (args_or_rets, is_baldrdash) { + (ArgsOrRets::Args, _) => ¶ms[i], + (ArgsOrRets::Rets, false) => ¶ms[i], + (ArgsOrRets::Rets, true) => ¶ms[params.len() - 1 - i], + }; + + // Validate "purpose". + match ¶m.purpose { + &ir::ArgumentPurpose::VMContext + | &ir::ArgumentPurpose::Normal + | &ir::ArgumentPurpose::StackLimit + | &ir::ArgumentPurpose::SignatureId + | &ir::ArgumentPurpose::CalleeTLS + | &ir::ArgumentPurpose::CallerTLS => {} + _ => panic!( + "Unsupported argument purpose {:?} in signature: {:?}", + param.purpose, params + ), + } + + let intreg = in_int_reg(param.value_type); + let vecreg = in_vec_reg(param.value_type); + debug_assert!(intreg || vecreg); + debug_assert!(!(intreg && vecreg)); + + let (next_reg, candidate) = if intreg { + let candidate = match args_or_rets { + ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr), + ArgsOrRets::Rets => get_intreg_for_retval_systemv(&call_conv, next_gpr, i), + }; + debug_assert!(candidate + .map(|r| r.get_class() == RegClass::I64) + .unwrap_or(true)); + (&mut next_gpr, candidate) + } else { + let candidate = match args_or_rets { + ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg), + ArgsOrRets::Rets => get_fltreg_for_retval_systemv(&call_conv, next_vreg, i), + }; + debug_assert!(candidate + .map(|r| r.get_class() == RegClass::V128) + .unwrap_or(true)); + (&mut next_vreg, candidate) + }; + + if let Some(param) = try_fill_baldrdash_reg(call_conv, param) { + assert!(intreg); + ret.push(param); + } else if let Some(reg) = candidate { + ret.push(ABIArg::Reg( + reg.to_real_reg(), + param.value_type, + param.extension, + param.purpose, + )); + *next_reg += 1; + } else { + // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte + // stack alignment happens separately after all args.) + let size = (param.value_type.bits() / 8) as u64; + let size = std::cmp::max(size, 8); + // Align. + debug_assert!(size.is_power_of_two()); + next_stack = (next_stack + size - 1) & !(size - 1); + ret.push(ABIArg::Stack( + next_stack as i64, + param.value_type, + param.extension, + param.purpose, + )); + next_stack += size; + } + } + + if args_or_rets == ArgsOrRets::Rets && is_baldrdash { + ret.reverse(); + } + + let extra_arg = if add_ret_area_ptr { + debug_assert!(args_or_rets == ArgsOrRets::Args); + if let Some(reg) = get_intreg_for_arg_systemv(&call_conv, next_gpr) { + ret.push(ABIArg::Reg( + reg.to_real_reg(), + types::I64, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + )); + } else { + ret.push(ABIArg::Stack( + next_stack as i64, + types::I64, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + )); + next_stack += 8; + } + Some(ret.len() - 1) + } else { + None + }; + + next_stack = (next_stack + 15) & !15; + + // To avoid overflow issues, limit the arg/return size to something reasonable. + if next_stack > STACK_ARG_RET_SIZE_LIMIT { + return Err(CodegenError::ImplLimitExceeded); + } + + Ok((ret, next_stack as i64, extra_arg)) + } + + fn fp_to_arg_offset(call_conv: isa::CallConv, flags: &settings::Flags) -> i64 { + if call_conv.extends_baldrdash() { + let num_words = flags.baldrdash_prologue_words() as i64; + debug_assert!(num_words > 0, "baldrdash must set baldrdash_prologue_words"); + num_words * 8 + } else { + 16 // frame pointer + return address. + } + } + + fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I { + let ext_kind = match ty { + types::B1 + | types::B8 + | types::I8 + | types::B16 + | types::I16 + | types::B32 + | types::I32 => ExtKind::SignExtend, + types::B64 | types::I64 | types::R64 | types::F32 | types::F64 => ExtKind::None, + _ if ty.bytes() == 16 => ExtKind::None, + _ => panic!("load_stack({})", ty), + }; + Inst::load(ty, mem, into_reg, ext_kind) + } + + fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Self::I { + Inst::store(ty, from_reg, mem) + } + + fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self::I { + Inst::gen_move(to_reg, from_reg, ty) + } + + /// Generate an integer-extend operation. + fn gen_extend( + to_reg: Writable<Reg>, + from_reg: Reg, + is_signed: bool, + from_bits: u8, + to_bits: u8, + ) -> Self::I { + let ext_mode = ExtMode::new(from_bits as u16, to_bits as u16) + .expect(&format!("invalid extension: {} -> {}", from_bits, to_bits)); + if is_signed { + Inst::movsx_rm_r(ext_mode, RegMem::reg(from_reg), to_reg) + } else { + Inst::movzx_rm_r(ext_mode, RegMem::reg(from_reg), to_reg) + } + } + + fn gen_ret() -> Self::I { + Inst::ret() + } + + fn gen_epilogue_placeholder() -> Self::I { + Inst::epilogue_placeholder() + } + + fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Self::I; 4]> { + let mut ret = SmallVec::new(); + if from_reg != into_reg.to_reg() { + ret.push(Inst::gen_move(into_reg, from_reg, I64)); + } + ret.push(Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::imm(imm), + into_reg, + )); + ret + } + + fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Self::I; 2]> { + smallvec![ + Inst::cmp_rmi_r(/* bytes = */ 8, RegMemImm::reg(regs::rsp()), limit_reg), + Inst::TrapIf { + // NBE == "> unsigned"; args above are reversed; this tests limit_reg > rsp. + cc: CC::NBE, + trap_code: TrapCode::StackOverflow, + }, + ] + } + + fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Self::I { + let mem: SyntheticAmode = mem.into(); + Inst::lea(mem, into_reg) + } + + fn get_stacklimit_reg() -> Reg { + debug_assert!( + !is_callee_save_systemv(regs::r10().to_real_reg()) + && !is_callee_save_baldrdash(regs::r10().to_real_reg()) + ); + + // As per comment on trait definition, we must return a caller-save + // register here. + regs::r10() + } + + fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Self::I { + // Only ever used for I64s; if that changes, see if the ExtKind below needs to be changed. + assert_eq!(ty, I64); + let simm32 = offset as u32; + let mem = Amode::imm_reg(simm32, base); + Inst::load(ty, mem, into_reg, ExtKind::None) + } + + fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Self::I { + let simm32 = offset as u32; + let mem = Amode::imm_reg(simm32, base); + Inst::store(ty, from_reg, mem) + } + + fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Self::I; 2]> { + let (alu_op, amount) = if amount >= 0 { + (AluRmiROpcode::Add, amount) + } else { + (AluRmiROpcode::Sub, -amount) + }; + + let amount = amount as u32; + + smallvec![Inst::alu_rmi_r( + true, + alu_op, + RegMemImm::imm(amount), + Writable::from_reg(regs::rsp()), + )] + } + + fn gen_nominal_sp_adj(offset: i32) -> Self::I { + Inst::VirtualSPOffsetAdj { + offset: offset as i64, + } + } + + fn gen_prologue_frame_setup() -> SmallVec<[Self::I; 2]> { + let r_rsp = regs::rsp(); + let r_rbp = regs::rbp(); + let w_rbp = Writable::from_reg(r_rbp); + let mut insts = SmallVec::new(); + // RSP before the call will be 0 % 16. So here, it is 8 % 16. + insts.push(Inst::push64(RegMemImm::reg(r_rbp))); + // RSP is now 0 % 16 + insts.push(Inst::mov_r_r(true, r_rsp, w_rbp)); + insts + } + + fn gen_epilogue_frame_restore() -> SmallVec<[Self::I; 2]> { + let mut insts = SmallVec::new(); + insts.push(Inst::mov_r_r( + true, + regs::rbp(), + Writable::from_reg(regs::rsp()), + )); + insts.push(Inst::pop64(Writable::from_reg(regs::rbp()))); + insts + } + + fn gen_clobber_save( + call_conv: isa::CallConv, + _: &settings::Flags, + clobbers: &Set<Writable<RealReg>>, + fixed_frame_storage_size: u32, + _outgoing_args_size: u32, + ) -> (u64, SmallVec<[Self::I; 16]>) { + let mut insts = SmallVec::new(); + // Find all clobbered registers that are callee-save. These are only I64 + // registers (all XMM registers are caller-save) so we can compute the + // total size of the needed stack space easily. + let clobbered = get_callee_saves(&call_conv, clobbers); + let clobbered_size = 8 * clobbered.len() as u32; + let stack_size = clobbered_size + fixed_frame_storage_size; + // Align to 16 bytes. + let stack_size = (stack_size + 15) & !15; + // Adjust the stack pointer downward with one `sub rsp, IMM` + // instruction. + if stack_size > 0 { + insts.push(Inst::alu_rmi_r( + true, + AluRmiROpcode::Sub, + RegMemImm::imm(stack_size), + Writable::from_reg(regs::rsp()), + )); + } + // Store each clobbered register in order at offsets from RSP. + let mut cur_offset = 0; + for reg in &clobbered { + let r_reg = reg.to_reg(); + match r_reg.get_class() { + RegClass::I64 => { + insts.push(Inst::mov_r_m( + /* bytes = */ 8, + r_reg.to_reg(), + Amode::imm_reg(cur_offset, regs::rsp()), + )); + cur_offset += 8; + } + // No XMM regs are callee-save, so we do not need to implement + // this. + _ => unimplemented!(), + } + } + + (clobbered_size as u64, insts) + } + + fn gen_clobber_restore( + call_conv: isa::CallConv, + flags: &settings::Flags, + clobbers: &Set<Writable<RealReg>>, + _fixed_frame_storage_size: u32, + _outgoing_args_size: u32, + ) -> SmallVec<[Self::I; 16]> { + let mut insts = SmallVec::new(); + + let clobbered = get_callee_saves(&call_conv, clobbers); + let stack_size = 8 * clobbered.len() as u32; + let stack_size = (stack_size + 15) & !15; + + // Restore regs by loading from offsets of RSP. + let mut cur_offset = 0; + for reg in &clobbered { + let rreg = reg.to_reg(); + match rreg.get_class() { + RegClass::I64 => { + insts.push(Inst::mov64_m_r( + Amode::imm_reg(cur_offset, regs::rsp()), + Writable::from_reg(rreg.to_reg()), + )); + cur_offset += 8; + } + _ => unimplemented!(), + } + } + // Adjust RSP back upward. + if stack_size > 0 { + insts.push(Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::imm(stack_size), + Writable::from_reg(regs::rsp()), + )); + } + + // If this is Baldrdash-2020, restore the callee (i.e., our) TLS + // register. We may have allocated it for something else and clobbered + // it, but the ABI expects us to leave the TLS register unchanged. + if call_conv == isa::CallConv::Baldrdash2020 { + let off = BALDRDASH_CALLEE_TLS_OFFSET + Self::fp_to_arg_offset(call_conv, flags); + insts.push(Inst::mov64_m_r( + Amode::imm_reg(off as u32, regs::rbp()), + Writable::from_reg(regs::r14()), + )); + } + + insts + } + + /// Generate a call instruction/sequence. + fn gen_call( + dest: &CallDest, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: ir::Opcode, + tmp: Writable<Reg>, + _callee_conv: isa::CallConv, + _caller_conv: isa::CallConv, + ) -> SmallVec<[(InstIsSafepoint, Self::I); 2]> { + let mut insts = SmallVec::new(); + match dest { + &CallDest::ExtName(ref name, RelocDistance::Near) => { + insts.push(( + InstIsSafepoint::Yes, + Inst::call_known(name.clone(), uses, defs, opcode), + )); + } + &CallDest::ExtName(ref name, RelocDistance::Far) => { + insts.push(( + InstIsSafepoint::No, + Inst::LoadExtName { + dst: tmp, + name: Box::new(name.clone()), + offset: 0, + }, + )); + insts.push(( + InstIsSafepoint::Yes, + Inst::call_unknown(RegMem::reg(tmp.to_reg()), uses, defs, opcode), + )); + } + &CallDest::Reg(reg) => { + insts.push(( + InstIsSafepoint::Yes, + Inst::call_unknown(RegMem::reg(reg), uses, defs, opcode), + )); + } + } + insts + } + + fn get_number_of_spillslots_for_value(rc: RegClass, ty: Type) -> u32 { + // We allocate in terms of 8-byte slots. + match (rc, ty) { + (RegClass::I64, _) => 1, + (RegClass::V128, types::F32) | (RegClass::V128, types::F64) => 1, + (RegClass::V128, _) => 2, + _ => panic!("Unexpected register class!"), + } + } + + fn get_virtual_sp_offset_from_state(s: &<Self::I as MachInstEmit>::State) -> i64 { + s.virtual_sp_offset + } + + fn get_nominal_sp_to_fp(s: &<Self::I as MachInstEmit>::State) -> i64 { + s.nominal_sp_to_fp + } + + fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> Vec<Writable<Reg>> { + let mut caller_saved = vec![ + // Systemv calling convention: + // - GPR: all except RBX, RBP, R12 to R15 (which are callee-saved). + Writable::from_reg(regs::rsi()), + Writable::from_reg(regs::rdi()), + Writable::from_reg(regs::rax()), + Writable::from_reg(regs::rcx()), + Writable::from_reg(regs::rdx()), + Writable::from_reg(regs::r8()), + Writable::from_reg(regs::r9()), + Writable::from_reg(regs::r10()), + Writable::from_reg(regs::r11()), + // - XMM: all the registers! + Writable::from_reg(regs::xmm0()), + Writable::from_reg(regs::xmm1()), + Writable::from_reg(regs::xmm2()), + Writable::from_reg(regs::xmm3()), + Writable::from_reg(regs::xmm4()), + Writable::from_reg(regs::xmm5()), + Writable::from_reg(regs::xmm6()), + Writable::from_reg(regs::xmm7()), + Writable::from_reg(regs::xmm8()), + Writable::from_reg(regs::xmm9()), + Writable::from_reg(regs::xmm10()), + Writable::from_reg(regs::xmm11()), + Writable::from_reg(regs::xmm12()), + Writable::from_reg(regs::xmm13()), + Writable::from_reg(regs::xmm14()), + Writable::from_reg(regs::xmm15()), + ]; + + if call_conv_of_callee.extends_baldrdash() { + caller_saved.push(Writable::from_reg(regs::r12())); + caller_saved.push(Writable::from_reg(regs::r13())); + // Not r14; implicitly preserved in the entry. + caller_saved.push(Writable::from_reg(regs::r15())); + caller_saved.push(Writable::from_reg(regs::rbx())); + } + + caller_saved + } +} + +impl From<StackAMode> for SyntheticAmode { + fn from(amode: StackAMode) -> Self { + // We enforce a 128 MB stack-frame size limit above, so these + // `expect()`s should never fail. + match amode { + StackAMode::FPOffset(off, _ty) => { + let off = i32::try_from(off) + .expect("Offset in FPOffset is greater than 2GB; should hit impl limit first"); + let simm32 = off as u32; + SyntheticAmode::Real(Amode::ImmReg { + simm32, + base: regs::rbp(), + flags: MemFlags::trusted(), + }) + } + StackAMode::NominalSPOffset(off, _ty) => { + let off = i32::try_from(off).expect( + "Offset in NominalSPOffset is greater than 2GB; should hit impl limit first", + ); + let simm32 = off as u32; + SyntheticAmode::nominal_sp_offset(simm32) + } + StackAMode::SPOffset(off, _ty) => { + let off = i32::try_from(off) + .expect("Offset in SPOffset is greater than 2GB; should hit impl limit first"); + let simm32 = off as u32; + SyntheticAmode::Real(Amode::ImmReg { + simm32, + base: regs::rsp(), + flags: MemFlags::trusted(), + }) + } + } + } +} + +fn in_int_reg(ty: types::Type) -> bool { + match ty { + types::I8 + | types::I16 + | types::I32 + | types::I64 + | types::B1 + | types::B8 + | types::B16 + | types::B32 + | types::B64 + | types::R64 => true, + types::R32 => panic!("unexpected 32-bits refs on x64!"), + _ => false, + } +} + +fn in_vec_reg(ty: types::Type) -> bool { + match ty { + types::F32 | types::F64 => true, + _ if ty.is_vector() => true, + _ => false, + } +} + +fn get_intreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> { + match call_conv { + CallConv::Fast + | CallConv::Cold + | CallConv::SystemV + | CallConv::BaldrdashSystemV + | CallConv::Baldrdash2020 => {} + _ => panic!("int args only supported for SysV calling convention"), + }; + match idx { + 0 => Some(regs::rdi()), + 1 => Some(regs::rsi()), + 2 => Some(regs::rdx()), + 3 => Some(regs::rcx()), + 4 => Some(regs::r8()), + 5 => Some(regs::r9()), + _ => None, + } +} + +fn get_fltreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> { + match call_conv { + CallConv::Fast + | CallConv::Cold + | CallConv::SystemV + | CallConv::BaldrdashSystemV + | CallConv::Baldrdash2020 => {} + _ => panic!("float args only supported for SysV calling convention"), + }; + match idx { + 0 => Some(regs::xmm0()), + 1 => Some(regs::xmm1()), + 2 => Some(regs::xmm2()), + 3 => Some(regs::xmm3()), + 4 => Some(regs::xmm4()), + 5 => Some(regs::xmm5()), + 6 => Some(regs::xmm6()), + 7 => Some(regs::xmm7()), + _ => None, + } +} + +fn get_intreg_for_retval_systemv( + call_conv: &CallConv, + intreg_idx: usize, + retval_idx: usize, +) -> Option<Reg> { + match call_conv { + CallConv::Fast | CallConv::Cold | CallConv::SystemV => match intreg_idx { + 0 => Some(regs::rax()), + 1 => Some(regs::rdx()), + _ => None, + }, + CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => { + if intreg_idx == 0 && retval_idx == 0 { + Some(regs::rax()) + } else { + None + } + } + CallConv::WindowsFastcall | CallConv::BaldrdashWindows | CallConv::Probestack => todo!(), + } +} + +fn get_fltreg_for_retval_systemv( + call_conv: &CallConv, + fltreg_idx: usize, + retval_idx: usize, +) -> Option<Reg> { + match call_conv { + CallConv::Fast | CallConv::Cold | CallConv::SystemV => match fltreg_idx { + 0 => Some(regs::xmm0()), + 1 => Some(regs::xmm1()), + _ => None, + }, + CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => { + if fltreg_idx == 0 && retval_idx == 0 { + Some(regs::xmm0()) + } else { + None + } + } + CallConv::WindowsFastcall | CallConv::BaldrdashWindows | CallConv::Probestack => todo!(), + } +} + +fn is_callee_save_systemv(r: RealReg) -> bool { + use regs::*; + match r.get_class() { + RegClass::I64 => match r.get_hw_encoding() as u8 { + ENC_RBX | ENC_RBP | ENC_R12 | ENC_R13 | ENC_R14 | ENC_R15 => true, + _ => false, + }, + RegClass::V128 => false, + _ => unimplemented!(), + } +} + +fn is_callee_save_baldrdash(r: RealReg) -> bool { + use regs::*; + match r.get_class() { + RegClass::I64 => { + if r.get_hw_encoding() as u8 == ENC_R14 { + // r14 is the WasmTlsReg and is preserved implicitly. + false + } else { + // Defer to native for the other ones. + is_callee_save_systemv(r) + } + } + RegClass::V128 => false, + _ => unimplemented!(), + } +} + +fn get_callee_saves(call_conv: &CallConv, regs: &Set<Writable<RealReg>>) -> Vec<Writable<RealReg>> { + let mut regs: Vec<Writable<RealReg>> = match call_conv { + CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => regs + .iter() + .cloned() + .filter(|r| is_callee_save_baldrdash(r.to_reg())) + .collect(), + CallConv::BaldrdashWindows => { + todo!("baldrdash windows"); + } + CallConv::Fast | CallConv::Cold | CallConv::SystemV => regs + .iter() + .cloned() + .filter(|r| is_callee_save_systemv(r.to_reg())) + .collect(), + CallConv::WindowsFastcall => todo!("windows fastcall"), + CallConv::Probestack => todo!("probestack?"), + }; + // Sort registers for deterministic code output. We can do an unstable sort because the + // registers will be unique (there are no dups). + regs.sort_unstable_by_key(|r| r.to_reg().get_index()); + regs +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs new file mode 100644 index 0000000000..6a8f65feb3 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs @@ -0,0 +1,1215 @@ +//! Instruction operand sub-components (aka "parts"): definitions and printing. + +use super::regs::{self, show_ireg_sized}; +use super::EmitState; +use crate::ir::condcodes::{FloatCC, IntCC}; +use crate::ir::MemFlags; +use crate::machinst::*; +use regalloc::{ + PrettyPrint, PrettyPrintSized, RealRegUniverse, Reg, RegClass, RegUsageCollector, + RegUsageMapper, Writable, +}; +use std::fmt; +use std::string::String; + +/// A possible addressing mode (amode) that can be used in instructions. +/// These denote a 64-bit value only. +#[derive(Clone, Debug)] +pub enum Amode { + /// Immediate sign-extended and a Register. + ImmReg { + simm32: u32, + base: Reg, + flags: MemFlags, + }, + + /// sign-extend-32-to-64(Immediate) + Register1 + (Register2 << Shift) + ImmRegRegShift { + simm32: u32, + base: Reg, + index: Reg, + shift: u8, /* 0 .. 3 only */ + flags: MemFlags, + }, + + /// sign-extend-32-to-64(Immediate) + RIP (instruction pointer). + /// To wit: not supported in 32-bits mode. + RipRelative { target: MachLabel }, +} + +impl Amode { + pub(crate) fn imm_reg(simm32: u32, base: Reg) -> Self { + debug_assert!(base.get_class() == RegClass::I64); + Self::ImmReg { + simm32, + base, + flags: MemFlags::trusted(), + } + } + + pub(crate) fn imm_reg_reg_shift(simm32: u32, base: Reg, index: Reg, shift: u8) -> Self { + debug_assert!(base.get_class() == RegClass::I64); + debug_assert!(index.get_class() == RegClass::I64); + debug_assert!(shift <= 3); + Self::ImmRegRegShift { + simm32, + base, + index, + shift, + flags: MemFlags::trusted(), + } + } + + pub(crate) fn rip_relative(target: MachLabel) -> Self { + Self::RipRelative { target } + } + + pub(crate) fn with_flags(&self, flags: MemFlags) -> Self { + match self { + &Self::ImmReg { simm32, base, .. } => Self::ImmReg { + simm32, + base, + flags, + }, + &Self::ImmRegRegShift { + simm32, + base, + index, + shift, + .. + } => Self::ImmRegRegShift { + simm32, + base, + index, + shift, + flags, + }, + _ => panic!("Amode {:?} cannot take memflags", self), + } + } + + /// Add the regs mentioned by `self` to `collector`. + pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) { + match self { + Amode::ImmReg { base, .. } => { + collector.add_use(*base); + } + Amode::ImmRegRegShift { base, index, .. } => { + collector.add_use(*base); + collector.add_use(*index); + } + Amode::RipRelative { .. } => { + // RIP isn't involved in regalloc. + } + } + } + + pub(crate) fn get_flags(&self) -> MemFlags { + match self { + Amode::ImmReg { flags, .. } => *flags, + Amode::ImmRegRegShift { flags, .. } => *flags, + Amode::RipRelative { .. } => MemFlags::trusted(), + } + } + + pub(crate) fn can_trap(&self) -> bool { + !self.get_flags().notrap() + } +} + +impl PrettyPrint for Amode { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + match self { + Amode::ImmReg { simm32, base, .. } => { + format!("{}({})", *simm32 as i32, base.show_rru(mb_rru)) + } + Amode::ImmRegRegShift { + simm32, + base, + index, + shift, + .. + } => format!( + "{}({},{},{})", + *simm32 as i32, + base.show_rru(mb_rru), + index.show_rru(mb_rru), + 1 << shift + ), + Amode::RipRelative { ref target } => format!("label{}(%rip)", target.get()), + } + } +} + +/// A Memory Address. These denote a 64-bit value only. +/// Used for usual addressing modes as well as addressing modes used during compilation, when the +/// moving SP offset is not known. +#[derive(Clone)] +pub enum SyntheticAmode { + /// A real amode. + Real(Amode), + + /// A (virtual) offset to the "nominal SP" value, which will be recomputed as we push and pop + /// within the function. + NominalSPOffset { simm32: u32 }, +} + +impl SyntheticAmode { + pub(crate) fn nominal_sp_offset(simm32: u32) -> Self { + SyntheticAmode::NominalSPOffset { simm32 } + } + + /// Add the regs mentioned by `self` to `collector`. + pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) { + match self { + SyntheticAmode::Real(addr) => addr.get_regs_as_uses(collector), + SyntheticAmode::NominalSPOffset { .. } => { + // Nothing to do; the base is SP and isn't involved in regalloc. + } + } + } + + pub(crate) fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) { + match self { + SyntheticAmode::Real(addr) => addr.map_uses(map), + SyntheticAmode::NominalSPOffset { .. } => { + // Nothing to do. + } + } + } + + pub(crate) fn finalize(&self, state: &mut EmitState) -> Amode { + match self { + SyntheticAmode::Real(addr) => addr.clone(), + SyntheticAmode::NominalSPOffset { simm32 } => { + let off = *simm32 as i64 + state.virtual_sp_offset; + // TODO will require a sequence of add etc. + assert!( + off <= u32::max_value() as i64, + "amode finalize: add sequence NYI" + ); + Amode::imm_reg(off as u32, regs::rsp()) + } + } + } +} + +impl Into<SyntheticAmode> for Amode { + fn into(self) -> SyntheticAmode { + SyntheticAmode::Real(self) + } +} + +impl PrettyPrint for SyntheticAmode { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + match self { + SyntheticAmode::Real(addr) => addr.show_rru(mb_rru), + SyntheticAmode::NominalSPOffset { simm32 } => { + format!("rsp({} + virtual offset)", *simm32 as i32) + } + } + } +} + +/// An operand which is either an integer Register, a value in Memory or an Immediate. This can +/// denote an 8, 16, 32 or 64 bit value. For the Immediate form, in the 8- and 16-bit case, only +/// the lower 8 or 16 bits of `simm32` is relevant. In the 64-bit case, the value denoted by +/// `simm32` is its sign-extension out to 64 bits. +#[derive(Clone)] +pub enum RegMemImm { + Reg { reg: Reg }, + Mem { addr: SyntheticAmode }, + Imm { simm32: u32 }, +} + +impl RegMemImm { + pub(crate) fn reg(reg: Reg) -> Self { + debug_assert!(reg.get_class() == RegClass::I64 || reg.get_class() == RegClass::V128); + Self::Reg { reg } + } + pub(crate) fn mem(addr: impl Into<SyntheticAmode>) -> Self { + Self::Mem { addr: addr.into() } + } + pub(crate) fn imm(simm32: u32) -> Self { + Self::Imm { simm32 } + } + + /// Asserts that in register mode, the reg class is the one that's expected. + pub(crate) fn assert_regclass_is(&self, expected_reg_class: RegClass) { + if let Self::Reg { reg } = self { + debug_assert_eq!(reg.get_class(), expected_reg_class); + } + } + + /// Add the regs mentioned by `self` to `collector`. + pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) { + match self { + Self::Reg { reg } => collector.add_use(*reg), + Self::Mem { addr } => addr.get_regs_as_uses(collector), + Self::Imm { .. } => {} + } + } + + pub(crate) fn to_reg(&self) -> Option<Reg> { + match self { + Self::Reg { reg } => Some(*reg), + _ => None, + } + } +} + +impl PrettyPrint for RegMemImm { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + self.show_rru_sized(mb_rru, 8) + } +} + +impl PrettyPrintSized for RegMemImm { + fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String { + match self { + Self::Reg { reg } => show_ireg_sized(*reg, mb_rru, size), + Self::Mem { addr } => addr.show_rru(mb_rru), + Self::Imm { simm32 } => format!("${}", *simm32 as i32), + } + } +} + +/// An operand which is either an integer Register or a value in Memory. This can denote an 8, 16, +/// 32, 64, or 128 bit value. +#[derive(Clone)] +pub enum RegMem { + Reg { reg: Reg }, + Mem { addr: SyntheticAmode }, +} + +impl RegMem { + pub(crate) fn reg(reg: Reg) -> Self { + debug_assert!(reg.get_class() == RegClass::I64 || reg.get_class() == RegClass::V128); + Self::Reg { reg } + } + pub(crate) fn mem(addr: impl Into<SyntheticAmode>) -> Self { + Self::Mem { addr: addr.into() } + } + /// Asserts that in register mode, the reg class is the one that's expected. + pub(crate) fn assert_regclass_is(&self, expected_reg_class: RegClass) { + if let Self::Reg { reg } = self { + debug_assert_eq!(reg.get_class(), expected_reg_class); + } + } + /// Add the regs mentioned by `self` to `collector`. + pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) { + match self { + RegMem::Reg { reg } => collector.add_use(*reg), + RegMem::Mem { addr, .. } => addr.get_regs_as_uses(collector), + } + } + pub(crate) fn to_reg(&self) -> Option<Reg> { + match self { + RegMem::Reg { reg } => Some(*reg), + _ => None, + } + } +} + +impl From<Writable<Reg>> for RegMem { + fn from(r: Writable<Reg>) -> Self { + RegMem::reg(r.to_reg()) + } +} + +impl PrettyPrint for RegMem { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + self.show_rru_sized(mb_rru, 8) + } +} + +impl PrettyPrintSized for RegMem { + fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String { + match self { + RegMem::Reg { reg } => show_ireg_sized(*reg, mb_rru, size), + RegMem::Mem { addr, .. } => addr.show_rru(mb_rru), + } + } +} + +/// Some basic ALU operations. TODO: maybe add Adc, Sbb. +#[derive(Copy, Clone, PartialEq)] +pub enum AluRmiROpcode { + Add, + Sub, + And, + Or, + Xor, + /// The signless, non-extending (N x N -> N, for N in {32,64}) variant. + Mul, +} + +impl fmt::Debug for AluRmiROpcode { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + AluRmiROpcode::Add => "add", + AluRmiROpcode::Sub => "sub", + AluRmiROpcode::And => "and", + AluRmiROpcode::Or => "or", + AluRmiROpcode::Xor => "xor", + AluRmiROpcode::Mul => "imul", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for AluRmiROpcode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +#[derive(Clone, PartialEq)] +pub enum UnaryRmROpcode { + /// Bit-scan reverse. + Bsr, + /// Bit-scan forward. + Bsf, +} + +impl fmt::Debug for UnaryRmROpcode { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + UnaryRmROpcode::Bsr => write!(fmt, "bsr"), + UnaryRmROpcode::Bsf => write!(fmt, "bsf"), + } + } +} + +impl fmt::Display for UnaryRmROpcode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +pub(crate) enum InstructionSet { + SSE, + SSE2, + SSSE3, + SSE41, + SSE42, +} + +/// Some SSE operations requiring 2 operands r/m and r. +#[derive(Clone, Copy, PartialEq)] +pub enum SseOpcode { + Addps, + Addpd, + Addss, + Addsd, + Andps, + Andpd, + Andnps, + Andnpd, + Comiss, + Comisd, + Cmpps, + Cmppd, + Cmpss, + Cmpsd, + Cvtdq2ps, + Cvtsd2ss, + Cvtsd2si, + Cvtsi2ss, + Cvtsi2sd, + Cvtss2si, + Cvtss2sd, + Cvttps2dq, + Cvttss2si, + Cvttsd2si, + Divps, + Divpd, + Divss, + Divsd, + Insertps, + Maxps, + Maxpd, + Maxss, + Maxsd, + Minps, + Minpd, + Minss, + Minsd, + Movaps, + Movapd, + Movd, + Movdqa, + Movdqu, + Movlhps, + Movmskps, + Movmskpd, + Movq, + Movss, + Movsd, + Movups, + Movupd, + Mulps, + Mulpd, + Mulss, + Mulsd, + Orps, + Orpd, + Pabsb, + Pabsw, + Pabsd, + Packsswb, + Paddb, + Paddd, + Paddq, + Paddw, + Paddsb, + Paddsw, + Paddusb, + Paddusw, + Pand, + Pandn, + Pavgb, + Pavgw, + Pcmpeqb, + Pcmpeqw, + Pcmpeqd, + Pcmpeqq, + Pcmpgtb, + Pcmpgtw, + Pcmpgtd, + Pcmpgtq, + Pextrb, + Pextrw, + Pextrd, + Pinsrb, + Pinsrw, + Pinsrd, + Pmaxsb, + Pmaxsw, + Pmaxsd, + Pmaxub, + Pmaxuw, + Pmaxud, + Pminsb, + Pminsw, + Pminsd, + Pminub, + Pminuw, + Pminud, + Pmovmskb, + Pmulld, + Pmullw, + Pmuludq, + Por, + Pshufb, + Pshufd, + Psllw, + Pslld, + Psllq, + Psraw, + Psrad, + Psrlw, + Psrld, + Psrlq, + Psubb, + Psubd, + Psubq, + Psubw, + Psubsb, + Psubsw, + Psubusb, + Psubusw, + Ptest, + Pxor, + Rcpss, + Roundss, + Roundsd, + Rsqrtss, + Sqrtps, + Sqrtpd, + Sqrtss, + Sqrtsd, + Subps, + Subpd, + Subss, + Subsd, + Ucomiss, + Ucomisd, + Xorps, + Xorpd, +} + +impl SseOpcode { + /// Which `InstructionSet` is the first supporting this opcode? + pub(crate) fn available_from(&self) -> InstructionSet { + use InstructionSet::*; + match self { + SseOpcode::Addps + | SseOpcode::Addss + | SseOpcode::Andps + | SseOpcode::Andnps + | SseOpcode::Comiss + | SseOpcode::Cmpps + | SseOpcode::Cmpss + | SseOpcode::Cvtsi2ss + | SseOpcode::Cvtss2si + | SseOpcode::Cvttss2si + | SseOpcode::Divps + | SseOpcode::Divss + | SseOpcode::Maxps + | SseOpcode::Maxss + | SseOpcode::Minps + | SseOpcode::Minss + | SseOpcode::Movaps + | SseOpcode::Movlhps + | SseOpcode::Movmskps + | SseOpcode::Movss + | SseOpcode::Movups + | SseOpcode::Mulps + | SseOpcode::Mulss + | SseOpcode::Orps + | SseOpcode::Rcpss + | SseOpcode::Rsqrtss + | SseOpcode::Sqrtps + | SseOpcode::Sqrtss + | SseOpcode::Subps + | SseOpcode::Subss + | SseOpcode::Ucomiss + | SseOpcode::Xorps => SSE, + + SseOpcode::Addpd + | SseOpcode::Addsd + | SseOpcode::Andpd + | SseOpcode::Andnpd + | SseOpcode::Cmppd + | SseOpcode::Cmpsd + | SseOpcode::Comisd + | SseOpcode::Cvtdq2ps + | SseOpcode::Cvtsd2ss + | SseOpcode::Cvtsd2si + | SseOpcode::Cvtsi2sd + | SseOpcode::Cvtss2sd + | SseOpcode::Cvttps2dq + | SseOpcode::Cvttsd2si + | SseOpcode::Divpd + | SseOpcode::Divsd + | SseOpcode::Maxpd + | SseOpcode::Maxsd + | SseOpcode::Minpd + | SseOpcode::Minsd + | SseOpcode::Movapd + | SseOpcode::Movd + | SseOpcode::Movmskpd + | SseOpcode::Movq + | SseOpcode::Movsd + | SseOpcode::Movupd + | SseOpcode::Movdqa + | SseOpcode::Movdqu + | SseOpcode::Mulpd + | SseOpcode::Mulsd + | SseOpcode::Orpd + | SseOpcode::Packsswb + | SseOpcode::Paddb + | SseOpcode::Paddd + | SseOpcode::Paddq + | SseOpcode::Paddw + | SseOpcode::Paddsb + | SseOpcode::Paddsw + | SseOpcode::Paddusb + | SseOpcode::Paddusw + | SseOpcode::Pand + | SseOpcode::Pandn + | SseOpcode::Pavgb + | SseOpcode::Pavgw + | SseOpcode::Pcmpeqb + | SseOpcode::Pcmpeqw + | SseOpcode::Pcmpeqd + | SseOpcode::Pcmpgtb + | SseOpcode::Pcmpgtw + | SseOpcode::Pcmpgtd + | SseOpcode::Pextrw + | SseOpcode::Pinsrw + | SseOpcode::Pmaxsw + | SseOpcode::Pmaxub + | SseOpcode::Pminsw + | SseOpcode::Pminub + | SseOpcode::Pmovmskb + | SseOpcode::Pmullw + | SseOpcode::Pmuludq + | SseOpcode::Por + | SseOpcode::Pshufd + | SseOpcode::Psllw + | SseOpcode::Pslld + | SseOpcode::Psllq + | SseOpcode::Psraw + | SseOpcode::Psrad + | SseOpcode::Psrlw + | SseOpcode::Psrld + | SseOpcode::Psrlq + | SseOpcode::Psubb + | SseOpcode::Psubd + | SseOpcode::Psubq + | SseOpcode::Psubw + | SseOpcode::Psubsb + | SseOpcode::Psubsw + | SseOpcode::Psubusb + | SseOpcode::Psubusw + | SseOpcode::Pxor + | SseOpcode::Sqrtpd + | SseOpcode::Sqrtsd + | SseOpcode::Subpd + | SseOpcode::Subsd + | SseOpcode::Ucomisd + | SseOpcode::Xorpd => SSE2, + + SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3, + + SseOpcode::Insertps + | SseOpcode::Pcmpeqq + | SseOpcode::Pextrb + | SseOpcode::Pextrd + | SseOpcode::Pinsrb + | SseOpcode::Pinsrd + | SseOpcode::Pmaxsb + | SseOpcode::Pmaxsd + | SseOpcode::Pmaxuw + | SseOpcode::Pmaxud + | SseOpcode::Pminsb + | SseOpcode::Pminsd + | SseOpcode::Pminuw + | SseOpcode::Pminud + | SseOpcode::Pmulld + | SseOpcode::Ptest + | SseOpcode::Roundss + | SseOpcode::Roundsd => SSE41, + + SseOpcode::Pcmpgtq => SSE42, + } + } + + /// Returns the src operand size for an instruction. + pub(crate) fn src_size(&self) -> u8 { + match self { + SseOpcode::Movd => 4, + _ => 8, + } + } +} + +impl fmt::Debug for SseOpcode { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + SseOpcode::Addps => "addps", + SseOpcode::Addpd => "addpd", + SseOpcode::Addss => "addss", + SseOpcode::Addsd => "addsd", + SseOpcode::Andpd => "andpd", + SseOpcode::Andps => "andps", + SseOpcode::Andnps => "andnps", + SseOpcode::Andnpd => "andnpd", + SseOpcode::Cmpps => "cmpps", + SseOpcode::Cmppd => "cmppd", + SseOpcode::Cmpss => "cmpss", + SseOpcode::Cmpsd => "cmpsd", + SseOpcode::Comiss => "comiss", + SseOpcode::Comisd => "comisd", + SseOpcode::Cvtdq2ps => "cvtdq2ps", + SseOpcode::Cvtsd2ss => "cvtsd2ss", + SseOpcode::Cvtsd2si => "cvtsd2si", + SseOpcode::Cvtsi2ss => "cvtsi2ss", + SseOpcode::Cvtsi2sd => "cvtsi2sd", + SseOpcode::Cvtss2si => "cvtss2si", + SseOpcode::Cvtss2sd => "cvtss2sd", + SseOpcode::Cvttps2dq => "cvttps2dq", + SseOpcode::Cvttss2si => "cvttss2si", + SseOpcode::Cvttsd2si => "cvttsd2si", + SseOpcode::Divps => "divps", + SseOpcode::Divpd => "divpd", + SseOpcode::Divss => "divss", + SseOpcode::Divsd => "divsd", + SseOpcode::Insertps => "insertps", + SseOpcode::Maxps => "maxps", + SseOpcode::Maxpd => "maxpd", + SseOpcode::Maxss => "maxss", + SseOpcode::Maxsd => "maxsd", + SseOpcode::Minps => "minps", + SseOpcode::Minpd => "minpd", + SseOpcode::Minss => "minss", + SseOpcode::Minsd => "minsd", + SseOpcode::Movaps => "movaps", + SseOpcode::Movapd => "movapd", + SseOpcode::Movd => "movd", + SseOpcode::Movdqa => "movdqa", + SseOpcode::Movdqu => "movdqu", + SseOpcode::Movlhps => "movlhps", + SseOpcode::Movmskps => "movmskps", + SseOpcode::Movmskpd => "movmskpd", + SseOpcode::Movq => "movq", + SseOpcode::Movss => "movss", + SseOpcode::Movsd => "movsd", + SseOpcode::Movups => "movups", + SseOpcode::Movupd => "movupd", + SseOpcode::Mulps => "mulps", + SseOpcode::Mulpd => "mulpd", + SseOpcode::Mulss => "mulss", + SseOpcode::Mulsd => "mulsd", + SseOpcode::Orpd => "orpd", + SseOpcode::Orps => "orps", + SseOpcode::Pabsb => "pabsb", + SseOpcode::Pabsw => "pabsw", + SseOpcode::Pabsd => "pabsd", + SseOpcode::Packsswb => "packsswb", + SseOpcode::Paddb => "paddb", + SseOpcode::Paddd => "paddd", + SseOpcode::Paddq => "paddq", + SseOpcode::Paddw => "paddw", + SseOpcode::Paddsb => "paddsb", + SseOpcode::Paddsw => "paddsw", + SseOpcode::Paddusb => "paddusb", + SseOpcode::Paddusw => "paddusw", + SseOpcode::Pand => "pand", + SseOpcode::Pandn => "pandn", + SseOpcode::Pavgb => "pavgb", + SseOpcode::Pavgw => "pavgw", + SseOpcode::Pcmpeqb => "pcmpeqb", + SseOpcode::Pcmpeqw => "pcmpeqw", + SseOpcode::Pcmpeqd => "pcmpeqd", + SseOpcode::Pcmpeqq => "pcmpeqq", + SseOpcode::Pcmpgtb => "pcmpgtb", + SseOpcode::Pcmpgtw => "pcmpgtw", + SseOpcode::Pcmpgtd => "pcmpgtd", + SseOpcode::Pcmpgtq => "pcmpgtq", + SseOpcode::Pextrb => "pextrb", + SseOpcode::Pextrw => "pextrw", + SseOpcode::Pextrd => "pextrd", + SseOpcode::Pinsrb => "pinsrb", + SseOpcode::Pinsrw => "pinsrw", + SseOpcode::Pinsrd => "pinsrd", + SseOpcode::Pmaxsb => "pmaxsb", + SseOpcode::Pmaxsw => "pmaxsw", + SseOpcode::Pmaxsd => "pmaxsd", + SseOpcode::Pmaxub => "pmaxub", + SseOpcode::Pmaxuw => "pmaxuw", + SseOpcode::Pmaxud => "pmaxud", + SseOpcode::Pminsb => "pminsb", + SseOpcode::Pminsw => "pminsw", + SseOpcode::Pminsd => "pminsd", + SseOpcode::Pminub => "pminub", + SseOpcode::Pminuw => "pminuw", + SseOpcode::Pminud => "pminud", + SseOpcode::Pmovmskb => "pmovmskb", + SseOpcode::Pmulld => "pmulld", + SseOpcode::Pmullw => "pmullw", + SseOpcode::Pmuludq => "pmuludq", + SseOpcode::Por => "por", + SseOpcode::Pshufb => "pshufb", + SseOpcode::Pshufd => "pshufd", + SseOpcode::Psllw => "psllw", + SseOpcode::Pslld => "pslld", + SseOpcode::Psllq => "psllq", + SseOpcode::Psraw => "psraw", + SseOpcode::Psrad => "psrad", + SseOpcode::Psrlw => "psrlw", + SseOpcode::Psrld => "psrld", + SseOpcode::Psrlq => "psrlq", + SseOpcode::Psubb => "psubb", + SseOpcode::Psubd => "psubd", + SseOpcode::Psubq => "psubq", + SseOpcode::Psubw => "psubw", + SseOpcode::Psubsb => "psubsb", + SseOpcode::Psubsw => "psubsw", + SseOpcode::Psubusb => "psubusb", + SseOpcode::Psubusw => "psubusw", + SseOpcode::Ptest => "ptest", + SseOpcode::Pxor => "pxor", + SseOpcode::Rcpss => "rcpss", + SseOpcode::Roundss => "roundss", + SseOpcode::Roundsd => "roundsd", + SseOpcode::Rsqrtss => "rsqrtss", + SseOpcode::Sqrtps => "sqrtps", + SseOpcode::Sqrtpd => "sqrtpd", + SseOpcode::Sqrtss => "sqrtss", + SseOpcode::Sqrtsd => "sqrtsd", + SseOpcode::Subps => "subps", + SseOpcode::Subpd => "subpd", + SseOpcode::Subss => "subss", + SseOpcode::Subsd => "subsd", + SseOpcode::Ucomiss => "ucomiss", + SseOpcode::Ucomisd => "ucomisd", + SseOpcode::Xorps => "xorps", + SseOpcode::Xorpd => "xorpd", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for SseOpcode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +/// This defines the ways a value can be extended: either signed- or zero-extension, or none for +/// types that are not extended. Contrast with [ExtMode], which defines the widths from and to which +/// values can be extended. +#[derive(Clone, PartialEq)] +pub enum ExtKind { + None, + SignExtend, + ZeroExtend, +} + +/// These indicate ways of extending (widening) a value, using the Intel +/// naming: B(yte) = u8, W(ord) = u16, L(ong)word = u32, Q(uad)word = u64 +#[derive(Clone, PartialEq)] +pub enum ExtMode { + /// Byte -> Longword. + BL, + /// Byte -> Quadword. + BQ, + /// Word -> Longword. + WL, + /// Word -> Quadword. + WQ, + /// Longword -> Quadword. + LQ, +} + +impl ExtMode { + /// Calculate the `ExtMode` from passed bit lengths of the from/to types. + pub(crate) fn new(from_bits: u16, to_bits: u16) -> Option<ExtMode> { + match (from_bits, to_bits) { + (1, 8) | (1, 16) | (1, 32) | (8, 16) | (8, 32) => Some(ExtMode::BL), + (1, 64) | (8, 64) => Some(ExtMode::BQ), + (16, 32) => Some(ExtMode::WL), + (16, 64) => Some(ExtMode::WQ), + (32, 64) => Some(ExtMode::LQ), + _ => None, + } + } + + /// Return the source register size in bytes. + pub(crate) fn src_size(&self) -> u8 { + match self { + ExtMode::BL | ExtMode::BQ => 1, + ExtMode::WL | ExtMode::WQ => 2, + ExtMode::LQ => 4, + } + } + + /// Return the destination register size in bytes. + pub(crate) fn dst_size(&self) -> u8 { + match self { + ExtMode::BL | ExtMode::WL => 4, + ExtMode::BQ | ExtMode::WQ | ExtMode::LQ => 8, + } + } +} + +impl fmt::Debug for ExtMode { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + ExtMode::BL => "bl", + ExtMode::BQ => "bq", + ExtMode::WL => "wl", + ExtMode::WQ => "wq", + ExtMode::LQ => "lq", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for ExtMode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +/// These indicate the form of a scalar shift/rotate: left, signed right, unsigned right. +#[derive(Clone)] +pub enum ShiftKind { + ShiftLeft, + /// Inserts zeros in the most significant bits. + ShiftRightLogical, + /// Replicates the sign bit in the most significant bits. + ShiftRightArithmetic, + RotateLeft, + RotateRight, +} + +impl fmt::Debug for ShiftKind { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + ShiftKind::ShiftLeft => "shl", + ShiftKind::ShiftRightLogical => "shr", + ShiftKind::ShiftRightArithmetic => "sar", + ShiftKind::RotateLeft => "rol", + ShiftKind::RotateRight => "ror", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for ShiftKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +/// What kind of division or remainer instruction this is? +#[derive(Clone)] +pub enum DivOrRemKind { + SignedDiv, + UnsignedDiv, + SignedRem, + UnsignedRem, +} + +impl DivOrRemKind { + pub(crate) fn is_signed(&self) -> bool { + match self { + DivOrRemKind::SignedDiv | DivOrRemKind::SignedRem => true, + _ => false, + } + } + + pub(crate) fn is_div(&self) -> bool { + match self { + DivOrRemKind::SignedDiv | DivOrRemKind::UnsignedDiv => true, + _ => false, + } + } +} + +/// These indicate condition code tests. Not all are represented since not all are useful in +/// compiler-generated code. +#[derive(Copy, Clone)] +#[repr(u8)] +pub enum CC { + /// overflow + O = 0, + /// no overflow + NO = 1, + + /// < unsigned + B = 2, + /// >= unsigned + NB = 3, + + /// zero + Z = 4, + /// not-zero + NZ = 5, + + /// <= unsigned + BE = 6, + /// > unsigned + NBE = 7, + + /// negative + S = 8, + /// not-negative + NS = 9, + + /// < signed + L = 12, + /// >= signed + NL = 13, + + /// <= signed + LE = 14, + /// > signed + NLE = 15, + + /// parity + P = 10, + + /// not parity + NP = 11, +} + +impl CC { + pub(crate) fn from_intcc(intcc: IntCC) -> Self { + match intcc { + IntCC::Equal => CC::Z, + IntCC::NotEqual => CC::NZ, + IntCC::SignedGreaterThanOrEqual => CC::NL, + IntCC::SignedGreaterThan => CC::NLE, + IntCC::SignedLessThanOrEqual => CC::LE, + IntCC::SignedLessThan => CC::L, + IntCC::UnsignedGreaterThanOrEqual => CC::NB, + IntCC::UnsignedGreaterThan => CC::NBE, + IntCC::UnsignedLessThanOrEqual => CC::BE, + IntCC::UnsignedLessThan => CC::B, + IntCC::Overflow => CC::O, + IntCC::NotOverflow => CC::NO, + } + } + + pub(crate) fn invert(&self) -> Self { + match self { + CC::O => CC::NO, + CC::NO => CC::O, + + CC::B => CC::NB, + CC::NB => CC::B, + + CC::Z => CC::NZ, + CC::NZ => CC::Z, + + CC::BE => CC::NBE, + CC::NBE => CC::BE, + + CC::S => CC::NS, + CC::NS => CC::S, + + CC::L => CC::NL, + CC::NL => CC::L, + + CC::LE => CC::NLE, + CC::NLE => CC::LE, + + CC::P => CC::NP, + CC::NP => CC::P, + } + } + + pub(crate) fn from_floatcc(floatcc: FloatCC) -> Self { + match floatcc { + FloatCC::Ordered => CC::NP, + FloatCC::Unordered => CC::P, + // Alias for NE + FloatCC::OrderedNotEqual => CC::NZ, + // Alias for E + FloatCC::UnorderedOrEqual => CC::Z, + // Alias for A + FloatCC::GreaterThan => CC::NBE, + // Alias for AE + FloatCC::GreaterThanOrEqual => CC::NB, + FloatCC::UnorderedOrLessThan => CC::B, + FloatCC::UnorderedOrLessThanOrEqual => CC::BE, + FloatCC::Equal + | FloatCC::NotEqual + | FloatCC::LessThan + | FloatCC::LessThanOrEqual + | FloatCC::UnorderedOrGreaterThan + | FloatCC::UnorderedOrGreaterThanOrEqual => panic!( + "{:?} can't be lowered to a CC code; treat as special case.", + floatcc + ), + } + } + + pub(crate) fn get_enc(self) -> u8 { + self as u8 + } +} + +impl fmt::Debug for CC { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + CC::O => "o", + CC::NO => "no", + CC::B => "b", + CC::NB => "nb", + CC::Z => "z", + CC::NZ => "nz", + CC::BE => "be", + CC::NBE => "nbe", + CC::S => "s", + CC::NS => "ns", + CC::L => "l", + CC::NL => "nl", + CC::LE => "le", + CC::NLE => "nle", + CC::P => "p", + CC::NP => "np", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for CC { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +/// Encode the ways that floats can be compared. This is used in float comparisons such as `cmpps`, +/// e.g.; it is distinguished from other float comparisons (e.g. `ucomiss`) in that those use EFLAGS +/// whereas [FcmpImm] is used as an immediate. +pub(crate) enum FcmpImm { + Equal = 0x00, + LessThan = 0x01, + LessThanOrEqual = 0x02, + Unordered = 0x03, + NotEqual = 0x04, + UnorderedOrGreaterThanOrEqual = 0x05, + UnorderedOrGreaterThan = 0x06, + Ordered = 0x07, +} + +impl FcmpImm { + pub(crate) fn encode(self) -> u8 { + self as u8 + } +} + +impl From<FloatCC> for FcmpImm { + fn from(cond: FloatCC) -> Self { + match cond { + FloatCC::Equal => FcmpImm::Equal, + FloatCC::LessThan => FcmpImm::LessThan, + FloatCC::LessThanOrEqual => FcmpImm::LessThanOrEqual, + FloatCC::Unordered => FcmpImm::Unordered, + FloatCC::NotEqual => FcmpImm::NotEqual, + FloatCC::UnorderedOrGreaterThanOrEqual => FcmpImm::UnorderedOrGreaterThanOrEqual, + FloatCC::UnorderedOrGreaterThan => FcmpImm::UnorderedOrGreaterThan, + FloatCC::Ordered => FcmpImm::Ordered, + _ => panic!("unable to create comparison predicate for {}", cond), + } + } +} + +/// An operand's size in bits. +#[derive(Clone, Copy, PartialEq)] +pub enum OperandSize { + Size32, + Size64, +} + +impl OperandSize { + pub(crate) fn from_bytes(num_bytes: u32) -> Self { + match num_bytes { + 1 | 2 | 4 => OperandSize::Size32, + 8 => OperandSize::Size64, + _ => unreachable!(), + } + } + + pub(crate) fn to_bytes(&self) -> u8 { + match self { + Self::Size32 => 4, + Self::Size64 => 8, + } + } + + pub(crate) fn to_bits(&self) -> u8 { + match self { + Self::Size32 => 32, + Self::Size64 => 64, + } + } +} + +/// An x64 memory fence kind. +#[derive(Clone)] +#[allow(dead_code)] +pub enum FenceKind { + /// `mfence` instruction ("Memory Fence") + MFence, + /// `lfence` instruction ("Load Fence") + LFence, + /// `sfence` instruction ("Store Fence") + SFence, +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs new file mode 100644 index 0000000000..dd4125a2da --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs @@ -0,0 +1,2819 @@ +use crate::binemit::{Addend, Reloc}; +use crate::ir::immediates::{Ieee32, Ieee64}; +use crate::ir::TrapCode; +use crate::isa::x64::inst::args::*; +use crate::isa::x64::inst::*; +use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel}; +use core::convert::TryInto; +use log::debug; +use regalloc::{Reg, RegClass, Writable}; + +fn low8_will_sign_extend_to_64(x: u32) -> bool { + let xs = (x as i32) as i64; + xs == ((xs << 56) >> 56) +} + +fn low8_will_sign_extend_to_32(x: u32) -> bool { + let xs = x as i32; + xs == ((xs << 24) >> 24) +} + +//============================================================================= +// Instructions and subcomponents: emission + +// For all of the routines that take both a memory-or-reg operand (sometimes +// called "E" in the Intel documentation) and a reg-only operand ("G" in +// Intelese), the order is always G first, then E. +// +// "enc" in the following means "hardware register encoding number". + +#[inline(always)] +fn encode_modrm(m0d: u8, enc_reg_g: u8, rm_e: u8) -> u8 { + debug_assert!(m0d < 4); + debug_assert!(enc_reg_g < 8); + debug_assert!(rm_e < 8); + ((m0d & 3) << 6) | ((enc_reg_g & 7) << 3) | (rm_e & 7) +} + +#[inline(always)] +fn encode_sib(shift: u8, enc_index: u8, enc_base: u8) -> u8 { + debug_assert!(shift < 4); + debug_assert!(enc_index < 8); + debug_assert!(enc_base < 8); + ((shift & 3) << 6) | ((enc_index & 7) << 3) | (enc_base & 7) +} + +/// Get the encoding number of a GPR. +#[inline(always)] +fn int_reg_enc(reg: Reg) -> u8 { + debug_assert!(reg.is_real()); + debug_assert_eq!(reg.get_class(), RegClass::I64); + reg.get_hw_encoding() +} + +/// Get the encoding number of any register. +#[inline(always)] +fn reg_enc(reg: Reg) -> u8 { + debug_assert!(reg.is_real()); + reg.get_hw_encoding() +} + +/// A small bit field to record a REX prefix specification: +/// - bit 0 set to 1 indicates REX.W must be 0 (cleared). +/// - bit 1 set to 1 indicates the REX prefix must always be emitted. +#[repr(transparent)] +#[derive(Clone, Copy)] +struct RexFlags(u8); + +impl RexFlags { + /// By default, set the W field, and don't always emit. + #[inline(always)] + fn set_w() -> Self { + Self(0) + } + /// Creates a new RexPrefix for which the REX.W bit will be cleared. + #[inline(always)] + fn clear_w() -> Self { + Self(1) + } + + #[inline(always)] + fn always_emit(&mut self) -> &mut Self { + self.0 = self.0 | 2; + self + } + + #[inline(always)] + fn must_clear_w(&self) -> bool { + (self.0 & 1) != 0 + } + #[inline(always)] + fn must_always_emit(&self) -> bool { + (self.0 & 2) != 0 + } + + #[inline(always)] + fn emit_two_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_e: u8) { + let w = if self.must_clear_w() { 0 } else { 1 }; + let r = (enc_g >> 3) & 1; + let x = 0; + let b = (enc_e >> 3) & 1; + let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b; + if rex != 0x40 || self.must_always_emit() { + sink.put1(rex); + } + } + + #[inline(always)] + fn emit_three_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_index: u8, enc_base: u8) { + let w = if self.must_clear_w() { 0 } else { 1 }; + let r = (enc_g >> 3) & 1; + let x = (enc_index >> 3) & 1; + let b = (enc_base >> 3) & 1; + let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b; + if rex != 0x40 || self.must_always_emit() { + sink.put1(rex); + } + } +} + +/// We may need to include one or more legacy prefix bytes before the REX prefix. This enum +/// covers only the small set of possibilities that we actually need. +enum LegacyPrefixes { + /// No prefix bytes + None, + /// Operand Size Override -- here, denoting "16-bit operation" + _66, + /// The Lock prefix + _F0, + /// Operand size override and Lock + _66F0, + /// REPNE, but no specific meaning here -- is just an opcode extension + _F2, + /// REP/REPE, but no specific meaning here -- is just an opcode extension + _F3, +} + +impl LegacyPrefixes { + #[inline(always)] + fn emit(&self, sink: &mut MachBuffer<Inst>) { + match self { + LegacyPrefixes::_66 => sink.put1(0x66), + LegacyPrefixes::_F0 => sink.put1(0xF0), + LegacyPrefixes::_66F0 => { + // I don't think the order matters, but in any case, this is the same order that + // the GNU assembler uses. + sink.put1(0x66); + sink.put1(0xF0); + } + LegacyPrefixes::_F2 => sink.put1(0xF2), + LegacyPrefixes::_F3 => sink.put1(0xF3), + LegacyPrefixes::None => (), + } + } +} + +/// This is the core 'emit' function for instructions that reference memory. +/// +/// For an instruction that has as operands a reg encoding `enc_g` and a memory address `mem_e`, +/// create and emit: +/// - first the legacy prefixes, if any +/// - then the REX prefix, if needed +/// - then caller-supplied opcode byte(s) (`opcodes` and `num_opcodes`), +/// - then the MOD/RM byte, +/// - then optionally, a SIB byte, +/// - and finally optionally an immediate that will be derived from the `mem_e` operand. +/// +/// For most instructions up to and including SSE4.2, that will be the whole instruction: this is +/// what we call "standard" instructions, and abbreviate "std" in the name here. VEX-prefixed +/// instructions will require their own emitter functions. +/// +/// This will also work for 32-bits x86 instructions, assuming no REX prefix is provided. +/// +/// The opcodes are written bigendianly for the convenience of callers. For example, if the opcode +/// bytes to be emitted are, in this order, F3 0F 27, then the caller should pass `opcodes` == +/// 0xF3_0F_27 and `num_opcodes` == 3. +/// +/// The register operand is represented here not as a `Reg` but as its hardware encoding, `enc_g`. +/// `rex` can specify special handling for the REX prefix. By default, the REX prefix will +/// indicate a 64-bit operation and will be deleted if it is redundant (0x40). Note that for a +/// 64-bit operation, the REX prefix will normally never be redundant, since REX.W must be 1 to +/// indicate a 64-bit operation. +fn emit_std_enc_mem( + sink: &mut MachBuffer<Inst>, + state: &EmitState, + prefixes: LegacyPrefixes, + opcodes: u32, + mut num_opcodes: usize, + enc_g: u8, + mem_e: &Amode, + rex: RexFlags, +) { + // General comment for this function: the registers in `mem_e` must be + // 64-bit integer registers, because they are part of an address + // expression. But `enc_g` can be derived from a register of any class. + + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() && mem_e.can_trap() { + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + prefixes.emit(sink); + + match mem_e { + Amode::ImmReg { simm32, base, .. } => { + // First, the REX byte. + let enc_e = int_reg_enc(*base); + rex.emit_two_op(sink, enc_g, enc_e); + + // Now the opcode(s). These include any other prefixes the caller + // hands to us. + while num_opcodes > 0 { + num_opcodes -= 1; + sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); + } + + // Now the mod/rm and associated immediates. This is + // significantly complicated due to the multiple special cases. + if *simm32 == 0 + && enc_e != regs::ENC_RSP + && enc_e != regs::ENC_RBP + && enc_e != regs::ENC_R12 + && enc_e != regs::ENC_R13 + { + // FIXME JRS 2020Feb11: those four tests can surely be + // replaced by a single mask-and-compare check. We should do + // that because this routine is likely to be hot. + sink.put1(encode_modrm(0, enc_g & 7, enc_e & 7)); + } else if *simm32 == 0 && (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12) { + sink.put1(encode_modrm(0, enc_g & 7, 4)); + sink.put1(0x24); + } else if low8_will_sign_extend_to_32(*simm32) + && enc_e != regs::ENC_RSP + && enc_e != regs::ENC_R12 + { + sink.put1(encode_modrm(1, enc_g & 7, enc_e & 7)); + sink.put1((simm32 & 0xFF) as u8); + } else if enc_e != regs::ENC_RSP && enc_e != regs::ENC_R12 { + sink.put1(encode_modrm(2, enc_g & 7, enc_e & 7)); + sink.put4(*simm32); + } else if (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12) + && low8_will_sign_extend_to_32(*simm32) + { + // REX.B distinguishes RSP from R12 + sink.put1(encode_modrm(1, enc_g & 7, 4)); + sink.put1(0x24); + sink.put1((simm32 & 0xFF) as u8); + } else if enc_e == regs::ENC_R12 || enc_e == regs::ENC_RSP { + //.. wait for test case for RSP case + // REX.B distinguishes RSP from R12 + sink.put1(encode_modrm(2, enc_g & 7, 4)); + sink.put1(0x24); + sink.put4(*simm32); + } else { + unreachable!("ImmReg"); + } + } + + Amode::ImmRegRegShift { + simm32, + base: reg_base, + index: reg_index, + shift, + .. + } => { + let enc_base = int_reg_enc(*reg_base); + let enc_index = int_reg_enc(*reg_index); + + // The rex byte. + rex.emit_three_op(sink, enc_g, enc_index, enc_base); + + // All other prefixes and opcodes. + while num_opcodes > 0 { + num_opcodes -= 1; + sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); + } + + // modrm, SIB, immediates. + if low8_will_sign_extend_to_32(*simm32) && enc_index != regs::ENC_RSP { + sink.put1(encode_modrm(1, enc_g & 7, 4)); + sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7)); + sink.put1(*simm32 as u8); + } else if enc_index != regs::ENC_RSP { + sink.put1(encode_modrm(2, enc_g & 7, 4)); + sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7)); + sink.put4(*simm32); + } else { + panic!("ImmRegRegShift"); + } + } + + Amode::RipRelative { ref target } => { + // First, the REX byte, with REX.B = 0. + rex.emit_two_op(sink, enc_g, 0); + + // Now the opcode(s). These include any other prefixes the caller + // hands to us. + while num_opcodes > 0 { + num_opcodes -= 1; + sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); + } + + // RIP-relative is mod=00, rm=101. + sink.put1(encode_modrm(0, enc_g & 7, 0b101)); + + let offset = sink.cur_offset(); + sink.use_label_at_offset(offset, *target, LabelUse::JmpRel32); + sink.put4(0); + } + } +} + +/// This is the core 'emit' function for instructions that do not reference memory. +/// +/// This is conceptually the same as emit_modrm_sib_enc_ge, except it is for the case where the E +/// operand is a register rather than memory. Hence it is much simpler. +fn emit_std_enc_enc( + sink: &mut MachBuffer<Inst>, + prefixes: LegacyPrefixes, + opcodes: u32, + mut num_opcodes: usize, + enc_g: u8, + enc_e: u8, + rex: RexFlags, +) { + // EncG and EncE can be derived from registers of any class, and they + // don't even have to be from the same class. For example, for an + // integer-to-FP conversion insn, one might be RegClass::I64 and the other + // RegClass::V128. + + // The legacy prefixes. + prefixes.emit(sink); + + // The rex byte. + rex.emit_two_op(sink, enc_g, enc_e); + + // All other prefixes and opcodes. + while num_opcodes > 0 { + num_opcodes -= 1; + sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); + } + + // Now the mod/rm byte. The instruction we're generating doesn't access + // memory, so there is no SIB byte or immediate -- we're done. + sink.put1(encode_modrm(3, enc_g & 7, enc_e & 7)); +} + +// These are merely wrappers for the above two functions that facilitate passing +// actual `Reg`s rather than their encodings. + +fn emit_std_reg_mem( + sink: &mut MachBuffer<Inst>, + state: &EmitState, + prefixes: LegacyPrefixes, + opcodes: u32, + num_opcodes: usize, + reg_g: Reg, + mem_e: &Amode, + rex: RexFlags, +) { + let enc_g = reg_enc(reg_g); + emit_std_enc_mem( + sink, + state, + prefixes, + opcodes, + num_opcodes, + enc_g, + mem_e, + rex, + ); +} + +fn emit_std_reg_reg( + sink: &mut MachBuffer<Inst>, + prefixes: LegacyPrefixes, + opcodes: u32, + num_opcodes: usize, + reg_g: Reg, + reg_e: Reg, + rex: RexFlags, +) { + let enc_g = reg_enc(reg_g); + let enc_e = reg_enc(reg_e); + emit_std_enc_enc(sink, prefixes, opcodes, num_opcodes, enc_g, enc_e, rex); +} + +/// Write a suitable number of bits from an imm64 to the sink. +fn emit_simm(sink: &mut MachBuffer<Inst>, size: u8, simm32: u32) { + match size { + 8 | 4 => sink.put4(simm32), + 2 => sink.put2(simm32 as u16), + 1 => sink.put1(simm32 as u8), + _ => unreachable!(), + } +} + +/// A small helper to generate a signed conversion instruction. +fn emit_signed_cvt( + sink: &mut MachBuffer<Inst>, + info: &EmitInfo, + state: &mut EmitState, + src: Reg, + dst: Writable<Reg>, + to_f64: bool, +) { + // Handle an unsigned int, which is the "easy" case: a signed conversion will do the + // right thing. + let op = if to_f64 { + SseOpcode::Cvtsi2sd + } else { + SseOpcode::Cvtsi2ss + }; + let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst); + inst.emit(sink, info, state); +} + +/// Emits a one way conditional jump if CC is set (true). +fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) { + let cond_start = sink.cur_offset(); + let cond_disp_off = cond_start + 2; + sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32); + sink.put1(0x0F); + sink.put1(0x80 + cc.get_enc()); + sink.put4(0x0); +} + +/// Emits a relocation, attaching the current source location as well. +fn emit_reloc( + sink: &mut MachBuffer<Inst>, + state: &EmitState, + kind: Reloc, + name: &ExternalName, + addend: Addend, +) { + let srcloc = state.cur_srcloc(); + sink.add_reloc(srcloc, kind, name, addend); +} + +/// The top-level emit function. +/// +/// Important! Do not add improved (shortened) encoding cases to existing +/// instructions without also adding tests for those improved encodings. That +/// is a dangerous game that leads to hard-to-track-down errors in the emitted +/// code. +/// +/// For all instructions, make sure to have test coverage for all of the +/// following situations. Do this by creating the cross product resulting from +/// applying the following rules to each operand: +/// +/// (1) for any insn that mentions a register: one test using a register from +/// the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one +/// using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15]. +/// This helps detect incorrect REX prefix construction. +/// +/// (2) for any insn that mentions a byte register: one test for each of the +/// four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil], +/// [r8b .. r11b] and [r12b .. r15b]. This checks that +/// apparently-redundant REX prefixes are retained when required. +/// +/// (3) for any insn that contains an immediate field, check the following +/// cases: field is zero, field is in simm8 range (-128 .. 127), field is +/// in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF). This is because some +/// instructions that require a 32-bit immediate have a short-form encoding +/// when the imm is in simm8 range. +/// +/// Rules (1), (2) and (3) don't apply for registers within address expressions +/// (`Addr`s). Those are already pretty well tested, and the registers in them +/// don't have any effect on the containing instruction (apart from possibly +/// require REX prefix bits). +/// +/// When choosing registers for a test, avoid using registers with the same +/// offset within a given group. For example, don't use rax and r8, since they +/// both have the lowest 3 bits as 000, and so the test won't detect errors +/// where those 3-bit register sub-fields are confused by the emitter. Instead +/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001). Similarly, don't use (eg) cl +/// and bpl since they have the same offset in their group; use instead (eg) cl +/// and sil. +/// +/// For all instructions, also add a test that uses only low-half registers +/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX +/// prefixes are correctly omitted. This low-half restriction must apply to +/// _all_ registers in the insn, even those in address expressions. +/// +/// Following these rules creates large numbers of test cases, but it's the +/// only way to make the emitter reliable. +/// +/// Known possible improvements: +/// +/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate. (Do we +/// care?) +pub(crate) fn emit( + inst: &Inst, + sink: &mut MachBuffer<Inst>, + info: &EmitInfo, + state: &mut EmitState, +) { + if let Some(iset_requirement) = inst.isa_requirement() { + match iset_requirement { + // Cranelift assumes SSE2 at least. + InstructionSet::SSE | InstructionSet::SSE2 => {} + InstructionSet::SSSE3 => assert!(info.isa_flags.has_ssse3()), + InstructionSet::SSE41 => assert!(info.isa_flags.has_sse41()), + InstructionSet::SSE42 => assert!(info.isa_flags.has_sse42()), + } + } + + match inst { + Inst::AluRmiR { + is_64, + op, + src, + dst: reg_g, + } => { + let rex = if *is_64 { + RexFlags::set_w() + } else { + RexFlags::clear_w() + }; + + if *op == AluRmiROpcode::Mul { + // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so + // we have to special-case it. + match src { + RegMemImm::Reg { reg: reg_e } => { + emit_std_reg_reg( + sink, + LegacyPrefixes::None, + 0x0FAF, + 2, + reg_g.to_reg(), + *reg_e, + rex, + ); + } + + RegMemImm::Mem { addr } => { + let amode = addr.finalize(state); + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + 0x0FAF, + 2, + reg_g.to_reg(), + &amode, + rex, + ); + } + + RegMemImm::Imm { simm32 } => { + let use_imm8 = low8_will_sign_extend_to_32(*simm32); + let opcode = if use_imm8 { 0x6B } else { 0x69 }; + // Yes, really, reg_g twice. + emit_std_reg_reg( + sink, + LegacyPrefixes::None, + opcode, + 1, + reg_g.to_reg(), + reg_g.to_reg(), + rex, + ); + emit_simm(sink, if use_imm8 { 1 } else { 4 }, *simm32); + } + } + } else { + let (opcode_r, opcode_m, subopcode_i) = match op { + AluRmiROpcode::Add => (0x01, 0x03, 0), + AluRmiROpcode::Sub => (0x29, 0x2B, 5), + AluRmiROpcode::And => (0x21, 0x23, 4), + AluRmiROpcode::Or => (0x09, 0x0B, 1), + AluRmiROpcode::Xor => (0x31, 0x33, 6), + AluRmiROpcode::Mul => panic!("unreachable"), + }; + + match src { + RegMemImm::Reg { reg: reg_e } => { + // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R + // duality). Do this too, so as to be able to compare generated machine + // code easily. + emit_std_reg_reg( + sink, + LegacyPrefixes::None, + opcode_r, + 1, + *reg_e, + reg_g.to_reg(), + rex, + ); + // NB: if this is ever extended to handle byte size ops, be sure to retain + // redundant REX prefixes. + } + + RegMemImm::Mem { addr } => { + // Here we revert to the "normal" G-E ordering. + let amode = addr.finalize(state); + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + opcode_m, + 1, + reg_g.to_reg(), + &amode, + rex, + ); + } + + RegMemImm::Imm { simm32 } => { + let use_imm8 = low8_will_sign_extend_to_32(*simm32); + let opcode = if use_imm8 { 0x83 } else { 0x81 }; + // And also here we use the "normal" G-E ordering. + let enc_g = int_reg_enc(reg_g.to_reg()); + emit_std_enc_enc( + sink, + LegacyPrefixes::None, + opcode, + 1, + subopcode_i, + enc_g, + rex, + ); + emit_simm(sink, if use_imm8 { 1 } else { 4 }, *simm32); + } + } + } + } + + Inst::UnaryRmR { size, op, src, dst } => { + let (prefix, rex_flags) = match size { + 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!(), + }; + + let (opcode, num_opcodes) = match op { + UnaryRmROpcode::Bsr => (0x0fbd, 2), + UnaryRmROpcode::Bsf => (0x0fbc, 2), + }; + + match src { + RegMem::Reg { reg: src } => emit_std_reg_reg( + sink, + prefix, + opcode, + num_opcodes, + dst.to_reg(), + *src, + rex_flags, + ), + RegMem::Mem { addr: src } => { + let amode = src.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode, + num_opcodes, + dst.to_reg(), + &amode, + rex_flags, + ); + } + } + } + + Inst::Not { size, src } => { + let (opcode, prefix, rex_flags) = match size { + 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()), + 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("{}", size), + }; + + let subopcode = 2; + let src = int_reg_enc(src.to_reg()); + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags) + } + + Inst::Neg { size, src } => { + let (opcode, prefix, rex_flags) = match size { + 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()), + 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("{}", size), + }; + + let subopcode = 3; + let src = int_reg_enc(src.to_reg()); + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags) + } + + Inst::Div { + size, + signed, + divisor, + } => { + let (opcode, prefix, rex_flags) = match size { + 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()), + 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("{}", size), + }; + + let loc = state.cur_srcloc(); + sink.add_trap(loc, TrapCode::IntegerDivisionByZero); + + let subopcode = if *signed { 7 } else { 6 }; + match divisor { + RegMem::Reg { reg } => { + let src = int_reg_enc(*reg); + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags) + } + RegMem::Mem { addr: src } => { + let amode = src.finalize(state); + emit_std_enc_mem(sink, state, prefix, opcode, 1, subopcode, &amode, rex_flags); + } + } + } + + Inst::MulHi { size, signed, rhs } => { + let (prefix, rex_flags) = match size { + 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!(), + }; + + let subopcode = if *signed { 5 } else { 4 }; + match rhs { + RegMem::Reg { reg } => { + let src = int_reg_enc(*reg); + emit_std_enc_enc(sink, prefix, 0xF7, 1, subopcode, src, rex_flags) + } + RegMem::Mem { addr: src } => { + let amode = src.finalize(state); + emit_std_enc_mem(sink, state, prefix, 0xF7, 1, subopcode, &amode, rex_flags); + } + } + } + + Inst::SignExtendData { size } => match size { + 1 => { + sink.put1(0x66); + sink.put1(0x98); + } + 2 => { + sink.put1(0x66); + sink.put1(0x99); + } + 4 => sink.put1(0x99), + 8 => { + sink.put1(0x48); + sink.put1(0x99); + } + _ => unreachable!(), + }, + + Inst::CheckedDivOrRemSeq { + kind, + size, + divisor, + tmp, + } => { + // Generates the following code sequence: + // + // ;; check divide by zero: + // cmp 0 %divisor + // jnz $after_trap + // ud2 + // $after_trap: + // + // ;; for signed modulo/div: + // cmp -1 %divisor + // jnz $do_op + // ;; for signed modulo, result is 0 + // mov #0, %rdx + // j $done + // ;; for signed div, check for integer overflow against INT_MIN of the right size + // cmp INT_MIN, %rax + // jnz $do_op + // ud2 + // + // $do_op: + // ;; if signed + // cdq ;; sign-extend from rax into rdx + // ;; else + // mov #0, %rdx + // idiv %divisor + // + // $done: + debug_assert!(info.flags().avoid_div_traps()); + + // Check if the divisor is zero, first. + let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0), divisor.to_reg()); + inst.emit(sink, info, state); + + let inst = Inst::trap_if(CC::Z, TrapCode::IntegerDivisionByZero); + inst.emit(sink, info, state); + + let (do_op, done_label) = if kind.is_signed() { + // Now check if the divisor is -1. + let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0xffffffff), divisor.to_reg()); + inst.emit(sink, info, state); + + let do_op = sink.get_label(); + + // If not equal, jump to do-op. + one_way_jmp(sink, CC::NZ, do_op); + + // Here, divisor == -1. + if !kind.is_div() { + // x % -1 = 0; put the result into the destination, $rdx. + let done_label = sink.get_label(); + + let inst = Inst::imm( + OperandSize::from_bytes(*size as u32), + 0, + Writable::from_reg(regs::rdx()), + ); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done_label); + inst.emit(sink, info, state); + + (Some(do_op), Some(done_label)) + } else { + // Check for integer overflow. + if *size == 8 { + let tmp = tmp.expect("temporary for i64 sdiv"); + + let inst = Inst::imm(OperandSize::Size64, 0x8000000000000000, tmp); + inst.emit(sink, info, state); + + let inst = Inst::cmp_rmi_r(8, RegMemImm::reg(tmp.to_reg()), regs::rax()); + inst.emit(sink, info, state); + } else { + let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0x80000000), regs::rax()); + inst.emit(sink, info, state); + } + + // If not equal, jump over the trap. + let inst = Inst::trap_if(CC::Z, TrapCode::IntegerOverflow); + inst.emit(sink, info, state); + + (Some(do_op), None) + } + } else { + (None, None) + }; + + if let Some(do_op) = do_op { + sink.bind_label(do_op); + } + + assert!( + *size > 1, + "CheckedDivOrRemSeq for i8 is not yet implemented" + ); + + // Fill in the high parts: + if kind.is_signed() { + // sign-extend the sign-bit of rax into rdx, for signed opcodes. + let inst = Inst::sign_extend_data(*size); + inst.emit(sink, info, state); + } else { + // zero for unsigned opcodes. + let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(regs::rdx())); + inst.emit(sink, info, state); + } + + let inst = Inst::div(*size, kind.is_signed(), RegMem::reg(divisor.to_reg())); + inst.emit(sink, info, state); + + // Lowering takes care of moving the result back into the right register, see comment + // there. + + if let Some(done) = done_label { + sink.bind_label(done); + } + } + + Inst::Imm { + dst_is_64, + simm64, + dst, + } => { + let enc_dst = int_reg_enc(dst.to_reg()); + if *dst_is_64 { + if low32_will_sign_extend_to_64(*simm64) { + // Sign-extended move imm32. + emit_std_enc_enc( + sink, + LegacyPrefixes::None, + 0xC7, + 1, + /* subopcode */ 0, + enc_dst, + RexFlags::set_w(), + ); + sink.put4(*simm64 as u32); + } else { + sink.put1(0x48 | ((enc_dst >> 3) & 1)); + sink.put1(0xB8 | (enc_dst & 7)); + sink.put8(*simm64); + } + } else { + if ((enc_dst >> 3) & 1) == 1 { + sink.put1(0x41); + } + sink.put1(0xB8 | (enc_dst & 7)); + sink.put4(*simm64 as u32); + } + } + + Inst::MovRR { is_64, src, dst } => { + let rex = if *is_64 { + RexFlags::set_w() + } else { + RexFlags::clear_w() + }; + emit_std_reg_reg(sink, LegacyPrefixes::None, 0x89, 1, *src, dst.to_reg(), rex); + } + + Inst::MovzxRmR { ext_mode, src, dst } => { + let (opcodes, num_opcodes, mut rex_flags) = match ext_mode { + ExtMode::BL => { + // MOVZBL is (REX.W==0) 0F B6 /r + (0x0FB6, 2, RexFlags::clear_w()) + } + ExtMode::BQ => { + // MOVZBQ is (REX.W==1) 0F B6 /r + // I'm not sure why the Intel manual offers different + // encodings for MOVZBQ than for MOVZBL. AIUI they should + // achieve the same, since MOVZBL is just going to zero out + // the upper half of the destination anyway. + (0x0FB6, 2, RexFlags::set_w()) + } + ExtMode::WL => { + // MOVZWL is (REX.W==0) 0F B7 /r + (0x0FB7, 2, RexFlags::clear_w()) + } + ExtMode::WQ => { + // MOVZWQ is (REX.W==1) 0F B7 /r + (0x0FB7, 2, RexFlags::set_w()) + } + ExtMode::LQ => { + // This is just a standard 32 bit load, and we rely on the + // default zero-extension rule to perform the extension. + // Note that in reg/reg mode, gcc seems to use the swapped form R/RM, which we + // don't do here, since it's the same encoding size. + // MOV r/m32, r32 is (REX.W==0) 8B /r + (0x8B, 1, RexFlags::clear_w()) + } + }; + + match src { + RegMem::Reg { reg: src } => { + match ext_mode { + ExtMode::BL | ExtMode::BQ => { + // A redundant REX prefix must be emitted for certain register inputs. + let enc_src = int_reg_enc(*src); + if enc_src >= 4 && enc_src <= 7 { + rex_flags.always_emit(); + }; + } + _ => {} + } + emit_std_reg_reg( + sink, + LegacyPrefixes::None, + opcodes, + num_opcodes, + dst.to_reg(), + *src, + rex_flags, + ) + } + + RegMem::Mem { addr: src } => { + let src = &src.finalize(state); + + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + opcodes, + num_opcodes, + dst.to_reg(), + src, + rex_flags, + ) + } + } + } + + Inst::Mov64MR { src, dst } => { + let src = &src.finalize(state); + + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + 0x8B, + 1, + dst.to_reg(), + src, + RexFlags::set_w(), + ) + } + + Inst::LoadEffectiveAddress { addr, dst } => { + let amode = addr.finalize(state); + + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + 0x8D, + 1, + dst.to_reg(), + &amode, + RexFlags::set_w(), + ); + } + + Inst::MovsxRmR { ext_mode, src, dst } => { + let (opcodes, num_opcodes, mut rex_flags) = match ext_mode { + ExtMode::BL => { + // MOVSBL is (REX.W==0) 0F BE /r + (0x0FBE, 2, RexFlags::clear_w()) + } + ExtMode::BQ => { + // MOVSBQ is (REX.W==1) 0F BE /r + (0x0FBE, 2, RexFlags::set_w()) + } + ExtMode::WL => { + // MOVSWL is (REX.W==0) 0F BF /r + (0x0FBF, 2, RexFlags::clear_w()) + } + ExtMode::WQ => { + // MOVSWQ is (REX.W==1) 0F BF /r + (0x0FBF, 2, RexFlags::set_w()) + } + ExtMode::LQ => { + // MOVSLQ is (REX.W==1) 63 /r + (0x63, 1, RexFlags::set_w()) + } + }; + + match src { + RegMem::Reg { reg: src } => { + match ext_mode { + ExtMode::BL | ExtMode::BQ => { + // A redundant REX prefix must be emitted for certain register inputs. + let enc_src = int_reg_enc(*src); + if enc_src >= 4 && enc_src <= 7 { + rex_flags.always_emit(); + }; + } + _ => {} + } + emit_std_reg_reg( + sink, + LegacyPrefixes::None, + opcodes, + num_opcodes, + dst.to_reg(), + *src, + rex_flags, + ) + } + + RegMem::Mem { addr: src } => { + let src = &src.finalize(state); + + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + opcodes, + num_opcodes, + dst.to_reg(), + src, + rex_flags, + ) + } + } + } + + Inst::MovRM { size, src, dst } => { + let dst = &dst.finalize(state); + + match size { + 1 => { + // This is one of the few places where the presence of a + // redundant REX prefix changes the meaning of the + // instruction. + let mut rex = RexFlags::clear_w(); + + let enc_src = int_reg_enc(*src); + if enc_src >= 4 && enc_src <= 7 { + rex.always_emit(); + }; + + // MOV r8, r/m8 is (REX.W==0) 88 /r + emit_std_reg_mem(sink, state, LegacyPrefixes::None, 0x88, 1, *src, dst, rex) + } + + 2 => { + // MOV r16, r/m16 is 66 (REX.W==0) 89 /r + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::_66, + 0x89, + 1, + *src, + dst, + RexFlags::clear_w(), + ) + } + + 4 => { + // MOV r32, r/m32 is (REX.W==0) 89 /r + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + 0x89, + 1, + *src, + dst, + RexFlags::clear_w(), + ) + } + + 8 => { + // MOV r64, r/m64 is (REX.W==1) 89 /r + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + 0x89, + 1, + *src, + dst, + RexFlags::set_w(), + ) + } + + _ => panic!("x64::Inst::Mov_R_M::emit: unreachable"), + } + } + + Inst::ShiftR { + size, + kind, + num_bits, + dst, + } => { + let enc_dst = int_reg_enc(dst.to_reg()); + let subopcode = match kind { + ShiftKind::RotateLeft => 0, + ShiftKind::RotateRight => 1, + ShiftKind::ShiftLeft => 4, + ShiftKind::ShiftRightLogical => 5, + ShiftKind::ShiftRightArithmetic => 7, + }; + + match num_bits { + None => { + let (opcode, prefix, rex_flags) = match size { + 1 => (0xD2, LegacyPrefixes::None, RexFlags::clear_w()), + 2 => (0xD3, LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (0xD3, LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (0xD3, LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("{}", size), + }; + + // SHL/SHR/SAR %cl, reg8 is (REX.W==0) D2 /subopcode + // SHL/SHR/SAR %cl, reg16 is 66 (REX.W==0) D3 /subopcode + // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode + // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags); + } + + Some(num_bits) => { + let (opcode, prefix, rex_flags) = match size { + 1 => (0xC0, LegacyPrefixes::None, RexFlags::clear_w()), + 2 => (0xC1, LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (0xC1, LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (0xC1, LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("{}", size), + }; + + // SHL/SHR/SAR $ib, reg8 is (REX.W==0) C0 /subopcode + // SHL/SHR/SAR $ib, reg16 is 66 (REX.W==0) C1 /subopcode + // SHL/SHR/SAR $ib, reg32 is (REX.W==0) C1 /subopcode ib + // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib + // When the shift amount is 1, there's an even shorter encoding, but we don't + // bother with that nicety here. + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags); + sink.put1(*num_bits); + } + } + } + + Inst::XmmRmiReg { opcode, src, dst } => { + let rex = RexFlags::clear_w(); + let prefix = LegacyPrefixes::_66; + if let RegMemImm::Imm { simm32 } = src { + let (opcode_bytes, reg_digit) = match opcode { + SseOpcode::Psllw => (0x0F71, 6), + SseOpcode::Pslld => (0x0F72, 6), + SseOpcode::Psllq => (0x0F73, 6), + SseOpcode::Psraw => (0x0F71, 4), + SseOpcode::Psrad => (0x0F72, 4), + SseOpcode::Psrlw => (0x0F71, 2), + SseOpcode::Psrld => (0x0F72, 2), + SseOpcode::Psrlq => (0x0F73, 2), + _ => panic!("invalid opcode: {}", opcode), + }; + let dst_enc = reg_enc(dst.to_reg()); + emit_std_enc_enc(sink, prefix, opcode_bytes, 2, reg_digit, dst_enc, rex); + let imm = (*simm32) + .try_into() + .expect("the immediate must be convertible to a u8"); + sink.put1(imm); + } else { + let opcode_bytes = match opcode { + SseOpcode::Psllw => 0x0FF1, + SseOpcode::Pslld => 0x0FF2, + SseOpcode::Psllq => 0x0FF3, + SseOpcode::Psraw => 0x0FE1, + SseOpcode::Psrad => 0x0FE2, + SseOpcode::Psrlw => 0x0FD1, + SseOpcode::Psrld => 0x0FD2, + SseOpcode::Psrlq => 0x0FD3, + _ => panic!("invalid opcode: {}", opcode), + }; + + match src { + RegMemImm::Reg { reg } => { + emit_std_reg_reg(sink, prefix, opcode_bytes, 2, dst.to_reg(), *reg, rex); + } + RegMemImm::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode_bytes, + 2, + dst.to_reg(), + addr, + rex, + ); + } + RegMemImm::Imm { .. } => unreachable!(), + } + }; + } + + Inst::CmpRmiR { + size, + src: src_e, + dst: reg_g, + } => { + let mut prefix = LegacyPrefixes::None; + if *size == 2 { + prefix = LegacyPrefixes::_66; + } + + let mut rex = match size { + 8 => RexFlags::set_w(), + 4 | 2 => RexFlags::clear_w(), + 1 => { + let mut rex = RexFlags::clear_w(); + // Here, a redundant REX prefix changes the meaning of the instruction. + let enc_g = int_reg_enc(*reg_g); + if enc_g >= 4 && enc_g <= 7 { + rex.always_emit(); + } + rex + } + _ => panic!("x64::Inst::Cmp_RMI_R::emit: unreachable"), + }; + + match src_e { + RegMemImm::Reg { reg: reg_e } => { + if *size == 1 { + // Check whether the E register forces the use of a redundant REX. + let enc_e = int_reg_enc(*reg_e); + if enc_e >= 4 && enc_e <= 7 { + rex.always_emit(); + } + } + + // Use the swapped operands encoding, to stay consistent with the output of + // gcc/llvm. + let opcode = if *size == 1 { 0x38 } else { 0x39 }; + emit_std_reg_reg(sink, prefix, opcode, 1, *reg_e, *reg_g, rex); + } + + RegMemImm::Mem { addr } => { + let addr = &addr.finalize(state); + // Whereas here we revert to the "normal" G-E ordering. + let opcode = if *size == 1 { 0x3A } else { 0x3B }; + emit_std_reg_mem(sink, state, prefix, opcode, 1, *reg_g, addr, rex); + } + + RegMemImm::Imm { simm32 } => { + // FIXME JRS 2020Feb11: there are shorter encodings for + // cmp $imm, rax/eax/ax/al. + let use_imm8 = low8_will_sign_extend_to_32(*simm32); + + // And also here we use the "normal" G-E ordering. + let opcode = if *size == 1 { + 0x80 + } else if use_imm8 { + 0x83 + } else { + 0x81 + }; + + let enc_g = int_reg_enc(*reg_g); + emit_std_enc_enc(sink, prefix, opcode, 1, 7 /*subopcode*/, enc_g, rex); + emit_simm(sink, if use_imm8 { 1 } else { *size }, *simm32); + } + } + } + + Inst::Setcc { cc, dst } => { + let opcode = 0x0f90 + cc.get_enc() as u32; + let mut rex_flags = RexFlags::clear_w(); + rex_flags.always_emit(); + emit_std_enc_enc( + sink, + LegacyPrefixes::None, + opcode, + 2, + 0, + reg_enc(dst.to_reg()), + rex_flags, + ); + } + + Inst::Cmove { + size, + cc, + src, + dst: reg_g, + } => { + let (prefix, rex_flags) = match size { + 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("invalid size spec for cmove"), + }; + let opcode = 0x0F40 + cc.get_enc() as u32; + match src { + RegMem::Reg { reg: reg_e } => { + emit_std_reg_reg(sink, prefix, opcode, 2, reg_g.to_reg(), *reg_e, rex_flags); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode, + 2, + reg_g.to_reg(), + addr, + rex_flags, + ); + } + } + } + + Inst::XmmCmove { + is_64, + cc, + src, + dst, + } => { + // Lowering of the Select IR opcode when the input is an fcmp relies on the fact that + // this doesn't clobber flags. Make sure to not do so here. + let next = sink.get_label(); + + // Jump if cc is *not* set. + one_way_jmp(sink, cc.invert(), next); + + let op = if *is_64 { + SseOpcode::Movsd + } else { + SseOpcode::Movss + }; + let inst = Inst::xmm_unary_rm_r(op, src.clone(), *dst); + inst.emit(sink, info, state); + + sink.bind_label(next); + } + + Inst::Push64 { src } => { + match src { + RegMemImm::Reg { reg } => { + let enc_reg = int_reg_enc(*reg); + let rex = 0x40 | ((enc_reg >> 3) & 1); + if rex != 0x40 { + sink.put1(rex); + } + sink.put1(0x50 | (enc_reg & 7)); + } + + RegMemImm::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_enc_mem( + sink, + state, + LegacyPrefixes::None, + 0xFF, + 1, + 6, /*subopcode*/ + addr, + RexFlags::clear_w(), + ); + } + + RegMemImm::Imm { simm32 } => { + if low8_will_sign_extend_to_64(*simm32) { + sink.put1(0x6A); + sink.put1(*simm32 as u8); + } else { + sink.put1(0x68); + sink.put4(*simm32); + } + } + } + } + + Inst::Pop64 { dst } => { + let enc_dst = int_reg_enc(dst.to_reg()); + if enc_dst >= 8 { + // 0x41 == REX.{W=0, B=1}. It seems that REX.W is irrelevant here. + sink.put1(0x41); + } + sink.put1(0x58 + (enc_dst & 7)); + } + + Inst::CallKnown { dest, opcode, .. } => { + if let Some(s) = state.take_stack_map() { + sink.add_stack_map(StackMapExtent::UpcomingBytes(5), s); + } + sink.put1(0xE8); + // The addend adjusts for the difference between the end of the instruction and the + // beginning of the immediate field. + emit_reloc(sink, state, Reloc::X86CallPCRel4, &dest, -4); + sink.put4(0); + if opcode.is_call() { + let loc = state.cur_srcloc(); + sink.add_call_site(loc, *opcode); + } + } + + Inst::CallUnknown { dest, opcode, .. } => { + let start_offset = sink.cur_offset(); + match dest { + RegMem::Reg { reg } => { + let reg_enc = int_reg_enc(*reg); + emit_std_enc_enc( + sink, + LegacyPrefixes::None, + 0xFF, + 1, + 2, /*subopcode*/ + reg_enc, + RexFlags::clear_w(), + ); + } + + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_enc_mem( + sink, + state, + LegacyPrefixes::None, + 0xFF, + 1, + 2, /*subopcode*/ + addr, + RexFlags::clear_w(), + ); + } + } + if let Some(s) = state.take_stack_map() { + sink.add_stack_map(StackMapExtent::StartedAtOffset(start_offset), s); + } + if opcode.is_call() { + let loc = state.cur_srcloc(); + sink.add_call_site(loc, *opcode); + } + } + + Inst::Ret {} => sink.put1(0xC3), + + Inst::JmpKnown { dst } => { + let br_start = sink.cur_offset(); + let br_disp_off = br_start + 1; + let br_end = br_start + 5; + + sink.use_label_at_offset(br_disp_off, *dst, LabelUse::JmpRel32); + sink.add_uncond_branch(br_start, br_end, *dst); + + sink.put1(0xE9); + // Placeholder for the label value. + sink.put4(0x0); + } + + Inst::JmpIf { cc, taken } => { + let cond_start = sink.cur_offset(); + let cond_disp_off = cond_start + 2; + + sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32); + // Since this is not a terminator, don't enroll in the branch inversion mechanism. + + sink.put1(0x0F); + sink.put1(0x80 + cc.get_enc()); + // Placeholder for the label value. + sink.put4(0x0); + } + + Inst::JmpCond { + cc, + taken, + not_taken, + } => { + // If taken. + let cond_start = sink.cur_offset(); + let cond_disp_off = cond_start + 2; + let cond_end = cond_start + 6; + + sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32); + let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00]; + sink.add_cond_branch(cond_start, cond_end, *taken, &inverted[..]); + + sink.put1(0x0F); + sink.put1(0x80 + cc.get_enc()); + // Placeholder for the label value. + sink.put4(0x0); + + // If not taken. + let uncond_start = sink.cur_offset(); + let uncond_disp_off = uncond_start + 1; + let uncond_end = uncond_start + 5; + + sink.use_label_at_offset(uncond_disp_off, *not_taken, LabelUse::JmpRel32); + sink.add_uncond_branch(uncond_start, uncond_end, *not_taken); + + sink.put1(0xE9); + // Placeholder for the label value. + sink.put4(0x0); + } + + Inst::JmpUnknown { target } => { + match target { + RegMem::Reg { reg } => { + let reg_enc = int_reg_enc(*reg); + emit_std_enc_enc( + sink, + LegacyPrefixes::None, + 0xFF, + 1, + 4, /*subopcode*/ + reg_enc, + RexFlags::clear_w(), + ); + } + + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_enc_mem( + sink, + state, + LegacyPrefixes::None, + 0xFF, + 1, + 4, /*subopcode*/ + addr, + RexFlags::clear_w(), + ); + } + } + } + + Inst::JmpTableSeq { + idx, + tmp1, + tmp2, + ref targets, + default_target, + .. + } => { + // This sequence is *one* instruction in the vcode, and is expanded only here at + // emission time, because we cannot allow the regalloc to insert spills/reloads in + // the middle; we depend on hardcoded PC-rel addressing below. + // + // We don't have to worry about emitting islands, because the only label-use type has a + // maximum range of 2 GB. If we later consider using shorter-range label references, + // this will need to be revisited. + + // Save index in a tmp (the live range of ridx only goes to start of this + // sequence; rtmp1 or rtmp2 may overwrite it). + + // We generate the following sequence: + // ;; generated by lowering: cmp #jmp_table_size, %idx + // jnb $default_target + // movl %idx, %tmp2 + // lea start_of_jump_table_offset(%rip), %tmp1 + // movslq [%tmp1, %tmp2, 4], %tmp2 ;; shift of 2, viz. multiply index by 4 + // addq %tmp2, %tmp1 + // j *%tmp1 + // $start_of_jump_table: + // -- jump table entries + one_way_jmp(sink, CC::NB, *default_target); // idx unsigned >= jmp table size + + // Copy the index (and make sure to clear the high 32-bits lane of tmp2). + let inst = Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(*idx), *tmp2); + inst.emit(sink, info, state); + + // Load base address of jump table. + let start_of_jumptable = sink.get_label(); + let inst = Inst::lea(Amode::rip_relative(start_of_jumptable), *tmp1); + inst.emit(sink, info, state); + + // Load value out of the jump table. It's a relative offset to the target block, so it + // might be negative; use a sign-extension. + let inst = Inst::movsx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg_reg_shift(0, tmp1.to_reg(), tmp2.to_reg(), 2)), + *tmp2, + ); + inst.emit(sink, info, state); + + // Add base of jump table to jump-table-sourced block offset. + let inst = Inst::alu_rmi_r( + true, /* is_64 */ + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + *tmp1, + ); + inst.emit(sink, info, state); + + // Branch to computed address. + let inst = Inst::jmp_unknown(RegMem::reg(tmp1.to_reg())); + inst.emit(sink, info, state); + + // Emit jump table (table of 32-bit offsets). + sink.bind_label(start_of_jumptable); + let jt_off = sink.cur_offset(); + for &target in targets.iter() { + let word_off = sink.cur_offset(); + // off_into_table is an addend here embedded in the label to be later patched at + // the end of codegen. The offset is initially relative to this jump table entry; + // with the extra addend, it'll be relative to the jump table's start, after + // patching. + let off_into_table = word_off - jt_off; + sink.use_label_at_offset(word_off, target, LabelUse::PCRel32); + sink.put4(off_into_table); + } + } + + Inst::TrapIf { cc, trap_code } => { + let else_label = sink.get_label(); + + // Jump over if the invert of CC is set (i.e. CC is not set). + one_way_jmp(sink, cc.invert(), else_label); + + // Trap! + let inst = Inst::trap(*trap_code); + inst.emit(sink, info, state); + + sink.bind_label(else_label); + } + + Inst::XmmUnaryRmR { + op, + src: src_e, + dst: reg_g, + } => { + let rex = RexFlags::clear_w(); + + let (prefix, opcode, num_opcodes) = match op { + SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2), + SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2), + SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2), + SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F28, 2), + SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F6F, 2), + SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F6F, 2), + SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2), + SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10, 2), + SseOpcode::Movups => (LegacyPrefixes::None, 0x0F10, 2), + SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F10, 2), + SseOpcode::Pabsb => (LegacyPrefixes::_66, 0x0F381C, 3), + SseOpcode::Pabsw => (LegacyPrefixes::_66, 0x0F381D, 3), + SseOpcode::Pabsd => (LegacyPrefixes::_66, 0x0F381E, 3), + SseOpcode::Sqrtps => (LegacyPrefixes::None, 0x0F51, 2), + SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2), + SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2), + SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + + match src_e { + RegMem::Reg { reg: reg_e } => { + emit_std_reg_reg( + sink, + prefix, + opcode, + num_opcodes, + reg_g.to_reg(), + *reg_e, + rex, + ); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode, + num_opcodes, + reg_g.to_reg(), + addr, + rex, + ); + } + }; + } + + Inst::XmmRmR { + op, + src: src_e, + dst: reg_g, + } => { + let rex = RexFlags::clear_w(); + let (prefix, opcode, length) = match op { + SseOpcode::Addps => (LegacyPrefixes::None, 0x0F58, 2), + SseOpcode::Addpd => (LegacyPrefixes::_66, 0x0F58, 2), + SseOpcode::Addss => (LegacyPrefixes::_F3, 0x0F58, 2), + SseOpcode::Addsd => (LegacyPrefixes::_F2, 0x0F58, 2), + SseOpcode::Andps => (LegacyPrefixes::None, 0x0F54, 2), + SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2), + SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2), + SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2), + SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2), + SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2), + SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2), + SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2), + SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2), + SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2), + SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2), + SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2), + SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2), + SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2), + SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2), + SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2), + SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2), + SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2), + SseOpcode::Movlhps => (LegacyPrefixes::None, 0x0F16, 2), + SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2), + SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2), + SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2), + SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2), + SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2), + SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2), + SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2), + SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2), + SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2), + SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2), + SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2), + SseOpcode::Paddw => (LegacyPrefixes::_66, 0x0FFD, 2), + SseOpcode::Paddsb => (LegacyPrefixes::_66, 0x0FEC, 2), + SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2), + SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2), + SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2), + SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2), + SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2), + SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2), + SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2), + SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2), + SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2), + SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2), + SseOpcode::Pcmpeqq => (LegacyPrefixes::_66, 0x0F3829, 3), + SseOpcode::Pcmpgtb => (LegacyPrefixes::_66, 0x0F64, 2), + SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2), + SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2), + SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3), + SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3), + SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2), + SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3), + SseOpcode::Pmaxub => (LegacyPrefixes::_66, 0x0FDE, 2), + SseOpcode::Pmaxuw => (LegacyPrefixes::_66, 0x0F383E, 3), + SseOpcode::Pmaxud => (LegacyPrefixes::_66, 0x0F383F, 3), + SseOpcode::Pminsb => (LegacyPrefixes::_66, 0x0F3838, 3), + SseOpcode::Pminsw => (LegacyPrefixes::_66, 0x0FEA, 2), + SseOpcode::Pminsd => (LegacyPrefixes::_66, 0x0F3839, 3), + SseOpcode::Pminub => (LegacyPrefixes::_66, 0x0FDA, 2), + SseOpcode::Pminuw => (LegacyPrefixes::_66, 0x0F383A, 3), + SseOpcode::Pminud => (LegacyPrefixes::_66, 0x0F383B, 3), + SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3), + SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2), + SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2), + SseOpcode::Por => (LegacyPrefixes::_66, 0x0FEB, 2), + SseOpcode::Pshufb => (LegacyPrefixes::_66, 0x0F3800, 3), + SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2), + SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2), + SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2), + SseOpcode::Psubw => (LegacyPrefixes::_66, 0x0FF9, 2), + SseOpcode::Psubsb => (LegacyPrefixes::_66, 0x0FE8, 2), + SseOpcode::Psubsw => (LegacyPrefixes::_66, 0x0FE9, 2), + SseOpcode::Psubusb => (LegacyPrefixes::_66, 0x0FD8, 2), + SseOpcode::Psubusw => (LegacyPrefixes::_66, 0x0FD9, 2), + SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2), + SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2), + SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2), + SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2), + SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2), + SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2), + SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + + match src_e { + RegMem::Reg { reg: reg_e } => { + emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode, + length, + reg_g.to_reg(), + addr, + rex, + ); + } + } + } + + Inst::XmmMinMaxSeq { + size, + is_min, + lhs, + rhs_dst, + } => { + // Generates the following sequence: + // cmpss/cmpsd %lhs, %rhs_dst + // jnz do_min_max + // jp propagate_nan + // + // ;; ordered and equal: propagate the sign bit (for -0 vs 0): + // {and,or}{ss,sd} %lhs, %rhs_dst + // j done + // + // ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the + // ;; NaN value is returned), we add both inputs. + // propagate_nan: + // add{ss,sd} %lhs, %rhs_dst + // j done + // + // do_min_max: + // {min,max}{ss,sd} %lhs, %rhs_dst + // + // done: + let done = sink.get_label(); + let propagate_nan = sink.get_label(); + let do_min_max = sink.get_label(); + + let (add_op, cmp_op, and_op, or_op, min_max_op) = match size { + OperandSize::Size32 => ( + SseOpcode::Addss, + SseOpcode::Ucomiss, + SseOpcode::Andps, + SseOpcode::Orps, + if *is_min { + SseOpcode::Minss + } else { + SseOpcode::Maxss + }, + ), + OperandSize::Size64 => ( + SseOpcode::Addsd, + SseOpcode::Ucomisd, + SseOpcode::Andpd, + SseOpcode::Orpd, + if *is_min { + SseOpcode::Minsd + } else { + SseOpcode::Maxsd + }, + ), + }; + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(*lhs), rhs_dst.to_reg()); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::NZ, do_min_max); + one_way_jmp(sink, CC::P, propagate_nan); + + // Ordered and equal. The operands are bit-identical unless they are zero + // and negative zero. These instructions merge the sign bits in that + // case, and are no-ops otherwise. + let op = if *is_min { or_op } else { and_op }; + let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + + // x86's min/max are not symmetric; if either operand is a NaN, they return the + // read-only operand: perform an addition between the two operands, which has the + // desired NaN propagation effects. + sink.bind_label(propagate_nan); + let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::P, done); + + sink.bind_label(do_min_max); + let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst); + inst.emit(sink, info, state); + + sink.bind_label(done); + } + + Inst::XmmRmRImm { + op, + src, + dst, + imm, + is64, + } => { + let (prefix, opcode, len) = match op { + SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2), + SseOpcode::Cmppd => (LegacyPrefixes::_66, 0x0FC2, 2), + SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2), + SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2), + SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3), + SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3), + SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2), + SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3), + SseOpcode::Pextrb => (LegacyPrefixes::_66, 0x0F3A14, 3), + SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2), + SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3), + SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + let rex = if *is64 { + RexFlags::set_w() + } else { + RexFlags::clear_w() + }; + let regs_swapped = match *op { + // These opcodes (and not the SSE2 version of PEXTRW) flip the operand + // encoding: `dst` in ModRM's r/m, `src` in ModRM's reg field. + SseOpcode::Pextrb | SseOpcode::Pextrd => true, + // The rest of the opcodes have the customary encoding: `dst` in ModRM's reg, + // `src` in ModRM's r/m field. + _ => false, + }; + match src { + RegMem::Reg { reg } => { + if regs_swapped { + emit_std_reg_reg(sink, prefix, opcode, len, *reg, dst.to_reg(), rex); + } else { + emit_std_reg_reg(sink, prefix, opcode, len, dst.to_reg(), *reg, rex); + } + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + assert!( + !regs_swapped, + "No existing way to encode a mem argument in the ModRM r/m field." + ); + emit_std_reg_mem(sink, state, prefix, opcode, len, dst.to_reg(), addr, rex); + } + } + sink.put1(*imm); + } + + Inst::XmmLoadConst { src, dst, ty } => { + let load_offset = Amode::rip_relative(sink.get_label_for_constant(*src)); + let load = Inst::load(*ty, load_offset, *dst, ExtKind::None); + load.emit(sink, info, state); + } + + Inst::XmmUninitializedValue { .. } => { + // This instruction format only exists to declare a register as a `def`; no code is + // emitted. + } + + Inst::XmmMovRM { op, src, dst } => { + let (prefix, opcode) = match op { + SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29), + SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29), + SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F7F), + SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F), + SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11), + SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11), + SseOpcode::Movups => (LegacyPrefixes::None, 0x0F11), + SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F11), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + let dst = &dst.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode, + 2, + *src, + dst, + RexFlags::clear_w(), + ); + } + + Inst::XmmToGpr { + op, + src, + dst, + dst_size, + } => { + let (prefix, opcode, dst_first) = match op { + SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true), + SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true), + // Movd and movq use the same opcode; the presence of the REX prefix (set below) + // actually determines which is used. + SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false), + SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true), + SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true), + SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true), + _ => panic!("unexpected opcode {:?}", op), + }; + let rex = match dst_size { + OperandSize::Size32 => RexFlags::clear_w(), + OperandSize::Size64 => RexFlags::set_w(), + }; + + let (src, dst) = if dst_first { + (dst.to_reg(), *src) + } else { + (*src, dst.to_reg()) + }; + + emit_std_reg_reg(sink, prefix, opcode, 2, src, dst, rex); + } + + Inst::GprToXmm { + op, + src: src_e, + dst: reg_g, + src_size, + } => { + let (prefix, opcode) = match op { + // Movd and movq use the same opcode; the presence of the REX prefix (set below) + // actually determines which is used. + SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E), + SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A), + SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A), + _ => panic!("unexpected opcode {:?}", op), + }; + let rex = match *src_size { + OperandSize::Size32 => RexFlags::clear_w(), + OperandSize::Size64 => RexFlags::set_w(), + }; + match src_e { + RegMem::Reg { reg: reg_e } => { + emit_std_reg_reg(sink, prefix, opcode, 2, reg_g.to_reg(), *reg_e, rex); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem(sink, state, prefix, opcode, 2, reg_g.to_reg(), addr, rex); + } + } + } + + Inst::XmmCmpRmR { op, src, dst } => { + let rex = RexFlags::clear_w(); + let (prefix, opcode, len) = match op { + SseOpcode::Ptest => (LegacyPrefixes::_66, 0x0F3817, 3), + SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E, 2), + SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E, 2), + _ => unimplemented!("Emit xmm cmp rm r"), + }; + + match src { + RegMem::Reg { reg } => { + emit_std_reg_reg(sink, prefix, opcode, len, *dst, *reg, rex); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem(sink, state, prefix, opcode, len, *dst, addr, rex); + } + } + } + + Inst::CvtUint64ToFloatSeq { + to_f64, + src, + dst, + tmp_gpr1, + tmp_gpr2, + } => { + // Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a + // different sequence. + // + // Emit the following sequence: + // + // cmp 0, %src + // jl handle_negative + // + // ;; handle positive, which can't overflow + // cvtsi2sd/cvtsi2ss %src, %dst + // j done + // + // ;; handle negative: see below for an explanation of what it's doing. + // handle_negative: + // mov %src, %tmp_gpr1 + // shr $1, %tmp_gpr1 + // mov %src, %tmp_gpr2 + // and $1, %tmp_gpr2 + // or %tmp_gpr1, %tmp_gpr2 + // cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst + // addsd/addss %dst, %dst + // + // done: + + assert_ne!(src, tmp_gpr1); + assert_ne!(src, tmp_gpr2); + assert_ne!(tmp_gpr1, tmp_gpr2); + + let handle_negative = sink.get_label(); + let done = sink.get_label(); + + // If x seen as a signed int64 is not negative, a signed-conversion will do the right + // thing. + // TODO use tst src, src here. + let inst = Inst::cmp_rmi_r(8, RegMemImm::imm(0), src.to_reg()); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::L, handle_negative); + + // Handle a positive int64, which is the "easy" case: a signed conversion will do the + // right thing. + emit_signed_cvt(sink, info, state, src.to_reg(), *dst, *to_f64); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + + sink.bind_label(handle_negative); + + // Divide x by two to get it in range for the signed conversion, keep the LSB, and + // scale it back up on the FP side. + let inst = Inst::gen_move(*tmp_gpr1, src.to_reg(), types::I64); + inst.emit(sink, info, state); + + // tmp_gpr1 := src >> 1 + let inst = Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(1), *tmp_gpr1); + inst.emit(sink, info, state); + + let inst = Inst::gen_move(*tmp_gpr2, src.to_reg(), types::I64); + inst.emit(sink, info, state); + + let inst = Inst::alu_rmi_r( + true, /* 64bits */ + AluRmiROpcode::And, + RegMemImm::imm(1), + *tmp_gpr2, + ); + inst.emit(sink, info, state); + + let inst = Inst::alu_rmi_r( + true, /* 64bits */ + AluRmiROpcode::Or, + RegMemImm::reg(tmp_gpr1.to_reg()), + *tmp_gpr2, + ); + inst.emit(sink, info, state); + + emit_signed_cvt(sink, info, state, tmp_gpr2.to_reg(), *dst, *to_f64); + + let add_op = if *to_f64 { + SseOpcode::Addsd + } else { + SseOpcode::Addss + }; + let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst); + inst.emit(sink, info, state); + + sink.bind_label(done); + } + + Inst::CvtFloatToSintSeq { + src_size, + dst_size, + is_saturating, + src, + dst, + tmp_gpr, + tmp_xmm, + } => { + // Emits the following common sequence: + // + // cvttss2si/cvttsd2si %src, %dst + // cmp %dst, 1 + // jno done + // + // Then, for saturating conversions: + // + // ;; check for NaN + // cmpss/cmpsd %src, %src + // jnp not_nan + // xor %dst, %dst + // + // ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is + // ;; already in %dst. + // xorpd %tmp_xmm, %tmp_xmm + // cmpss/cmpsd %src, %tmp_xmm + // jnb done + // mov/movaps $INT_MAX, %dst + // + // done: + // + // Then, for non-saturating conversions: + // + // ;; check for NaN + // cmpss/cmpsd %src, %src + // jnp not_nan + // ud2 trap BadConversionToInteger + // + // ;; check if INT_MIN was the correct result, against a magic constant: + // not_nan: + // movaps/mov $magic, %tmp_gpr + // movq/movd %tmp_gpr, %tmp_xmm + // cmpss/cmpsd %tmp_xmm, %src + // jnb/jnbe $check_positive + // ud2 trap IntegerOverflow + // + // ;; if positive, it was a real overflow + // check_positive: + // xorpd %tmp_xmm, %tmp_xmm + // cmpss/cmpsd %src, %tmp_xmm + // jnb done + // ud2 trap IntegerOverflow + // + // done: + + let src = src.to_reg(); + + let (cast_op, cmp_op, trunc_op) = match src_size { + OperandSize::Size64 => (SseOpcode::Movq, SseOpcode::Ucomisd, SseOpcode::Cvttsd2si), + OperandSize::Size32 => (SseOpcode::Movd, SseOpcode::Ucomiss, SseOpcode::Cvttss2si), + }; + + let done = sink.get_label(); + let not_nan = sink.get_label(); + + // The truncation. + let inst = Inst::xmm_to_gpr(trunc_op, src, *dst, *dst_size); + inst.emit(sink, info, state); + + // Compare against 1, in case of overflow the dst operand was INT_MIN. + let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(1), dst.to_reg()); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::NO, done); // no overflow => done + + // Check for NaN. + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), src); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN + + if *is_saturating { + // For NaN, emit 0. + let inst = Inst::alu_rmi_r( + *dst_size == OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst.to_reg()), + *dst, + ); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + + sink.bind_label(not_nan); + + // If the input was positive, saturate to INT_MAX. + + // Zero out tmp_xmm. + let inst = + Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm); + inst.emit(sink, info, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg()); + inst.emit(sink, info, state); + + // Jump if >= to done. + one_way_jmp(sink, CC::NB, done); + + // Otherwise, put INT_MAX. + if *dst_size == OperandSize::Size64 { + let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, *dst); + inst.emit(sink, info, state); + } else { + let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, *dst); + inst.emit(sink, info, state); + } + } else { + let check_positive = sink.get_label(); + + let inst = Inst::trap(TrapCode::BadConversionToInteger); + inst.emit(sink, info, state); + + // Check if INT_MIN was the correct result: determine the smallest floating point + // number that would convert to INT_MIN, put it in a temporary register, and compare + // against the src register. + // If the src register is less (or in some cases, less-or-equal) than the threshold, + // trap! + + sink.bind_label(not_nan); + + let mut no_overflow_cc = CC::NB; // >= + let output_bits = dst_size.to_bits(); + match *src_size { + OperandSize::Size32 => { + let cst = Ieee32::pow2(output_bits - 1).neg().bits(); + let inst = Inst::imm(OperandSize::Size32, cst as u64, *tmp_gpr); + inst.emit(sink, info, state); + } + OperandSize::Size64 => { + // An f64 can represent `i32::min_value() - 1` exactly with precision to spare, + // so there are values less than -2^(N-1) that convert correctly to INT_MIN. + let cst = if output_bits < 64 { + no_overflow_cc = CC::NBE; // > + Ieee64::fcvt_to_sint_negative_overflow(output_bits) + } else { + Ieee64::pow2(output_bits - 1).neg() + }; + let inst = Inst::imm(OperandSize::Size64, cst.bits(), *tmp_gpr); + inst.emit(sink, info, state); + } + } + + let inst = + Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm); + inst.emit(sink, info, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm.to_reg()), src); + inst.emit(sink, info, state); + + // jump over trap if src >= or > threshold + one_way_jmp(sink, no_overflow_cc, check_positive); + + let inst = Inst::trap(TrapCode::IntegerOverflow); + inst.emit(sink, info, state); + + // If positive, it was a real overflow. + + sink.bind_label(check_positive); + + // Zero out the tmp_xmm register. + let inst = + Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm); + inst.emit(sink, info, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg()); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::NB, done); // jump over trap if 0 >= src + + let inst = Inst::trap(TrapCode::IntegerOverflow); + inst.emit(sink, info, state); + } + + sink.bind_label(done); + } + + Inst::CvtFloatToUintSeq { + src_size, + dst_size, + is_saturating, + src, + dst, + tmp_gpr, + tmp_xmm, + } => { + // The only difference in behavior between saturating and non-saturating is how we + // handle errors. Emits the following sequence: + // + // movaps/mov 2**(int_width - 1), %tmp_gpr + // movq/movd %tmp_gpr, %tmp_xmm + // cmpss/cmpsd %tmp_xmm, %src + // jnb is_large + // + // ;; check for NaN inputs + // jnp not_nan + // -- non-saturating: ud2 trap BadConversionToInteger + // -- saturating: xor %dst, %dst; j done + // + // not_nan: + // cvttss2si/cvttsd2si %src, %dst + // cmp 0, %dst + // jnl done + // -- non-saturating: ud2 trap IntegerOverflow + // -- saturating: xor %dst, %dst; j done + // + // is_large: + // subss/subsd %tmp_xmm, %src ; <-- we clobber %src here + // cvttss2si/cvttss2sd %tmp_x, %dst + // cmp 0, %dst + // jnl next_is_large + // -- non-saturating: ud2 trap IntegerOverflow + // -- saturating: movaps $UINT_MAX, %dst; j done + // + // next_is_large: + // add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers + // + // done: + + assert_ne!(tmp_xmm, src, "tmp_xmm clobbers src!"); + + let (sub_op, cast_op, cmp_op, trunc_op) = if *src_size == OperandSize::Size64 { + ( + SseOpcode::Subsd, + SseOpcode::Movq, + SseOpcode::Ucomisd, + SseOpcode::Cvttsd2si, + ) + } else { + ( + SseOpcode::Subss, + SseOpcode::Movd, + SseOpcode::Ucomiss, + SseOpcode::Cvttss2si, + ) + }; + + let done = sink.get_label(); + + let cst = if *src_size == OperandSize::Size64 { + Ieee64::pow2(dst_size.to_bits() - 1).bits() + } else { + Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64 + }; + + let inst = Inst::imm(*src_size, cst, *tmp_gpr); + inst.emit(sink, info, state); + + let inst = + Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm); + inst.emit(sink, info, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm.to_reg()), src.to_reg()); + inst.emit(sink, info, state); + + let handle_large = sink.get_label(); + one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold + + let not_nan = sink.get_label(); + one_way_jmp(sink, CC::NP, not_nan); // jump over trap if not NaN + + if *is_saturating { + // Emit 0. + let inst = Inst::alu_rmi_r( + *dst_size == OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst.to_reg()), + *dst, + ); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + } else { + // Trap. + let inst = Inst::trap(TrapCode::BadConversionToInteger); + inst.emit(sink, info, state); + } + + sink.bind_label(not_nan); + + // Actual truncation for small inputs: if the result is not positive, then we had an + // overflow. + + let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size); + inst.emit(sink, info, state); + + let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(0), dst.to_reg()); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done + + if *is_saturating { + // The input was "small" (< 2**(width -1)), so the only way to get an integer + // overflow is because the input was too small: saturate to the min value, i.e. 0. + let inst = Inst::alu_rmi_r( + *dst_size == OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst.to_reg()), + *dst, + ); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + } else { + // Trap. + let inst = Inst::trap(TrapCode::IntegerOverflow); + inst.emit(sink, info, state); + } + + // Now handle large inputs. + + sink.bind_label(handle_large); + + let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src); + inst.emit(sink, info, state); + + let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size); + inst.emit(sink, info, state); + + let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(0), dst.to_reg()); + inst.emit(sink, info, state); + + let next_is_large = sink.get_label(); + one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large + + if *is_saturating { + // The input was "large" (>= 2**(width -1)), so the only way to get an integer + // overflow is because the input was too large: saturate to the max value. + let inst = Inst::imm( + OperandSize::Size64, + if *dst_size == OperandSize::Size64 { + u64::max_value() + } else { + u32::max_value() as u64 + }, + *dst, + ); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + } else { + let inst = Inst::trap(TrapCode::IntegerOverflow); + inst.emit(sink, info, state); + } + + sink.bind_label(next_is_large); + + if *dst_size == OperandSize::Size64 { + let inst = Inst::imm(OperandSize::Size64, 1 << 63, *tmp_gpr); + inst.emit(sink, info, state); + + let inst = Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::reg(tmp_gpr.to_reg()), + *dst, + ); + inst.emit(sink, info, state); + } else { + let inst = + Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::imm(1 << 31), *dst); + inst.emit(sink, info, state); + } + + sink.bind_label(done); + } + + Inst::LoadExtName { dst, name, offset } => { + // The full address can be encoded in the register, with a relocation. + // Generates: movabsq $name, %dst + let enc_dst = int_reg_enc(dst.to_reg()); + sink.put1(0x48 | ((enc_dst >> 3) & 1)); + sink.put1(0xB8 | (enc_dst & 7)); + emit_reloc(sink, state, Reloc::Abs8, name, *offset); + if info.flags().emit_all_ones_funcaddrs() { + sink.put8(u64::max_value()); + } else { + sink.put8(0); + } + } + + Inst::LockCmpxchg { ty, src, dst } => { + // lock cmpxchg{b,w,l,q} %src, (dst) + // Note that 0xF0 is the Lock prefix. + let (prefix, rex, opcodes) = match *ty { + types::I8 => { + let mut rex_flags = RexFlags::clear_w(); + let enc_src = int_reg_enc(*src); + if enc_src >= 4 && enc_src <= 7 { + rex_flags.always_emit(); + }; + (LegacyPrefixes::_F0, rex_flags, 0x0FB0) + } + types::I16 => (LegacyPrefixes::_66F0, RexFlags::clear_w(), 0x0FB1), + types::I32 => (LegacyPrefixes::_F0, RexFlags::clear_w(), 0x0FB1), + types::I64 => (LegacyPrefixes::_F0, RexFlags::set_w(), 0x0FB1), + _ => unreachable!(), + }; + let amode = dst.finalize(state); + emit_std_reg_mem(sink, state, prefix, opcodes, 2, *src, &amode, rex); + } + + Inst::AtomicRmwSeq { ty, op } => { + // Emit this: + // + // mov{zbq,zwq,zlq,q} (%r9), %rax // rax = old value + // again: + // movq %rax, %r11 // rax = old value, r11 = old value + // `op`q %r10, %r11 // rax = old value, r11 = new value + // lock cmpxchg{b,w,l,q} %r11, (%r9) // try to store new value + // jnz again // If this is taken, rax will have a "revised" old value + // + // Operand conventions: + // IN: %r9 (addr), %r10 (2nd arg for `op`) + // OUT: %rax (old value), %r11 (trashed), %rflags (trashed) + // + // In the case where the operation is 'xchg', the "`op`q" instruction is instead + // movq %r10, %r11 + // so that we simply write in the destination, the "2nd arg for `op`". + let rax = regs::rax(); + let r9 = regs::r9(); + let r10 = regs::r10(); + let r11 = regs::r11(); + let rax_w = Writable::from_reg(rax); + let r11_w = Writable::from_reg(r11); + let amode = Amode::imm_reg(0, r9); + let again_label = sink.get_label(); + + // mov{zbq,zwq,zlq,q} (%r9), %rax + // No need to call `add_trap` here, since the `i1` emit will do that. + let i1 = Inst::load(*ty, amode.clone(), rax_w, ExtKind::ZeroExtend); + i1.emit(sink, info, state); + + // again: + sink.bind_label(again_label); + + // movq %rax, %r11 + let i2 = Inst::mov_r_r(true, rax, r11_w); + i2.emit(sink, info, state); + + // opq %r10, %r11 + let r10_rmi = RegMemImm::reg(r10); + let i3 = if *op == inst_common::AtomicRmwOp::Xchg { + Inst::mov_r_r(true, r10, r11_w) + } else { + let alu_op = match op { + inst_common::AtomicRmwOp::Add => AluRmiROpcode::Add, + inst_common::AtomicRmwOp::Sub => AluRmiROpcode::Sub, + inst_common::AtomicRmwOp::And => AluRmiROpcode::And, + inst_common::AtomicRmwOp::Or => AluRmiROpcode::Or, + inst_common::AtomicRmwOp::Xor => AluRmiROpcode::Xor, + inst_common::AtomicRmwOp::Xchg => unreachable!(), + }; + Inst::alu_rmi_r(true, alu_op, r10_rmi, r11_w) + }; + i3.emit(sink, info, state); + + // lock cmpxchg{b,w,l,q} %r11, (%r9) + // No need to call `add_trap` here, since the `i4` emit will do that. + let i4 = Inst::LockCmpxchg { + ty: *ty, + src: r11, + dst: amode.into(), + }; + i4.emit(sink, info, state); + + // jnz again + one_way_jmp(sink, CC::NZ, again_label); + } + + Inst::Fence { kind } => { + sink.put1(0x0F); + sink.put1(0xAE); + match kind { + FenceKind::MFence => sink.put1(0xF0), // mfence = 0F AE F0 + FenceKind::LFence => sink.put1(0xE8), // lfence = 0F AE E8 + FenceKind::SFence => sink.put1(0xF8), // sfence = 0F AE F8 + } + } + + Inst::Hlt => { + sink.put1(0xcc); + } + + Inst::Ud2 { trap_code } => { + let cur_srcloc = state.cur_srcloc(); + sink.add_trap(cur_srcloc, *trap_code); + if let Some(s) = state.take_stack_map() { + sink.add_stack_map(StackMapExtent::UpcomingBytes(2), s); + } + sink.put1(0x0f); + sink.put1(0x0b); + } + + Inst::VirtualSPOffsetAdj { offset } => { + debug!( + "virtual sp offset adjusted by {} -> {}", + offset, + state.virtual_sp_offset + offset + ); + state.virtual_sp_offset += offset; + } + + Inst::Nop { len } => { + // These encodings can all be found in Intel's architecture manual, at the NOP + // instruction description. + let mut len = *len; + while len != 0 { + let emitted = u8::min(len, 9); + match emitted { + 0 => {} + 1 => sink.put1(0x90), // NOP + 2 => { + // 66 NOP + sink.put1(0x66); + sink.put1(0x90); + } + 3 => { + // NOP [EAX] + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x00); + } + 4 => { + // NOP 0(EAX), with 0 a 1-byte immediate. + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x40); + sink.put1(0x00); + } + 5 => { + // NOP [EAX, EAX, 1] + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x44); + sink.put1(0x00); + sink.put1(0x00); + } + 6 => { + // 66 NOP [EAX, EAX, 1] + sink.put1(0x66); + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x44); + sink.put1(0x00); + sink.put1(0x00); + } + 7 => { + // NOP 0[EAX], but 0 is a 4 bytes immediate. + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x80); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + } + 8 => { + // NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate. + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x84); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + } + 9 => { + // 66 NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate. + sink.put1(0x66); + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x84); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + } + _ => unreachable!(), + } + len -= emitted; + } + } + + Inst::EpiloguePlaceholder => { + // Generate no code. + } + } + + state.clear_post_insn(); +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs new file mode 100644 index 0000000000..06092d498a --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs @@ -0,0 +1,3593 @@ +//! Tests for the emitter +//! +//! See comments at the top of `fn x64_emit` for advice on how to create reliable test cases. +//! +//! to see stdout: cargo test -- --nocapture +//! +//! for this specific case, as of 24 Aug 2020: +//! +//! cd to the top of your wasmtime tree, then: +//! RUST_BACKTRACE=1 cargo test --features test-programs/test_programs \ +//! --features experimental_x64 --all --exclude peepmatic --exclude lightbeam \ +//! --exclude wasmtime-lightbeam --exclude peepmatic-automata --exclude peepmatic-fuzzing \ +//! --exclude peepmatic-macro -- isa::x64::inst::emit_tests::test_x64_emit + +use super::*; +use crate::isa::test_utils; +use crate::isa::x64; +use alloc::vec::Vec; + +#[test] +fn test_x64_emit() { + let rax = regs::rax(); + let rbx = regs::rbx(); + let rcx = regs::rcx(); + let rdx = regs::rdx(); + let rsi = regs::rsi(); + let rdi = regs::rdi(); + let rsp = regs::rsp(); + let rbp = regs::rbp(); + let r8 = regs::r8(); + let r9 = regs::r9(); + let r10 = regs::r10(); + let r11 = regs::r11(); + let r12 = regs::r12(); + let r13 = regs::r13(); + let r14 = regs::r14(); + let r15 = regs::r15(); + + let xmm0 = regs::xmm0(); + let xmm1 = regs::xmm1(); + let xmm2 = regs::xmm2(); + let xmm3 = regs::xmm3(); + let xmm4 = regs::xmm4(); + let xmm5 = regs::xmm5(); + let xmm6 = regs::xmm6(); + let xmm7 = regs::xmm7(); + let xmm8 = regs::xmm8(); + let xmm9 = regs::xmm9(); + let xmm10 = regs::xmm10(); + let xmm11 = regs::xmm11(); + let xmm12 = regs::xmm12(); + let xmm13 = regs::xmm13(); + let xmm14 = regs::xmm14(); + let xmm15 = regs::xmm15(); + + // And Writable<> versions of the same: + let w_rax = Writable::<Reg>::from_reg(rax); + let w_rbx = Writable::<Reg>::from_reg(rbx); + let w_rcx = Writable::<Reg>::from_reg(rcx); + let w_rdx = Writable::<Reg>::from_reg(rdx); + let w_rsi = Writable::<Reg>::from_reg(rsi); + let w_rdi = Writable::<Reg>::from_reg(rdi); + let _w_rsp = Writable::<Reg>::from_reg(rsp); + let _w_rbp = Writable::<Reg>::from_reg(rbp); + let w_r8 = Writable::<Reg>::from_reg(r8); + let w_r9 = Writable::<Reg>::from_reg(r9); + let _w_r10 = Writable::<Reg>::from_reg(r10); + let w_r11 = Writable::<Reg>::from_reg(r11); + let w_r12 = Writable::<Reg>::from_reg(r12); + let w_r13 = Writable::<Reg>::from_reg(r13); + let w_r14 = Writable::<Reg>::from_reg(r14); + let w_r15 = Writable::<Reg>::from_reg(r15); + + let w_xmm0 = Writable::<Reg>::from_reg(xmm0); + let w_xmm1 = Writable::<Reg>::from_reg(xmm1); + let w_xmm2 = Writable::<Reg>::from_reg(xmm2); + let w_xmm3 = Writable::<Reg>::from_reg(xmm3); + let w_xmm4 = Writable::<Reg>::from_reg(xmm4); + let w_xmm5 = Writable::<Reg>::from_reg(xmm5); + let w_xmm6 = Writable::<Reg>::from_reg(xmm6); + let w_xmm7 = Writable::<Reg>::from_reg(xmm7); + let w_xmm8 = Writable::<Reg>::from_reg(xmm8); + let w_xmm9 = Writable::<Reg>::from_reg(xmm9); + let w_xmm10 = Writable::<Reg>::from_reg(xmm10); + let w_xmm11 = Writable::<Reg>::from_reg(xmm11); + let w_xmm12 = Writable::<Reg>::from_reg(xmm12); + let w_xmm13 = Writable::<Reg>::from_reg(xmm13); + let w_xmm14 = Writable::<Reg>::from_reg(xmm14); + let w_xmm15 = Writable::<Reg>::from_reg(xmm15); + + let mut insns = Vec::<(Inst, &str, &str)>::new(); + + // ======================================================== + // Cases aimed at checking Addr-esses: IR (Imm + Reg) + // + // These are just a bunch of loads with all supported (by the emitter) + // permutations of address formats. + // + // Addr_IR, offset zero + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rax), w_rdi), + "488B38", + "movq 0(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rbx), w_rdi), + "488B3B", + "movq 0(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rcx), w_rdi), + "488B39", + "movq 0(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rdx), w_rdi), + "488B3A", + "movq 0(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rbp), w_rdi), + "488B7D00", + "movq 0(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rsp), w_rdi), + "488B3C24", + "movq 0(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rsi), w_rdi), + "488B3E", + "movq 0(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rdi), w_rdi), + "488B3F", + "movq 0(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r8), w_rdi), + "498B38", + "movq 0(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r9), w_rdi), + "498B39", + "movq 0(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r10), w_rdi), + "498B3A", + "movq 0(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r11), w_rdi), + "498B3B", + "movq 0(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r12), w_rdi), + "498B3C24", + "movq 0(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r13), w_rdi), + "498B7D00", + "movq 0(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r14), w_rdi), + "498B3E", + "movq 0(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r15), w_rdi), + "498B3F", + "movq 0(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset max simm8 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rax), w_rdi), + "488B787F", + "movq 127(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rbx), w_rdi), + "488B7B7F", + "movq 127(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rcx), w_rdi), + "488B797F", + "movq 127(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rdx), w_rdi), + "488B7A7F", + "movq 127(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rbp), w_rdi), + "488B7D7F", + "movq 127(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rsp), w_rdi), + "488B7C247F", + "movq 127(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rsi), w_rdi), + "488B7E7F", + "movq 127(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rdi), w_rdi), + "488B7F7F", + "movq 127(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r8), w_rdi), + "498B787F", + "movq 127(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r9), w_rdi), + "498B797F", + "movq 127(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r10), w_rdi), + "498B7A7F", + "movq 127(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r11), w_rdi), + "498B7B7F", + "movq 127(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r12), w_rdi), + "498B7C247F", + "movq 127(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r13), w_rdi), + "498B7D7F", + "movq 127(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r14), w_rdi), + "498B7E7F", + "movq 127(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r15), w_rdi), + "498B7F7F", + "movq 127(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset min simm8 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rax), w_rdi), + "488B7880", + "movq -128(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rbx), w_rdi), + "488B7B80", + "movq -128(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rcx), w_rdi), + "488B7980", + "movq -128(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rdx), w_rdi), + "488B7A80", + "movq -128(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rbp), w_rdi), + "488B7D80", + "movq -128(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rsp), w_rdi), + "488B7C2480", + "movq -128(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rsi), w_rdi), + "488B7E80", + "movq -128(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rdi), w_rdi), + "488B7F80", + "movq -128(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r8), w_rdi), + "498B7880", + "movq -128(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r9), w_rdi), + "498B7980", + "movq -128(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r10), w_rdi), + "498B7A80", + "movq -128(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r11), w_rdi), + "498B7B80", + "movq -128(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r12), w_rdi), + "498B7C2480", + "movq -128(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r13), w_rdi), + "498B7D80", + "movq -128(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r14), w_rdi), + "498B7E80", + "movq -128(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r15), w_rdi), + "498B7F80", + "movq -128(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset smallest positive simm32 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rax), w_rdi), + "488BB880000000", + "movq 128(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rbx), w_rdi), + "488BBB80000000", + "movq 128(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rcx), w_rdi), + "488BB980000000", + "movq 128(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rdx), w_rdi), + "488BBA80000000", + "movq 128(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rbp), w_rdi), + "488BBD80000000", + "movq 128(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rsp), w_rdi), + "488BBC2480000000", + "movq 128(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rsi), w_rdi), + "488BBE80000000", + "movq 128(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rdi), w_rdi), + "488BBF80000000", + "movq 128(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r8), w_rdi), + "498BB880000000", + "movq 128(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r9), w_rdi), + "498BB980000000", + "movq 128(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r10), w_rdi), + "498BBA80000000", + "movq 128(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r11), w_rdi), + "498BBB80000000", + "movq 128(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r12), w_rdi), + "498BBC2480000000", + "movq 128(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r13), w_rdi), + "498BBD80000000", + "movq 128(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r14), w_rdi), + "498BBE80000000", + "movq 128(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r15), w_rdi), + "498BBF80000000", + "movq 128(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset smallest negative simm32 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rax), w_rdi), + "488BB87FFFFFFF", + "movq -129(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rbx), w_rdi), + "488BBB7FFFFFFF", + "movq -129(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rcx), w_rdi), + "488BB97FFFFFFF", + "movq -129(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rdx), w_rdi), + "488BBA7FFFFFFF", + "movq -129(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rbp), w_rdi), + "488BBD7FFFFFFF", + "movq -129(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rsp), w_rdi), + "488BBC247FFFFFFF", + "movq -129(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rsi), w_rdi), + "488BBE7FFFFFFF", + "movq -129(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rdi), w_rdi), + "488BBF7FFFFFFF", + "movq -129(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r8), w_rdi), + "498BB87FFFFFFF", + "movq -129(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r9), w_rdi), + "498BB97FFFFFFF", + "movq -129(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r10), w_rdi), + "498BBA7FFFFFFF", + "movq -129(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r11), w_rdi), + "498BBB7FFFFFFF", + "movq -129(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r12), w_rdi), + "498BBC247FFFFFFF", + "movq -129(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r13), w_rdi), + "498BBD7FFFFFFF", + "movq -129(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r14), w_rdi), + "498BBE7FFFFFFF", + "movq -129(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r15), w_rdi), + "498BBF7FFFFFFF", + "movq -129(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset large positive simm32 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rax), w_rdi), + "488BB877207317", + "movq 393420919(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rbx), w_rdi), + "488BBB77207317", + "movq 393420919(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rcx), w_rdi), + "488BB977207317", + "movq 393420919(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rdx), w_rdi), + "488BBA77207317", + "movq 393420919(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rbp), w_rdi), + "488BBD77207317", + "movq 393420919(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rsp), w_rdi), + "488BBC2477207317", + "movq 393420919(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rsi), w_rdi), + "488BBE77207317", + "movq 393420919(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rdi), w_rdi), + "488BBF77207317", + "movq 393420919(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r8), w_rdi), + "498BB877207317", + "movq 393420919(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r9), w_rdi), + "498BB977207317", + "movq 393420919(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r10), w_rdi), + "498BBA77207317", + "movq 393420919(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r11), w_rdi), + "498BBB77207317", + "movq 393420919(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r12), w_rdi), + "498BBC2477207317", + "movq 393420919(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r13), w_rdi), + "498BBD77207317", + "movq 393420919(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r14), w_rdi), + "498BBE77207317", + "movq 393420919(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r15), w_rdi), + "498BBF77207317", + "movq 393420919(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset large negative simm32 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rax), w_rdi), + "488BB8D9A6BECE", + "movq -826366247(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rbx), w_rdi), + "488BBBD9A6BECE", + "movq -826366247(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rcx), w_rdi), + "488BB9D9A6BECE", + "movq -826366247(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rdx), w_rdi), + "488BBAD9A6BECE", + "movq -826366247(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rbp), w_rdi), + "488BBDD9A6BECE", + "movq -826366247(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rsp), w_rdi), + "488BBC24D9A6BECE", + "movq -826366247(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rsi), w_rdi), + "488BBED9A6BECE", + "movq -826366247(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rdi), w_rdi), + "488BBFD9A6BECE", + "movq -826366247(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r8), w_rdi), + "498BB8D9A6BECE", + "movq -826366247(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r9), w_rdi), + "498BB9D9A6BECE", + "movq -826366247(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r10), w_rdi), + "498BBAD9A6BECE", + "movq -826366247(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r11), w_rdi), + "498BBBD9A6BECE", + "movq -826366247(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r12), w_rdi), + "498BBC24D9A6BECE", + "movq -826366247(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r13), w_rdi), + "498BBDD9A6BECE", + "movq -826366247(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r14), w_rdi), + "498BBED9A6BECE", + "movq -826366247(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r15), w_rdi), + "498BBFD9A6BECE", + "movq -826366247(%r15), %rdi", + )); + + // ======================================================== + // Cases aimed at checking Addr-esses: IRRS (Imm + Reg + (Reg << Shift)) + // Note these don't check the case where the index reg is RSP, since we + // don't encode any of those. + // + // Addr_IRRS, offset max simm8 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rax, rax, 0), w_r11), + "4C8B5C007F", + "movq 127(%rax,%rax,1), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rdi, rax, 1), w_r11), + "4C8B5C477F", + "movq 127(%rdi,%rax,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r8, rax, 2), w_r11), + "4D8B5C807F", + "movq 127(%r8,%rax,4), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r15, rax, 3), w_r11), + "4D8B5CC77F", + "movq 127(%r15,%rax,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rax, rdi, 3), w_r11), + "4C8B5CF87F", + "movq 127(%rax,%rdi,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rdi, rdi, 2), w_r11), + "4C8B5CBF7F", + "movq 127(%rdi,%rdi,4), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r8, rdi, 1), w_r11), + "4D8B5C787F", + "movq 127(%r8,%rdi,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r15, rdi, 0), w_r11), + "4D8B5C3F7F", + "movq 127(%r15,%rdi,1), %r11", + )); + + // ======================================================== + // Addr_IRRS, offset min simm8 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rax, r8, 2), w_r11), + "4E8B5C8080", + "movq -128(%rax,%r8,4), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rdi, r8, 3), w_r11), + "4E8B5CC780", + "movq -128(%rdi,%r8,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r8, r8, 0), w_r11), + "4F8B5C0080", + "movq -128(%r8,%r8,1), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r15, r8, 1), w_r11), + "4F8B5C4780", + "movq -128(%r15,%r8,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rax, r15, 1), w_r11), + "4E8B5C7880", + "movq -128(%rax,%r15,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rdi, r15, 0), w_r11), + "4E8B5C3F80", + "movq -128(%rdi,%r15,1), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r8, r15, 3), w_r11), + "4F8B5CF880", + "movq -128(%r8,%r15,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r15, r15, 2), w_r11), + "4F8B5CBF80", + "movq -128(%r15,%r15,4), %r11", + )); + + // ======================================================== + // Addr_IRRS, offset large positive simm32 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rax, rax, 0), w_r11), + "4C8B9C00BE25664F", + "movq 1332094398(%rax,%rax,1), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rdi, rax, 1), w_r11), + "4C8B9C47BE25664F", + "movq 1332094398(%rdi,%rax,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r8, rax, 2), w_r11), + "4D8B9C80BE25664F", + "movq 1332094398(%r8,%rax,4), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r15, rax, 3), w_r11), + "4D8B9CC7BE25664F", + "movq 1332094398(%r15,%rax,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rax, rdi, 3), w_r11), + "4C8B9CF8BE25664F", + "movq 1332094398(%rax,%rdi,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rdi, rdi, 2), w_r11), + "4C8B9CBFBE25664F", + "movq 1332094398(%rdi,%rdi,4), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r8, rdi, 1), w_r11), + "4D8B9C78BE25664F", + "movq 1332094398(%r8,%rdi,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r15, rdi, 0), w_r11), + "4D8B9C3FBE25664F", + "movq 1332094398(%r15,%rdi,1), %r11", + )); + + // ======================================================== + // Addr_IRRS, offset large negative simm32 + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rax, r8, 2), + w_r11, + ), + "4E8B9C8070E9B2D9", + "movq -642586256(%rax,%r8,4), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rdi, r8, 3), + w_r11, + ), + "4E8B9CC770E9B2D9", + "movq -642586256(%rdi,%r8,8), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r8, r8, 0), + w_r11, + ), + "4F8B9C0070E9B2D9", + "movq -642586256(%r8,%r8,1), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r15, r8, 1), + w_r11, + ), + "4F8B9C4770E9B2D9", + "movq -642586256(%r15,%r8,2), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rax, r15, 1), + w_r11, + ), + "4E8B9C7870E9B2D9", + "movq -642586256(%rax,%r15,2), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rdi, r15, 0), + w_r11, + ), + "4E8B9C3F70E9B2D9", + "movq -642586256(%rdi,%r15,1), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r8, r15, 3), + w_r11, + ), + "4F8B9CF870E9B2D9", + "movq -642586256(%r8,%r15,8), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r15, r15, 2), + w_r11, + ), + "4F8B9CBF70E9B2D9", + "movq -642586256(%r15,%r15,4), %r11", + )); + + // End of test cases for Addr + // ======================================================== + + // ======================================================== + // General tests for each insn. Don't forget to follow the + // guidelines commented just prior to `fn x64_emit`. + // + // Alu_RMI_R + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Add, RegMemImm::reg(r15), w_rdx), + "4C01FA", + "addq %r15, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::reg(rcx), w_r8), + "4101C8", + "addl %ecx, %r8d", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::reg(rcx), w_rsi), + "01CE", + "addl %ecx, %esi", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_rdx, + ), + "48035763", + "addq 99(%rdi), %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_r8, + ), + "44034763", + "addl 99(%rdi), %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_rsi, + ), + "037763", + "addl 99(%rdi), %esi", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::imm(-127i32 as u32), + w_rdx, + ), + "4883C281", + "addq $-127, %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::imm(-129i32 as u32), + w_rdx, + ), + "4881C27FFFFFFF", + "addq $-129, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Add, RegMemImm::imm(76543210), w_rdx), + "4881C2EAF48F04", + "addq $76543210, %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::imm(-127i32 as u32), + w_r8, + ), + "4183C081", + "addl $-127, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::imm(-129i32 as u32), + w_r8, + ), + "4181C07FFFFFFF", + "addl $-129, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::imm(-76543210i32 as u32), + w_r8, + ), + "4181C0160B70FB", + "addl $-76543210, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::imm(-127i32 as u32), + w_rsi, + ), + "83C681", + "addl $-127, %esi", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::imm(-129i32 as u32), + w_rsi, + ), + "81C67FFFFFFF", + "addl $-129, %esi", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::imm(76543210), w_rsi), + "81C6EAF48F04", + "addl $76543210, %esi", + )); + // This is pretty feeble + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Sub, RegMemImm::reg(r15), w_rdx), + "4C29FA", + "subq %r15, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::And, RegMemImm::reg(r15), w_rdx), + "4C21FA", + "andq %r15, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Or, RegMemImm::reg(r15), w_rdx), + "4C09FA", + "orq %r15, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Xor, RegMemImm::reg(r15), w_rdx), + "4C31FA", + "xorq %r15, %rdx", + )); + // Test all mul cases, though + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Mul, RegMemImm::reg(r15), w_rdx), + "490FAFD7", + "imulq %r15, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::reg(rcx), w_r8), + "440FAFC1", + "imull %ecx, %r8d", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::reg(rcx), w_rsi), + "0FAFF1", + "imull %ecx, %esi", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Mul, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_rdx, + ), + "480FAF5763", + "imulq 99(%rdi), %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_r8, + ), + "440FAF4763", + "imull 99(%rdi), %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_rsi, + ), + "0FAF7763", + "imull 99(%rdi), %esi", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Mul, + RegMemImm::imm(-127i32 as u32), + w_rdx, + ), + "486BD281", + "imulq $-127, %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Mul, + RegMemImm::imm(-129i32 as u32), + w_rdx, + ), + "4869D27FFFFFFF", + "imulq $-129, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Mul, RegMemImm::imm(76543210), w_rdx), + "4869D2EAF48F04", + "imulq $76543210, %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::imm(-127i32 as u32), + w_r8, + ), + "456BC081", + "imull $-127, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::imm(-129i32 as u32), + w_r8, + ), + "4569C07FFFFFFF", + "imull $-129, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::imm(-76543210i32 as u32), + w_r8, + ), + "4569C0160B70FB", + "imull $-76543210, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::imm(-127i32 as u32), + w_rsi, + ), + "6BF681", + "imull $-127, %esi", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::imm(-129i32 as u32), + w_rsi, + ), + "69F67FFFFFFF", + "imull $-129, %esi", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::imm(76543210), w_rsi), + "69F6EAF48F04", + "imull $76543210, %esi", + )); + + // ======================================================== + // UnaryRmR + + insns.push(( + Inst::unary_rm_r(4, UnaryRmROpcode::Bsr, RegMem::reg(rsi), w_rdi), + "0FBDFE", + "bsrl %esi, %edi", + )); + insns.push(( + Inst::unary_rm_r(8, UnaryRmROpcode::Bsr, RegMem::reg(r15), w_rax), + "490FBDC7", + "bsrq %r15, %rax", + )); + + // ======================================================== + // Not + insns.push(( + Inst::not(4, Writable::from_reg(regs::rsi())), + "F7D6", + "notl %esi", + )); + insns.push(( + Inst::not(8, Writable::from_reg(regs::r15())), + "49F7D7", + "notq %r15", + )); + insns.push(( + Inst::not(4, Writable::from_reg(regs::r14())), + "41F7D6", + "notl %r14d", + )); + insns.push(( + Inst::not(2, Writable::from_reg(regs::rdi())), + "66F7D7", + "notw %di", + )); + + // ======================================================== + // Neg + insns.push(( + Inst::neg(4, Writable::from_reg(regs::rsi())), + "F7DE", + "negl %esi", + )); + insns.push(( + Inst::neg(8, Writable::from_reg(regs::r15())), + "49F7DF", + "negq %r15", + )); + insns.push(( + Inst::neg(4, Writable::from_reg(regs::r14())), + "41F7DE", + "negl %r14d", + )); + insns.push(( + Inst::neg(2, Writable::from_reg(regs::rdi())), + "66F7DF", + "negw %di", + )); + + // ======================================================== + // Div + insns.push(( + Inst::div(4, true /*signed*/, RegMem::reg(regs::rsi())), + "F7FE", + "idiv %esi", + )); + insns.push(( + Inst::div(8, true /*signed*/, RegMem::reg(regs::r15())), + "49F7FF", + "idiv %r15", + )); + insns.push(( + Inst::div(4, false /*signed*/, RegMem::reg(regs::r14())), + "41F7F6", + "div %r14d", + )); + insns.push(( + Inst::div(8, false /*signed*/, RegMem::reg(regs::rdi())), + "48F7F7", + "div %rdi", + )); + + // ======================================================== + // MulHi + insns.push(( + Inst::mul_hi(4, true /*signed*/, RegMem::reg(regs::rsi())), + "F7EE", + "imul %esi", + )); + insns.push(( + Inst::mul_hi(8, true /*signed*/, RegMem::reg(regs::r15())), + "49F7EF", + "imul %r15", + )); + insns.push(( + Inst::mul_hi(4, false /*signed*/, RegMem::reg(regs::r14())), + "41F7E6", + "mul %r14d", + )); + insns.push(( + Inst::mul_hi(8, false /*signed*/, RegMem::reg(regs::rdi())), + "48F7E7", + "mul %rdi", + )); + + // ======================================================== + // cbw + insns.push((Inst::sign_extend_data(1), "6698", "cbw")); + + // ======================================================== + // cdq family: SignExtendRaxRdx + insns.push((Inst::sign_extend_data(2), "6699", "cwd")); + insns.push((Inst::sign_extend_data(4), "99", "cdq")); + insns.push((Inst::sign_extend_data(8), "4899", "cqo")); + + // ======================================================== + // Imm_R + // + insns.push(( + Inst::imm(OperandSize::Size32, 1234567, w_r14), + "41BE87D61200", + "movl $1234567, %r14d", + )); + insns.push(( + Inst::imm(OperandSize::Size32, -126i64 as u64, w_r14), + "41BE82FFFFFF", + "movl $-126, %r14d", + )); + insns.push(( + Inst::imm(OperandSize::Size64, 1234567898765, w_r14), + "49BE8D26FB711F010000", + "movabsq $1234567898765, %r14", + )); + insns.push(( + Inst::imm(OperandSize::Size64, -126i64 as u64, w_r14), + "49C7C682FFFFFF", + "movabsq $-126, %r14", + )); + insns.push(( + Inst::imm(OperandSize::Size32, 1234567, w_rcx), + "B987D61200", + "movl $1234567, %ecx", + )); + insns.push(( + Inst::imm(OperandSize::Size32, -126i64 as u64, w_rcx), + "B982FFFFFF", + "movl $-126, %ecx", + )); + insns.push(( + Inst::imm(OperandSize::Size64, 1234567898765, w_rsi), + "48BE8D26FB711F010000", + "movabsq $1234567898765, %rsi", + )); + insns.push(( + Inst::imm(OperandSize::Size64, -126i64 as u64, w_rbx), + "48C7C382FFFFFF", + "movabsq $-126, %rbx", + )); + + // ======================================================== + // Mov_R_R + insns.push(( + Inst::mov_r_r(false, rbx, w_rsi), + "89DE", + "movl %ebx, %esi", + )); + insns.push(( + Inst::mov_r_r(false, rbx, w_r9), + "4189D9", + "movl %ebx, %r9d", + )); + insns.push(( + Inst::mov_r_r(false, r11, w_rsi), + "4489DE", + "movl %r11d, %esi", + )); + insns.push(( + Inst::mov_r_r(false, r12, w_r9), + "4589E1", + "movl %r12d, %r9d", + )); + insns.push(( + Inst::mov_r_r(true, rbx, w_rsi), + "4889DE", + "movq %rbx, %rsi", + )); + insns.push(( + Inst::mov_r_r(true, rbx, w_r9), + "4989D9", + "movq %rbx, %r9", + )); + insns.push(( + Inst::mov_r_r(true, r11, w_rsi), + "4C89DE", + "movq %r11, %rsi", + )); + insns.push(( + Inst::mov_r_r(true, r12, w_r9), + "4D89E1", + "movq %r12, %r9", + )); + + // ======================================================== + // MovZX_RM_R + insns.push(( + Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(rdi), w_rdi), + "400FB6FF", + "movzbl %dil, %edi", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(rax), w_rsi), + "0FB6F0", + "movzbl %al, %esi", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(r15), w_rsi), + "410FB6F7", + "movzbl %r15b, %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "0FB671F9", + "movzbl -7(%rcx), %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "410FB658F9", + "movzbl -7(%r8), %ebx", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "450FB64AF9", + "movzbl -7(%r10), %r9d", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "410FB653F9", + "movzbl -7(%r11), %edx", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::BQ, RegMem::reg(rax), w_rsi), + "480FB6F0", + "movzbq %al, %rsi", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::BQ, RegMem::reg(r10), w_rsi), + "490FB6F2", + "movzbq %r10b, %rsi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "480FB671F9", + "movzbq -7(%rcx), %rsi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "490FB658F9", + "movzbq -7(%r8), %rbx", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "4D0FB64AF9", + "movzbq -7(%r10), %r9", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "490FB653F9", + "movzbq -7(%r11), %rdx", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::WL, RegMem::reg(rcx), w_rsi), + "0FB7F1", + "movzwl %cx, %esi", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::WL, RegMem::reg(r10), w_rsi), + "410FB7F2", + "movzwl %r10w, %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "0FB771F9", + "movzwl -7(%rcx), %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "410FB758F9", + "movzwl -7(%r8), %ebx", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "450FB74AF9", + "movzwl -7(%r10), %r9d", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "410FB753F9", + "movzwl -7(%r11), %edx", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::WQ, RegMem::reg(rcx), w_rsi), + "480FB7F1", + "movzwq %cx, %rsi", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::WQ, RegMem::reg(r11), w_rsi), + "490FB7F3", + "movzwq %r11w, %rsi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "480FB771F9", + "movzwq -7(%rcx), %rsi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "490FB758F9", + "movzwq -7(%r8), %rbx", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "4D0FB74AF9", + "movzwq -7(%r10), %r9", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "490FB753F9", + "movzwq -7(%r11), %rdx", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(rcx), w_rsi), + "8BF1", + "movl %ecx, %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "8B71F9", + "movl -7(%rcx), %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "418B58F9", + "movl -7(%r8), %ebx", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "458B4AF9", + "movl -7(%r10), %r9d", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "418B53F9", + "movl -7(%r11), %edx", + )); + + // ======================================================== + // Mov64_M_R + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, rbx, 0), w_rcx), + "488B8C18B3000000", + "movq 179(%rax,%rbx,1), %rcx", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, rbx, 0), w_r8), + "4C8B8418B3000000", + "movq 179(%rax,%rbx,1), %r8", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, r9, 0), w_rcx), + "4A8B8C08B3000000", + "movq 179(%rax,%r9,1), %rcx", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, r9, 0), w_r8), + "4E8B8408B3000000", + "movq 179(%rax,%r9,1), %r8", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, rbx, 0), w_rcx), + "498B8C1AB3000000", + "movq 179(%r10,%rbx,1), %rcx", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, rbx, 0), w_r8), + "4D8B841AB3000000", + "movq 179(%r10,%rbx,1), %r8", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_rcx), + "4B8B8C0AB3000000", + "movq 179(%r10,%r9,1), %rcx", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_r8), + "4F8B840AB3000000", + "movq 179(%r10,%r9,1), %r8", + )); + + // ======================================================== + // LoadEffectiveAddress + insns.push(( + Inst::lea(Amode::imm_reg(42, r10), w_r8), + "4D8D422A", + "lea 42(%r10), %r8", + )); + insns.push(( + Inst::lea(Amode::imm_reg(42, r10), w_r15), + "4D8D7A2A", + "lea 42(%r10), %r15", + )); + insns.push(( + Inst::lea(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_r8), + "4F8D840AB3000000", + "lea 179(%r10,%r9,1), %r8", + )); + insns.push(( + Inst::lea(Amode::rip_relative(MachLabel::from_block(0)), w_rdi), + "488D3D00000000", + "lea label0(%rip), %rdi", + )); + + // ======================================================== + // MovSX_RM_R + insns.push(( + Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rdi), w_rdi), + "400FBEFF", + "movsbl %dil, %edi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rcx), w_rsi), + "0FBEF1", + "movsbl %cl, %esi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(r14), w_rsi), + "410FBEF6", + "movsbl %r14b, %esi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "0FBE71F9", + "movsbl -7(%rcx), %esi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "410FBE58F9", + "movsbl -7(%r8), %ebx", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "450FBE4AF9", + "movsbl -7(%r10), %r9d", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "410FBE53F9", + "movsbl -7(%r11), %edx", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::BQ, RegMem::reg(rcx), w_rsi), + "480FBEF1", + "movsbq %cl, %rsi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::BQ, RegMem::reg(r15), w_rsi), + "490FBEF7", + "movsbq %r15b, %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "480FBE71F9", + "movsbq -7(%rcx), %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "490FBE58F9", + "movsbq -7(%r8), %rbx", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "4D0FBE4AF9", + "movsbq -7(%r10), %r9", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "490FBE53F9", + "movsbq -7(%r11), %rdx", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::WL, RegMem::reg(rcx), w_rsi), + "0FBFF1", + "movswl %cx, %esi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::WL, RegMem::reg(r14), w_rsi), + "410FBFF6", + "movswl %r14w, %esi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "0FBF71F9", + "movswl -7(%rcx), %esi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "410FBF58F9", + "movswl -7(%r8), %ebx", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "450FBF4AF9", + "movswl -7(%r10), %r9d", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "410FBF53F9", + "movswl -7(%r11), %edx", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::WQ, RegMem::reg(rcx), w_rsi), + "480FBFF1", + "movswq %cx, %rsi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::WQ, RegMem::reg(r13), w_rsi), + "490FBFF5", + "movswq %r13w, %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "480FBF71F9", + "movswq -7(%rcx), %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "490FBF58F9", + "movswq -7(%r8), %rbx", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "4D0FBF4AF9", + "movswq -7(%r10), %r9", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "490FBF53F9", + "movswq -7(%r11), %rdx", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::LQ, RegMem::reg(rcx), w_rsi), + "4863F1", + "movslq %ecx, %rsi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::LQ, RegMem::reg(r15), w_rsi), + "4963F7", + "movslq %r15d, %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "486371F9", + "movslq -7(%rcx), %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "496358F9", + "movslq -7(%r8), %rbx", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "4D634AF9", + "movslq -7(%r10), %r9", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "496353F9", + "movslq -7(%r11), %rdx", + )); + + // ======================================================== + // Mov_R_M. Byte stores are tricky. Check everything carefully. + insns.push(( + Inst::mov_r_m(8, rax, Amode::imm_reg(99, rdi)), + "48894763", + "movq %rax, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(8, rbx, Amode::imm_reg(99, r8)), + "49895863", + "movq %rbx, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(8, rcx, Amode::imm_reg(99, rsi)), + "48894E63", + "movq %rcx, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(8, rdx, Amode::imm_reg(99, r9)), + "49895163", + "movq %rdx, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(8, rsi, Amode::imm_reg(99, rax)), + "48897063", + "movq %rsi, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(8, rdi, Amode::imm_reg(99, r15)), + "49897F63", + "movq %rdi, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(8, rsp, Amode::imm_reg(99, rcx)), + "48896163", + "movq %rsp, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(8, rbp, Amode::imm_reg(99, r14)), + "49896E63", + "movq %rbp, 99(%r14)", + )); + insns.push(( + Inst::mov_r_m(8, r8, Amode::imm_reg(99, rdi)), + "4C894763", + "movq %r8, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(8, r9, Amode::imm_reg(99, r8)), + "4D894863", + "movq %r9, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(8, r10, Amode::imm_reg(99, rsi)), + "4C895663", + "movq %r10, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(8, r11, Amode::imm_reg(99, r9)), + "4D895963", + "movq %r11, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(8, r12, Amode::imm_reg(99, rax)), + "4C896063", + "movq %r12, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(8, r13, Amode::imm_reg(99, r15)), + "4D896F63", + "movq %r13, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(8, r14, Amode::imm_reg(99, rcx)), + "4C897163", + "movq %r14, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(8, r15, Amode::imm_reg(99, r14)), + "4D897E63", + "movq %r15, 99(%r14)", + )); + // + insns.push(( + Inst::mov_r_m(4, rax, Amode::imm_reg(99, rdi)), + "894763", + "movl %eax, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(4, rbx, Amode::imm_reg(99, r8)), + "41895863", + "movl %ebx, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(4, rcx, Amode::imm_reg(99, rsi)), + "894E63", + "movl %ecx, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(4, rdx, Amode::imm_reg(99, r9)), + "41895163", + "movl %edx, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(4, rsi, Amode::imm_reg(99, rax)), + "897063", + "movl %esi, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(4, rdi, Amode::imm_reg(99, r15)), + "41897F63", + "movl %edi, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(4, rsp, Amode::imm_reg(99, rcx)), + "896163", + "movl %esp, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(4, rbp, Amode::imm_reg(99, r14)), + "41896E63", + "movl %ebp, 99(%r14)", + )); + insns.push(( + Inst::mov_r_m(4, r8, Amode::imm_reg(99, rdi)), + "44894763", + "movl %r8d, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(4, r9, Amode::imm_reg(99, r8)), + "45894863", + "movl %r9d, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(4, r10, Amode::imm_reg(99, rsi)), + "44895663", + "movl %r10d, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(4, r11, Amode::imm_reg(99, r9)), + "45895963", + "movl %r11d, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(4, r12, Amode::imm_reg(99, rax)), + "44896063", + "movl %r12d, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(4, r13, Amode::imm_reg(99, r15)), + "45896F63", + "movl %r13d, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(4, r14, Amode::imm_reg(99, rcx)), + "44897163", + "movl %r14d, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(4, r15, Amode::imm_reg(99, r14)), + "45897E63", + "movl %r15d, 99(%r14)", + )); + // + insns.push(( + Inst::mov_r_m(2, rax, Amode::imm_reg(99, rdi)), + "66894763", + "movw %ax, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(2, rbx, Amode::imm_reg(99, r8)), + "6641895863", + "movw %bx, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(2, rcx, Amode::imm_reg(99, rsi)), + "66894E63", + "movw %cx, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(2, rdx, Amode::imm_reg(99, r9)), + "6641895163", + "movw %dx, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(2, rsi, Amode::imm_reg(99, rax)), + "66897063", + "movw %si, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(2, rdi, Amode::imm_reg(99, r15)), + "6641897F63", + "movw %di, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(2, rsp, Amode::imm_reg(99, rcx)), + "66896163", + "movw %sp, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(2, rbp, Amode::imm_reg(99, r14)), + "6641896E63", + "movw %bp, 99(%r14)", + )); + insns.push(( + Inst::mov_r_m(2, r8, Amode::imm_reg(99, rdi)), + "6644894763", + "movw %r8w, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(2, r9, Amode::imm_reg(99, r8)), + "6645894863", + "movw %r9w, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(2, r10, Amode::imm_reg(99, rsi)), + "6644895663", + "movw %r10w, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(2, r11, Amode::imm_reg(99, r9)), + "6645895963", + "movw %r11w, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(2, r12, Amode::imm_reg(99, rax)), + "6644896063", + "movw %r12w, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(2, r13, Amode::imm_reg(99, r15)), + "6645896F63", + "movw %r13w, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(2, r14, Amode::imm_reg(99, rcx)), + "6644897163", + "movw %r14w, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(2, r15, Amode::imm_reg(99, r14)), + "6645897E63", + "movw %r15w, 99(%r14)", + )); + // + insns.push(( + Inst::mov_r_m(1, rax, Amode::imm_reg(99, rdi)), + "884763", + "movb %al, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(1, rbx, Amode::imm_reg(99, r8)), + "41885863", + "movb %bl, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(1, rcx, Amode::imm_reg(99, rsi)), + "884E63", + "movb %cl, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(1, rdx, Amode::imm_reg(99, r9)), + "41885163", + "movb %dl, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(1, rsi, Amode::imm_reg(99, rax)), + "40887063", + "movb %sil, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(1, rdi, Amode::imm_reg(99, r15)), + "41887F63", + "movb %dil, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(1, rsp, Amode::imm_reg(99, rcx)), + "40886163", + "movb %spl, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(1, rbp, Amode::imm_reg(99, r14)), + "41886E63", + "movb %bpl, 99(%r14)", + )); + insns.push(( + Inst::mov_r_m(1, r8, Amode::imm_reg(99, rdi)), + "44884763", + "movb %r8b, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(1, r9, Amode::imm_reg(99, r8)), + "45884863", + "movb %r9b, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(1, r10, Amode::imm_reg(99, rsi)), + "44885663", + "movb %r10b, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(1, r11, Amode::imm_reg(99, r9)), + "45885963", + "movb %r11b, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(1, r12, Amode::imm_reg(99, rax)), + "44886063", + "movb %r12b, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(1, r13, Amode::imm_reg(99, r15)), + "45886F63", + "movb %r13b, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(1, r14, Amode::imm_reg(99, rcx)), + "44887163", + "movb %r14b, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(1, r15, Amode::imm_reg(99, r14)), + "45887E63", + "movb %r15b, 99(%r14)", + )); + + // ======================================================== + // Shift_R + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftLeft, None, w_rdi), + "D3E7", + "shll %cl, %edi", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftLeft, None, w_r12), + "41D3E4", + "shll %cl, %r12d", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftLeft, Some(2), w_r8), + "41C1E002", + "shll $2, %r8d", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftLeft, Some(31), w_r13), + "41C1E51F", + "shll $31, %r13d", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftLeft, None, w_r13), + "49D3E5", + "shlq %cl, %r13", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftLeft, None, w_rdi), + "48D3E7", + "shlq %cl, %rdi", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftLeft, Some(2), w_r8), + "49C1E002", + "shlq $2, %r8", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftLeft, Some(3), w_rbx), + "48C1E303", + "shlq $3, %rbx", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftLeft, Some(63), w_r13), + "49C1E53F", + "shlq $63, %r13", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightLogical, None, w_rdi), + "D3EF", + "shrl %cl, %edi", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(2), w_r8), + "41C1E802", + "shrl $2, %r8d", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(31), w_r13), + "41C1ED1F", + "shrl $31, %r13d", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightLogical, None, w_rdi), + "48D3EF", + "shrq %cl, %rdi", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(2), w_r8), + "49C1E802", + "shrq $2, %r8", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(63), w_r13), + "49C1ED3F", + "shrq $63, %r13", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, None, w_rdi), + "D3FF", + "sarl %cl, %edi", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, Some(2), w_r8), + "41C1F802", + "sarl $2, %r8d", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, Some(31), w_r13), + "41C1FD1F", + "sarl $31, %r13d", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, None, w_rdi), + "48D3FF", + "sarq %cl, %rdi", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, Some(2), w_r8), + "49C1F802", + "sarq $2, %r8", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, Some(63), w_r13), + "49C1FD3F", + "sarq $63, %r13", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::RotateLeft, None, w_r8), + "49D3C0", + "rolq %cl, %r8", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::RotateLeft, Some(3), w_r9), + "41C1C103", + "roll $3, %r9d", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::RotateRight, None, w_rsi), + "D3CE", + "rorl %cl, %esi", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::RotateRight, Some(5), w_r15), + "49C1CF05", + "rorq $5, %r15", + )); + insns.push(( + Inst::shift_r(1, ShiftKind::RotateRight, None, w_rsi), + "D2CE", + "rorb %cl, %sil", + )); + insns.push(( + Inst::shift_r(1, ShiftKind::RotateRight, Some(5), w_r15), + "41C0CF05", + "rorb $5, %r15b", + )); + insns.push(( + Inst::shift_r(2, ShiftKind::RotateRight, None, w_rsi), + "66D3CE", + "rorw %cl, %si", + )); + insns.push(( + Inst::shift_r(2, ShiftKind::RotateRight, Some(5), w_r15), + "6641C1CF05", + "rorw $5, %r15w", + )); + + // ======================================================== + // CmpRMIR + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::reg(r15), rdx), + "4C39FA", + "cmpq %r15, %rdx", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::reg(rcx), r8), + "4939C8", + "cmpq %rcx, %r8", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::reg(rcx), rsi), + "4839CE", + "cmpq %rcx, %rsi", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx), + "483B5763", + "cmpq 99(%rdi), %rdx", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8), + "4C3B4763", + "cmpq 99(%rdi), %r8", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi), + "483B7763", + "cmpq 99(%rdi), %rsi", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::imm(76543210), rdx), + "4881FAEAF48F04", + "cmpq $76543210, %rdx", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::imm(-76543210i32 as u32), r8), + "4981F8160B70FB", + "cmpq $-76543210, %r8", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::imm(76543210), rsi), + "4881FEEAF48F04", + "cmpq $76543210, %rsi", + )); + // + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::reg(r15), rdx), + "4439FA", + "cmpl %r15d, %edx", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::reg(rcx), r8), + "4139C8", + "cmpl %ecx, %r8d", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::reg(rcx), rsi), + "39CE", + "cmpl %ecx, %esi", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx), + "3B5763", + "cmpl 99(%rdi), %edx", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8), + "443B4763", + "cmpl 99(%rdi), %r8d", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi), + "3B7763", + "cmpl 99(%rdi), %esi", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::imm(76543210), rdx), + "81FAEAF48F04", + "cmpl $76543210, %edx", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::imm(-76543210i32 as u32), r8), + "4181F8160B70FB", + "cmpl $-76543210, %r8d", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::imm(76543210), rsi), + "81FEEAF48F04", + "cmpl $76543210, %esi", + )); + // + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::reg(r15), rdx), + "664439FA", + "cmpw %r15w, %dx", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::reg(rcx), r8), + "664139C8", + "cmpw %cx, %r8w", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::reg(rcx), rsi), + "6639CE", + "cmpw %cx, %si", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx), + "663B5763", + "cmpw 99(%rdi), %dx", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8), + "66443B4763", + "cmpw 99(%rdi), %r8w", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi), + "663B7763", + "cmpw 99(%rdi), %si", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::imm(23210), rdx), + "6681FAAA5A", + "cmpw $23210, %dx", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::imm(-7654i32 as u32), r8), + "664181F81AE2", + "cmpw $-7654, %r8w", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::imm(7654), rsi), + "6681FEE61D", + "cmpw $7654, %si", + )); + // + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r15), rdx), + "4438FA", + "cmpb %r15b, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r8), + "4138C8", + "cmpb %cl, %r8b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rsi), + "4038CE", + "cmpb %cl, %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx), + "3A5763", + "cmpb 99(%rdi), %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8), + "443A4763", + "cmpb 99(%rdi), %r8b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi), + "403A7763", + "cmpb 99(%rdi), %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::imm(70), rdx), + "80FA46", + "cmpb $70, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::imm(-76i32 as u32), r8), + "4180F8B4", + "cmpb $-76, %r8b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::imm(76), rsi), + "4080FE4C", + "cmpb $76, %sil", + )); + // Extra byte-cases (paranoia!) for cmp_rmi_r for first operand = R + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rax), rbx), + "38C3", + "cmpb %al, %bl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rbx), rax), + "38D8", + "cmpb %bl, %al", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rdx), + "38CA", + "cmpb %cl, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rsi), + "4038CE", + "cmpb %cl, %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r10), + "4138CA", + "cmpb %cl, %r10b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r14), + "4138CE", + "cmpb %cl, %r14b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), rdx), + "4038EA", + "cmpb %bpl, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), rsi), + "4038EE", + "cmpb %bpl, %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), r10), + "4138EA", + "cmpb %bpl, %r10b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), r14), + "4138EE", + "cmpb %bpl, %r14b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r9), rdx), + "4438CA", + "cmpb %r9b, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r9), rsi), + "4438CE", + "cmpb %r9b, %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r9), r10), + "4538CA", + "cmpb %r9b, %r10b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r9), r14), + "4538CE", + "cmpb %r9b, %r14b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r13), rdx), + "4438EA", + "cmpb %r13b, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r13), rsi), + "4438EE", + "cmpb %r13b, %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r13), r10), + "4538EA", + "cmpb %r13b, %r10b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r13), r14), + "4538EE", + "cmpb %r13b, %r14b", + )); + + // ======================================================== + // SetCC + insns.push((Inst::setcc(CC::O, w_rsi), "400F90C6", "seto %sil")); + insns.push((Inst::setcc(CC::NLE, w_rsi), "400F9FC6", "setnle %sil")); + insns.push((Inst::setcc(CC::Z, w_r14), "410F94C6", "setz %r14b")); + insns.push((Inst::setcc(CC::LE, w_r14), "410F9EC6", "setle %r14b")); + insns.push((Inst::setcc(CC::P, w_r9), "410F9AC1", "setp %r9b")); + insns.push((Inst::setcc(CC::NP, w_r8), "410F9BC0", "setnp %r8b")); + // ======================================================== + // Cmove + insns.push(( + Inst::cmove(2, CC::O, RegMem::reg(rdi), w_rsi), + "660F40F7", + "cmovow %di, %si", + )); + insns.push(( + Inst::cmove( + 2, + CC::NO, + RegMem::mem(Amode::imm_reg_reg_shift(37, rdi, rsi, 2)), + w_r15, + ), + "66440F417CB725", + "cmovnow 37(%rdi,%rsi,4), %r15w", + )); + insns.push(( + Inst::cmove(4, CC::LE, RegMem::reg(rdi), w_rsi), + "0F4EF7", + "cmovlel %edi, %esi", + )); + insns.push(( + Inst::cmove(4, CC::NLE, RegMem::mem(Amode::imm_reg(0, r15)), w_rsi), + "410F4F37", + "cmovnlel 0(%r15), %esi", + )); + insns.push(( + Inst::cmove(8, CC::Z, RegMem::reg(rdi), w_r14), + "4C0F44F7", + "cmovzq %rdi, %r14", + )); + insns.push(( + Inst::cmove(8, CC::NZ, RegMem::mem(Amode::imm_reg(13, rdi)), w_r14), + "4C0F45770D", + "cmovnzq 13(%rdi), %r14", + )); + + // ======================================================== + // Push64 + insns.push((Inst::push64(RegMemImm::reg(rdi)), "57", "pushq %rdi")); + insns.push((Inst::push64(RegMemImm::reg(r8)), "4150", "pushq %r8")); + insns.push(( + Inst::push64(RegMemImm::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))), + "FFB4CE41010000", + "pushq 321(%rsi,%rcx,8)", + )); + insns.push(( + Inst::push64(RegMemImm::mem(Amode::imm_reg_reg_shift(321, r9, rbx, 2))), + "41FFB49941010000", + "pushq 321(%r9,%rbx,4)", + )); + insns.push((Inst::push64(RegMemImm::imm(0)), "6A00", "pushq $0")); + insns.push((Inst::push64(RegMemImm::imm(127)), "6A7F", "pushq $127")); + insns.push(( + Inst::push64(RegMemImm::imm(128)), + "6880000000", + "pushq $128", + )); + insns.push(( + Inst::push64(RegMemImm::imm(0x31415927)), + "6827594131", + "pushq $826366247", + )); + insns.push(( + Inst::push64(RegMemImm::imm(-128i32 as u32)), + "6A80", + "pushq $-128", + )); + insns.push(( + Inst::push64(RegMemImm::imm(-129i32 as u32)), + "687FFFFFFF", + "pushq $-129", + )); + insns.push(( + Inst::push64(RegMemImm::imm(-0x75c4e8a1i32 as u32)), + "685F173B8A", + "pushq $-1975838881", + )); + + // ======================================================== + // Pop64 + insns.push((Inst::pop64(w_rax), "58", "popq %rax")); + insns.push((Inst::pop64(w_rdi), "5F", "popq %rdi")); + insns.push((Inst::pop64(w_r8), "4158", "popq %r8")); + insns.push((Inst::pop64(w_r15), "415F", "popq %r15")); + + // ======================================================== + // CallKnown + insns.push(( + Inst::call_known( + ExternalName::User { + namespace: 0, + index: 0, + }, + Vec::new(), + Vec::new(), + Opcode::Call, + ), + "E800000000", + "call User { namespace: 0, index: 0 }", + )); + + // ======================================================== + // CallUnknown + fn call_unknown(rm: RegMem) -> Inst { + Inst::call_unknown(rm, Vec::new(), Vec::new(), Opcode::CallIndirect) + } + + insns.push((call_unknown(RegMem::reg(rbp)), "FFD5", "call *%rbp")); + insns.push((call_unknown(RegMem::reg(r11)), "41FFD3", "call *%r11")); + insns.push(( + call_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))), + "FF94CE41010000", + "call *321(%rsi,%rcx,8)", + )); + insns.push(( + call_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rdx, 2))), + "41FF949241010000", + "call *321(%r10,%rdx,4)", + )); + + // ======================================================== + // Ret + insns.push((Inst::ret(), "C3", "ret")); + + // ======================================================== + // JmpKnown skipped for now + + // ======================================================== + // JmpCondSymm isn't a real instruction + + // ======================================================== + // JmpCond skipped for now + + // ======================================================== + // JmpCondCompound isn't a real instruction + + // ======================================================== + // JmpUnknown + insns.push((Inst::jmp_unknown(RegMem::reg(rbp)), "FFE5", "jmp *%rbp")); + insns.push(( + Inst::jmp_unknown(RegMem::reg(r11)), + "41FFE3", + "jmp *%r11", + )); + insns.push(( + Inst::jmp_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))), + "FFA4CE41010000", + "jmp *321(%rsi,%rcx,8)", + )); + insns.push(( + Inst::jmp_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rdx, 2))), + "41FFA49241010000", + "jmp *321(%r10,%rdx,4)", + )); + + // ======================================================== + // XMM_CMP_RM_R + + insns.push(( + Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, RegMem::reg(xmm1), xmm2), + "0F2ED1", + "ucomiss %xmm1, %xmm2", + )); + + insns.push(( + Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, RegMem::reg(xmm0), xmm9), + "440F2EC8", + "ucomiss %xmm0, %xmm9", + )); + + insns.push(( + Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, RegMem::reg(xmm13), xmm4), + "66410F2EE5", + "ucomisd %xmm13, %xmm4", + )); + + insns.push(( + Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, RegMem::reg(xmm11), xmm12), + "66450F2EE3", + "ucomisd %xmm11, %xmm12", + )); + + // ======================================================== + // XMM_RM_R: float binary ops + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0), + "F30F58C1", + "addss %xmm1, %xmm0", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13), + "F3450F58EB", + "addss %xmm11, %xmm13", + )); + insns.push(( + Inst::xmm_rm_r( + SseOpcode::Addss, + RegMem::mem(Amode::imm_reg_reg_shift(123, r10, rdx, 2)), + w_xmm0, + ), + "F3410F5844927B", + "addss 123(%r10,%rdx,4), %xmm0", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4), + "F2410F58E7", + "addsd %xmm15, %xmm4", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1), + "F30F5CC8", + "subss %xmm0, %xmm1", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1), + "F3410F5CCC", + "subss %xmm12, %xmm1", + )); + insns.push(( + Inst::xmm_rm_r( + SseOpcode::Subss, + RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rax, 3)), + w_xmm10, + ), + "F3450F5C94C241010000", + "subss 321(%r10,%rax,8), %xmm10", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14), + "F2440F5CF5", + "subsd %xmm5, %xmm14", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4), + "F30F59E5", + "mulss %xmm5, %xmm4", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4), + "F20F59E5", + "mulsd %xmm5, %xmm4", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7), + "F3410F5EF8", + "divss %xmm8, %xmm7", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4), + "F20F5EE5", + "divsd %xmm5, %xmm4", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12), + "440F54E3", + "andps %xmm3, %xmm12", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11), + "440F55DC", + "andnps %xmm4, %xmm11", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15), + "440F56F9", + "orps %xmm1, %xmm15", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4), + "0F56E5", + "orps %xmm5, %xmm4", + )); + + // ======================================================== + // XMM_RM_R: Integer Packed + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5), + "66410FFCE9", + "paddb %xmm9, %xmm5", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6), + "660FFDF7", + "paddw %xmm7, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13), + "66450FFEEC", + "paddd %xmm12, %xmm13", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8), + "66440FD4C1", + "paddq %xmm1, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5), + "66410FECE9", + "paddsb %xmm9, %xmm5", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6), + "660FEDF7", + "paddsw %xmm7, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13), + "66450FDCEC", + "paddusb %xmm12, %xmm13", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8), + "66440FDDC1", + "paddusw %xmm1, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5), + "66410FE8E9", + "psubsb %xmm9, %xmm5", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6), + "660FE9F7", + "psubsw %xmm7, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13), + "66450FD8EC", + "psubusb %xmm12, %xmm13", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8), + "66440FD9C1", + "psubusw %xmm1, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13), + "66450FE0EC", + "pavgb %xmm12, %xmm13", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8), + "66440FE3C1", + "pavgw %xmm1, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9), + "66440FF8CD", + "psubb %xmm5, %xmm9", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7), + "660FF9FE", + "psubw %xmm6, %xmm7", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12), + "66450FFAE5", + "psubd %xmm13, %xmm12", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1), + "66410FFBC8", + "psubq %xmm8, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6), + "66410F3840F7", + "pmulld %xmm15, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1), + "66410FD5CE", + "pmullw %xmm14, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9), + "66450FF4C8", + "pmuludq %xmm8, %xmm9", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6), + "66410F383CF7", + "pmaxsb %xmm15, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6), + "66410FEEF7", + "pmaxsw %xmm15, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6), + "66410F383DF7", + "pmaxsd %xmm15, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1), + "66410FDECE", + "pmaxub %xmm14, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1), + "66410F383ECE", + "pmaxuw %xmm14, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1), + "66410F383FCE", + "pmaxud %xmm14, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9), + "66450F3838C8", + "pminsb %xmm8, %xmm9", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9), + "66450FEAC8", + "pminsw %xmm8, %xmm9", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9), + "66450F3839C8", + "pminsd %xmm8, %xmm9", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2), + "660FDAD3", + "pminub %xmm3, %xmm2", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2), + "660F383AD3", + "pminuw %xmm3, %xmm2", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2), + "660F383BD3", + "pminud %xmm3, %xmm2", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2), + "66410FEFD3", + "pxor %xmm11, %xmm2", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2), + "66410F3800D3", + "pshufb %xmm11, %xmm2", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2), + "66410F63D3", + "packsswb %xmm11, %xmm2", + )); + + // ======================================================== + // XMM_RM_R: Integer Conversion + insns.push(( + Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::reg(xmm1), w_xmm8), + "440F5BC1", + "cvtdq2ps %xmm1, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Cvttps2dq, RegMem::reg(xmm9), w_xmm8), + "F3450F5BC1", + "cvttps2dq %xmm9, %xmm8", + )); + + // XMM_Mov_R_M: float stores + insns.push(( + Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12)), + "F3450F11BC2480000000", + "movss %xmm15, 128(%r12)", + )); + insns.push(( + Inst::xmm_mov_r_m(SseOpcode::Movsd, xmm1, Amode::imm_reg(0, rsi)), + "F20F110E", + "movsd %xmm1, 0(%rsi)", + )); + + // XmmUnary: moves and unary float ops + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Movss, RegMem::reg(xmm13), w_xmm2), + "F3410F10D5", + "movss %xmm13, %xmm2", + )); + + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Movsd, RegMem::reg(xmm0), w_xmm1), + "F20F10C8", + "movsd %xmm0, %xmm1", + )); + insns.push(( + Inst::xmm_unary_rm_r( + SseOpcode::Movsd, + RegMem::mem(Amode::imm_reg(0, rsi)), + w_xmm2, + ), + "F20F1016", + "movsd 0(%rsi), %xmm2", + )); + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Movsd, RegMem::reg(xmm14), w_xmm3), + "F2410F10DE", + "movsd %xmm14, %xmm3", + )); + + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Movaps, RegMem::reg(xmm5), w_xmm14), + "440F28F5", + "movaps %xmm5, %xmm14", + )); + + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Sqrtss, RegMem::reg(xmm7), w_xmm8), + "F3440F51C7", + "sqrtss %xmm7, %xmm8", + )); + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Sqrtsd, RegMem::reg(xmm1), w_xmm2), + "F20F51D1", + "sqrtsd %xmm1, %xmm2", + )); + + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, RegMem::reg(xmm0), w_xmm1), + "F30F5AC8", + "cvtss2sd %xmm0, %xmm1", + )); + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, RegMem::reg(xmm1), w_xmm0), + "F20F5AC1", + "cvtsd2ss %xmm1, %xmm0", + )); + + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Pabsb, RegMem::reg(xmm2), w_xmm1), + "660F381CCA", + "pabsb %xmm2, %xmm1", + )); + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Pabsw, RegMem::reg(xmm0), w_xmm0), + "660F381DC0", + "pabsw %xmm0, %xmm0", + )); + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Pabsd, RegMem::reg(xmm10), w_xmm11), + "66450F381EDA", + "pabsd %xmm10, %xmm11", + )); + + // Xmm to int conversions, and conversely. + + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Movd, xmm0, w_rsi, OperandSize::Size32), + "660F7EC6", + "movd %xmm0, %esi", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Movq, xmm2, w_rdi, OperandSize::Size64), + "66480F7ED7", + "movq %xmm2, %rdi", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Cvttss2si, xmm0, w_rsi, OperandSize::Size32), + "F30F2CF0", + "cvttss2si %xmm0, %esi", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Cvttss2si, xmm0, w_rdi, OperandSize::Size64), + "F3480F2CF8", + "cvttss2si %xmm0, %rdi", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Cvttsd2si, xmm0, w_rax, OperandSize::Size32), + "F20F2CC0", + "cvttsd2si %xmm0, %eax", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Cvttsd2si, xmm0, w_r15, OperandSize::Size64), + "F24C0F2CF8", + "cvttsd2si %xmm0, %r15", + )); + + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Pmovmskb, xmm10, w_rax, OperandSize::Size32), + "66410FD7C2", + "pmovmskb %xmm10, %eax", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Movmskps, xmm2, w_rax, OperandSize::Size32), + "0F50C2", + "movmskps %xmm2, %eax", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Movmskpd, xmm0, w_rcx, OperandSize::Size32), + "660F50C8", + "movmskpd %xmm0, %ecx", + )); + + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::reg(rax), + OperandSize::Size32, + w_xmm15, + ), + "66440F6EF8", + "movd %eax, %xmm15", + )); + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::mem(Amode::imm_reg(2, r10)), + OperandSize::Size32, + w_xmm9, + ), + "66450F6E4A02", + "movd 2(%r10), %xmm9", + )); + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::reg(rsi), + OperandSize::Size32, + w_xmm1, + ), + "660F6ECE", + "movd %esi, %xmm1", + )); + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Movq, + RegMem::reg(rdi), + OperandSize::Size64, + w_xmm15, + ), + "664C0F6EFF", + "movq %rdi, %xmm15", + )); + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Cvtsi2ss, + RegMem::reg(rdi), + OperandSize::Size32, + w_xmm15, + ), + "F3440F2AFF", + "cvtsi2ss %edi, %xmm15", + )); + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Cvtsi2sd, + RegMem::reg(rsi), + OperandSize::Size64, + w_xmm1, + ), + "F2480F2ACE", + "cvtsi2sd %rsi, %xmm1", + )); + + // ======================================================== + // XmmRmi + insns.push(( + Inst::xmm_rmi_reg(SseOpcode::Psraw, RegMemImm::reg(xmm10), w_xmm1), + "66410FE1CA", + "psraw %xmm10, %xmm1", + )); + insns.push(( + Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(31), w_xmm1), + "660F72F11F", + "pslld $31, %xmm1", + )); + insns.push(( + Inst::xmm_rmi_reg(SseOpcode::Psrlq, RegMemImm::imm(1), w_xmm3), + "660F73D301", + "psrlq $1, %xmm3", + )); + + // ======================================================== + // XmmRmRImm + insns.push(( + Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false), + "660FC2CD02", + "cmppd $2, %xmm5, %xmm1", + )); + insns.push(( + Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false), + "410FC2FF00", + "cmpps $0, %xmm15, %xmm7", + )); + + // ======================================================== + // Pertaining to atomics. + let am1: SyntheticAmode = Amode::imm_reg_reg_shift(321, r10, rdx, 2).into(); + // `am2` doesn't contribute any 1 bits to the rex prefix, so we must use it when testing + // for retention of the apparently-redundant rex prefix in the 8-bit case. + let am2: SyntheticAmode = Amode::imm_reg_reg_shift(-12345i32 as u32, rcx, rsi, 3).into(); + + // A general 8-bit case. + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: rbx, + dst: am1, + }, + "F0410FB09C9241010000", + "lock cmpxchgb %bl, 321(%r10,%rdx,4)", + )); + // Check redundant rex retention in 8-bit cases. + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: rdx, + dst: am2.clone(), + }, + "F00FB094F1C7CFFFFF", + "lock cmpxchgb %dl, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: rsi, + dst: am2.clone(), + }, + "F0400FB0B4F1C7CFFFFF", + "lock cmpxchgb %sil, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: r10, + dst: am2.clone(), + }, + "F0440FB094F1C7CFFFFF", + "lock cmpxchgb %r10b, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: r15, + dst: am2.clone(), + }, + "F0440FB0BCF1C7CFFFFF", + "lock cmpxchgb %r15b, -12345(%rcx,%rsi,8)", + )); + // 16 bit cases + insns.push(( + Inst::LockCmpxchg { + ty: types::I16, + src: rsi, + dst: am2.clone(), + }, + "66F00FB1B4F1C7CFFFFF", + "lock cmpxchgw %si, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I16, + src: r10, + dst: am2.clone(), + }, + "66F0440FB194F1C7CFFFFF", + "lock cmpxchgw %r10w, -12345(%rcx,%rsi,8)", + )); + // 32 bit cases + insns.push(( + Inst::LockCmpxchg { + ty: types::I32, + src: rsi, + dst: am2.clone(), + }, + "F00FB1B4F1C7CFFFFF", + "lock cmpxchgl %esi, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I32, + src: r10, + dst: am2.clone(), + }, + "F0440FB194F1C7CFFFFF", + "lock cmpxchgl %r10d, -12345(%rcx,%rsi,8)", + )); + // 64 bit cases + insns.push(( + Inst::LockCmpxchg { + ty: types::I64, + src: rsi, + dst: am2.clone(), + }, + "F0480FB1B4F1C7CFFFFF", + "lock cmpxchgq %rsi, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I64, + src: r10, + dst: am2.clone(), + }, + "F04C0FB194F1C7CFFFFF", + "lock cmpxchgq %r10, -12345(%rcx,%rsi,8)", + )); + + // AtomicRmwSeq + insns.push(( + Inst::AtomicRmwSeq { ty: types::I8, op: inst_common::AtomicRmwOp::Or, }, + "490FB6014989C34D09D3F0450FB0190F85EFFFFFFF", + "atomically { 8_bits_at_[%r9]) Or= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + insns.push(( + Inst::AtomicRmwSeq { ty: types::I16, op: inst_common::AtomicRmwOp::And, }, + "490FB7014989C34D21D366F0450FB1190F85EEFFFFFF", + "atomically { 16_bits_at_[%r9]) And= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + insns.push(( + Inst::AtomicRmwSeq { ty: types::I32, op: inst_common::AtomicRmwOp::Xchg, }, + "418B014989C34D89D3F0450FB1190F85EFFFFFFF", + "atomically { 32_bits_at_[%r9]) Xchg= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + insns.push(( + Inst::AtomicRmwSeq { ty: types::I64, op: inst_common::AtomicRmwOp::Add, }, + "498B014989C34D01D3F04D0FB1190F85EFFFFFFF", + "atomically { 64_bits_at_[%r9]) Add= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + + // Fence + insns.push(( + Inst::Fence { + kind: FenceKind::MFence, + }, + "0FAEF0", + "mfence", + )); + insns.push(( + Inst::Fence { + kind: FenceKind::LFence, + }, + "0FAEE8", + "lfence", + )); + insns.push(( + Inst::Fence { + kind: FenceKind::SFence, + }, + "0FAEF8", + "sfence", + )); + + // ======================================================== + // Misc instructions. + + insns.push((Inst::Hlt, "CC", "hlt")); + + let trap_code = TrapCode::UnreachableCodeReached; + insns.push((Inst::Ud2 { trap_code }, "0F0B", "ud2 unreachable")); + + // ======================================================== + // Actually run the tests! + let flags = settings::Flags::new(settings::builder()); + + use crate::settings::Configurable; + let mut isa_flag_builder = x64::settings::builder(); + isa_flag_builder.enable("has_ssse3").unwrap(); + isa_flag_builder.enable("has_sse41").unwrap(); + let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder); + + let rru = regs::create_reg_universe_systemv(&flags); + let emit_info = EmitInfo::new(flags, isa_flags); + for (insn, expected_encoding, expected_printing) in insns { + // Check the printed text is as expected. + let actual_printing = insn.show_rru(Some(&rru)); + assert_eq!(expected_printing, actual_printing); + let mut sink = test_utils::TestCodeSink::new(); + let mut buffer = MachBuffer::new(); + + insn.emit(&mut buffer, &emit_info, &mut Default::default()); + + // Allow one label just after the instruction (so the offset is 0). + let label = buffer.get_label(); + buffer.bind_label(label); + + let buffer = buffer.finish(); + buffer.emit(&mut sink); + let actual_encoding = &sink.stringify(); + assert_eq!(expected_encoding, actual_encoding, "{}", expected_printing); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs new file mode 100644 index 0000000000..1172b22eff --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs @@ -0,0 +1,2733 @@ +//! This module defines x86_64-specific machine instruction types. + +use crate::binemit::{CodeOffset, StackMap}; +use crate::ir::{types, ExternalName, Opcode, SourceLoc, TrapCode, Type}; +use crate::isa::x64::settings as x64_settings; +use crate::machinst::*; +use crate::{settings, settings::Flags, CodegenError, CodegenResult}; +use alloc::boxed::Box; +use alloc::vec::Vec; +use regalloc::{ + PrettyPrint, PrettyPrintSized, RealRegUniverse, Reg, RegClass, RegUsageCollector, + RegUsageMapper, SpillSlot, VirtualReg, Writable, +}; +use smallvec::SmallVec; +use std::fmt; +use std::string::{String, ToString}; + +pub mod args; +mod emit; +#[cfg(test)] +mod emit_tests; +pub mod regs; +pub mod unwind; + +use args::*; +use regs::{create_reg_universe_systemv, show_ireg_sized}; + +//============================================================================= +// Instructions (top level): definition + +// Don't build these directly. Instead use the Inst:: functions to create them. + +/// Instructions. Destinations are on the RIGHT (a la AT&T syntax). +#[derive(Clone)] +pub enum Inst { + /// Nops of various sizes, including zero. + Nop { len: u8 }, + + // ===================================== + // Integer instructions. + /// Integer arithmetic/bit-twiddling: (add sub and or xor mul adc? sbb?) (32 64) (reg addr imm) reg + AluRmiR { + is_64: bool, + op: AluRmiROpcode, + src: RegMemImm, + dst: Writable<Reg>, + }, + + /// Instructions on GPR that only read src and defines dst (dst is not modified): bsr, etc. + UnaryRmR { + size: u8, // 2, 4 or 8 + op: UnaryRmROpcode, + src: RegMem, + dst: Writable<Reg>, + }, + + /// Bitwise not + Not { + size: u8, // 1, 2, 4 or 8 + src: Writable<Reg>, + }, + + /// Integer negation + Neg { + size: u8, // 1, 2, 4 or 8 + src: Writable<Reg>, + }, + + /// Integer quotient and remainder: (div idiv) $rax $rdx (reg addr) + Div { + size: u8, // 1, 2, 4 or 8 + signed: bool, + divisor: RegMem, + }, + + /// The high bits (RDX) of a (un)signed multiply: RDX:RAX := RAX * rhs. + MulHi { size: u8, signed: bool, rhs: RegMem }, + + /// A synthetic sequence to implement the right inline checks for remainder and division, + /// assuming the dividend is in %rax. + /// Puts the result back into %rax if is_div, %rdx if !is_div, to mimic what the div + /// instruction does. + /// The generated code sequence is described in the emit's function match arm for this + /// instruction. + /// + /// Note: %rdx is marked as modified by this instruction, to avoid an early clobber problem + /// with the temporary and divisor registers. Make sure to zero %rdx right before this + /// instruction, or you might run into regalloc failures where %rdx is live before its first + /// def! + CheckedDivOrRemSeq { + kind: DivOrRemKind, + size: u8, + /// The divisor operand. Note it's marked as modified so that it gets assigned a register + /// different from the temporary. + divisor: Writable<Reg>, + tmp: Option<Writable<Reg>>, + }, + + /// Do a sign-extend based on the sign of the value in rax into rdx: (cwd cdq cqo) + /// or al into ah: (cbw) + SignExtendData { + size: u8, // 1, 2, 4 or 8 + }, + + /// Constant materialization: (imm32 imm64) reg. + /// Either: movl $imm32, %reg32 or movabsq $imm64, %reg32. + Imm { + dst_is_64: bool, + simm64: u64, + dst: Writable<Reg>, + }, + + /// GPR to GPR move: mov (64 32) reg reg. + MovRR { + is_64: bool, + src: Reg, + dst: Writable<Reg>, + }, + + /// Zero-extended loads, except for 64 bits: movz (bl bq wl wq lq) addr reg. + /// Note that the lq variant doesn't really exist since the default zero-extend rule makes it + /// unnecessary. For that case we emit the equivalent "movl AM, reg32". + MovzxRmR { + ext_mode: ExtMode, + src: RegMem, + dst: Writable<Reg>, + }, + + /// A plain 64-bit integer load, since MovZX_RM_R can't represent that. + Mov64MR { + src: SyntheticAmode, + dst: Writable<Reg>, + }, + + /// Loads the memory address of addr into dst. + LoadEffectiveAddress { + addr: SyntheticAmode, + dst: Writable<Reg>, + }, + + /// Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg. + MovsxRmR { + ext_mode: ExtMode, + src: RegMem, + dst: Writable<Reg>, + }, + + /// Integer stores: mov (b w l q) reg addr. + MovRM { + size: u8, // 1, 2, 4 or 8. + src: Reg, + dst: SyntheticAmode, + }, + + /// Arithmetic shifts: (shl shr sar) (b w l q) imm reg. + ShiftR { + size: u8, // 1, 2, 4 or 8 + kind: ShiftKind, + /// shift count: Some(0 .. #bits-in-type - 1), or None to mean "%cl". + num_bits: Option<u8>, + dst: Writable<Reg>, + }, + + /// Arithmetic SIMD shifts. + XmmRmiReg { + opcode: SseOpcode, + src: RegMemImm, + dst: Writable<Reg>, + }, + + /// Integer comparisons/tests: cmp (b w l q) (reg addr imm) reg. + CmpRmiR { + size: u8, // 1, 2, 4 or 8 + src: RegMemImm, + dst: Reg, + }, + + /// Materializes the requested condition code in the destination reg. + Setcc { cc: CC, dst: Writable<Reg> }, + + /// Integer conditional move. + /// Overwrites the destination register. + Cmove { + /// Possible values are 2, 4 or 8. Checked in the related factory. + size: u8, + cc: CC, + src: RegMem, + dst: Writable<Reg>, + }, + + // ===================================== + // Stack manipulation. + /// pushq (reg addr imm) + Push64 { src: RegMemImm }, + + /// popq reg + Pop64 { dst: Writable<Reg> }, + + // ===================================== + // Floating-point operations. + /// XMM (scalar or vector) binary op: (add sub and or xor mul adc? sbb?) (32 64) (reg addr) reg + XmmRmR { + op: SseOpcode, + src: RegMem, + dst: Writable<Reg>, + }, + + /// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt, + /// etc. + /// + /// This differs from XMM_RM_R in that the dst register of XmmUnaryRmR is not used in the + /// computation of the instruction dst value and so does not have to be a previously valid + /// value. This is characteristic of mov instructions. + XmmUnaryRmR { + op: SseOpcode, + src: RegMem, + dst: Writable<Reg>, + }, + + /// XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd, movq + XmmMovRM { + op: SseOpcode, + src: Reg, + dst: SyntheticAmode, + }, + + /// XMM (vector) unary op (to move a constant value into an xmm register): movups + XmmLoadConst { + src: VCodeConstant, + dst: Writable<Reg>, + ty: Type, + }, + + /// XMM (scalar) unary op (from xmm to integer reg): movd, movq, cvtts{s,d}2si + XmmToGpr { + op: SseOpcode, + src: Reg, + dst: Writable<Reg>, + dst_size: OperandSize, + }, + + /// XMM (scalar) unary op (from integer to float reg): movd, movq, cvtsi2s{s,d} + GprToXmm { + op: SseOpcode, + src: RegMem, + dst: Writable<Reg>, + src_size: OperandSize, + }, + + /// Converts an unsigned int64 to a float32/float64. + CvtUint64ToFloatSeq { + /// Is the target a 64-bits or 32-bits register? + to_f64: bool, + /// A copy of the source register, fed by lowering. It is marked as modified during + /// register allocation to make sure that the temporary registers differ from the src + /// register, since both registers are live at the same time in the generated code + /// sequence. + src: Writable<Reg>, + dst: Writable<Reg>, + tmp_gpr1: Writable<Reg>, + tmp_gpr2: Writable<Reg>, + }, + + /// Converts a scalar xmm to a signed int32/int64. + CvtFloatToSintSeq { + dst_size: OperandSize, + src_size: OperandSize, + is_saturating: bool, + /// A copy of the source register, fed by lowering. It is marked as modified during + /// register allocation to make sure that the temporary xmm register differs from the src + /// register, since both registers are live at the same time in the generated code + /// sequence. + src: Writable<Reg>, + dst: Writable<Reg>, + tmp_gpr: Writable<Reg>, + tmp_xmm: Writable<Reg>, + }, + + /// Converts a scalar xmm to an unsigned int32/int64. + CvtFloatToUintSeq { + src_size: OperandSize, + dst_size: OperandSize, + is_saturating: bool, + /// A copy of the source register, fed by lowering, reused as a temporary. It is marked as + /// modified during register allocation to make sure that the temporary xmm register + /// differs from the src register, since both registers are live at the same time in the + /// generated code sequence. + src: Writable<Reg>, + dst: Writable<Reg>, + tmp_gpr: Writable<Reg>, + tmp_xmm: Writable<Reg>, + }, + + /// A sequence to compute min/max with the proper NaN semantics for xmm registers. + XmmMinMaxSeq { + size: OperandSize, + is_min: bool, + lhs: Reg, + rhs_dst: Writable<Reg>, + }, + + /// XMM (scalar) conditional move. + /// Overwrites the destination register if cc is set. + XmmCmove { + /// Whether the cmove is moving either 32 or 64 bits. + is_64: bool, + cc: CC, + src: RegMem, + dst: Writable<Reg>, + }, + + /// Float comparisons/tests: cmp (b w l q) (reg addr imm) reg. + XmmCmpRmR { + op: SseOpcode, + src: RegMem, + dst: Reg, + }, + + /// A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg + XmmRmRImm { + op: SseOpcode, + src: RegMem, + dst: Writable<Reg>, + imm: u8, + is64: bool, + }, + + // ===================================== + // Control flow instructions. + /// Direct call: call simm32. + CallKnown { + dest: ExternalName, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: Opcode, + }, + + /// Indirect call: callq (reg mem). + CallUnknown { + dest: RegMem, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: Opcode, + }, + + /// Return. + Ret, + + /// A placeholder instruction, generating no code, meaning that a function epilogue must be + /// inserted there. + EpiloguePlaceholder, + + /// Jump to a known target: jmp simm32. + JmpKnown { dst: MachLabel }, + + /// One-way conditional branch: jcond cond target. + /// + /// This instruction is useful when we have conditional jumps depending on more than two + /// conditions, see for instance the lowering of Brz/brnz with Fcmp inputs. + /// + /// A note of caution: in contexts where the branch target is another block, this has to be the + /// same successor as the one specified in the terminator branch of the current block. + /// Otherwise, this might confuse register allocation by creating new invisible edges. + JmpIf { cc: CC, taken: MachLabel }, + + /// Two-way conditional branch: jcond cond target target. + /// Emitted as a compound sequence; the MachBuffer will shrink it as appropriate. + JmpCond { + cc: CC, + taken: MachLabel, + not_taken: MachLabel, + }, + + /// Jump-table sequence, as one compound instruction (see note in lower.rs for rationale). + /// The generated code sequence is described in the emit's function match arm for this + /// instruction. + /// See comment in lowering about the temporaries signedness. + JmpTableSeq { + idx: Reg, + tmp1: Writable<Reg>, + tmp2: Writable<Reg>, + default_target: MachLabel, + targets: Vec<MachLabel>, + targets_for_term: Vec<MachLabel>, + }, + + /// Indirect jump: jmpq (reg mem). + JmpUnknown { target: RegMem }, + + /// Traps if the condition code is set. + TrapIf { cc: CC, trap_code: TrapCode }, + + /// A debug trap. + Hlt, + + /// An instruction that will always trigger the illegal instruction exception. + Ud2 { trap_code: TrapCode }, + + /// Loads an external symbol in a register, with a relocation: movabsq $name, dst + LoadExtName { + dst: Writable<Reg>, + name: Box<ExternalName>, + offset: i64, + }, + + // ===================================== + // Instructions pertaining to atomic memory accesses. + /// A standard (native) `lock cmpxchg src, (amode)`, with register conventions: + /// + /// `dst` (read) address + /// `src` (read) replacement value + /// %rax (modified) in: expected value, out: value that was actually at `dst` + /// %rflags is written. Do not assume anything about it after the instruction. + /// + /// The instruction "succeeded" iff the lowest `ty` bits of %rax afterwards are the same as + /// they were before. + LockCmpxchg { + ty: Type, // I8, I16, I32 or I64 + src: Reg, + dst: SyntheticAmode, + }, + + /// A synthetic instruction, based on a loop around a native `lock cmpxchg` instruction. + /// This atomically modifies a value in memory and returns the old value. The sequence + /// consists of an initial "normal" load from `dst`, followed by a loop which computes the + /// new value and tries to compare-and-swap ("CAS") it into `dst`, using the native + /// instruction `lock cmpxchg{b,w,l,q}` . The loop iterates until the CAS is successful. + /// If there is no contention, there will be only one pass through the loop body. The + /// sequence does *not* perform any explicit memory fence instructions + /// (mfence/sfence/lfence). + /// + /// Note that the transaction is atomic in the sense that, as observed by some other thread, + /// `dst` either has the initial or final value, but no other. It isn't atomic in the sense + /// of guaranteeing that no other thread writes to `dst` in between the initial load and the + /// CAS -- but that would cause the CAS to fail unless the other thread's last write before + /// the CAS wrote the same value that was already there. In other words, this + /// implementation suffers (unavoidably) from the A-B-A problem. + /// + /// This instruction sequence has fixed register uses as follows: + /// + /// %r9 (read) address + /// %r10 (read) second operand for `op` + /// %r11 (written) scratch reg; value afterwards has no meaning + /// %rax (written) the old value at %r9 + /// %rflags is written. Do not assume anything about it after the instruction. + AtomicRmwSeq { + ty: Type, // I8, I16, I32 or I64 + op: inst_common::AtomicRmwOp, + }, + + /// A memory fence (mfence, lfence or sfence). + Fence { kind: FenceKind }, + + // ===================================== + // Meta-instructions generating no code. + /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This + /// controls how MemArg::NominalSPOffset args are lowered. + VirtualSPOffsetAdj { offset: i64 }, + + /// Provides a way to tell the register allocator that the upcoming sequence of instructions + /// will overwrite `dst` so it should be considered as a `def`; use this with care. + /// + /// This is useful when we have a sequence of instructions whose register usages are nominally + /// `mod`s, but such that the combination of operations creates a result that is independent of + /// the initial register value. It's thus semantically a `def`, not a `mod`, when all the + /// instructions are taken together, so we want to ensure the register is defined (its + /// live-range starts) prior to the sequence to keep analyses happy. + /// + /// One alternative would be a compound instruction that somehow encapsulates the others and + /// reports its own `def`s/`use`s/`mod`s; this adds complexity (the instruction list is no + /// longer flat) and requires knowledge about semantics and initial-value independence anyway. + XmmUninitializedValue { dst: Writable<Reg> }, +} + +pub(crate) fn low32_will_sign_extend_to_64(x: u64) -> bool { + let xs = x as i64; + xs == ((xs << 32) >> 32) +} + +impl Inst { + fn isa_requirement(&self) -> Option<InstructionSet> { + match self { + // These instructions are part of SSE2, which is a basic requirement in Cranelift, and + // don't have to be checked. + Inst::AluRmiR { .. } + | Inst::AtomicRmwSeq { .. } + | Inst::CallKnown { .. } + | Inst::CallUnknown { .. } + | Inst::CheckedDivOrRemSeq { .. } + | Inst::Cmove { .. } + | Inst::CmpRmiR { .. } + | Inst::CvtFloatToSintSeq { .. } + | Inst::CvtFloatToUintSeq { .. } + | Inst::CvtUint64ToFloatSeq { .. } + | Inst::Div { .. } + | Inst::EpiloguePlaceholder + | Inst::Fence { .. } + | Inst::Hlt + | Inst::Imm { .. } + | Inst::JmpCond { .. } + | Inst::JmpIf { .. } + | Inst::JmpKnown { .. } + | Inst::JmpTableSeq { .. } + | Inst::JmpUnknown { .. } + | Inst::LoadEffectiveAddress { .. } + | Inst::LoadExtName { .. } + | Inst::LockCmpxchg { .. } + | Inst::Mov64MR { .. } + | Inst::MovRM { .. } + | Inst::MovRR { .. } + | Inst::MovsxRmR { .. } + | Inst::MovzxRmR { .. } + | Inst::MulHi { .. } + | Inst::Neg { .. } + | Inst::Not { .. } + | Inst::Nop { .. } + | Inst::Pop64 { .. } + | Inst::Push64 { .. } + | Inst::Ret + | Inst::Setcc { .. } + | Inst::ShiftR { .. } + | Inst::SignExtendData { .. } + | Inst::TrapIf { .. } + | Inst::Ud2 { .. } + | Inst::UnaryRmR { .. } + | Inst::VirtualSPOffsetAdj { .. } + | Inst::XmmCmove { .. } + | Inst::XmmCmpRmR { .. } + | Inst::XmmLoadConst { .. } + | Inst::XmmMinMaxSeq { .. } + | Inst::XmmUninitializedValue { .. } => None, + + // These use dynamic SSE opcodes. + Inst::GprToXmm { op, .. } + | Inst::XmmMovRM { op, .. } + | Inst::XmmRmiReg { opcode: op, .. } + | Inst::XmmRmR { op, .. } + | Inst::XmmRmRImm { op, .. } + | Inst::XmmToGpr { op, .. } + | Inst::XmmUnaryRmR { op, .. } => Some(op.available_from()), + } + } +} + +// Handy constructors for Insts. + +impl Inst { + pub(crate) fn nop(len: u8) -> Self { + debug_assert!(len <= 16); + Self::Nop { len } + } + + pub(crate) fn alu_rmi_r( + is_64: bool, + op: AluRmiROpcode, + src: RegMemImm, + dst: Writable<Reg>, + ) -> Self { + src.assert_regclass_is(RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Self::AluRmiR { + is_64, + op, + src, + dst, + } + } + + pub(crate) fn unary_rm_r( + size: u8, + op: UnaryRmROpcode, + src: RegMem, + dst: Writable<Reg>, + ) -> Self { + src.assert_regclass_is(RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2); + Self::UnaryRmR { size, op, src, dst } + } + + pub(crate) fn not(size: u8, src: Writable<Reg>) -> Inst { + debug_assert_eq!(src.to_reg().get_class(), RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + Inst::Not { size, src } + } + + pub(crate) fn neg(size: u8, src: Writable<Reg>) -> Inst { + debug_assert_eq!(src.to_reg().get_class(), RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + Inst::Neg { size, src } + } + + pub(crate) fn div(size: u8, signed: bool, divisor: RegMem) -> Inst { + divisor.assert_regclass_is(RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + Inst::Div { + size, + signed, + divisor, + } + } + + pub(crate) fn mul_hi(size: u8, signed: bool, rhs: RegMem) -> Inst { + rhs.assert_regclass_is(RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + Inst::MulHi { size, signed, rhs } + } + + pub(crate) fn checked_div_or_rem_seq( + kind: DivOrRemKind, + size: u8, + divisor: Writable<Reg>, + tmp: Option<Writable<Reg>>, + ) -> Inst { + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + debug_assert!(divisor.to_reg().get_class() == RegClass::I64); + debug_assert!(tmp + .map(|tmp| tmp.to_reg().get_class() == RegClass::I64) + .unwrap_or(true)); + Inst::CheckedDivOrRemSeq { + kind, + size, + divisor, + tmp, + } + } + + pub(crate) fn sign_extend_data(size: u8) -> Inst { + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + Inst::SignExtendData { size } + } + + pub(crate) fn imm(size: OperandSize, simm64: u64, dst: Writable<Reg>) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + // Try to generate a 32-bit immediate when the upper high bits are zeroed (which matches + // the semantics of movl). + let dst_is_64 = size == OperandSize::Size64 && simm64 > u32::max_value() as u64; + Inst::Imm { + dst_is_64, + simm64, + dst, + } + } + + pub(crate) fn mov_r_r(is_64: bool, src: Reg, dst: Writable<Reg>) -> Inst { + debug_assert!(src.get_class() == RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::MovRR { is_64, src, dst } + } + + // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level) + pub(crate) fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmUnaryRmR { op, src, dst } + } + + pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable<Reg>, ty: Type) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + debug_assert!(ty.is_vector() && ty.bits() == 128); + Inst::XmmLoadConst { src, dst, ty } + } + + /// Convenient helper for unary float operations. + pub(crate) fn xmm_unary_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmUnaryRmR { op, src, dst } + } + + pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmRmR { op, src, dst } + } + + pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self { + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmUninitializedValue { dst } + } + + pub(crate) fn xmm_mov_r_m(op: SseOpcode, src: Reg, dst: impl Into<SyntheticAmode>) -> Inst { + debug_assert!(src.get_class() == RegClass::V128); + Inst::XmmMovRM { + op, + src, + dst: dst.into(), + } + } + + pub(crate) fn xmm_to_gpr( + op: SseOpcode, + src: Reg, + dst: Writable<Reg>, + dst_size: OperandSize, + ) -> Inst { + debug_assert!(src.get_class() == RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::XmmToGpr { + op, + src, + dst, + dst_size, + } + } + + pub(crate) fn gpr_to_xmm( + op: SseOpcode, + src: RegMem, + src_size: OperandSize, + dst: Writable<Reg>, + ) -> Inst { + src.assert_regclass_is(RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::GprToXmm { + op, + src, + dst, + src_size, + } + } + + pub(crate) fn xmm_cmp_rm_r(op: SseOpcode, src: RegMem, dst: Reg) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.get_class() == RegClass::V128); + Inst::XmmCmpRmR { op, src, dst } + } + + pub(crate) fn cvt_u64_to_float_seq( + to_f64: bool, + src: Writable<Reg>, + tmp_gpr1: Writable<Reg>, + tmp_gpr2: Writable<Reg>, + dst: Writable<Reg>, + ) -> Inst { + debug_assert!(src.to_reg().get_class() == RegClass::I64); + debug_assert!(tmp_gpr1.to_reg().get_class() == RegClass::I64); + debug_assert!(tmp_gpr2.to_reg().get_class() == RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::CvtUint64ToFloatSeq { + src, + dst, + tmp_gpr1, + tmp_gpr2, + to_f64, + } + } + + pub(crate) fn cvt_float_to_sint_seq( + src_size: OperandSize, + dst_size: OperandSize, + is_saturating: bool, + src: Writable<Reg>, + dst: Writable<Reg>, + tmp_gpr: Writable<Reg>, + tmp_xmm: Writable<Reg>, + ) -> Inst { + debug_assert!(src.to_reg().get_class() == RegClass::V128); + debug_assert!(tmp_xmm.to_reg().get_class() == RegClass::V128); + debug_assert!(tmp_gpr.to_reg().get_class() == RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::CvtFloatToSintSeq { + src_size, + dst_size, + is_saturating, + src, + dst, + tmp_gpr, + tmp_xmm, + } + } + + pub(crate) fn cvt_float_to_uint_seq( + src_size: OperandSize, + dst_size: OperandSize, + is_saturating: bool, + src: Writable<Reg>, + dst: Writable<Reg>, + tmp_gpr: Writable<Reg>, + tmp_xmm: Writable<Reg>, + ) -> Inst { + debug_assert!(src.to_reg().get_class() == RegClass::V128); + debug_assert!(tmp_xmm.to_reg().get_class() == RegClass::V128); + debug_assert!(tmp_gpr.to_reg().get_class() == RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::CvtFloatToUintSeq { + src_size, + dst_size, + is_saturating, + src, + dst, + tmp_gpr, + tmp_xmm, + } + } + + pub(crate) fn xmm_min_max_seq( + size: OperandSize, + is_min: bool, + lhs: Reg, + rhs_dst: Writable<Reg>, + ) -> Inst { + debug_assert_eq!(lhs.get_class(), RegClass::V128); + debug_assert_eq!(rhs_dst.to_reg().get_class(), RegClass::V128); + Inst::XmmMinMaxSeq { + size, + is_min, + lhs, + rhs_dst, + } + } + + pub(crate) fn xmm_rm_r_imm( + op: SseOpcode, + src: RegMem, + dst: Writable<Reg>, + imm: u8, + is64: bool, + ) -> Inst { + Inst::XmmRmRImm { + op, + src, + dst, + imm, + is64, + } + } + + pub(crate) fn movzx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::MovzxRmR { ext_mode, src, dst } + } + + pub(crate) fn xmm_rmi_reg(opcode: SseOpcode, src: RegMemImm, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmRmiReg { opcode, src, dst } + } + + pub(crate) fn movsx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::MovsxRmR { ext_mode, src, dst } + } + + pub(crate) fn mov64_m_r(src: impl Into<SyntheticAmode>, dst: Writable<Reg>) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::Mov64MR { + src: src.into(), + dst, + } + } + + /// A convenience function to be able to use a RegMem as the source of a move. + pub(crate) fn mov64_rm_r(src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::I64); + match src { + RegMem::Reg { reg } => Self::mov_r_r(true, reg, dst), + RegMem::Mem { addr } => Self::mov64_m_r(addr, dst), + } + } + + pub(crate) fn mov_r_m( + size: u8, // 1, 2, 4 or 8 + src: Reg, + dst: impl Into<SyntheticAmode>, + ) -> Inst { + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + debug_assert!(src.get_class() == RegClass::I64); + Inst::MovRM { + size, + src, + dst: dst.into(), + } + } + + pub(crate) fn lea(addr: impl Into<SyntheticAmode>, dst: Writable<Reg>) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::LoadEffectiveAddress { + addr: addr.into(), + dst, + } + } + + pub(crate) fn shift_r( + size: u8, + kind: ShiftKind, + num_bits: Option<u8>, + dst: Writable<Reg>, + ) -> Inst { + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + debug_assert!(if let Some(num_bits) = num_bits { + num_bits < size * 8 + } else { + true + }); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::ShiftR { + size, + kind, + num_bits, + dst, + } + } + + /// Does a comparison of dst - src for operands of size `size`, as stated by the machine + /// instruction semantics. Be careful with the order of parameters! + pub(crate) fn cmp_rmi_r( + size: u8, // 1, 2, 4 or 8 + src: RegMemImm, + dst: Reg, + ) -> Inst { + src.assert_regclass_is(RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + debug_assert!(dst.get_class() == RegClass::I64); + Inst::CmpRmiR { size, src, dst } + } + + pub(crate) fn trap(trap_code: TrapCode) -> Inst { + Inst::Ud2 { + trap_code: trap_code, + } + } + + pub(crate) fn setcc(cc: CC, dst: Writable<Reg>) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::Setcc { cc, dst } + } + + pub(crate) fn cmove(size: u8, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst { + debug_assert!(size == 8 || size == 4 || size == 2); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::Cmove { size, cc, src, dst } + } + + pub(crate) fn xmm_cmove(is_64: bool, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmCmove { + is_64, + cc, + src, + dst, + } + } + + pub(crate) fn push64(src: RegMemImm) -> Inst { + src.assert_regclass_is(RegClass::I64); + Inst::Push64 { src } + } + + pub(crate) fn pop64(dst: Writable<Reg>) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::Pop64 { dst } + } + + pub(crate) fn call_known( + dest: ExternalName, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: Opcode, + ) -> Inst { + Inst::CallKnown { + dest, + uses, + defs, + opcode, + } + } + + pub(crate) fn call_unknown( + dest: RegMem, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: Opcode, + ) -> Inst { + dest.assert_regclass_is(RegClass::I64); + Inst::CallUnknown { + dest, + uses, + defs, + opcode, + } + } + + pub(crate) fn ret() -> Inst { + Inst::Ret + } + + pub(crate) fn epilogue_placeholder() -> Inst { + Inst::EpiloguePlaceholder + } + + pub(crate) fn jmp_known(dst: MachLabel) -> Inst { + Inst::JmpKnown { dst } + } + + pub(crate) fn jmp_if(cc: CC, taken: MachLabel) -> Inst { + Inst::JmpIf { cc, taken } + } + + pub(crate) fn jmp_cond(cc: CC, taken: MachLabel, not_taken: MachLabel) -> Inst { + Inst::JmpCond { + cc, + taken, + not_taken, + } + } + + pub(crate) fn jmp_unknown(target: RegMem) -> Inst { + target.assert_regclass_is(RegClass::I64); + Inst::JmpUnknown { target } + } + + pub(crate) fn trap_if(cc: CC, trap_code: TrapCode) -> Inst { + Inst::TrapIf { cc, trap_code } + } + + /// Choose which instruction to use for loading a register value from memory. For loads smaller + /// than 64 bits, this method expects a way to extend the value (i.e. [ExtKind::SignExtend], + /// [ExtKind::ZeroExtend]); loads with no extension necessary will ignore this. + pub(crate) fn load( + ty: Type, + from_addr: impl Into<SyntheticAmode>, + to_reg: Writable<Reg>, + ext_kind: ExtKind, + ) -> Inst { + let rc = to_reg.to_reg().get_class(); + match rc { + RegClass::I64 => { + let ext_mode = match ty.bytes() { + 1 => Some(ExtMode::BQ), + 2 => Some(ExtMode::WQ), + 4 => Some(ExtMode::LQ), + 8 => None, + _ => unreachable!("the type should never use a scalar load: {}", ty), + }; + if let Some(ext_mode) = ext_mode { + // Values smaller than 64 bits must be extended in some way. + match ext_kind { + ExtKind::SignExtend => { + Inst::movsx_rm_r(ext_mode, RegMem::mem(from_addr), to_reg) + } + ExtKind::ZeroExtend => { + Inst::movzx_rm_r(ext_mode, RegMem::mem(from_addr), to_reg) + } + ExtKind::None => panic!( + "expected an extension kind for extension mode: {:?}", + ext_mode + ), + } + } else { + // 64-bit values can be moved directly. + Inst::mov64_m_r(from_addr, to_reg) + } + } + RegClass::V128 => { + let opcode = match ty { + types::F32 => SseOpcode::Movss, + types::F64 => SseOpcode::Movsd, + types::F32X4 => SseOpcode::Movups, + types::F64X2 => SseOpcode::Movupd, + _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqu, + _ => unimplemented!("unable to load type: {}", ty), + }; + Inst::xmm_unary_rm_r(opcode, RegMem::mem(from_addr), to_reg) + } + _ => panic!("unable to generate load for register class: {:?}", rc), + } + } + + /// Choose which instruction to use for storing a register value to memory. + pub(crate) fn store(ty: Type, from_reg: Reg, to_addr: impl Into<SyntheticAmode>) -> Inst { + let rc = from_reg.get_class(); + match rc { + RegClass::I64 => { + // Always store the full register, to ensure that the high bits are properly set + // when doing a full reload. + Inst::mov_r_m(8 /* bytes */, from_reg, to_addr) + } + RegClass::V128 => { + let opcode = match ty { + types::F32 => SseOpcode::Movss, + types::F64 => SseOpcode::Movsd, + types::F32X4 => SseOpcode::Movups, + types::F64X2 => SseOpcode::Movupd, + _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqu, + _ => unimplemented!("unable to store type: {}", ty), + }; + Inst::xmm_mov_r_m(opcode, from_reg, to_addr) + } + _ => panic!("unable to generate store for register class: {:?}", rc), + } + } +} + +// Inst helpers. + +impl Inst { + /// In certain cases, instructions of this format can act as a definition of an XMM register, + /// producing a value that is independent of its initial value. + /// + /// For example, a vector equality comparison (`cmppd` or `cmpps`) that compares a register to + /// itself will generate all ones as a result, regardless of its value. From the register + /// allocator's point of view, we should (i) record the first register, which is normally a + /// mod, as a def instead; and (ii) not record the second register as a use, because it is the + /// same as the first register (already handled). + fn produces_const(&self) -> bool { + match self { + Self::AluRmiR { op, src, dst, .. } => { + src.to_reg() == Some(dst.to_reg()) + && (*op == AluRmiROpcode::Xor || *op == AluRmiROpcode::Sub) + } + + Self::XmmRmR { op, src, dst, .. } => { + src.to_reg() == Some(dst.to_reg()) + && (*op == SseOpcode::Xorps + || *op == SseOpcode::Xorpd + || *op == SseOpcode::Pxor + || *op == SseOpcode::Pcmpeqb + || *op == SseOpcode::Pcmpeqw + || *op == SseOpcode::Pcmpeqd + || *op == SseOpcode::Pcmpeqq) + } + + Self::XmmRmRImm { + op, src, dst, imm, .. + } => { + src.to_reg() == Some(dst.to_reg()) + && (*op == SseOpcode::Cmppd || *op == SseOpcode::Cmpps) + && *imm == FcmpImm::Equal.encode() + } + + _ => false, + } + } + + /// Choose which instruction to use for comparing two values for equality. + pub(crate) fn equals(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst { + match ty { + types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to), + types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to), + types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to), + types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to), + types::F32X4 => { + Inst::xmm_rm_r_imm(SseOpcode::Cmpps, from, to, FcmpImm::Equal.encode(), false) + } + types::F64X2 => { + Inst::xmm_rm_r_imm(SseOpcode::Cmppd, from, to, FcmpImm::Equal.encode(), false) + } + _ => unimplemented!("unimplemented type for Inst::equals: {}", ty), + } + } + + /// Choose which instruction to use for computing a bitwise AND on two values. + pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst { + match ty { + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to), + _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pand, from, to), + _ => unimplemented!("unimplemented type for Inst::and: {}", ty), + } + } + + /// Choose which instruction to use for computing a bitwise AND NOT on two values. + pub(crate) fn and_not(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst { + match ty { + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to), + _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pandn, from, to), + _ => unimplemented!("unimplemented type for Inst::and_not: {}", ty), + } + } + + /// Choose which instruction to use for computing a bitwise OR on two values. + pub(crate) fn or(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst { + match ty { + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to), + _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Por, from, to), + _ => unimplemented!("unimplemented type for Inst::or: {}", ty), + } + } + + /// Choose which instruction to use for computing a bitwise XOR on two values. + pub(crate) fn xor(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst { + match ty { + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to), + _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pxor, from, to), + _ => unimplemented!("unimplemented type for Inst::xor: {}", ty), + } + } +} + +//============================================================================= +// Instructions: printing + +impl PrettyPrint for Inst { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + fn ljustify(s: String) -> String { + let w = 7; + if s.len() >= w { + s + } else { + let need = usize::min(w, w - s.len()); + s + &format!("{nil: <width$}", nil = "", width = need) + } + } + + fn ljustify2(s1: String, s2: String) -> String { + ljustify(s1 + &s2) + } + + fn suffix_lq(is_64: bool) -> String { + (if is_64 { "q" } else { "l" }).to_string() + } + + fn size_lq(is_64: bool) -> u8 { + if is_64 { + 8 + } else { + 4 + } + } + + fn suffix_bwlq(size: u8) -> String { + match size { + 1 => "b".to_string(), + 2 => "w".to_string(), + 4 => "l".to_string(), + 8 => "q".to_string(), + _ => panic!("Inst(x64).show.suffixBWLQ: size={}", size), + } + } + + match self { + Inst::Nop { len } => format!("{} len={}", ljustify("nop".to_string()), len), + + Inst::AluRmiR { + is_64, + op, + src, + dst, + } => format!( + "{} {}, {}", + ljustify2(op.to_string(), suffix_lq(*is_64)), + src.show_rru_sized(mb_rru, size_lq(*is_64)), + show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64)), + ), + + Inst::UnaryRmR { src, dst, op, size } => format!( + "{} {}, {}", + ljustify2(op.to_string(), suffix_bwlq(*size)), + src.show_rru_sized(mb_rru, *size), + show_ireg_sized(dst.to_reg(), mb_rru, *size), + ), + + Inst::Not { size, src } => format!( + "{} {}", + ljustify2("not".to_string(), suffix_bwlq(*size)), + show_ireg_sized(src.to_reg(), mb_rru, *size) + ), + + Inst::Neg { size, src } => format!( + "{} {}", + ljustify2("neg".to_string(), suffix_bwlq(*size)), + show_ireg_sized(src.to_reg(), mb_rru, *size) + ), + + Inst::Div { + size, + signed, + divisor, + .. + } => format!( + "{} {}", + ljustify(if *signed { + "idiv".to_string() + } else { + "div".into() + }), + divisor.show_rru_sized(mb_rru, *size) + ), + + Inst::MulHi { + size, signed, rhs, .. + } => format!( + "{} {}", + ljustify(if *signed { + "imul".to_string() + } else { + "mul".to_string() + }), + rhs.show_rru_sized(mb_rru, *size) + ), + + Inst::CheckedDivOrRemSeq { + kind, + size, + divisor, + .. + } => format!( + "{} $rax:$rdx, {}", + match kind { + DivOrRemKind::SignedDiv => "sdiv", + DivOrRemKind::UnsignedDiv => "udiv", + DivOrRemKind::SignedRem => "srem", + DivOrRemKind::UnsignedRem => "urem", + }, + show_ireg_sized(divisor.to_reg(), mb_rru, *size), + ), + + Inst::SignExtendData { size } => match size { + 1 => "cbw", + 2 => "cwd", + 4 => "cdq", + 8 => "cqo", + _ => unreachable!(), + } + .into(), + + Inst::XmmUnaryRmR { op, src, dst, .. } => format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru_sized(mb_rru, op.src_size()), + show_ireg_sized(dst.to_reg(), mb_rru, 8), + ), + + Inst::XmmMovRM { op, src, dst, .. } => format!( + "{} {}, {}", + ljustify(op.to_string()), + show_ireg_sized(*src, mb_rru, 8), + dst.show_rru(mb_rru), + ), + + Inst::XmmRmR { op, src, dst, .. } => format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru_sized(mb_rru, 8), + show_ireg_sized(dst.to_reg(), mb_rru, 8), + ), + + Inst::XmmMinMaxSeq { + lhs, + rhs_dst, + is_min, + size, + } => format!( + "{} {}, {}", + ljustify2( + if *is_min { + "xmm min seq ".to_string() + } else { + "xmm max seq ".to_string() + }, + match size { + OperandSize::Size32 => "f32", + OperandSize::Size64 => "f64", + } + .into() + ), + show_ireg_sized(*lhs, mb_rru, 8), + show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8), + ), + + Inst::XmmRmRImm { op, src, dst, imm, is64, .. } => format!( + "{} ${}, {}, {}", + ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })), + imm, + src.show_rru(mb_rru), + dst.show_rru(mb_rru), + ), + + Inst::XmmUninitializedValue { dst } => format!( + "{} {}", + ljustify("uninit".into()), + dst.show_rru(mb_rru), + ), + + Inst::XmmLoadConst { src, dst, .. } => { + format!("load_const {:?}, {}", src, dst.show_rru(mb_rru),) + } + + Inst::XmmToGpr { + op, + src, + dst, + dst_size, + } => { + let dst_size = match dst_size { + OperandSize::Size32 => 4, + OperandSize::Size64 => 8, + }; + format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru(mb_rru), + show_ireg_sized(dst.to_reg(), mb_rru, dst_size), + ) + } + + Inst::GprToXmm { + op, + src, + src_size, + dst, + } => format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru_sized(mb_rru, src_size.to_bytes()), + dst.show_rru(mb_rru) + ), + + Inst::XmmCmpRmR { op, src, dst } => format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru_sized(mb_rru, 8), + show_ireg_sized(*dst, mb_rru, 8), + ), + + Inst::CvtUint64ToFloatSeq { + src, dst, to_f64, .. + } => format!( + "{} {}, {}", + ljustify(format!( + "u64_to_{}_seq", + if *to_f64 { "f64" } else { "f32" } + )), + show_ireg_sized(src.to_reg(), mb_rru, 8), + dst.show_rru(mb_rru), + ), + + Inst::CvtFloatToSintSeq { + src, + dst, + src_size, + dst_size, + .. + } => format!( + "{} {}, {}", + ljustify(format!( + "cvt_float{}_to_sint{}_seq", + if *src_size == OperandSize::Size64 { + "64" + } else { + "32" + }, + if *dst_size == OperandSize::Size64 { + "64" + } else { + "32" + } + )), + show_ireg_sized(src.to_reg(), mb_rru, 8), + show_ireg_sized(dst.to_reg(), mb_rru, dst_size.to_bytes()), + ), + + Inst::CvtFloatToUintSeq { + src, + dst, + src_size, + dst_size, + .. + } => format!( + "{} {}, {}", + ljustify(format!( + "cvt_float{}_to_uint{}_seq", + if *src_size == OperandSize::Size64 { + "64" + } else { + "32" + }, + if *dst_size == OperandSize::Size64 { + "64" + } else { + "32" + } + )), + show_ireg_sized(src.to_reg(), mb_rru, 8), + show_ireg_sized(dst.to_reg(), mb_rru, dst_size.to_bytes()), + ), + + Inst::Imm { + dst_is_64, + simm64, + dst, + } => { + if *dst_is_64 { + format!( + "{} ${}, {}", + ljustify("movabsq".to_string()), + *simm64 as i64, + show_ireg_sized(dst.to_reg(), mb_rru, 8) + ) + } else { + format!( + "{} ${}, {}", + ljustify("movl".to_string()), + (*simm64 as u32) as i32, + show_ireg_sized(dst.to_reg(), mb_rru, 4) + ) + } + } + + Inst::MovRR { is_64, src, dst } => format!( + "{} {}, {}", + ljustify2("mov".to_string(), suffix_lq(*is_64)), + show_ireg_sized(*src, mb_rru, size_lq(*is_64)), + show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64)) + ), + + Inst::MovzxRmR { + ext_mode, src, dst, .. + } => { + if *ext_mode == ExtMode::LQ { + format!( + "{} {}, {}", + ljustify("movl".to_string()), + src.show_rru_sized(mb_rru, ext_mode.src_size()), + show_ireg_sized(dst.to_reg(), mb_rru, 4) + ) + } else { + format!( + "{} {}, {}", + ljustify2("movz".to_string(), ext_mode.to_string()), + src.show_rru_sized(mb_rru, ext_mode.src_size()), + show_ireg_sized(dst.to_reg(), mb_rru, ext_mode.dst_size()) + ) + } + } + + Inst::Mov64MR { src, dst, .. } => format!( + "{} {}, {}", + ljustify("movq".to_string()), + src.show_rru(mb_rru), + dst.show_rru(mb_rru) + ), + + Inst::LoadEffectiveAddress { addr, dst } => format!( + "{} {}, {}", + ljustify("lea".to_string()), + addr.show_rru(mb_rru), + dst.show_rru(mb_rru) + ), + + Inst::MovsxRmR { + ext_mode, src, dst, .. + } => format!( + "{} {}, {}", + ljustify2("movs".to_string(), ext_mode.to_string()), + src.show_rru_sized(mb_rru, ext_mode.src_size()), + show_ireg_sized(dst.to_reg(), mb_rru, ext_mode.dst_size()) + ), + + Inst::MovRM { size, src, dst, .. } => format!( + "{} {}, {}", + ljustify2("mov".to_string(), suffix_bwlq(*size)), + show_ireg_sized(*src, mb_rru, *size), + dst.show_rru(mb_rru) + ), + + Inst::ShiftR { + size, + kind, + num_bits, + dst, + } => match num_bits { + None => format!( + "{} %cl, {}", + ljustify2(kind.to_string(), suffix_bwlq(*size)), + show_ireg_sized(dst.to_reg(), mb_rru, *size) + ), + + Some(num_bits) => format!( + "{} ${}, {}", + ljustify2(kind.to_string(), suffix_bwlq(*size)), + num_bits, + show_ireg_sized(dst.to_reg(), mb_rru, *size) + ), + }, + + Inst::XmmRmiReg { opcode, src, dst } => format!( + "{} {}, {}", + ljustify(opcode.to_string()), + src.show_rru(mb_rru), + dst.to_reg().show_rru(mb_rru) + ), + + Inst::CmpRmiR { size, src, dst } => format!( + "{} {}, {}", + ljustify2("cmp".to_string(), suffix_bwlq(*size)), + src.show_rru_sized(mb_rru, *size), + show_ireg_sized(*dst, mb_rru, *size) + ), + + Inst::Setcc { cc, dst } => format!( + "{} {}", + ljustify2("set".to_string(), cc.to_string()), + show_ireg_sized(dst.to_reg(), mb_rru, 1) + ), + + Inst::Cmove { size, cc, src, dst } => format!( + "{} {}, {}", + ljustify(format!("cmov{}{}", cc.to_string(), suffix_bwlq(*size))), + src.show_rru_sized(mb_rru, *size), + show_ireg_sized(dst.to_reg(), mb_rru, *size) + ), + + Inst::XmmCmove { + is_64, + cc, + src, + dst, + } => { + let size = if *is_64 { 8 } else { 4 }; + format!( + "j{} $next; mov{} {}, {}; $next: ", + cc.invert().to_string(), + if *is_64 { "sd" } else { "ss" }, + src.show_rru_sized(mb_rru, size), + show_ireg_sized(dst.to_reg(), mb_rru, size) + ) + } + + Inst::Push64 { src } => { + format!("{} {}", ljustify("pushq".to_string()), src.show_rru(mb_rru)) + } + + Inst::Pop64 { dst } => { + format!("{} {}", ljustify("popq".to_string()), dst.show_rru(mb_rru)) + } + + Inst::CallKnown { dest, .. } => format!("{} {:?}", ljustify("call".to_string()), dest), + + Inst::CallUnknown { dest, .. } => format!( + "{} *{}", + ljustify("call".to_string()), + dest.show_rru(mb_rru) + ), + + Inst::Ret => "ret".to_string(), + + Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(), + + Inst::JmpKnown { dst } => { + format!("{} {}", ljustify("jmp".to_string()), dst.to_string()) + } + + Inst::JmpIf { cc, taken } => format!( + "{} {}", + ljustify2("j".to_string(), cc.to_string()), + taken.to_string(), + ), + + Inst::JmpCond { + cc, + taken, + not_taken, + } => format!( + "{} {}; j {}", + ljustify2("j".to_string(), cc.to_string()), + taken.to_string(), + not_taken.to_string() + ), + + Inst::JmpTableSeq { idx, .. } => { + format!("{} {}", ljustify("br_table".into()), idx.show_rru(mb_rru)) + } + + Inst::JmpUnknown { target } => format!( + "{} *{}", + ljustify("jmp".to_string()), + target.show_rru(mb_rru) + ), + + Inst::TrapIf { cc, trap_code, .. } => { + format!("j{} ; ud2 {} ;", cc.invert().to_string(), trap_code) + } + + Inst::LoadExtName { + dst, name, offset, .. + } => format!( + "{} {}+{}, {}", + ljustify("movaps".into()), + name, + offset, + show_ireg_sized(dst.to_reg(), mb_rru, 8), + ), + + Inst::LockCmpxchg { ty, src, dst, .. } => { + let size = ty.bytes() as u8; + format!("lock cmpxchg{} {}, {}", + suffix_bwlq(size), show_ireg_sized(*src, mb_rru, size), dst.show_rru(mb_rru)) + } + + Inst::AtomicRmwSeq { ty, op, .. } => { + format!( + "atomically {{ {}_bits_at_[%r9]) {:?}= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }}", + ty.bits(), op) + }, + + Inst::Fence { kind } => { + match kind { + FenceKind::MFence => "mfence".to_string(), + FenceKind::LFence => "lfence".to_string(), + FenceKind::SFence => "sfence".to_string(), + } + } + + Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset), + + Inst::Hlt => "hlt".into(), + + Inst::Ud2 { trap_code } => format!("ud2 {}", trap_code), + } + } +} + +// Temp hook for legacy printing machinery +impl fmt::Debug for Inst { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + // Print the insn without a Universe :-( + write!(fmt, "{}", self.show_rru(None)) + } +} + +fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { + // This is a bit subtle. If some register is in the modified set, then it may not be in either + // the use or def sets. However, enforcing that directly is somewhat difficult. Instead, + // regalloc.rs will "fix" this for us by removing the the modified set from the use and def + // sets. + match inst { + Inst::AluRmiR { src, dst, .. } => { + if inst.produces_const() { + // No need to account for src, since src == dst. + collector.add_def(*dst); + } else { + src.get_regs_as_uses(collector); + collector.add_mod(*dst); + } + } + Inst::Not { src, .. } => { + collector.add_mod(*src); + } + Inst::Neg { src, .. } => { + collector.add_mod(*src); + } + Inst::Div { size, divisor, .. } => { + collector.add_mod(Writable::from_reg(regs::rax())); + if *size == 1 { + collector.add_def(Writable::from_reg(regs::rdx())); + } else { + collector.add_mod(Writable::from_reg(regs::rdx())); + } + divisor.get_regs_as_uses(collector); + } + Inst::MulHi { rhs, .. } => { + collector.add_mod(Writable::from_reg(regs::rax())); + collector.add_def(Writable::from_reg(regs::rdx())); + rhs.get_regs_as_uses(collector); + } + Inst::CheckedDivOrRemSeq { divisor, tmp, .. } => { + // Mark both fixed registers as mods, to avoid an early clobber problem in codegen + // (i.e. the temporary is allocated one of the fixed registers). This requires writing + // the rdx register *before* the instruction, which is not too bad. + collector.add_mod(Writable::from_reg(regs::rax())); + collector.add_mod(Writable::from_reg(regs::rdx())); + collector.add_mod(*divisor); + if let Some(tmp) = tmp { + collector.add_def(*tmp); + } + } + Inst::SignExtendData { size } => match size { + 1 => collector.add_mod(Writable::from_reg(regs::rax())), + 2 | 4 | 8 => { + collector.add_use(regs::rax()); + collector.add_def(Writable::from_reg(regs::rdx())); + } + _ => unreachable!(), + }, + Inst::UnaryRmR { src, dst, .. } | Inst::XmmUnaryRmR { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_def(*dst); + } + Inst::XmmRmR { src, dst, .. } => { + if inst.produces_const() { + // No need to account for src, since src == dst. + collector.add_def(*dst); + } else { + src.get_regs_as_uses(collector); + collector.add_mod(*dst); + } + } + Inst::XmmRmRImm { op, src, dst, .. } => { + if inst.produces_const() { + // No need to account for src, since src == dst. + collector.add_def(*dst); + } else if *op == SseOpcode::Pextrb + || *op == SseOpcode::Pextrw + || *op == SseOpcode::Pextrd + || *op == SseOpcode::Pshufd + { + src.get_regs_as_uses(collector); + collector.add_def(*dst); + } else { + src.get_regs_as_uses(collector); + collector.add_mod(*dst); + } + } + Inst::XmmUninitializedValue { dst } => collector.add_def(*dst), + Inst::XmmLoadConst { dst, .. } => collector.add_def(*dst), + Inst::XmmMinMaxSeq { lhs, rhs_dst, .. } => { + collector.add_use(*lhs); + collector.add_mod(*rhs_dst); + } + Inst::XmmRmiReg { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_mod(*dst); + } + Inst::XmmMovRM { src, dst, .. } => { + collector.add_use(*src); + dst.get_regs_as_uses(collector); + } + Inst::XmmCmpRmR { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_use(*dst); + } + Inst::Imm { dst, .. } => { + collector.add_def(*dst); + } + Inst::MovRR { src, dst, .. } | Inst::XmmToGpr { src, dst, .. } => { + collector.add_use(*src); + collector.add_def(*dst); + } + Inst::GprToXmm { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_def(*dst); + } + Inst::CvtUint64ToFloatSeq { + src, + dst, + tmp_gpr1, + tmp_gpr2, + .. + } => { + collector.add_mod(*src); + collector.add_def(*dst); + collector.add_def(*tmp_gpr1); + collector.add_def(*tmp_gpr2); + } + Inst::CvtFloatToSintSeq { + src, + dst, + tmp_xmm, + tmp_gpr, + .. + } + | Inst::CvtFloatToUintSeq { + src, + dst, + tmp_gpr, + tmp_xmm, + .. + } => { + collector.add_mod(*src); + collector.add_def(*dst); + collector.add_def(*tmp_gpr); + collector.add_def(*tmp_xmm); + } + Inst::MovzxRmR { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_def(*dst); + } + Inst::Mov64MR { src, dst, .. } | Inst::LoadEffectiveAddress { addr: src, dst } => { + src.get_regs_as_uses(collector); + collector.add_def(*dst) + } + Inst::MovsxRmR { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_def(*dst); + } + Inst::MovRM { src, dst, .. } => { + collector.add_use(*src); + dst.get_regs_as_uses(collector); + } + Inst::ShiftR { num_bits, dst, .. } => { + if num_bits.is_none() { + collector.add_use(regs::rcx()); + } + collector.add_mod(*dst); + } + Inst::CmpRmiR { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_use(*dst); // yes, really `add_use` + } + Inst::Setcc { dst, .. } => { + collector.add_def(*dst); + } + Inst::Cmove { src, dst, .. } | Inst::XmmCmove { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_mod(*dst); + } + Inst::Push64 { src } => { + src.get_regs_as_uses(collector); + collector.add_mod(Writable::from_reg(regs::rsp())); + } + Inst::Pop64 { dst } => { + collector.add_def(*dst); + } + + Inst::CallKnown { + ref uses, ref defs, .. + } => { + collector.add_uses(uses); + collector.add_defs(defs); + } + + Inst::CallUnknown { + ref uses, + ref defs, + dest, + .. + } => { + collector.add_uses(uses); + collector.add_defs(defs); + dest.get_regs_as_uses(collector); + } + + Inst::JmpTableSeq { + ref idx, + ref tmp1, + ref tmp2, + .. + } => { + collector.add_use(*idx); + collector.add_def(*tmp1); + collector.add_def(*tmp2); + } + + Inst::JmpUnknown { target } => { + target.get_regs_as_uses(collector); + } + + Inst::LoadExtName { dst, .. } => { + collector.add_def(*dst); + } + + Inst::LockCmpxchg { src, dst, .. } => { + dst.get_regs_as_uses(collector); + collector.add_use(*src); + collector.add_mod(Writable::from_reg(regs::rax())); + } + + Inst::AtomicRmwSeq { .. } => { + collector.add_use(regs::r9()); + collector.add_use(regs::r10()); + collector.add_def(Writable::from_reg(regs::r11())); + collector.add_def(Writable::from_reg(regs::rax())); + } + + Inst::Ret + | Inst::EpiloguePlaceholder + | Inst::JmpKnown { .. } + | Inst::JmpIf { .. } + | Inst::JmpCond { .. } + | Inst::Nop { .. } + | Inst::TrapIf { .. } + | Inst::VirtualSPOffsetAdj { .. } + | Inst::Hlt + | Inst::Ud2 { .. } + | Inst::Fence { .. } => { + // No registers are used. + } + } +} + +//============================================================================= +// Instructions and subcomponents: map_regs + +fn map_use<RUM: RegUsageMapper>(m: &RUM, r: &mut Reg) { + if let Some(reg) = r.as_virtual_reg() { + let new = m.get_use(reg).unwrap().to_reg(); + *r = new; + } +} + +fn map_def<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) { + if let Some(reg) = r.to_reg().as_virtual_reg() { + let new = m.get_def(reg).unwrap().to_reg(); + *r = Writable::from_reg(new); + } +} + +fn map_mod<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) { + if let Some(reg) = r.to_reg().as_virtual_reg() { + let new = m.get_mod(reg).unwrap().to_reg(); + *r = Writable::from_reg(new); + } +} + +impl Amode { + fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) { + match self { + Amode::ImmReg { ref mut base, .. } => map_use(map, base), + Amode::ImmRegRegShift { + ref mut base, + ref mut index, + .. + } => { + map_use(map, base); + map_use(map, index); + } + Amode::RipRelative { .. } => { + // RIP isn't involved in regalloc. + } + } + } +} + +impl RegMemImm { + fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) { + match self { + RegMemImm::Reg { ref mut reg } => map_use(map, reg), + RegMemImm::Mem { ref mut addr } => addr.map_uses(map), + RegMemImm::Imm { .. } => {} + } + } + + fn map_as_def<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + match self { + Self::Reg { reg } => { + let mut writable_src = Writable::from_reg(*reg); + map_def(mapper, &mut writable_src); + *self = Self::reg(writable_src.to_reg()); + } + _ => panic!("unexpected RegMemImm kind in map_src_reg_as_def"), + } + } +} + +impl RegMem { + fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) { + match self { + RegMem::Reg { ref mut reg } => map_use(map, reg), + RegMem::Mem { ref mut addr, .. } => addr.map_uses(map), + } + } + + fn map_as_def<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + match self { + Self::Reg { reg } => { + let mut writable_src = Writable::from_reg(*reg); + map_def(mapper, &mut writable_src); + *self = Self::reg(writable_src.to_reg()); + } + _ => panic!("unexpected RegMem kind in map_src_reg_as_def"), + } + } +} + +fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) { + // Note this must be carefully synchronized with x64_get_regs. + let produces_const = inst.produces_const(); + + match inst { + // ** Nop + Inst::AluRmiR { + ref mut src, + ref mut dst, + .. + } => { + if produces_const { + src.map_as_def(mapper); + map_def(mapper, dst); + } else { + src.map_uses(mapper); + map_mod(mapper, dst); + } + } + Inst::Not { src, .. } | Inst::Neg { src, .. } => map_mod(mapper, src), + Inst::Div { divisor, .. } => divisor.map_uses(mapper), + Inst::MulHi { rhs, .. } => rhs.map_uses(mapper), + Inst::CheckedDivOrRemSeq { divisor, tmp, .. } => { + map_mod(mapper, divisor); + if let Some(tmp) = tmp { + map_def(mapper, tmp) + } + } + Inst::SignExtendData { .. } => {} + Inst::XmmUnaryRmR { + ref mut src, + ref mut dst, + .. + } + | Inst::UnaryRmR { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_def(mapper, dst); + } + Inst::XmmRmRImm { + ref op, + ref mut src, + ref mut dst, + .. + } => { + if produces_const { + src.map_as_def(mapper); + map_def(mapper, dst); + } else if *op == SseOpcode::Pextrb + || *op == SseOpcode::Pextrw + || *op == SseOpcode::Pextrd + || *op == SseOpcode::Pshufd + { + src.map_uses(mapper); + map_def(mapper, dst); + } else { + src.map_uses(mapper); + map_mod(mapper, dst); + } + } + Inst::XmmRmR { + ref mut src, + ref mut dst, + .. + } => { + if produces_const { + src.map_as_def(mapper); + map_def(mapper, dst); + } else { + src.map_uses(mapper); + map_mod(mapper, dst); + } + } + Inst::XmmRmiReg { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_mod(mapper, dst); + } + Inst::XmmUninitializedValue { ref mut dst, .. } => { + map_def(mapper, dst); + } + Inst::XmmLoadConst { ref mut dst, .. } => { + map_def(mapper, dst); + } + Inst::XmmMinMaxSeq { + ref mut lhs, + ref mut rhs_dst, + .. + } => { + map_use(mapper, lhs); + map_mod(mapper, rhs_dst); + } + Inst::XmmMovRM { + ref mut src, + ref mut dst, + .. + } => { + map_use(mapper, src); + dst.map_uses(mapper); + } + Inst::XmmCmpRmR { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_use(mapper, dst); + } + Inst::Imm { ref mut dst, .. } => map_def(mapper, dst), + Inst::MovRR { + ref mut src, + ref mut dst, + .. + } + | Inst::XmmToGpr { + ref mut src, + ref mut dst, + .. + } => { + map_use(mapper, src); + map_def(mapper, dst); + } + Inst::GprToXmm { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_def(mapper, dst); + } + Inst::CvtUint64ToFloatSeq { + ref mut src, + ref mut dst, + ref mut tmp_gpr1, + ref mut tmp_gpr2, + .. + } => { + map_mod(mapper, src); + map_def(mapper, dst); + map_def(mapper, tmp_gpr1); + map_def(mapper, tmp_gpr2); + } + Inst::CvtFloatToSintSeq { + ref mut src, + ref mut dst, + ref mut tmp_xmm, + ref mut tmp_gpr, + .. + } + | Inst::CvtFloatToUintSeq { + ref mut src, + ref mut dst, + ref mut tmp_gpr, + ref mut tmp_xmm, + .. + } => { + map_mod(mapper, src); + map_def(mapper, dst); + map_def(mapper, tmp_gpr); + map_def(mapper, tmp_xmm); + } + Inst::MovzxRmR { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_def(mapper, dst); + } + Inst::Mov64MR { src, dst, .. } | Inst::LoadEffectiveAddress { addr: src, dst } => { + src.map_uses(mapper); + map_def(mapper, dst); + } + Inst::MovsxRmR { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_def(mapper, dst); + } + Inst::MovRM { + ref mut src, + ref mut dst, + .. + } => { + map_use(mapper, src); + dst.map_uses(mapper); + } + Inst::ShiftR { ref mut dst, .. } => { + map_mod(mapper, dst); + } + Inst::CmpRmiR { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_use(mapper, dst); + } + Inst::Setcc { ref mut dst, .. } => map_def(mapper, dst), + Inst::Cmove { + ref mut src, + ref mut dst, + .. + } + | Inst::XmmCmove { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_mod(mapper, dst) + } + Inst::Push64 { ref mut src } => src.map_uses(mapper), + Inst::Pop64 { ref mut dst } => { + map_def(mapper, dst); + } + + Inst::CallKnown { + ref mut uses, + ref mut defs, + .. + } => { + for r in uses.iter_mut() { + map_use(mapper, r); + } + for r in defs.iter_mut() { + map_def(mapper, r); + } + } + + Inst::CallUnknown { + ref mut uses, + ref mut defs, + ref mut dest, + .. + } => { + for r in uses.iter_mut() { + map_use(mapper, r); + } + for r in defs.iter_mut() { + map_def(mapper, r); + } + dest.map_uses(mapper); + } + + Inst::JmpTableSeq { + ref mut idx, + ref mut tmp1, + ref mut tmp2, + .. + } => { + map_use(mapper, idx); + map_def(mapper, tmp1); + map_def(mapper, tmp2); + } + + Inst::JmpUnknown { ref mut target } => target.map_uses(mapper), + + Inst::LoadExtName { ref mut dst, .. } => map_def(mapper, dst), + + Inst::LockCmpxchg { + ref mut src, + ref mut dst, + .. + } => { + map_use(mapper, src); + dst.map_uses(mapper); + } + + Inst::Ret + | Inst::EpiloguePlaceholder + | Inst::JmpKnown { .. } + | Inst::JmpCond { .. } + | Inst::JmpIf { .. } + | Inst::Nop { .. } + | Inst::TrapIf { .. } + | Inst::VirtualSPOffsetAdj { .. } + | Inst::Ud2 { .. } + | Inst::Hlt + | Inst::AtomicRmwSeq { .. } + | Inst::Fence { .. } => { + // Instruction doesn't explicitly mention any regs, so it can't have any virtual + // regs that we'd need to remap. Hence no action required. + } + } +} + +//============================================================================= +// Instructions: misc functions and external interface + +impl MachInst for Inst { + fn get_regs(&self, collector: &mut RegUsageCollector) { + x64_get_regs(&self, collector) + } + + fn map_regs<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + x64_map_regs(self, mapper); + } + + fn is_move(&self) -> Option<(Writable<Reg>, Reg)> { + match self { + // Note (carefully!) that a 32-bit mov *isn't* a no-op since it zeroes + // out the upper 32 bits of the destination. For example, we could + // conceivably use `movl %reg, %reg` to zero out the top 32 bits of + // %reg. + Self::MovRR { + is_64, src, dst, .. + } if *is_64 => Some((*dst, *src)), + // Note as well that MOVS[S|D] when used in the `XmmUnaryRmR` context are pure moves of + // scalar floating-point values (and annotate `dst` as `def`s to the register allocator) + // whereas the same operation in a packed context, e.g. `XMM_RM_R`, is used to merge a + // value into the lowest lane of a vector (not a move). + Self::XmmUnaryRmR { op, src, dst, .. } + if *op == SseOpcode::Movss + || *op == SseOpcode::Movsd + || *op == SseOpcode::Movaps + || *op == SseOpcode::Movapd + || *op == SseOpcode::Movups + || *op == SseOpcode::Movupd + || *op == SseOpcode::Movdqa + || *op == SseOpcode::Movdqu => + { + if let RegMem::Reg { reg } = src { + Some((*dst, *reg)) + } else { + None + } + } + _ => None, + } + } + + fn is_epilogue_placeholder(&self) -> bool { + if let Self::EpiloguePlaceholder = self { + true + } else { + false + } + } + + fn is_term<'a>(&'a self) -> MachTerminator<'a> { + match self { + // Interesting cases. + &Self::Ret | &Self::EpiloguePlaceholder => MachTerminator::Ret, + &Self::JmpKnown { dst } => MachTerminator::Uncond(dst), + &Self::JmpCond { + taken, not_taken, .. + } => MachTerminator::Cond(taken, not_taken), + &Self::JmpTableSeq { + ref targets_for_term, + .. + } => MachTerminator::Indirect(&targets_for_term[..]), + // All other cases are boring. + _ => MachTerminator::None, + } + } + + fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, ty: Type) -> Inst { + let rc_dst = dst_reg.to_reg().get_class(); + let rc_src = src_reg.get_class(); + // If this isn't true, we have gone way off the rails. + debug_assert!(rc_dst == rc_src); + match rc_dst { + RegClass::I64 => Inst::mov_r_r(true, src_reg, dst_reg), + RegClass::V128 => { + // The Intel optimization manual, in "3.5.1.13 Zero-Latency MOV Instructions", + // doesn't include MOVSS/MOVSD as instructions with zero-latency. Use movaps for + // those, which may write more lanes that we need, but are specified to have + // zero-latency. + let opcode = match ty { + types::F32 | types::F64 | types::F32X4 => SseOpcode::Movaps, + types::F64X2 => SseOpcode::Movapd, + _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqa, + _ => unimplemented!("unable to move type: {}", ty), + }; + Inst::xmm_unary_rm_r(opcode, RegMem::reg(src_reg), dst_reg) + } + _ => panic!("gen_move(x64): unhandled regclass {:?}", rc_dst), + } + } + + fn gen_zero_len_nop() -> Inst { + Inst::Nop { len: 0 } + } + + fn gen_nop(preferred_size: usize) -> Inst { + Inst::nop((preferred_size % 16) as u8) + } + + fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> { + None + } + + fn rc_for_type(ty: Type) -> CodegenResult<RegClass> { + match ty { + types::I8 + | types::I16 + | types::I32 + | types::I64 + | types::B1 + | types::B8 + | types::B16 + | types::B32 + | types::B64 + | types::R32 + | types::R64 => Ok(RegClass::I64), + types::F32 | types::F64 => Ok(RegClass::V128), + _ if ty.bits() == 128 => Ok(RegClass::V128), + types::IFLAGS | types::FFLAGS => Ok(RegClass::I64), + _ => Err(CodegenError::Unsupported(format!( + "Unexpected SSA-value type: {}", + ty + ))), + } + } + + fn gen_jump(label: MachLabel) -> Inst { + Inst::jmp_known(label) + } + + fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>( + to_reg: Writable<Reg>, + value: u64, + ty: Type, + mut alloc_tmp: F, + ) -> SmallVec<[Self; 4]> { + let mut ret = SmallVec::new(); + if ty == types::F32 { + if value == 0 { + ret.push(Inst::xmm_rm_r( + SseOpcode::Xorps, + RegMem::reg(to_reg.to_reg()), + to_reg, + )); + } else { + let tmp = alloc_tmp(RegClass::I64, types::I32); + ret.push(Inst::imm(OperandSize::Size32, value, tmp)); + + ret.push(Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::reg(tmp.to_reg()), + OperandSize::Size32, + to_reg, + )); + } + } else if ty == types::F64 { + if value == 0 { + ret.push(Inst::xmm_rm_r( + SseOpcode::Xorpd, + RegMem::reg(to_reg.to_reg()), + to_reg, + )); + } else { + let tmp = alloc_tmp(RegClass::I64, types::I64); + ret.push(Inst::imm(OperandSize::Size64, value, tmp)); + + ret.push(Inst::gpr_to_xmm( + SseOpcode::Movq, + RegMem::reg(tmp.to_reg()), + OperandSize::Size64, + to_reg, + )); + } + } else { + // Must be an integer type. + debug_assert!( + ty == types::B1 + || ty == types::I8 + || ty == types::B8 + || ty == types::I16 + || ty == types::B16 + || ty == types::I32 + || ty == types::B32 + || ty == types::I64 + || ty == types::B64 + || ty == types::R32 + || ty == types::R64 + ); + if value == 0 { + ret.push(Inst::alu_rmi_r( + ty == types::I64, + AluRmiROpcode::Xor, + RegMemImm::reg(to_reg.to_reg()), + to_reg, + )); + } else { + ret.push(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + value.into(), + to_reg, + )); + } + } + ret + } + + fn reg_universe(flags: &Flags) -> RealRegUniverse { + create_reg_universe_systemv(flags) + } + + fn worst_case_size() -> CodeOffset { + 15 + } + + fn ref_type_regclass(_: &settings::Flags) -> RegClass { + RegClass::I64 + } + + type LabelUse = LabelUse; +} + +/// State carried between emissions of a sequence of instructions. +#[derive(Default, Clone, Debug)] +pub struct EmitState { + /// Addend to convert nominal-SP offsets to real-SP offsets at the current + /// program point. + pub(crate) virtual_sp_offset: i64, + /// Offset of FP from nominal-SP. + pub(crate) nominal_sp_to_fp: i64, + /// Safepoint stack map for upcoming instruction, as provided to `pre_safepoint()`. + stack_map: Option<StackMap>, + /// Current source location. + cur_srcloc: SourceLoc, +} + +/// Constant state used during emissions of a sequence of instructions. +pub struct EmitInfo { + flags: settings::Flags, + isa_flags: x64_settings::Flags, +} + +impl EmitInfo { + pub(crate) fn new(flags: settings::Flags, isa_flags: x64_settings::Flags) -> Self { + Self { flags, isa_flags } + } +} + +impl MachInstEmitInfo for EmitInfo { + fn flags(&self) -> &Flags { + &self.flags + } +} + +impl MachInstEmit for Inst { + type State = EmitState; + type Info = EmitInfo; + type UnwindInfo = unwind::X64UnwindInfo; + + fn emit(&self, sink: &mut MachBuffer<Inst>, info: &Self::Info, state: &mut Self::State) { + emit::emit(self, sink, info, state); + } + + fn pretty_print(&self, mb_rru: Option<&RealRegUniverse>, _: &mut Self::State) -> String { + self.show_rru(mb_rru) + } +} + +impl MachInstEmitState<Inst> for EmitState { + fn new(abi: &dyn ABICallee<I = Inst>) -> Self { + EmitState { + virtual_sp_offset: 0, + nominal_sp_to_fp: abi.frame_size() as i64, + stack_map: None, + cur_srcloc: SourceLoc::default(), + } + } + + fn pre_safepoint(&mut self, stack_map: StackMap) { + self.stack_map = Some(stack_map); + } + + fn pre_sourceloc(&mut self, srcloc: SourceLoc) { + self.cur_srcloc = srcloc; + } +} + +impl EmitState { + fn take_stack_map(&mut self) -> Option<StackMap> { + self.stack_map.take() + } + + fn clear_post_insn(&mut self) { + self.stack_map = None; + } + + fn cur_srcloc(&self) -> SourceLoc { + self.cur_srcloc + } +} + +/// A label-use (internal relocation) in generated code. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LabelUse { + /// A 32-bit offset from location of relocation itself, added to the existing value at that + /// location. Used for control flow instructions which consider an offset from the start of the + /// next instruction (so the size of the payload -- 4 bytes -- is subtracted from the payload). + JmpRel32, + + /// A 32-bit offset from location of relocation itself, added to the existing value at that + /// location. + PCRel32, +} + +impl MachInstLabelUse for LabelUse { + const ALIGN: CodeOffset = 1; + + fn max_pos_range(self) -> CodeOffset { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => 0x7fff_ffff, + } + } + + fn max_neg_range(self) -> CodeOffset { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => 0x8000_0000, + } + } + + fn patch_size(self) -> CodeOffset { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => 4, + } + } + + fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) { + let pc_rel = (label_offset as i64) - (use_offset as i64); + debug_assert!(pc_rel <= self.max_pos_range() as i64); + debug_assert!(pc_rel >= -(self.max_neg_range() as i64)); + let pc_rel = pc_rel as u32; + match self { + LabelUse::JmpRel32 => { + let addend = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]); + let value = pc_rel.wrapping_add(addend).wrapping_sub(4); + buffer.copy_from_slice(&value.to_le_bytes()[..]); + } + LabelUse::PCRel32 => { + let addend = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]); + let value = pc_rel.wrapping_add(addend); + buffer.copy_from_slice(&value.to_le_bytes()[..]); + } + } + } + + fn supports_veneer(self) -> bool { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => false, + } + } + + fn veneer_size(self) -> CodeOffset { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => 0, + } + } + + fn generate_veneer(self, _: &mut [u8], _: CodeOffset) -> (CodeOffset, LabelUse) { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => { + panic!("Veneer not supported for JumpRel32 label-use."); + } + } + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs new file mode 100644 index 0000000000..04bc1f09bf --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs @@ -0,0 +1,289 @@ +//! Registers, the Universe thereof, and printing. +//! +//! These are ordered by sequence number, as required in the Universe. The strange ordering is +//! intended to make callee-save registers available before caller-saved ones. This is a net win +//! provided that each function makes at least one onward call. It'll be a net loss for leaf +//! functions, and we should change the ordering in that case, so as to make caller-save regs +//! available first. +//! +//! TODO Maybe have two different universes, one for leaf functions and one for non-leaf functions? +//! Also, they will have to be ABI dependent. Need to find a way to avoid constructing a universe +//! for each function we compile. + +use crate::settings; +use alloc::vec::Vec; +use regalloc::{ + PrettyPrint, RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, NUM_REG_CLASSES, +}; +use std::string::String; + +// Hardware encodings for a few registers. + +pub const ENC_RBX: u8 = 3; +pub const ENC_RSP: u8 = 4; +pub const ENC_RBP: u8 = 5; +pub const ENC_R12: u8 = 12; +pub const ENC_R13: u8 = 13; +pub const ENC_R14: u8 = 14; +pub const ENC_R15: u8 = 15; + +fn gpr(enc: u8, index: u8) -> Reg { + Reg::new_real(RegClass::I64, enc, index) +} + +pub(crate) fn r12() -> Reg { + gpr(ENC_R12, 16) +} +pub(crate) fn r13() -> Reg { + gpr(ENC_R13, 17) +} +pub(crate) fn r14() -> Reg { + gpr(ENC_R14, 18) +} +pub(crate) fn rbx() -> Reg { + gpr(ENC_RBX, 19) +} +pub(crate) fn rsi() -> Reg { + gpr(6, 20) +} +pub(crate) fn rdi() -> Reg { + gpr(7, 21) +} +pub(crate) fn rax() -> Reg { + gpr(0, 22) +} +pub(crate) fn rcx() -> Reg { + gpr(1, 23) +} +pub(crate) fn rdx() -> Reg { + gpr(2, 24) +} +pub(crate) fn r8() -> Reg { + gpr(8, 25) +} +pub(crate) fn r9() -> Reg { + gpr(9, 26) +} +pub(crate) fn r10() -> Reg { + gpr(10, 27) +} +pub(crate) fn r11() -> Reg { + gpr(11, 28) +} + +pub(crate) fn r15() -> Reg { + // r15 is put aside since this is the pinned register. + gpr(ENC_R15, 29) +} + +/// The pinned register on this architecture. +/// It must be the same as Spidermonkey's HeapReg, as found in this file. +/// https://searchfox.org/mozilla-central/source/js/src/jit/x64/Assembler-x64.h#99 +pub(crate) fn pinned_reg() -> Reg { + r15() +} + +fn fpr(enc: u8, index: u8) -> Reg { + Reg::new_real(RegClass::V128, enc, index) +} + +pub(crate) fn xmm0() -> Reg { + fpr(0, 0) +} +pub(crate) fn xmm1() -> Reg { + fpr(1, 1) +} +pub(crate) fn xmm2() -> Reg { + fpr(2, 2) +} +pub(crate) fn xmm3() -> Reg { + fpr(3, 3) +} +pub(crate) fn xmm4() -> Reg { + fpr(4, 4) +} +pub(crate) fn xmm5() -> Reg { + fpr(5, 5) +} +pub(crate) fn xmm6() -> Reg { + fpr(6, 6) +} +pub(crate) fn xmm7() -> Reg { + fpr(7, 7) +} +pub(crate) fn xmm8() -> Reg { + fpr(8, 8) +} +pub(crate) fn xmm9() -> Reg { + fpr(9, 9) +} +pub(crate) fn xmm10() -> Reg { + fpr(10, 10) +} +pub(crate) fn xmm11() -> Reg { + fpr(11, 11) +} +pub(crate) fn xmm12() -> Reg { + fpr(12, 12) +} +pub(crate) fn xmm13() -> Reg { + fpr(13, 13) +} +pub(crate) fn xmm14() -> Reg { + fpr(14, 14) +} +pub(crate) fn xmm15() -> Reg { + fpr(15, 15) +} + +pub(crate) fn rsp() -> Reg { + gpr(ENC_RSP, 30) +} +pub(crate) fn rbp() -> Reg { + gpr(ENC_RBP, 31) +} + +/// Create the register universe for X64. +/// +/// The ordering of registers matters, as commented in the file doc comment: assumes the +/// calling-convention is SystemV, at the moment. +pub(crate) fn create_reg_universe_systemv(flags: &settings::Flags) -> RealRegUniverse { + let mut regs = Vec::<(RealReg, String)>::new(); + let mut allocable_by_class = [None; NUM_REG_CLASSES]; + + let use_pinned_reg = flags.enable_pinned_reg(); + + // XMM registers + let first_fpr = regs.len(); + regs.push((xmm0().to_real_reg(), "%xmm0".into())); + regs.push((xmm1().to_real_reg(), "%xmm1".into())); + regs.push((xmm2().to_real_reg(), "%xmm2".into())); + regs.push((xmm3().to_real_reg(), "%xmm3".into())); + regs.push((xmm4().to_real_reg(), "%xmm4".into())); + regs.push((xmm5().to_real_reg(), "%xmm5".into())); + regs.push((xmm6().to_real_reg(), "%xmm6".into())); + regs.push((xmm7().to_real_reg(), "%xmm7".into())); + regs.push((xmm8().to_real_reg(), "%xmm8".into())); + regs.push((xmm9().to_real_reg(), "%xmm9".into())); + regs.push((xmm10().to_real_reg(), "%xmm10".into())); + regs.push((xmm11().to_real_reg(), "%xmm11".into())); + regs.push((xmm12().to_real_reg(), "%xmm12".into())); + regs.push((xmm13().to_real_reg(), "%xmm13".into())); + regs.push((xmm14().to_real_reg(), "%xmm14".into())); + regs.push((xmm15().to_real_reg(), "%xmm15".into())); + let last_fpr = regs.len() - 1; + + // Integer regs. + let first_gpr = regs.len(); + + // Callee-saved, in the SystemV x86_64 ABI. + regs.push((r12().to_real_reg(), "%r12".into())); + regs.push((r13().to_real_reg(), "%r13".into())); + regs.push((r14().to_real_reg(), "%r14".into())); + + regs.push((rbx().to_real_reg(), "%rbx".into())); + + // Caller-saved, in the SystemV x86_64 ABI. + regs.push((rsi().to_real_reg(), "%rsi".into())); + regs.push((rdi().to_real_reg(), "%rdi".into())); + regs.push((rax().to_real_reg(), "%rax".into())); + regs.push((rcx().to_real_reg(), "%rcx".into())); + regs.push((rdx().to_real_reg(), "%rdx".into())); + regs.push((r8().to_real_reg(), "%r8".into())); + regs.push((r9().to_real_reg(), "%r9".into())); + regs.push((r10().to_real_reg(), "%r10".into())); + regs.push((r11().to_real_reg(), "%r11".into())); + + // Other regs, not available to the allocator. + debug_assert_eq!(r15(), pinned_reg()); + let allocable = if use_pinned_reg { + // The pinned register is not allocatable in this case, so record the length before adding + // it. + let len = regs.len(); + regs.push((r15().to_real_reg(), "%r15/pinned".into())); + len + } else { + regs.push((r15().to_real_reg(), "%r15".into())); + regs.len() + }; + let last_gpr = allocable - 1; + + regs.push((rsp().to_real_reg(), "%rsp".into())); + regs.push((rbp().to_real_reg(), "%rbp".into())); + + allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo { + first: first_gpr, + last: last_gpr, + suggested_scratch: Some(r12().get_index()), + }); + allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo { + first: first_fpr, + last: last_fpr, + suggested_scratch: Some(xmm15().get_index()), + }); + + // Sanity-check: the index passed to the Reg ctor must match the order in the register list. + for (i, reg) in regs.iter().enumerate() { + assert_eq!(i, reg.0.get_index()); + } + + RealRegUniverse { + regs, + allocable, + allocable_by_class, + } +} + +/// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show its name at some +/// smaller size (4, 2 or 1 bytes). +pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: u8) -> String { + let mut s = reg.show_rru(mb_rru); + + if reg.get_class() != RegClass::I64 || size == 8 { + // We can't do any better. + return s; + } + + if reg.is_real() { + // Change (eg) "rax" into "eax", "ax" or "al" as appropriate. This is something one could + // describe diplomatically as "a kludge", but it's only debug code. + let remapper = match s.as_str() { + "%rax" => Some(["%eax", "%ax", "%al"]), + "%rbx" => Some(["%ebx", "%bx", "%bl"]), + "%rcx" => Some(["%ecx", "%cx", "%cl"]), + "%rdx" => Some(["%edx", "%dx", "%dl"]), + "%rsi" => Some(["%esi", "%si", "%sil"]), + "%rdi" => Some(["%edi", "%di", "%dil"]), + "%rbp" => Some(["%ebp", "%bp", "%bpl"]), + "%rsp" => Some(["%esp", "%sp", "%spl"]), + "%r8" => Some(["%r8d", "%r8w", "%r8b"]), + "%r9" => Some(["%r9d", "%r9w", "%r9b"]), + "%r10" => Some(["%r10d", "%r10w", "%r10b"]), + "%r11" => Some(["%r11d", "%r11w", "%r11b"]), + "%r12" => Some(["%r12d", "%r12w", "%r12b"]), + "%r13" => Some(["%r13d", "%r13w", "%r13b"]), + "%r14" => Some(["%r14d", "%r14w", "%r14b"]), + "%r15" => Some(["%r15d", "%r15w", "%r15b"]), + _ => None, + }; + if let Some(smaller_names) = remapper { + match size { + 4 => s = smaller_names[0].into(), + 2 => s = smaller_names[1].into(), + 1 => s = smaller_names[2].into(), + _ => panic!("show_ireg_sized: real"), + } + } + } else { + // Add a "l", "w" or "b" suffix to RegClass::I64 vregs used at narrower widths. + let suffix = match size { + 4 => "l", + 2 => "w", + 1 => "b", + _ => panic!("show_ireg_sized: virtual"), + }; + s = s + suffix; + } + + s +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs new file mode 100644 index 0000000000..ffe43930f0 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs @@ -0,0 +1,125 @@ +use crate::isa::unwind::input::UnwindInfo; +use crate::isa::x64::inst::{ + args::{AluRmiROpcode, Amode, RegMemImm, SyntheticAmode}, + regs, Inst, +}; +use crate::machinst::{UnwindInfoContext, UnwindInfoGenerator}; +use crate::result::CodegenResult; +use alloc::vec::Vec; +use regalloc::Reg; + +#[cfg(feature = "unwind")] +pub(crate) mod systemv; + +pub struct X64UnwindInfo; + +impl UnwindInfoGenerator<Inst> for X64UnwindInfo { + fn create_unwind_info( + context: UnwindInfoContext<Inst>, + ) -> CodegenResult<Option<UnwindInfo<Reg>>> { + use crate::isa::unwind::input::{self, UnwindCode}; + let mut codes = Vec::new(); + const WORD_SIZE: u8 = 8; + + for i in context.prologue.clone() { + let i = i as usize; + let inst = &context.insts[i]; + let offset = context.insts_layout[i]; + + match inst { + Inst::Push64 { + src: RegMemImm::Reg { reg }, + } => { + codes.push(( + offset, + UnwindCode::StackAlloc { + size: WORD_SIZE.into(), + }, + )); + codes.push(( + offset, + UnwindCode::SaveRegister { + reg: *reg, + stack_offset: 0, + }, + )); + } + Inst::MovRR { src, dst, .. } => { + if *src == regs::rsp() { + codes.push((offset, UnwindCode::SetFramePointer { reg: dst.to_reg() })); + } + } + Inst::AluRmiR { + is_64: true, + op: AluRmiROpcode::Sub, + src: RegMemImm::Imm { simm32 }, + dst, + .. + } if dst.to_reg() == regs::rsp() => { + let imm = *simm32; + codes.push((offset, UnwindCode::StackAlloc { size: imm })); + } + Inst::MovRM { + src, + dst: SyntheticAmode::Real(Amode::ImmReg { simm32, base, .. }), + .. + } if *base == regs::rsp() => { + // `mov reg, imm(rsp)` + let imm = *simm32; + codes.push(( + offset, + UnwindCode::SaveRegister { + reg: *src, + stack_offset: imm, + }, + )); + } + Inst::AluRmiR { + is_64: true, + op: AluRmiROpcode::Add, + src: RegMemImm::Imm { simm32 }, + dst, + .. + } if dst.to_reg() == regs::rsp() => { + let imm = *simm32; + codes.push((offset, UnwindCode::StackDealloc { size: imm })); + } + _ => {} + } + } + + let last_epilogue_end = context.len; + let epilogues_unwind_codes = context + .epilogues + .iter() + .map(|epilogue| { + // TODO add logic to process epilogue instruction instead of + // returning empty array. + let end = epilogue.end as usize - 1; + let end_offset = context.insts_layout[end]; + if end_offset == last_epilogue_end { + // Do not remember/restore for very last epilogue. + return vec![]; + } + + let start = epilogue.start as usize; + let offset = context.insts_layout[start]; + vec![ + (offset, UnwindCode::RememberState), + // TODO epilogue instructions + (end_offset, UnwindCode::RestoreState), + ] + }) + .collect(); + + let prologue_size = context.insts_layout[context.prologue.end as usize]; + Ok(Some(input::UnwindInfo { + prologue_size, + prologue_unwind_codes: codes, + epilogues_unwind_codes, + function_size: context.len, + word_size: WORD_SIZE, + initial_sp_offset: WORD_SIZE, + })) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs new file mode 100644 index 0000000000..68473a8afb --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs @@ -0,0 +1,204 @@ +//! Unwind information for System V ABI (x86-64). + +use crate::isa::unwind::input; +use crate::isa::unwind::systemv::{RegisterMappingError, UnwindInfo}; +use crate::result::CodegenResult; +use gimli::{write::CommonInformationEntry, Encoding, Format, Register, X86_64}; +use regalloc::{Reg, RegClass}; + +/// Creates a new x86-64 common information entry (CIE). +pub fn create_cie() -> CommonInformationEntry { + use gimli::write::CallFrameInstruction; + + let mut entry = CommonInformationEntry::new( + Encoding { + address_size: 8, + format: Format::Dwarf32, + version: 1, + }, + 1, // Code alignment factor + -8, // Data alignment factor + X86_64::RA, + ); + + // Every frame will start with the call frame address (CFA) at RSP+8 + // It is +8 to account for the push of the return address by the call instruction + entry.add_instruction(CallFrameInstruction::Cfa(X86_64::RSP, 8)); + + // Every frame will start with the return address at RSP (CFA-8 = RSP+8-8 = RSP) + entry.add_instruction(CallFrameInstruction::Offset(X86_64::RA, -8)); + + entry +} + +/// Map Cranelift registers to their corresponding Gimli registers. +pub fn map_reg(reg: Reg) -> Result<Register, RegisterMappingError> { + // Mapping from https://github.com/bytecodealliance/cranelift/pull/902 by @iximeow + const X86_GP_REG_MAP: [gimli::Register; 16] = [ + X86_64::RAX, + X86_64::RCX, + X86_64::RDX, + X86_64::RBX, + X86_64::RSP, + X86_64::RBP, + X86_64::RSI, + X86_64::RDI, + X86_64::R8, + X86_64::R9, + X86_64::R10, + X86_64::R11, + X86_64::R12, + X86_64::R13, + X86_64::R14, + X86_64::R15, + ]; + const X86_XMM_REG_MAP: [gimli::Register; 16] = [ + X86_64::XMM0, + X86_64::XMM1, + X86_64::XMM2, + X86_64::XMM3, + X86_64::XMM4, + X86_64::XMM5, + X86_64::XMM6, + X86_64::XMM7, + X86_64::XMM8, + X86_64::XMM9, + X86_64::XMM10, + X86_64::XMM11, + X86_64::XMM12, + X86_64::XMM13, + X86_64::XMM14, + X86_64::XMM15, + ]; + + match reg.get_class() { + RegClass::I64 => { + // x86 GP registers have a weird mapping to DWARF registers, so we use a + // lookup table. + Ok(X86_GP_REG_MAP[reg.get_hw_encoding() as usize]) + } + RegClass::V128 => Ok(X86_XMM_REG_MAP[reg.get_hw_encoding() as usize]), + _ => Err(RegisterMappingError::UnsupportedRegisterBank("class?")), + } +} + +pub(crate) fn create_unwind_info( + unwind: input::UnwindInfo<Reg>, +) -> CodegenResult<Option<UnwindInfo>> { + struct RegisterMapper; + impl crate::isa::unwind::systemv::RegisterMapper<Reg> for RegisterMapper { + fn map(&self, reg: Reg) -> Result<u16, RegisterMappingError> { + Ok(map_reg(reg)?.0) + } + fn sp(&self) -> u16 { + X86_64::RSP.0 + } + } + let map = RegisterMapper; + + Ok(Some(UnwindInfo::build(unwind, &map)?)) +} + +#[cfg(test)] +mod tests { + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::{ + types, AbiParam, ExternalName, Function, InstBuilder, Signature, StackSlotData, + StackSlotKind, + }; + use crate::isa::{lookup, CallConv}; + use crate::settings::{builder, Flags}; + use crate::Context; + use gimli::write::Address; + use std::str::FromStr; + use target_lexicon::triple; + + #[test] + fn test_simple_func() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::SystemV, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let fde = match context + .create_unwind_info(isa.as_ref()) + .expect("can create unwind info") + { + Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => { + info.to_fde(Address::Constant(1234)) + } + _ => panic!("expected unwind information"), + }; + + assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(1234), length: 13, lsda: None, instructions: [(1, CfaOffset(16)), (1, Offset(Register(6), -16)), (4, CfaRegister(Register(6)))] }"); + } + + fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function { + let mut func = + Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv)); + + let block0 = func.dfg.make_block(); + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().return_(&[]); + + if let Some(stack_slot) = stack_slot { + func.stack_slots.push(stack_slot); + } + + func + } + + #[test] + fn test_multi_return_func() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV)); + + context.compile(&*isa).expect("expected compilation"); + + let fde = match context + .create_unwind_info(isa.as_ref()) + .expect("can create unwind info") + { + Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => { + info.to_fde(Address::Constant(4321)) + } + _ => panic!("expected unwind information"), + }; + + assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 23, lsda: None, instructions: [(1, CfaOffset(16)), (1, Offset(Register(6), -16)), (4, CfaRegister(Register(6))), (16, RememberState), (18, RestoreState)] }"); + } + + fn create_multi_return_function(call_conv: CallConv) -> Function { + let mut sig = Signature::new(call_conv); + sig.params.push(AbiParam::new(types::I32)); + let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig); + + let block0 = func.dfg.make_block(); + let v0 = func.dfg.append_block_param(block0, types::I32); + let block1 = func.dfg.make_block(); + let block2 = func.dfg.make_block(); + + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().brnz(v0, block2, &[]); + pos.ins().jump(block1, &[]); + + pos.insert_block(block1); + pos.ins().return_(&[]); + + pos.insert_block(block2); + pos.ins().return_(&[]); + + func + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs new file mode 100644 index 0000000000..0862154360 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs @@ -0,0 +1,3771 @@ +//! Lowering rules for X64. + +use crate::data_value::DataValue; +use crate::ir::{ + condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName, + Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type, +}; +use crate::isa::x64::abi::*; +use crate::isa::x64::inst::args::*; +use crate::isa::x64::inst::*; +use crate::isa::{x64::X64Backend, CallConv}; +use crate::machinst::lower::*; +use crate::machinst::*; +use crate::result::CodegenResult; +use crate::settings::Flags; +use alloc::boxed::Box; +use alloc::vec::Vec; +use cranelift_codegen_shared::condcodes::CondCode; +use log::trace; +use regalloc::{Reg, RegClass, Writable}; +use smallvec::SmallVec; +use std::convert::TryFrom; +use target_lexicon::Triple; + +/// Context passed to all lowering functions. +type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>; + +//============================================================================= +// Helpers for instruction lowering. + +fn is_int_or_ref_ty(ty: Type) -> bool { + match ty { + types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true, + types::R32 => panic!("shouldn't have 32-bits refs on x64"), + _ => false, + } +} + +fn is_bool_ty(ty: Type) -> bool { + match ty { + types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true, + types::R32 => panic!("shouldn't have 32-bits refs on x64"), + _ => false, + } +} + +/// This is target-word-size dependent. And it excludes booleans and reftypes. +fn is_valid_atomic_transaction_ty(ty: Type) -> bool { + match ty { + types::I8 | types::I16 | types::I32 | types::I64 => true, + _ => false, + } +} + +/// Returns whether the given specified `input` is a result produced by an instruction with Opcode +/// `op`. +// TODO investigate failures with checking against the result index. +fn matches_input<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + op: Opcode, +) -> Option<IRInst> { + let inputs = ctx.get_input(input.insn, input.input); + inputs.inst.and_then(|(src_inst, _)| { + let data = ctx.data(src_inst); + if data.opcode() == op { + return Some(src_inst); + } + None + }) +} + +/// Returns whether the given specified `input` is a result produced by an instruction with any of +/// the opcodes specified in `ops`. +fn matches_input_any<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + ops: &[Opcode], +) -> Option<IRInst> { + let inputs = ctx.get_input(input.insn, input.input); + inputs.inst.and_then(|(src_inst, _)| { + let data = ctx.data(src_inst); + for &op in ops { + if data.opcode() == op { + return Some(src_inst); + } + } + None + }) +} + +fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg { + ctx.use_input_reg(input); + input.reg +} + +/// Put the given input into a register, and mark it as used (side-effect). +fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg { + let input = ctx.get_input(spec.insn, spec.input); + + if let Some(c) = input.constant { + // Generate constants fresh at each use to minimize long-range register pressure. + let ty = ctx.input_ty(spec.insn, spec.input); + let from_bits = ty_bits(ty); + let masked = if from_bits < 64 { + c & ((1u64 << from_bits) - 1) + } else { + c + }; + + let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty); + for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) + .into_iter() + { + ctx.emit(inst); + } + cst_copy.to_reg() + } else { + lowerinput_to_reg(ctx, input) + } +} + +/// An extension specification for `extend_input_to_reg`. +#[derive(Clone, Copy)] +enum ExtSpec { + ZeroExtendTo32, + ZeroExtendTo64, + SignExtendTo32, + SignExtendTo64, +} + +/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if +/// required. (This obviously causes side-effects.) +fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg { + let requested_size = match ext_spec { + ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32, + ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64, + }; + let input_size = ctx.input_ty(spec.insn, spec.input).bits(); + + let requested_ty = if requested_size == 32 { + types::I32 + } else { + types::I64 + }; + + let ext_mode = match (input_size, requested_size) { + (a, b) if a == b => return put_input_in_reg(ctx, spec), + (1, 8) => return put_input_in_reg(ctx, spec), + (a, b) => ExtMode::new(a, b).expect(&format!("invalid extension: {} -> {}", a, b)), + }; + + let src = input_to_reg_mem(ctx, spec); + let dst = ctx.alloc_tmp(RegClass::I64, requested_ty); + match ext_spec { + ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => { + ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst)) + } + ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => { + ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst)) + } + } + dst.to_reg() +} + +fn lowerinput_to_reg_mem(ctx: Ctx, input: LowerInput) -> RegMem { + // TODO handle memory. + RegMem::reg(lowerinput_to_reg(ctx, input)) +} + +/// Put the given input into a register or a memory operand. +/// Effectful: may mark the given input as used, when returning the register form. +fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem { + let input = ctx.get_input(spec.insn, spec.input); + lowerinput_to_reg_mem(ctx, input) +} + +/// Returns whether the given input is an immediate that can be properly sign-extended, without any +/// possible side-effect. +fn lowerinput_to_sext_imm(input: LowerInput, input_ty: Type) -> Option<u32> { + input.constant.and_then(|x| { + // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend + // to 64 bits. For other sizes, it doesn't matter and we can just use the plain + // constant. + if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) { + Some(x as u32) + } else { + None + } + }) +} + +fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> { + let input = ctx.get_input(spec.insn, spec.input); + let input_ty = ctx.input_ty(spec.insn, spec.input); + lowerinput_to_sext_imm(input, input_ty) +} + +fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> { + ctx.get_input(spec.insn, spec.input).constant +} + +/// Put the given input into an immediate, a register or a memory operand. +/// Effectful: may mark the given input as used, when returning the register form. +fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm { + let input = ctx.get_input(spec.insn, spec.input); + let input_ty = ctx.input_ty(spec.insn, spec.input); + match lowerinput_to_sext_imm(input, input_ty) { + Some(x) => RegMemImm::imm(x), + None => match lowerinput_to_reg_mem(ctx, input) { + RegMem::Reg { reg } => RegMemImm::reg(reg), + RegMem::Mem { addr } => RegMemImm::mem(addr), + }, + } +} + +/// Emit an instruction to insert a value `src` into a lane of `dst`. +fn emit_insert_lane<C: LowerCtx<I = Inst>>( + ctx: &mut C, + src: RegMem, + dst: Writable<Reg>, + lane: u8, + ty: Type, +) { + if !ty.is_float() { + let (sse_op, is64) = match ty.lane_bits() { + 8 => (SseOpcode::Pinsrb, false), + 16 => (SseOpcode::Pinsrw, false), + 32 => (SseOpcode::Pinsrd, false), + 64 => (SseOpcode::Pinsrd, true), + _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()), + }; + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64)); + } else if ty == types::F32 { + let sse_op = SseOpcode::Insertps; + // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane + // shifted into bits 5:6). + let lane = 0b00_00_00_00 | lane << 4; + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false)); + } else if ty == types::F64 { + let sse_op = match lane { + // Move the lowest quadword in replacement to vector without changing + // the upper bits. + 0 => SseOpcode::Movsd, + // Move the low 64 bits of replacement vector to the high 64 bits of the + // vector. + 1 => SseOpcode::Movlhps, + _ => unreachable!(), + }; + // Here we use the `xmm_rm_r` encoding because it correctly tells the register + // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other + // encoding formats like `xmm_unary_rm_r` treat it as a `def`. + ctx.emit(Inst::xmm_rm_r(sse_op, src, dst)); + } else { + panic!("unable to emit insertlane for type: {}", ty) + } +} + +/// Emits an int comparison instruction. +/// +/// Note: make sure that there are no instructions modifying the flags between a call to this +/// function and the use of the flags! +fn emit_cmp(ctx: Ctx, insn: IRInst) { + let ty = ctx.input_ty(insn, 0); + + let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; + + // TODO Try to commute the operands (and invert the condition) if one is an immediate. + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem_imm(ctx, inputs[1]); + + // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives + // us dst - src at the machine instruction level, so invert operands. + ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs)); +} + +/// A specification for a fcmp emission. +enum FcmpSpec { + /// Normal flow. + Normal, + + /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that + /// happens with `InvertedEqualOrConditions`. + /// + /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or + /// sequence of instructions) that check for an "AND" combination of condition codes; see for + /// instance lowering of Select. + InvertEqual, +} + +/// This explains how to interpret the results of an fcmp instruction. +enum FcmpCondResult { + /// The given condition code must be set. + Condition(CC), + + /// Both condition codes must be set. + AndConditions(CC, CC), + + /// Either of the conditions codes must be set. + OrConditions(CC, CC), + + /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either + /// of the condition codes must be set, and the user must invert meaning of analyzing the + /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be + /// reached. + InvertedEqualOrConditions(CC, CC), +} + +/// Emits a float comparison instruction. +/// +/// Note: make sure that there are no instructions modifying the flags between a call to this +/// function and the use of the flags! +fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult { + let (flip_operands, inverted_equal) = match cond_code { + FloatCC::LessThan + | FloatCC::LessThanOrEqual + | FloatCC::UnorderedOrGreaterThan + | FloatCC::UnorderedOrGreaterThanOrEqual => { + cond_code = cond_code.reverse(); + (true, false) + } + FloatCC::Equal => { + let inverted_equal = match spec { + FcmpSpec::Normal => false, + FcmpSpec::InvertEqual => { + cond_code = FloatCC::NotEqual; // same as .inverse() + true + } + }; + (false, inverted_equal) + } + _ => (false, false), + }; + + // The only valid CC constructed with `from_floatcc` can be put in the flag + // register with a direct float comparison; do this here. + let op = match ctx.input_ty(insn, 0) { + types::F32 => SseOpcode::Ucomiss, + types::F64 => SseOpcode::Ucomisd, + _ => panic!("Bad input type to Fcmp"), + }; + + let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; + let (lhs_input, rhs_input) = if flip_operands { + (inputs[1], inputs[0]) + } else { + (inputs[0], inputs[1]) + }; + let lhs = put_input_in_reg(ctx, lhs_input); + let rhs = input_to_reg_mem(ctx, rhs_input); + ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs)); + + let cond_result = match cond_code { + FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z), + FloatCC::NotEqual if inverted_equal => { + FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ) + } + FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ), + _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)), + }; + + cond_result +} + +fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature { + let mut sig = Signature::new(call_conv); + for i in 0..ctx.num_inputs(insn) { + sig.params.push(AbiParam::new(ctx.input_ty(insn, i))); + } + for i in 0..ctx.num_outputs(insn) { + sig.returns.push(AbiParam::new(ctx.output_ty(insn, i))); + } + if call_conv.extends_baldrdash() { + // Adds the special VMContext parameter to the signature. + sig.params + .push(AbiParam::special(ptr_ty, ArgumentPurpose::VMContext)); + } + sig +} + +fn emit_vm_call<C: LowerCtx<I = Inst>>( + ctx: &mut C, + flags: &Flags, + triple: &Triple, + libcall: LibCall, + insn: IRInst, + inputs: SmallVec<[InsnInput; 4]>, + outputs: SmallVec<[InsnOutput; 2]>, +) -> CodegenResult<()> { + let extname = ExternalName::LibCall(libcall); + + let dist = if flags.use_colocated_libcalls() { + RelocDistance::Near + } else { + RelocDistance::Far + }; + + // TODO avoid recreating signatures for every single Libcall function. + let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple)); + let sig = make_libcall_sig(ctx, insn, call_conv, types::I64); + let caller_conv = ctx.abi().call_conv(); + + let mut abi = X64ABICaller::from_func(&sig, &extname, dist, caller_conv)?; + + abi.emit_stack_pre_adjust(ctx); + + let vm_context = if call_conv.extends_baldrdash() { 1 } else { 0 }; + assert_eq!(inputs.len() + vm_context, abi.num_args()); + + for (i, input) in inputs.iter().enumerate() { + let arg_reg = put_input_in_reg(ctx, *input); + abi.emit_copy_reg_to_arg(ctx, i, arg_reg); + } + if call_conv.extends_baldrdash() { + let vm_context_vreg = ctx + .get_vm_context() + .expect("should have a VMContext to pass to libcall funcs"); + abi.emit_copy_reg_to_arg(ctx, inputs.len(), vm_context_vreg); + } + + abi.emit_call(ctx); + for (i, output) in outputs.iter().enumerate() { + let retval_reg = get_output_reg(ctx, *output); + abi.emit_copy_retval_to_reg(ctx, i, retval_reg); + } + abi.emit_stack_post_adjust(ctx); + + Ok(()) +} + +/// Returns whether the given input is a shift by a constant value less or equal than 3. +/// The goal is to embed it within an address mode. +fn matches_small_constant_shift<C: LowerCtx<I = Inst>>( + ctx: &mut C, + spec: InsnInput, +) -> Option<(InsnInput, u8)> { + matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| { + match input_to_imm( + ctx, + InsnInput { + insn: shift, + input: 1, + }, + ) { + Some(shift_amt) if shift_amt <= 3 => Some(( + InsnInput { + insn: shift, + input: 0, + }, + shift_amt as u8, + )), + _ => None, + } + }) +} + +/// Lowers an instruction to one of the x86 addressing modes. +/// +/// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior. +fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i32) -> Amode { + let flags = ctx + .memflags(spec.insn) + .expect("Instruction with amode should have memflags"); + + // We now either have an add that we must materialize, or some other input; as well as the + // final offset. + if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) { + debug_assert_eq!(ctx.output_ty(add, 0), types::I64); + let add_inputs = &[ + InsnInput { + insn: add, + input: 0, + }, + InsnInput { + insn: add, + input: 1, + }, + ]; + + // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations + // aren't happening in the wasm case. We could do better, given some range analysis. + let (base, index, shift) = if let Some((shift_input, shift_amt)) = + matches_small_constant_shift(ctx, add_inputs[0]) + { + ( + put_input_in_reg(ctx, add_inputs[1]), + put_input_in_reg(ctx, shift_input), + shift_amt, + ) + } else if let Some((shift_input, shift_amt)) = + matches_small_constant_shift(ctx, add_inputs[1]) + { + ( + put_input_in_reg(ctx, add_inputs[0]), + put_input_in_reg(ctx, shift_input), + shift_amt, + ) + } else { + for i in 0..=1 { + let input = ctx.get_input(add, i); + + // Try to pierce through uextend. + if let Some(uextend) = matches_input( + ctx, + InsnInput { + insn: add, + input: i, + }, + Opcode::Uextend, + ) { + if let Some(cst) = ctx.get_input(uextend, 0).constant { + // Zero the upper bits. + let input_size = ctx.input_ty(uextend, 0).bits() as u64; + let shift: u64 = 64 - input_size; + let uext_cst: u64 = (cst << shift) >> shift; + + let final_offset = (offset as i64).wrapping_add(uext_cst as i64); + if low32_will_sign_extend_to_64(final_offset as u64) { + let base = put_input_in_reg(ctx, add_inputs[1 - i]); + return Amode::imm_reg(final_offset as u32, base).with_flags(flags); + } + } + } + + // If it's a constant, add it directly! + if let Some(cst) = input.constant { + let final_offset = (offset as i64).wrapping_add(cst as i64); + if low32_will_sign_extend_to_64(final_offset as u64) { + let base = put_input_in_reg(ctx, add_inputs[1 - i]); + return Amode::imm_reg(final_offset as u32, base).with_flags(flags); + } + } + } + + ( + put_input_in_reg(ctx, add_inputs[0]), + put_input_in_reg(ctx, add_inputs[1]), + 0, + ) + }; + + return Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags); + } + + let input = put_input_in_reg(ctx, spec); + Amode::imm_reg(offset as u32, input).with_flags(flags) +} + +//============================================================================= +// Top-level instruction lowering entry point, for one instruction. + +/// Actually codegen an instruction's results into registers. +fn lower_insn_to_regs<C: LowerCtx<I = Inst>>( + ctx: &mut C, + insn: IRInst, + flags: &Flags, + triple: &Triple, +) -> CodegenResult<()> { + let op = ctx.data(insn).opcode(); + + let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn)) + .map(|i| InsnInput { insn, input: i }) + .collect(); + let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn)) + .map(|i| InsnOutput { insn, output: i }) + .collect(); + + let ty = if outputs.len() > 0 { + Some(ctx.output_ty(insn, 0)) + } else { + None + }; + + match op { + Opcode::Iconst | Opcode::Bconst | Opcode::Null => { + let value = ctx + .get_constant(insn) + .expect("constant value for iconst et al"); + let dst = get_output_reg(ctx, outputs[0]); + for inst in Inst::gen_constant(dst, value, ty.unwrap(), |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + } + + Opcode::Iadd + | Opcode::IaddIfcout + | Opcode::SaddSat + | Opcode::UaddSat + | Opcode::Isub + | Opcode::SsubSat + | Opcode::UsubSat + | Opcode::Imul + | Opcode::AvgRound + | Opcode::Band + | Opcode::Bor + | Opcode::Bxor => { + let ty = ty.unwrap(); + if ty.lane_count() > 1 { + let sse_op = match op { + Opcode::Iadd => match ty { + types::I8X16 => SseOpcode::Paddb, + types::I16X8 => SseOpcode::Paddw, + types::I32X4 => SseOpcode::Paddd, + types::I64X2 => SseOpcode::Paddq, + _ => panic!("Unsupported type for packed iadd instruction: {}", ty), + }, + Opcode::SaddSat => match ty { + types::I8X16 => SseOpcode::Paddsb, + types::I16X8 => SseOpcode::Paddsw, + _ => panic!("Unsupported type for packed sadd_sat instruction: {}", ty), + }, + Opcode::UaddSat => match ty { + types::I8X16 => SseOpcode::Paddusb, + types::I16X8 => SseOpcode::Paddusw, + _ => panic!("Unsupported type for packed uadd_sat instruction: {}", ty), + }, + Opcode::Isub => match ty { + types::I8X16 => SseOpcode::Psubb, + types::I16X8 => SseOpcode::Psubw, + types::I32X4 => SseOpcode::Psubd, + types::I64X2 => SseOpcode::Psubq, + _ => panic!("Unsupported type for packed isub instruction: {}", ty), + }, + Opcode::SsubSat => match ty { + types::I8X16 => SseOpcode::Psubsb, + types::I16X8 => SseOpcode::Psubsw, + _ => panic!("Unsupported type for packed ssub_sat instruction: {}", ty), + }, + Opcode::UsubSat => match ty { + types::I8X16 => SseOpcode::Psubusb, + types::I16X8 => SseOpcode::Psubusw, + _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty), + }, + Opcode::Imul => match ty { + types::I16X8 => SseOpcode::Pmullw, + types::I32X4 => SseOpcode::Pmulld, + types::I64X2 => { + // Note for I64X2 we describe a lane A as being composed of a + // 32-bit upper half "Ah" and a 32-bit lower half "Al". + // The 32-bit long hand multiplication can then be written as: + // Ah Al + // * Bh Bl + // ----- + // Al * Bl + // + (Ah * Bl) << 32 + // + (Al * Bh) << 32 + // + // So for each lane we will compute: + // A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32 + // + // Note, the algorithm will use pmuldq which operates directly on + // the lower 32-bit (Al or Bl) of a lane and writes the result + // to the full 64-bits of the lane of the destination. For this + // reason we don't need shifts to isolate the lower 32-bits, however + // we will need to use shifts to isolate the high 32-bits when doing + // calculations, i.e. Ah == A >> 32 + // + // The full sequence then is as follows: + // A' = A + // A' = A' >> 32 + // A' = Ah' * Bl + // B' = B + // B' = B' >> 32 + // B' = Bh' * Al + // B' = B' + A' + // B' = B' << 32 + // A' = A + // A' = Al' * Bl + // A' = A' + B' + // dst = A' + + // Get inputs rhs=A and lhs=B and the dst register + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + // A' = A + let rhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2); + ctx.emit(Inst::gen_move(rhs_1, rhs, ty)); + + // A' = A' >> 32 + // A' = Ah' * Bl + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psrlq, + RegMemImm::imm(32), + rhs_1, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmuludq, + RegMem::reg(lhs.clone()), + rhs_1, + )); + + // B' = B + let lhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2); + ctx.emit(Inst::gen_move(lhs_1, lhs, ty)); + + // B' = B' >> 32 + // B' = Bh' * Al + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psrlq, + RegMemImm::imm(32), + lhs_1, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1)); + + // B' = B' + A' + // B' = B' << 32 + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddq, + RegMem::reg(rhs_1.to_reg()), + lhs_1, + )); + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psllq, + RegMemImm::imm(32), + lhs_1, + )); + + // A' = A + // A' = Al' * Bl + // A' = A' + B' + // dst = A' + ctx.emit(Inst::gen_move(rhs_1, rhs, ty)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmuludq, + RegMem::reg(lhs.clone()), + rhs_1, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddq, + RegMem::reg(lhs_1.to_reg()), + rhs_1, + )); + ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty)); + return Ok(()); + } + _ => panic!("Unsupported type for packed imul instruction: {}", ty), + }, + Opcode::AvgRound => match ty { + types::I8X16 => SseOpcode::Pavgb, + types::I16X8 => SseOpcode::Pavgw, + _ => panic!("Unsupported type for packed avg_round instruction: {}", ty), + }, + Opcode::Band => match ty { + types::F32X4 => SseOpcode::Andps, + types::F64X2 => SseOpcode::Andpd, + _ => SseOpcode::Pand, + }, + Opcode::Bor => match ty { + types::F32X4 => SseOpcode::Orps, + types::F64X2 => SseOpcode::Orpd, + _ => SseOpcode::Por, + }, + Opcode::Bxor => match ty { + types::F32X4 => SseOpcode::Xorps, + types::F64X2 => SseOpcode::Xorpd, + _ => SseOpcode::Pxor, + }, + _ => panic!("Unsupported packed instruction: {}", op), + }; + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + // Move the `lhs` to the same register as `dst`. + ctx.emit(Inst::gen_move(dst, lhs, ty)); + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + } else { + let is_64 = ty == types::I64; + let alu_op = match op { + Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add, + Opcode::Isub => AluRmiROpcode::Sub, + Opcode::Imul => AluRmiROpcode::Mul, + Opcode::Band => AluRmiROpcode::And, + Opcode::Bor => AluRmiROpcode::Or, + Opcode::Bxor => AluRmiROpcode::Xor, + _ => unreachable!(), + }; + + let (lhs, rhs) = match op { + Opcode::Iadd + | Opcode::IaddIfcout + | Opcode::Imul + | Opcode::Band + | Opcode::Bor + | Opcode::Bxor => { + // For commutative operations, try to commute operands if one is an + // immediate. + if let Some(imm) = input_to_sext_imm(ctx, inputs[0]) { + (put_input_in_reg(ctx, inputs[1]), RegMemImm::imm(imm)) + } else { + ( + put_input_in_reg(ctx, inputs[0]), + input_to_reg_mem_imm(ctx, inputs[1]), + ) + } + } + Opcode::Isub => ( + put_input_in_reg(ctx, inputs[0]), + input_to_reg_mem_imm(ctx, inputs[1]), + ), + _ => unreachable!(), + }; + + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::mov_r_r(true, lhs, dst)); + ctx.emit(Inst::alu_rmi_r(is_64, alu_op, rhs, dst)); + } + } + + Opcode::BandNot => { + let ty = ty.unwrap(); + debug_assert!(ty.is_vector() && ty.bytes() == 16); + let lhs = input_to_reg_mem(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let sse_op = match ty { + types::F32X4 => SseOpcode::Andnps, + types::F64X2 => SseOpcode::Andnpd, + _ => SseOpcode::Pandn, + }; + // Note the flipping of operands: the `rhs` operand is used as the destination instead + // of the `lhs` as in the other bit operations above (e.g. `band`). + ctx.emit(Inst::gen_move(dst, rhs, ty)); + ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst)); + } + + Opcode::Iabs => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + if ty.is_vector() { + let opcode = match ty { + types::I8X16 => SseOpcode::Pabsb, + types::I16X8 => SseOpcode::Pabsw, + types::I32X4 => SseOpcode::Pabsd, + _ => panic!("Unsupported type for packed iabs instruction: {}", ty), + }; + ctx.emit(Inst::xmm_unary_rm_r(opcode, src, dst)); + } else { + unimplemented!("iabs is unimplemented for non-vector type: {}", ty); + } + } + + Opcode::Imax | Opcode::Umax | Opcode::Imin | Opcode::Umin => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + if ty.is_vector() { + let sse_op = match op { + Opcode::Imax => match ty { + types::I8X16 => SseOpcode::Pmaxsb, + types::I16X8 => SseOpcode::Pmaxsw, + types::I32X4 => SseOpcode::Pmaxsd, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + Opcode::Umax => match ty { + types::I8X16 => SseOpcode::Pmaxub, + types::I16X8 => SseOpcode::Pmaxuw, + types::I32X4 => SseOpcode::Pmaxud, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + Opcode::Imin => match ty { + types::I8X16 => SseOpcode::Pminsb, + types::I16X8 => SseOpcode::Pminsw, + types::I32X4 => SseOpcode::Pminsd, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + Opcode::Umin => match ty { + types::I8X16 => SseOpcode::Pminub, + types::I16X8 => SseOpcode::Pminuw, + types::I32X4 => SseOpcode::Pminud, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + _ => unreachable!("This is a bug: the external and internal `match op` should be over the same opcodes."), + }; + + // Move the `lhs` to the same register as `dst`. + ctx.emit(Inst::gen_move(dst, lhs, ty)); + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + } else { + panic!("Unsupported type for {} instruction: {}", op, ty); + } + } + + Opcode::Bnot => { + let ty = ty.unwrap(); + let size = ty.bytes() as u8; + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, src, ty)); + + if ty.is_vector() { + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp)); + ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst)); + } else if ty.is_bool() { + unimplemented!("bool bnot") + } else { + ctx.emit(Inst::not(size, dst)); + } + } + + Opcode::Bitselect => { + let ty = ty.unwrap(); + let condition = put_input_in_reg(ctx, inputs[0]); + let if_true = put_input_in_reg(ctx, inputs[1]); + let if_false = input_to_reg_mem(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); + + if ty.is_vector() { + let tmp1 = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::gen_move(tmp1, if_true, ty)); + ctx.emit(Inst::and(ty, RegMem::reg(condition.clone()), tmp1)); + + let tmp2 = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::gen_move(tmp2, condition, ty)); + ctx.emit(Inst::and_not(ty, if_false, tmp2)); + + ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty)); + ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst)); + } else { + unimplemented!("scalar bitselect") + } + } + + Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => { + let dst_ty = ctx.output_ty(insn, 0); + debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty); + + let (size, lhs) = match dst_ty { + types::I8 | types::I16 => match op { + Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])), + Opcode::Ushr => ( + 4, + extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32), + ), + Opcode::Sshr => ( + 4, + extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32), + ), + Opcode::Rotl | Opcode::Rotr => { + (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])) + } + _ => unreachable!(), + }, + types::I32 | types::I64 => (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])), + _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty), + }; + + let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant { + // Mask count, according to Cranelift's semantics. + let cst = (cst as u8) & (dst_ty.bits() as u8 - 1); + (Some(cst), None) + } else { + (None, Some(put_input_in_reg(ctx, inputs[1]))) + }; + + let dst = get_output_reg(ctx, outputs[0]); + + let shift_kind = match op { + Opcode::Ishl => ShiftKind::ShiftLeft, + Opcode::Ushr => ShiftKind::ShiftRightLogical, + Opcode::Sshr => ShiftKind::ShiftRightArithmetic, + Opcode::Rotl => ShiftKind::RotateLeft, + Opcode::Rotr => ShiftKind::RotateRight, + _ => unreachable!(), + }; + + let w_rcx = Writable::from_reg(regs::rcx()); + ctx.emit(Inst::mov_r_r(true, lhs, dst)); + if count.is_none() { + ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx)); + } + ctx.emit(Inst::shift_r(size, shift_kind, count, dst)); + } + + Opcode::Ineg => { + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + if ty.is_vector() { + // Zero's out a register and then does a packed subtraction + // of the input from the register. + + let src = input_to_reg_mem(ctx, inputs[0]); + let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4); + + let subtract_opcode = match ty { + types::I8X16 => SseOpcode::Psubb, + types::I16X8 => SseOpcode::Psubw, + types::I32X4 => SseOpcode::Psubd, + types::I64X2 => SseOpcode::Psubq, + _ => panic!("Unsupported type for Ineg instruction, found {}", ty), + }; + + // Note we must zero out a tmp instead of using the destination register since + // the desitnation could be an alias for the source input register + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(tmp.to_reg()), + tmp, + )); + ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp)); + ctx.emit(Inst::xmm_unary_rm_r( + SseOpcode::Movapd, + RegMem::reg(tmp.to_reg()), + dst, + )); + } else { + let size = ty.bytes() as u8; + let src = put_input_in_reg(ctx, inputs[0]); + ctx.emit(Inst::gen_move(dst, src, ty)); + ctx.emit(Inst::neg(size, dst)); + } + } + + Opcode::Clz => { + // TODO when the x86 flags have use_lzcnt, we can use LZCNT. + + // General formula using bit-scan reverse (BSR): + // mov -1, %dst + // bsr %src, %tmp + // cmovz %dst, %tmp + // mov $(size_bits - 1), %dst + // sub %tmp, %dst + + let (ext_spec, ty) = match ctx.input_ty(insn, 0) { + types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), + a if a == types::I32 || a == types::I64 => (None, a), + _ => unreachable!(), + }; + + let src = if let Some(ext_spec) = ext_spec { + RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) + } else { + input_to_reg_mem(ctx, inputs[0]) + }; + let dst = get_output_reg(ctx, outputs[0]); + + let tmp = ctx.alloc_tmp(RegClass::I64, ty); + ctx.emit(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + u64::max_value(), + dst, + )); + + ctx.emit(Inst::unary_rm_r( + ty.bytes() as u8, + UnaryRmROpcode::Bsr, + src, + tmp, + )); + + ctx.emit(Inst::cmove( + ty.bytes() as u8, + CC::Z, + RegMem::reg(dst.to_reg()), + tmp, + )); + + ctx.emit(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + ty.bits() as u64 - 1, + dst, + )); + + ctx.emit(Inst::alu_rmi_r( + ty == types::I64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + } + + Opcode::Ctz => { + // TODO when the x86 flags have use_bmi1, we can use TZCNT. + + // General formula using bit-scan forward (BSF): + // bsf %src, %dst + // mov $(size_bits), %tmp + // cmovz %tmp, %dst + let ty = ctx.input_ty(insn, 0); + let ty = if ty.bits() < 32 { types::I32 } else { ty }; + debug_assert!(ty == types::I32 || ty == types::I64); + + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + let tmp = ctx.alloc_tmp(RegClass::I64, ty); + ctx.emit(Inst::imm(OperandSize::Size32, ty.bits() as u64, tmp)); + + ctx.emit(Inst::unary_rm_r( + ty.bytes() as u8, + UnaryRmROpcode::Bsf, + src, + dst, + )); + + ctx.emit(Inst::cmove( + ty.bytes() as u8, + CC::Z, + RegMem::reg(tmp.to_reg()), + dst, + )); + } + + Opcode::Popcnt => { + // TODO when the x86 flags have use_popcnt, we can use the popcnt instruction. + + let (ext_spec, ty) = match ctx.input_ty(insn, 0) { + types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), + a if a == types::I32 || a == types::I64 => (None, a), + _ => unreachable!(), + }; + + let src = if let Some(ext_spec) = ext_spec { + RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) + } else { + input_to_reg_mem(ctx, inputs[0]) + }; + let dst = get_output_reg(ctx, outputs[0]); + + if ty == types::I64 { + let is_64 = true; + + let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64); + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64); + let cst = ctx.alloc_tmp(RegClass::I64, types::I64); + + // mov src, tmp1 + ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // mov 0x7777_7777_7777_7777, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst)); + + // andq cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); + + // mov src, tmp2 + ctx.emit(Inst::mov64_rm_r(src, tmp2)); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // mov tmp2, dst + ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); + + // shr $4, dst + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst)); + + // add tmp2, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + dst, + )); + + // mov $0x0F0F_0F0F_0F0F_0F0F, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst)); + + // and cst, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + dst, + )); + + // mov $0x0101_0101_0101_0101, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst)); + + // mul cst, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Mul, + RegMemImm::reg(cst.to_reg()), + dst, + )); + + // shr $56, dst + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(56), + dst, + )); + } else { + assert_eq!(ty, types::I32); + let is_64 = false; + + let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64); + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64); + + // mov src, tmp1 + ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // andq $0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // mov src, tmp2 + ctx.emit(Inst::mov64_rm_r(src, tmp2)); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and 0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and $0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // mov tmp2, dst + ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); + + // shr $4, dst + ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst)); + + // add tmp2, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + dst, + )); + + // and $0x0F0F_0F0F, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x0F0F0F0F), + dst, + )); + + // mul $0x0101_0101, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Mul, + RegMemImm::imm(0x01010101), + dst, + )); + + // shr $24, dst + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(24), + dst, + )); + } + } + + Opcode::IsNull | Opcode::IsInvalid => { + // Null references are represented by the constant value 0; invalid references are + // represented by the constant value -1. See `define_reftypes()` in + // `meta/src/isa/x86/encodings.rs` to confirm. + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + let imm = match op { + Opcode::IsNull => { + // TODO could use tst src, src for IsNull + 0 + } + Opcode::IsInvalid => { + // We can do a 32-bit comparison even in 64-bits mode, as the constant is then + // sign-extended. + 0xffffffff + } + _ => unreachable!(), + }; + ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::imm(imm), src)); + ctx.emit(Inst::setcc(CC::Z, dst)); + } + + Opcode::Uextend + | Opcode::Sextend + | Opcode::Bint + | Opcode::Breduce + | Opcode::Bextend + | Opcode::Ireduce => { + let src_ty = ctx.input_ty(insn, 0); + let dst_ty = ctx.output_ty(insn, 0); + + // Sextend requires a sign-extended move, but all the other opcodes are simply a move + // from a zero-extended source. Here is why this works, in each case: + // + // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to + // zero-extend here. + // + // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so + // again, this is a zero-extend / no-op. + // + // - Ireduce: changing width of an integer. Smaller ints are stored with undefined + // high-order bits, so we can simply do a copy. + + if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend { + // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on + // 32-bits will zero-extend the upper 32-bits, so we can even not generate a + // zero-extended move in this case. + // TODO add loads and shifts here. + if let Some(_) = matches_input_any( + ctx, + inputs[0], + &[ + Opcode::Iadd, + Opcode::IaddIfcout, + Opcode::Isub, + Opcode::Imul, + Opcode::Band, + Opcode::Bor, + Opcode::Bxor, + ], + ) { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, src, types::I64)); + return Ok(()); + } + } + + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits()); + assert_eq!( + src_ty.bits() < dst_ty.bits(), + ext_mode.is_some(), + "unexpected extension: {} -> {}", + src_ty, + dst_ty + ); + + if let Some(ext_mode) = ext_mode { + if op == Opcode::Sextend { + ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst)); + } else { + ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst)); + } + } else { + ctx.emit(Inst::mov64_rm_r(src, dst)); + } + } + + Opcode::Icmp => { + let condcode = ctx.data(insn).cond_code().unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + if !ty.is_vector() { + emit_cmp(ctx, insn); + let cc = CC::from_intcc(condcode); + ctx.emit(Inst::setcc(cc, dst)); + } else { + assert_eq!(ty.bits(), 128); + let eq = |ty| match ty { + types::I8X16 => SseOpcode::Pcmpeqb, + types::I16X8 => SseOpcode::Pcmpeqw, + types::I32X4 => SseOpcode::Pcmpeqd, + types::I64X2 => SseOpcode::Pcmpeqq, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let gt = |ty| match ty { + types::I8X16 => SseOpcode::Pcmpgtb, + types::I16X8 => SseOpcode::Pcmpgtw, + types::I32X4 => SseOpcode::Pcmpgtd, + types::I64X2 => SseOpcode::Pcmpgtq, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let maxu = |ty| match ty { + types::I8X16 => SseOpcode::Pmaxub, + types::I16X8 => SseOpcode::Pmaxuw, + types::I32X4 => SseOpcode::Pmaxud, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let mins = |ty| match ty { + types::I8X16 => SseOpcode::Pminsb, + types::I16X8 => SseOpcode::Pminsw, + types::I32X4 => SseOpcode::Pminsd, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let minu = |ty| match ty { + types::I8X16 => SseOpcode::Pminub, + types::I16X8 => SseOpcode::Pminuw, + types::I32X4 => SseOpcode::Pminud, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + + // Here we decide which operand to use as the read/write `dst` (ModRM reg field) + // and which to use as the read `input` (ModRM r/m field). In the normal case we + // use Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for + // the less-than cases so that we can reuse the greater-than implementation. + let input = match condcode { + IntCC::SignedLessThan + | IntCC::SignedLessThanOrEqual + | IntCC::UnsignedLessThan + | IntCC::UnsignedLessThanOrEqual => { + let lhs = input_to_reg_mem(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + lhs + } + _ => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + ctx.emit(Inst::gen_move(dst, lhs, ty)); + rhs + } + }; + + match condcode { + IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)), + IntCC::NotEqual => { + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)); + // Emit all 1s into the `tmp` register. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp)); + // Invert the result of the `PCMPEQ*`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst)); + } + IntCC::SignedGreaterThan | IntCC::SignedLessThan => { + ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst)) + } + IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => { + ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)) + } + IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => { + ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)); + // Emit all 1s into the `tmp` register. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp)); + // Invert the result of the `PCMPEQ*`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst)); + } + IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => { + ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)) + } + _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode), + } + } + } + + Opcode::Fcmp => { + let cond_code = ctx.data(insn).fp_cond_code().unwrap(); + let input_ty = ctx.input_ty(insn, 0); + if !input_ty.is_vector() { + // Unordered is returned by setting ZF, PF, CF <- 111 + // Greater than by ZF, PF, CF <- 000 + // Less than by ZF, PF, CF <- 001 + // Equal by ZF, PF, CF <- 100 + // + // Checking the result of comiss is somewhat annoying because you don't have setcc + // instructions that explicitly check simultaneously for the condition (i.e. eq, le, + // gt, etc) *and* orderedness. + // + // So that might mean we need more than one setcc check and then a logical "and" or + // "or" to determine both, in some cases. However knowing that if the parity bit is + // set, then the result was considered unordered and knowing that if the parity bit is + // set, then both the ZF and CF flag bits must also be set we can get away with using + // one setcc for most condition codes. + + let dst = get_output_reg(ctx, outputs[0]); + + match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { + ctx.emit(Inst::setcc(cc, dst)); + } + FcmpCondResult::AndConditions(cc1, cc2) => { + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, dst)); + ctx.emit(Inst::alu_rmi_r( + false, + AluRmiROpcode::And, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, dst)); + ctx.emit(Inst::alu_rmi_r( + false, + AluRmiROpcode::Or, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), + } + } else { + let op = match input_ty { + types::F32X4 => SseOpcode::Cmpps, + types::F64X2 => SseOpcode::Cmppd, + _ => panic!("Bad input type to fcmp: {}", input_ty), + }; + + // Since some packed comparisons are not available, some of the condition codes + // must be inverted, with a corresponding `flip` of the operands. + let (imm, flip) = match cond_code { + FloatCC::GreaterThan => (FcmpImm::LessThan, true), + FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true), + FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true), + FloatCC::UnorderedOrLessThanOrEqual => { + (FcmpImm::UnorderedOrGreaterThanOrEqual, true) + } + FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => { + panic!("unsupported float condition code: {}", cond_code) + } + _ => (FcmpImm::from(cond_code), false), + }; + + // Determine the operands of the comparison, possibly by flipping them. + let (lhs, rhs) = if flip { + ( + put_input_in_reg(ctx, inputs[1]), + input_to_reg_mem(ctx, inputs[0]), + ) + } else { + ( + put_input_in_reg(ctx, inputs[0]), + input_to_reg_mem(ctx, inputs[1]), + ) + }; + + // Move the `lhs` to the same register as `dst`; this may not emit an actual move + // but ensures that the registers are the same to match x86's read-write operand + // encoding. + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, lhs, input_ty)); + + // Emit the comparison. + ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false)); + } + } + + Opcode::FallthroughReturn | Opcode::Return => { + for i in 0..ctx.num_inputs(insn) { + let src_reg = put_input_in_reg(ctx, inputs[i]); + let retval_reg = ctx.retval(i); + let ty = ctx.input_ty(insn, i); + ctx.emit(Inst::gen_move(retval_reg, src_reg, ty)); + } + // N.B.: the Ret itself is generated by the ABI. + } + + Opcode::Call | Opcode::CallIndirect => { + let caller_conv = ctx.abi().call_conv(); + let (mut abi, inputs) = match op { + Opcode::Call => { + let (extname, dist) = ctx.call_target(insn).unwrap(); + let sig = ctx.call_sig(insn).unwrap(); + assert_eq!(inputs.len(), sig.params.len()); + assert_eq!(outputs.len(), sig.returns.len()); + ( + X64ABICaller::from_func(sig, &extname, dist, caller_conv)?, + &inputs[..], + ) + } + + Opcode::CallIndirect => { + let ptr = put_input_in_reg(ctx, inputs[0]); + let sig = ctx.call_sig(insn).unwrap(); + assert_eq!(inputs.len() - 1, sig.params.len()); + assert_eq!(outputs.len(), sig.returns.len()); + ( + X64ABICaller::from_ptr(sig, ptr, op, caller_conv)?, + &inputs[1..], + ) + } + + _ => unreachable!(), + }; + + abi.emit_stack_pre_adjust(ctx); + assert_eq!(inputs.len(), abi.num_args()); + for (i, input) in inputs.iter().enumerate() { + let arg_reg = put_input_in_reg(ctx, *input); + abi.emit_copy_reg_to_arg(ctx, i, arg_reg); + } + abi.emit_call(ctx); + for (i, output) in outputs.iter().enumerate() { + let retval_reg = get_output_reg(ctx, *output); + abi.emit_copy_retval_to_reg(ctx, i, retval_reg); + } + abi.emit_stack_post_adjust(ctx); + } + + Opcode::Debugtrap => { + ctx.emit(Inst::Hlt); + } + + Opcode::Trap | Opcode::ResumableTrap => { + let trap_code = ctx.data(insn).trap_code().unwrap(); + ctx.emit_safepoint(Inst::Ud2 { trap_code }); + } + + Opcode::Trapif | Opcode::Trapff => { + let trap_code = ctx.data(insn).trap_code().unwrap(); + + if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() { + let cond_code = ctx.data(insn).cond_code().unwrap(); + // The flags must not have been clobbered by any other instruction between the + // iadd_ifcout and this instruction, as verified by the CLIF validator; so we can + // simply use the flags here. + let cc = CC::from_intcc(cond_code); + + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }); + } else if op == Opcode::Trapif { + let cond_code = ctx.data(insn).cond_code().unwrap(); + let cc = CC::from_intcc(cond_code); + + // Verification ensures that the input is always a single-def ifcmp. + let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + emit_cmp(ctx, ifcmp); + + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }); + } else { + let cond_code = ctx.data(insn).fp_cond_code().unwrap(); + + // Verification ensures that the input is always a single-def ffcmp. + let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap(); + + match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }) + } + FcmpCondResult::AndConditions(cc1, cc2) => { + // A bit unfortunate, but materialize the flags in their own register, and + // check against this. + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, tmp2)); + ctx.emit(Inst::alu_rmi_r( + false, /* is_64 */ + AluRmiROpcode::And, + RegMemImm::reg(tmp.to_reg()), + tmp2, + )); + ctx.emit_safepoint(Inst::TrapIf { + trap_code, + cc: CC::NZ, + }); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc1 }); + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc2 }); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), + }; + }; + } + + Opcode::F64const => { + // TODO use cmpeqpd for all 1s. + let value = ctx.get_constant(insn).unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + for inst in Inst::gen_constant(dst, value, types::F64, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + } + + Opcode::F32const => { + // TODO use cmpeqps for all 1s. + let value = ctx.get_constant(insn).unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + for inst in Inst::gen_constant(dst, value, types::F32, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + } + + Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + // Move the `lhs` to the same register as `dst`; this may not emit an actual move + // but ensures that the registers are the same to match x86's read-write operand + // encoding. + ctx.emit(Inst::gen_move(dst, lhs, ty)); + + // Note: min and max can't be handled here, because of the way Cranelift defines them: + // if any operand is a NaN, they must return the NaN operand, while the x86 machine + // instruction will return the second operand if either operand is a NaN. + let sse_op = match ty { + types::F32 => match op { + Opcode::Fadd => SseOpcode::Addss, + Opcode::Fsub => SseOpcode::Subss, + Opcode::Fmul => SseOpcode::Mulss, + Opcode::Fdiv => SseOpcode::Divss, + _ => unreachable!(), + }, + types::F64 => match op { + Opcode::Fadd => SseOpcode::Addsd, + Opcode::Fsub => SseOpcode::Subsd, + Opcode::Fmul => SseOpcode::Mulsd, + Opcode::Fdiv => SseOpcode::Divsd, + _ => unreachable!(), + }, + types::F32X4 => match op { + Opcode::Fadd => SseOpcode::Addps, + Opcode::Fsub => SseOpcode::Subps, + Opcode::Fmul => SseOpcode::Mulps, + Opcode::Fdiv => SseOpcode::Divps, + _ => unreachable!(), + }, + types::F64X2 => match op { + Opcode::Fadd => SseOpcode::Addpd, + Opcode::Fsub => SseOpcode::Subpd, + Opcode::Fmul => SseOpcode::Mulpd, + Opcode::Fdiv => SseOpcode::Divpd, + _ => unreachable!(), + }, + _ => panic!( + "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}", + ty + ), + }; + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + } + + Opcode::Fmin | Opcode::Fmax => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let is_min = op == Opcode::Fmin; + let output_ty = ty.unwrap(); + ctx.emit(Inst::gen_move(dst, rhs, output_ty)); + if !output_ty.is_vector() { + let op_size = match output_ty { + types::F32 => OperandSize::Size32, + types::F64 => OperandSize::Size64, + _ => panic!("unexpected type {:?} for fmin/fmax", output_ty), + }; + ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst)); + } else { + // X64's implementation of floating point min and floating point max does not + // propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the + // scalar approach we use jumps to handle cases where NaN and +0 propagation is + // not consistent with what is needed. However for packed floating point min and + // floating point max we implement a different approach to avoid the sequence + // of jumps that would be required on a per lane basis. Because we do not need to + // lower labels and jumps but do need ctx for creating temporaries we implement + // the lowering here in lower.rs instead of emit.rs as is done in the case for scalars. + // The outline of approach is as follows: + // + // First we preform the Min/Max in both directions. This is because in the + // case of an operand's lane containing a NaN or in the case of the lanes of the + // two operands containing 0 but with mismatched signs, x64 will return the second + // operand regardless of its contents. So in order to make sure we capture NaNs and + // normalize NaNs and 0 values we capture the operation in both directions and merge the + // results. Then we normalize the results through operations that create a mask for the + // lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize + // 0s. + // + // The following sequence is generated for min: + // + // movap{s,d} %lhs, %tmp + // minp{s,d} %dst, %tmp + // minp,{s,d} %lhs, %dst + // orp{s,d} %dst, %tmp + // cmpp{s,d} %tmp, %dst, $3 + // orps{s,d} %dst, %tmp + // psrl{s,d} {$10, $13}, %dst + // andnp{s,d} %tmp, %dst + // + // and for max the sequence is: + // + // movap{s,d} %lhs, %tmp + // minp{s,d} %dst, %tmp + // minp,{s,d} %lhs, %dst + // xorp{s,d} %tmp, %dst + // orp{s,d} %dst, %tmp + // subp{s,d} %dst, %tmp + // cmpp{s,d} %tmp, %dst, $3 + // psrl{s,d} {$10, $13}, %dst + // andnp{s,d} %tmp, %dst + + if is_min { + let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) = + match output_ty { + types::F32X4 => ( + SseOpcode::Movaps, + SseOpcode::Minps, + SseOpcode::Orps, + SseOpcode::Cmpps, + SseOpcode::Psrld, + 10, + SseOpcode::Andnps, + ), + types::F64X2 => ( + SseOpcode::Movapd, + SseOpcode::Minpd, + SseOpcode::Orpd, + SseOpcode::Cmppd, + SseOpcode::Psrlq, + 13, + SseOpcode::Andnpd, + ), + _ => unimplemented!("unsupported op type {:?}", output_ty), + }; + + // Copy lhs into tmp + let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, output_ty); + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1)); + + // Perform min in reverse direction + ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1)); + + // Perform min in original direction + ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst)); + + // X64 handles propagation of -0's and Nans differently between left and right + // operands. After doing the min in both directions, this OR will + // guarrentee capture of -0's and Nan in our tmp register + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1)); + + // Compare unordered to create mask for lanes containing NaNs and then use + // that mask to saturate the NaN containing lanes in the tmp register with 1s. + // TODO: Would a check for NaN and then a jump be better here in the + // common case than continuing on to normalize NaNs that might not exist? + let cond = FcmpImm::from(FloatCC::Unordered); + ctx.emit(Inst::xmm_rm_r_imm( + cmp_op, + RegMem::reg(tmp_xmm1.to_reg()), + dst, + cond.encode(), + false, + )); + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // The dst register holds a mask for lanes containing NaNs. + // We take that mask and shift in preparation for creating a different mask + // to normalize NaNs (create a quite NaN) by zeroing out the appropriate + // number of least signficant bits. We shift right each lane by 10 bits + // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign + + // 11 exp. + 1 MSB sig.) for F64X2. + ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst)); + + // Finally we do a nand with the tmp register to produce the final results + // in the dst. + ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + } else { + let ( + mov_op, + max_op, + xor_op, + or_op, + sub_op, + cmp_op, + shift_op, + shift_by, + andn_op, + ) = match output_ty { + types::F32X4 => ( + SseOpcode::Movaps, + SseOpcode::Maxps, + SseOpcode::Xorps, + SseOpcode::Orps, + SseOpcode::Subps, + SseOpcode::Cmpps, + SseOpcode::Psrld, + 10, + SseOpcode::Andnps, + ), + types::F64X2 => ( + SseOpcode::Movapd, + SseOpcode::Maxpd, + SseOpcode::Xorpd, + SseOpcode::Orpd, + SseOpcode::Subpd, + SseOpcode::Cmppd, + SseOpcode::Psrlq, + 13, + SseOpcode::Andnpd, + ), + _ => unimplemented!("unsupported op type {:?}", output_ty), + }; + + // Copy lhs into tmp. + let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32); + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1)); + + // Perform max in reverse direction. + ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // Perform max in original direction. + ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst)); + + // Get the difference between the two results and store in tmp. + // Max uses a different approach than min to account for potential + // discrepancies with plus/minus 0. + ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + + // X64 handles propagation of -0's and Nans differently between left and right + // operands. After doing the max in both directions, this OR will + // guarentee capture of 0's and Nan in our tmp register. + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // Capture NaNs and sign discrepancies. + ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // Compare unordered to create mask for lanes containing NaNs and then use + // that mask to saturate the NaN containing lanes in the tmp register with 1s. + let cond = FcmpImm::from(FloatCC::Unordered); + ctx.emit(Inst::xmm_rm_r_imm( + cmp_op, + RegMem::reg(tmp_xmm1.to_reg()), + dst, + cond.encode(), + false, + )); + + // The dst register holds a mask for lanes containing NaNs. + // We take that mask and shift in preparation for creating a different mask + // to normalize NaNs (create a quite NaN) by zeroing out the appropriate + // number of least signficant bits. We shift right each lane by 10 bits + // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign + + // 11 exp. + 1 MSB sig.) for F64X2. + ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst)); + + // Finally we do a nand with the tmp register to produce the final results + // in the dst. + ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + } + } + } + + Opcode::FminPseudo | Opcode::FmaxPseudo => { + let lhs = input_to_reg_mem(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + let sse_opcode = match (ty, op) { + (types::F32X4, Opcode::FminPseudo) => SseOpcode::Minps, + (types::F32X4, Opcode::FmaxPseudo) => SseOpcode::Maxps, + (types::F64X2, Opcode::FminPseudo) => SseOpcode::Minpd, + (types::F64X2, Opcode::FmaxPseudo) => SseOpcode::Maxpd, + _ => unimplemented!("unsupported type {} for {}", ty, op), + }; + ctx.emit(Inst::xmm_rm_r(sse_opcode, lhs, dst)); + } + + Opcode::Sqrt => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + let sse_op = match ty { + types::F32 => SseOpcode::Sqrtss, + types::F64 => SseOpcode::Sqrtsd, + types::F32X4 => SseOpcode::Sqrtps, + types::F64X2 => SseOpcode::Sqrtpd, + _ => panic!( + "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}", + ty + ), + }; + + ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst)); + } + + Opcode::Fpromote => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst)); + } + + Opcode::Fdemote => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst)); + } + + Opcode::FcvtFromSint => { + let output_ty = ty.unwrap(); + if !output_ty.is_vector() { + let (ext_spec, src_size) = match ctx.input_ty(insn, 0) { + types::I8 | types::I16 => (Some(ExtSpec::SignExtendTo32), OperandSize::Size32), + types::I32 => (None, OperandSize::Size32), + types::I64 => (None, OperandSize::Size64), + _ => unreachable!(), + }; + + let src = match ext_spec { + Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)), + None => input_to_reg_mem(ctx, inputs[0]), + }; + + let opcode = if output_ty == types::F32 { + SseOpcode::Cvtsi2ss + } else { + assert_eq!(output_ty, types::F64); + SseOpcode::Cvtsi2sd + }; + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gpr_to_xmm(opcode, src, src_size, dst)); + } else { + let ty = ty.unwrap(); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let opcode = match ctx.input_ty(insn, 0) { + types::I32X4 => SseOpcode::Cvtdq2ps, + _ => { + unimplemented!("unable to use type {} for op {}", ctx.input_ty(insn, 0), op) + } + }; + ctx.emit(Inst::gen_move(dst, src, ty)); + ctx.emit(Inst::xmm_rm_r(opcode, RegMem::from(dst), dst)); + } + } + + Opcode::FcvtFromUint => { + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + let input_ty = ctx.input_ty(insn, 0); + if !ty.is_vector() { + match input_ty { + types::I8 | types::I16 | types::I32 => { + // Conversion from an unsigned int smaller than 64-bit is easy: zero-extend + + // do a signed conversion (which won't overflow). + let opcode = if ty == types::F32 { + SseOpcode::Cvtsi2ss + } else { + assert_eq!(ty, types::F64); + SseOpcode::Cvtsi2sd + }; + + let src = RegMem::reg(extend_input_to_reg( + ctx, + inputs[0], + ExtSpec::ZeroExtendTo64, + )); + ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst)); + } + + types::I64 => { + let src = put_input_in_reg(ctx, inputs[0]); + + let src_copy = ctx.alloc_tmp(RegClass::I64, types::I64); + ctx.emit(Inst::gen_move(src_copy, src, types::I64)); + + let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I64); + let tmp_gpr2 = ctx.alloc_tmp(RegClass::I64, types::I64); + ctx.emit(Inst::cvt_u64_to_float_seq( + ty == types::F64, + src_copy, + tmp_gpr1, + tmp_gpr2, + dst, + )); + } + _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty), + }; + } else { + // Converting packed unsigned integers to packed floats requires a few steps. + // There is no single instruction lowering for converting unsigned floats but there + // is for converting packed signed integers to float (cvtdq2ps). In the steps below + // we isolate the upper half (16 bits) and lower half (16 bits) of each lane and + // then we convert each half separately using cvtdq2ps meant for signed integers. + // In order for this to work for the upper half bits we must shift right by 1 + // (divide by 2) these bits in order to ensure the most significant bit is 0 not + // signed, and then after the conversion we double the value. Finally we add the + // converted values where addition will correctly round. + // + // Sequence: + // -> A = 0xffffffff + // -> Ah = 0xffff0000 + // -> Al = 0x0000ffff + // -> Convert(Al) // Convert int to float + // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed + // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift + // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion. + // -> dst = Ah + Al // Add the two floats together + + assert_eq!(ctx.input_ty(insn, 0), types::I32X4); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + // Create a temporary register + let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4); + ctx.emit(Inst::xmm_unary_rm_r( + SseOpcode::Movapd, + RegMem::reg(src), + tmp, + )); + ctx.emit(Inst::gen_move(dst, src, ty)); + + // Get the low 16 bits + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp)); + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp)); + + // Get the high 16 bits + ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst)); + + // Convert the low 16 bits + ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp)); + + // Shift the high bits by 1, convert, and double to get the correct value. + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Addps, + RegMem::reg(dst.to_reg()), + dst, + )); + + // Add together the two converted values. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Addps, + RegMem::reg(tmp.to_reg()), + dst, + )); + } + } + + Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + let input_ty = ctx.input_ty(insn, 0); + if !input_ty.is_vector() { + let src_size = if input_ty == types::F32 { + OperandSize::Size32 + } else { + assert_eq!(input_ty, types::F64); + OperandSize::Size64 + }; + + let output_ty = ty.unwrap(); + let dst_size = if output_ty == types::I32 { + OperandSize::Size32 + } else { + assert_eq!(output_ty, types::I64); + OperandSize::Size64 + }; + + let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat; + let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat; + + let src_copy = ctx.alloc_tmp(RegClass::V128, input_ty); + ctx.emit(Inst::gen_move(src_copy, src, input_ty)); + + let tmp_xmm = ctx.alloc_tmp(RegClass::V128, input_ty); + let tmp_gpr = ctx.alloc_tmp(RegClass::I64, output_ty); + + if to_signed { + ctx.emit(Inst::cvt_float_to_sint_seq( + src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, + )); + } else { + ctx.emit(Inst::cvt_float_to_uint_seq( + src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, + )); + } + } else { + if op == Opcode::FcvtToSintSat { + // Sets destination to zero if float is NaN + let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4); + ctx.emit(Inst::xmm_unary_rm_r( + SseOpcode::Movapd, + RegMem::reg(src), + tmp, + )); + ctx.emit(Inst::gen_move(dst, src, input_ty)); + let cond = FcmpImm::from(FloatCC::Equal); + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Cmpps, + RegMem::reg(tmp.to_reg()), + tmp, + cond.encode(), + false, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Andps, + RegMem::reg(tmp.to_reg()), + dst, + )); + + // Sets top bit of tmp if float is positive + // Setting up to set top bit on negative float values + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(dst.to_reg()), + tmp, + )); + + // Convert the packed float to packed doubleword. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Cvttps2dq, + RegMem::reg(dst.to_reg()), + dst, + )); + + // Set top bit only if < 0 + // Saturate lane with sign (top) bit. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pand, + RegMem::reg(dst.to_reg()), + tmp, + )); + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp)); + + // On overflow 0x80000000 is returned to a lane. + // Below sets positive overflow lanes to 0x7FFFFFFF + // Keeps negative overflow lanes as is. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(tmp.to_reg()), + dst, + )); + } else if op == Opcode::FcvtToUintSat { + unimplemented!("f32x4.convert_i32x4_u"); + } else { + // Since this branch is also guarded by a check for vector types + // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here + // due to vector varients not existing. The first two branches will + // cover all reachable cases. + unreachable!(); + } + } + } + + Opcode::Bitcast => { + let input_ty = ctx.input_ty(insn, 0); + let output_ty = ctx.output_ty(insn, 0); + match (input_ty, output_ty) { + (types::F32, types::I32) => { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_to_gpr( + SseOpcode::Movd, + src, + dst, + OperandSize::Size32, + )); + } + (types::I32, types::F32) => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gpr_to_xmm( + SseOpcode::Movd, + src, + OperandSize::Size32, + dst, + )); + } + (types::F64, types::I64) => { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_to_gpr( + SseOpcode::Movq, + src, + dst, + OperandSize::Size64, + )); + } + (types::I64, types::F64) => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gpr_to_xmm( + SseOpcode::Movq, + src, + OperandSize::Size64, + dst, + )); + } + _ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty), + } + } + + Opcode::Fabs | Opcode::Fneg => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + // In both cases, generate a constant and apply a single binary instruction: + // - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the + // src with it. + // - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the + // src with it. + let output_ty = ty.unwrap(); + if !output_ty.is_vector() { + let (val, opcode) = match output_ty { + types::F32 => match op { + Opcode::Fabs => (0x7fffffff, SseOpcode::Andps), + Opcode::Fneg => (0x80000000, SseOpcode::Xorps), + _ => unreachable!(), + }, + types::F64 => match op { + Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd), + Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd), + _ => unreachable!(), + }, + _ => panic!("unexpected type {:?} for Fabs", output_ty), + }; + + for inst in Inst::gen_constant(dst, val, output_ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + + ctx.emit(Inst::xmm_rm_r(opcode, src, dst)); + } else { + // Eventually vector constants should be available in `gen_constant` and this block + // can be merged with the one above (TODO). + if output_ty.bits() == 128 { + // Move the `lhs` to the same register as `dst`; this may not emit an actual move + // but ensures that the registers are the same to match x86's read-write operand + // encoding. + let src = put_input_in_reg(ctx, inputs[0]); + ctx.emit(Inst::gen_move(dst, src, output_ty)); + + // Generate an all 1s constant in an XMM register. This uses CMPPS but could + // have used CMPPD with the same effect. + let tmp = ctx.alloc_tmp(RegClass::V128, output_ty); + let cond = FcmpImm::from(FloatCC::Equal); + let cmpps = Inst::xmm_rm_r_imm( + SseOpcode::Cmpps, + RegMem::reg(tmp.to_reg()), + tmp, + cond.encode(), + false, + ); + ctx.emit(cmpps); + + // Shift the all 1s constant to generate the mask. + let lane_bits = output_ty.lane_bits(); + let (shift_opcode, opcode, shift_by) = match (op, lane_bits) { + (Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1), + (Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1), + (Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31), + (Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63), + _ => unreachable!( + "unexpected opcode and lane size: {:?}, {} bits", + op, lane_bits + ), + }; + let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp); + ctx.emit(shift); + + // Apply shifted mask (XOR or AND). + let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst); + ctx.emit(mask); + } else { + panic!("unexpected type {:?} for Fabs", output_ty); + } + } + } + + Opcode::Fcopysign => { + let dst = get_output_reg(ctx, outputs[0]); + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + + let ty = ty.unwrap(); + + // We're going to generate the following sequence: + // + // movabs $INT_MIN, tmp_gpr1 + // mov{d,q} tmp_gpr1, tmp_xmm1 + // movap{s,d} tmp_xmm1, dst + // andnp{s,d} src_1, dst + // movap{s,d} src_2, tmp_xmm2 + // andp{s,d} tmp_xmm1, tmp_xmm2 + // orp{s,d} tmp_xmm2, dst + + let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32); + let tmp_xmm2 = ctx.alloc_tmp(RegClass::V128, types::F32); + + let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty { + types::F32 => ( + 0x8000_0000, + SseOpcode::Movaps, + SseOpcode::Andnps, + SseOpcode::Andps, + SseOpcode::Orps, + ), + types::F64 => ( + 0x8000_0000_0000_0000, + SseOpcode::Movapd, + SseOpcode::Andnpd, + SseOpcode::Andpd, + SseOpcode::Orpd, + ), + _ => { + panic!("unexpected type {:?} for copysign", ty); + } + }; + + for inst in Inst::gen_constant(tmp_xmm1, sign_bit_cst, ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst)); + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2)); + ctx.emit(Inst::xmm_rm_r( + and_op, + RegMem::reg(tmp_xmm1.to_reg()), + tmp_xmm2, + )); + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst)); + } + + Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => { + // TODO use ROUNDSS/ROUNDSD after sse4.1. + + // Lower to VM calls when there's no access to SSE4.1. + let ty = ty.unwrap(); + let libcall = match (ty, op) { + (types::F32, Opcode::Ceil) => LibCall::CeilF32, + (types::F64, Opcode::Ceil) => LibCall::CeilF64, + (types::F32, Opcode::Floor) => LibCall::FloorF32, + (types::F64, Opcode::Floor) => LibCall::FloorF64, + (types::F32, Opcode::Nearest) => LibCall::NearestF32, + (types::F64, Opcode::Nearest) => LibCall::NearestF64, + (types::F32, Opcode::Trunc) => LibCall::TruncF32, + (types::F64, Opcode::Trunc) => LibCall::TruncF64, + _ => panic!( + "unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc", + ty, op + ), + }; + + emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?; + } + + Opcode::Load + | Opcode::Uload8 + | Opcode::Sload8 + | Opcode::Uload16 + | Opcode::Sload16 + | Opcode::Uload32 + | Opcode::Sload32 + | Opcode::LoadComplex + | Opcode::Uload8Complex + | Opcode::Sload8Complex + | Opcode::Uload16Complex + | Opcode::Sload16Complex + | Opcode::Uload32Complex + | Opcode::Sload32Complex => { + let offset = ctx.data(insn).load_store_offset().unwrap(); + + let elem_ty = match op { + Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => { + types::I8 + } + Opcode::Sload16 + | Opcode::Uload16 + | Opcode::Sload16Complex + | Opcode::Uload16Complex => types::I16, + Opcode::Sload32 + | Opcode::Uload32 + | Opcode::Sload32Complex + | Opcode::Uload32Complex => types::I32, + Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0), + _ => unimplemented!(), + }; + + let ext_mode = ExtMode::new(elem_ty.bits(), 64); + + let sign_extend = match op { + Opcode::Sload8 + | Opcode::Sload8Complex + | Opcode::Sload16 + | Opcode::Sload16Complex + | Opcode::Sload32 + | Opcode::Sload32Complex => true, + _ => false, + }; + + let amode = match op { + Opcode::Load + | Opcode::Uload8 + | Opcode::Sload8 + | Opcode::Uload16 + | Opcode::Sload16 + | Opcode::Uload32 + | Opcode::Sload32 => { + assert_eq!(inputs.len(), 1, "only one input for load operands"); + lower_to_amode(ctx, inputs[0], offset) + } + + Opcode::LoadComplex + | Opcode::Uload8Complex + | Opcode::Sload8Complex + | Opcode::Uload16Complex + | Opcode::Sload16Complex + | Opcode::Uload32Complex + | Opcode::Sload32Complex => { + assert_eq!( + inputs.len(), + 2, + "can't handle more than two inputs in complex load" + ); + let base = put_input_in_reg(ctx, inputs[0]); + let index = put_input_in_reg(ctx, inputs[1]); + let shift = 0; + let flags = ctx.memflags(insn).expect("load should have memflags"); + Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags) + } + + _ => unreachable!(), + }; + + let dst = get_output_reg(ctx, outputs[0]); + let is_xmm = elem_ty.is_float() || elem_ty.is_vector(); + match (sign_extend, is_xmm) { + (true, false) => { + // The load is sign-extended only when the output size is lower than 64 bits, + // so ext-mode is defined in this case. + ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst)); + } + (false, false) => { + if elem_ty.bytes() == 8 { + // Use a plain load. + ctx.emit(Inst::mov64_m_r(amode, dst)) + } else { + // Use a zero-extended load. + ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst)) + } + } + (_, true) => { + ctx.emit(match elem_ty { + types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst), + types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst), + _ if elem_ty.is_vector() && elem_ty.bits() == 128 => { + Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst) + } // TODO Specialize for different types: MOVUPD, MOVDQU + _ => unreachable!("unexpected type for load: {:?}", elem_ty), + }); + } + } + } + + Opcode::Store + | Opcode::Istore8 + | Opcode::Istore16 + | Opcode::Istore32 + | Opcode::StoreComplex + | Opcode::Istore8Complex + | Opcode::Istore16Complex + | Opcode::Istore32Complex => { + let offset = ctx.data(insn).load_store_offset().unwrap(); + + let elem_ty = match op { + Opcode::Istore8 | Opcode::Istore8Complex => types::I8, + Opcode::Istore16 | Opcode::Istore16Complex => types::I16, + Opcode::Istore32 | Opcode::Istore32Complex => types::I32, + Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0), + _ => unreachable!(), + }; + + let addr = match op { + Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => { + assert_eq!(inputs.len(), 2, "only one input for store memory operands"); + lower_to_amode(ctx, inputs[1], offset) + } + + Opcode::StoreComplex + | Opcode::Istore8Complex + | Opcode::Istore16Complex + | Opcode::Istore32Complex => { + assert_eq!( + inputs.len(), + 3, + "can't handle more than two inputs in complex store" + ); + let base = put_input_in_reg(ctx, inputs[1]); + let index = put_input_in_reg(ctx, inputs[2]); + let shift = 0; + let flags = ctx.memflags(insn).expect("store should have memflags"); + Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags) + } + + _ => unreachable!(), + }; + + let src = put_input_in_reg(ctx, inputs[0]); + + ctx.emit(match elem_ty { + types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr), + types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr), + _ if elem_ty.is_vector() && elem_ty.bits() == 128 => { + // TODO Specialize for different types: MOVUPD, MOVDQU, etc. + Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr) + } + _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr), + }); + } + + Opcode::AtomicRmw => { + // This is a simple, general-case atomic update, based on a loop involving + // `cmpxchg`. Note that we could do much better than this in the case where the old + // value at the location (that is to say, the SSA `Value` computed by this CLIF + // instruction) is not required. In that case, we could instead implement this + // using a single `lock`-prefixed x64 read-modify-write instruction. Also, even in + // the case where the old value is required, for the `add` and `sub` cases, we can + // use the single instruction `lock xadd`. However, those improvements have been + // left for another day. + // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153 + let dst = get_output_reg(ctx, outputs[0]); + let mut addr = put_input_in_reg(ctx, inputs[0]); + let mut arg2 = put_input_in_reg(ctx, inputs[1]); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + // Make sure that both args are in virtual regs, since in effect we have to do a + // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not + // guaranteed safe if either is in a real reg. + addr = ctx.ensure_in_vreg(addr, types::I64); + arg2 = ctx.ensure_in_vreg(arg2, types::I64); + + // Move the args to the preordained AtomicRMW input regs. Note that `AtomicRmwSeq` + // operates at whatever width is specified by `ty`, so there's no need to + // zero-extend `arg2` in the case of `ty` being I8/I16/I32. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::r9()), + addr, + types::I64, + )); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::r10()), + arg2, + types::I64, + )); + + // Now the AtomicRmwSeq (pseudo-) instruction itself + let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap()); + ctx.emit(Inst::AtomicRmwSeq { ty: ty_access, op }); + + // And finally, copy the preordained AtomicRmwSeq output reg to its destination. + ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64)); + } + + Opcode::AtomicCas => { + // This is very similar to, but not identical to, the `AtomicRmw` case. As with + // `AtomicRmw`, there's no need to zero-extend narrow values here. + let dst = get_output_reg(ctx, outputs[0]); + let addr = lower_to_amode(ctx, inputs[0], 0); + let expected = put_input_in_reg(ctx, inputs[1]); + let replacement = put_input_in_reg(ctx, inputs[2]); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + // Move the expected value into %rax. Because there's only one fixed register on + // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the + // `AtomicRmw` case. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + expected, + types::I64, + )); + ctx.emit(Inst::LockCmpxchg { + ty: ty_access, + src: replacement, + dst: addr.into(), + }); + // And finally, copy the old value at the location to its destination reg. + ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64)); + } + + Opcode::AtomicLoad => { + // This is a normal load. The x86-TSO memory model provides sufficient sequencing + // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the + // need for any fence instructions. + let data = get_output_reg(ctx, outputs[0]); + let addr = lower_to_amode(ctx, inputs[0], 0); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + let rm = RegMem::mem(addr); + if ty_access == types::I64 { + ctx.emit(Inst::mov64_rm_r(rm, data)); + } else { + let ext_mode = ExtMode::new(ty_access.bits(), 64).expect(&format!( + "invalid extension during AtomicLoad: {} -> {}", + ty_access.bits(), + 64 + )); + ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data)); + } + } + + Opcode::AtomicStore => { + // This is a normal store, followed by an `mfence` instruction. + let data = put_input_in_reg(ctx, inputs[0]); + let addr = lower_to_amode(ctx, inputs[1], 0); + let ty_access = ctx.input_ty(insn, 0); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + ctx.emit(Inst::mov_r_m(ty_access.bytes() as u8, data, addr)); + ctx.emit(Inst::Fence { + kind: FenceKind::MFence, + }); + } + + Opcode::Fence => { + ctx.emit(Inst::Fence { + kind: FenceKind::MFence, + }); + } + + Opcode::FuncAddr => { + let dst = get_output_reg(ctx, outputs[0]); + let (extname, _) = ctx.call_target(insn).unwrap(); + let extname = extname.clone(); + ctx.emit(Inst::LoadExtName { + dst, + name: Box::new(extname), + offset: 0, + }); + } + + Opcode::SymbolValue => { + let dst = get_output_reg(ctx, outputs[0]); + let (extname, _, offset) = ctx.symbol_value(insn).unwrap(); + let extname = extname.clone(); + ctx.emit(Inst::LoadExtName { + dst, + name: Box::new(extname), + offset, + }); + } + + Opcode::StackAddr => { + let (stack_slot, offset) = match *ctx.data(insn) { + InstructionData::StackLoad { + opcode: Opcode::StackAddr, + stack_slot, + offset, + } => (stack_slot, offset), + _ => unreachable!(), + }; + let dst = get_output_reg(ctx, outputs[0]); + let offset: i32 = offset.into(); + let inst = ctx + .abi() + .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst); + ctx.emit(inst); + } + + Opcode::Select => { + let flag_input = inputs[0]; + if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) { + let cond_code = ctx.data(fcmp).fp_cond_code().unwrap(); + + // For equal, we flip the operands, because we can't test a conjunction of + // CPU flags with a single cmove; see InvertedEqualOrConditions doc comment. + let (lhs_input, rhs_input) = match cond_code { + FloatCC::Equal => (inputs[2], inputs[1]), + _ => (inputs[1], inputs[2]), + }; + + let ty = ctx.output_ty(insn, 0); + let rhs = put_input_in_reg(ctx, rhs_input); + let dst = get_output_reg(ctx, outputs[0]); + let lhs = if is_int_or_ref_ty(ty) && ty.bytes() < 4 { + // Special case: since the higher bits are undefined per CLIF semantics, we + // can just apply a 32-bit cmove here. Force inputs into registers, to + // avoid partial spilling out-of-bounds with memory accesses, though. + // Sign-extend operands to 32, then do a cmove of size 4. + RegMem::reg(put_input_in_reg(ctx, lhs_input)) + } else { + input_to_reg_mem(ctx, lhs_input) + }; + + // We request inversion of Equal to NotEqual here: taking LHS if equal would mean + // take it if both CC::NP and CC::Z are set, the conjunction of which can't be + // modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the + // select operation, and invert the equal to a not-equal here. + let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual); + + if let FcmpCondResult::InvertedEqualOrConditions(_, _) = &fcmp_results { + // Keep this sync'd with the lowering of the select inputs above. + assert_eq!(cond_code, FloatCC::Equal); + } + + ctx.emit(Inst::gen_move(dst, rhs, ty)); + + match fcmp_results { + FcmpCondResult::Condition(cc) => { + if is_int_or_ref_ty(ty) { + let size = u8::max(ty.bytes() as u8, 4); + ctx.emit(Inst::cmove(size, cc, lhs, dst)); + } else { + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + } + } + FcmpCondResult::AndConditions(_, _) => { + unreachable!( + "can't AND with select; see above comment about inverting equal" + ); + } + FcmpCondResult::InvertedEqualOrConditions(cc1, cc2) + | FcmpCondResult::OrConditions(cc1, cc2) => { + if is_int_or_ref_ty(ty) { + let size = u8::max(ty.bytes() as u8, 4); + ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst)); + ctx.emit(Inst::cmove(size, cc2, lhs, dst)); + } else { + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst)); + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst)); + } + } + } + } else { + let ty = ty.unwrap(); + + let mut size = ty.bytes() as u8; + let lhs = if is_int_or_ref_ty(ty) { + if size < 4 { + // Special case: since the higher bits are undefined per CLIF semantics, we + // can just apply a 32-bit cmove here. Force inputs into registers, to + // avoid partial spilling out-of-bounds with memory accesses, though. + size = 4; + RegMem::reg(put_input_in_reg(ctx, inputs[1])) + } else { + input_to_reg_mem(ctx, inputs[1]) + } + } else { + input_to_reg_mem(ctx, inputs[1]) + }; + + let rhs = put_input_in_reg(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); + + let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) { + emit_cmp(ctx, icmp); + let cond_code = ctx.data(icmp).cond_code().unwrap(); + CC::from_intcc(cond_code) + } else { + // The input is a boolean value, compare it against zero. + let size = ctx.input_ty(insn, 0).bytes() as u8; + let test = put_input_in_reg(ctx, flag_input); + ctx.emit(Inst::cmp_rmi_r(size, RegMemImm::imm(0), test)); + CC::NZ + }; + + // This doesn't affect the flags. + ctx.emit(Inst::gen_move(dst, rhs, ty)); + + if is_int_or_ref_ty(ty) { + ctx.emit(Inst::cmove(size, cc, lhs, dst)); + } else { + debug_assert!(ty == types::F32 || ty == types::F64); + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + } + } + } + + Opcode::Selectif | Opcode::SelectifSpectreGuard => { + let lhs = input_to_reg_mem(ctx, inputs[1]); + let rhs = put_input_in_reg(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ctx.output_ty(insn, 0); + + // Verification ensures that the input is always a single-def ifcmp. + let cmp_insn = ctx + .get_input(inputs[0].insn, inputs[0].input) + .inst + .unwrap() + .0; + debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp); + emit_cmp(ctx, cmp_insn); + + let cc = CC::from_intcc(ctx.data(insn).cond_code().unwrap()); + + if is_int_or_ref_ty(ty) { + let size = ty.bytes() as u8; + if size == 1 { + // Sign-extend operands to 32, then do a cmove of size 4. + let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se)); + ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst)); + ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst)); + } else { + ctx.emit(Inst::gen_move(dst, rhs, ty)); + ctx.emit(Inst::cmove(size, cc, lhs, dst)); + } + } else { + debug_assert!(ty == types::F32 || ty == types::F64); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + } + } + + Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => { + let kind = match op { + Opcode::Udiv => DivOrRemKind::UnsignedDiv, + Opcode::Sdiv => DivOrRemKind::SignedDiv, + Opcode::Urem => DivOrRemKind::UnsignedRem, + Opcode::Srem => DivOrRemKind::SignedRem, + _ => unreachable!(), + }; + let is_div = kind.is_div(); + + let input_ty = ctx.input_ty(insn, 0); + let size = input_ty.bytes() as u8; + + let dividend = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + dividend, + input_ty, + )); + + if flags.avoid_div_traps() { + // A vcode meta-instruction is used to lower the inline checks, since they embed + // pc-relative offsets that must not change, thus requiring regalloc to not + // interfere by introducing spills and reloads. + // + // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that + // regalloc is aware of the coalescing opportunity between rax/rdx and the + // destination register. + let divisor = put_input_in_reg(ctx, inputs[1]); + + let divisor_copy = ctx.alloc_tmp(RegClass::I64, types::I64); + ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64)); + + let tmp = if op == Opcode::Sdiv && size == 8 { + Some(ctx.alloc_tmp(RegClass::I64, types::I64)) + } else { + None + }; + // TODO use xor + ctx.emit(Inst::imm( + OperandSize::Size32, + 0, + Writable::from_reg(regs::rdx()), + )); + ctx.emit(Inst::checked_div_or_rem_seq(kind, size, divisor_copy, tmp)); + } else { + let divisor = input_to_reg_mem(ctx, inputs[1]); + + // Fill in the high parts: + if kind.is_signed() { + // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for + // signed opcodes. + ctx.emit(Inst::sign_extend_data(size)); + } else if input_ty == types::I8 { + ctx.emit(Inst::movzx_rm_r( + ExtMode::BL, + RegMem::reg(regs::rax()), + Writable::from_reg(regs::rax()), + )); + } else { + // zero for unsigned opcodes. + ctx.emit(Inst::imm( + OperandSize::Size64, + 0, + Writable::from_reg(regs::rdx()), + )); + } + + // Emit the actual idiv. + ctx.emit(Inst::div(size, kind.is_signed(), divisor)); + } + + // Move the result back into the destination reg. + if is_div { + // The quotient is in rax. + ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty)); + } else { + // The remainder is in rdx. + ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty)); + } + } + + Opcode::Umulhi | Opcode::Smulhi => { + let input_ty = ctx.input_ty(insn, 0); + let size = input_ty.bytes() as u8; + + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + // Move lhs in %rax. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + lhs, + input_ty, + )); + + // Emit the actual mul or imul. + let signed = op == Opcode::Smulhi; + ctx.emit(Inst::mul_hi(size, signed, rhs)); + + // Read the result from the high part (stored in %rdx). + ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty)); + } + + Opcode::GetPinnedReg => { + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64)); + } + + Opcode::SetPinnedReg => { + let src = put_input_in_reg(ctx, inputs[0]); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::pinned_reg()), + src, + types::I64, + )); + } + + Opcode::Vconst => { + let used_constant = if let &InstructionData::UnaryConst { + constant_handle, .. + } = ctx.data(insn) + { + ctx.use_constant(VCodeConstantData::Pool( + constant_handle, + ctx.get_constant_data(constant_handle).clone(), + )) + } else { + unreachable!("vconst should always have unary_const format") + }; + // TODO use Inst::gen_constant() instead. + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + ctx.emit(Inst::xmm_load_const(used_constant, dst, ty)); + } + + Opcode::RawBitcast => { + // A raw_bitcast is just a mechanism for correcting the type of V128 values (see + // https://github.com/bytecodealliance/wasmtime/issues/1147). As such, this IR + // instruction should emit no machine code but a move is necessary to give the register + // allocator a definition for the output virtual register. + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + ctx.emit(Inst::gen_move(dst, src, ty)); + } + + Opcode::Shuffle => { + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let lhs_ty = ctx.input_ty(insn, 0); + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let mask = match ctx.get_immediate(insn) { + Some(DataValue::V128(bytes)) => bytes.to_vec(), + _ => unreachable!("shuffle should always have a 16-byte immediate"), + }; + + // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a + // 1 in the most significant position zeroes the lane. + let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b }; + + ctx.emit(Inst::gen_move(dst, rhs, ty)); + if rhs == lhs { + // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM + // register. We statically build `constructed_mask` to zero out any unknown lane + // indices (may not be completely necessary: verification could fail incorrect mask + // values) and fix the indexes to all point to the `dst` vector. + let constructed_mask = mask + .iter() + // If the mask is greater than 15 it still may be referring to a lane in b. + .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b }) + .map(zero_unknown_lane_index) + .collect(); + let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); + let tmp = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const(constant, tmp, ty)); + // After loading the constructed mask in a temporary register, we use this to + // shuffle the `dst` register (remember that, in this case, it is the same as + // `src` so we disregard this register). + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)); + } else { + // If `lhs` and `rhs` are different, we must shuffle each separately and then OR + // them together. This is necessary due to PSHUFB semantics. As in the case above, + // we build the `constructed_mask` for each case statically. + + // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes. + let tmp0 = ctx.alloc_tmp(RegClass::V128, lhs_ty); + ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty)); + let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect(); + let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); + let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const(constant, tmp1, ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0)); + + // PSHUFB the second argument, placing zeroes for unused lanes. + let constructed_mask = mask + .iter() + .map(|b| b.wrapping_sub(16)) + .map(zero_unknown_lane_index) + .collect(); + let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); + let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const(constant, tmp2, ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst)); + + // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers + // is not important). + ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst)); + + // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB + } + } + + Opcode::Swizzle => { + // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec + // requiring mask indexes greater than 15 to have the same semantics as a 0 index. For + // the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF + // semantics match the Wasm SIMD semantics for this instruction. + // The instruction format maps to variables like: %dst = swizzle %src, %mask + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src = put_input_in_reg(ctx, inputs[0]); + let swizzle_mask = put_input_in_reg(ctx, inputs[1]); + + // Inform the register allocator that `src` and `dst` should be in the same register. + ctx.emit(Inst::gen_move(dst, src, ty)); + + // Create a mask for zeroing out-of-bounds lanes of the swizzle mask. + let zero_mask = ctx.alloc_tmp(RegClass::V128, types::I8X16); + static ZERO_MASK_VALUE: [u8; 16] = [ + 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, + 0x70, 0x70, + ]; + let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE)); + ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty)); + + // Use the `zero_mask` on a writable `swizzle_mask`. + let swizzle_mask = Writable::from_reg(swizzle_mask); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddusb, + RegMem::from(zero_mask), + swizzle_mask, + )); + + // Shuffle `dst` using the fixed-up `swizzle_mask`. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::from(swizzle_mask), + dst, + )); + } + + Opcode::Insertlane => { + // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let in_vec = put_input_in_reg(ctx, inputs[0]); + let src_ty = ctx.input_ty(insn, 1); + debug_assert!(!src_ty.is_vector()); + let src = input_to_reg_mem(ctx, inputs[1]); + let lane = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) { + *imm + } else { + unreachable!(); + }; + debug_assert!(lane < ty.lane_count() as u8); + + ctx.emit(Inst::gen_move(dst, in_vec, ty)); + emit_insert_lane(ctx, src, dst, lane, ty.lane_type()); + } + + Opcode::Extractlane => { + // The instruction format maps to variables like: %dst = extractlane %src, %lane + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = put_input_in_reg(ctx, inputs[0]); + let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) { + *imm + } else { + unreachable!(); + }; + debug_assert!(lane < src_ty.lane_count() as u8); + + if !ty.is_float() { + let (sse_op, w_bit) = match ty.lane_bits() { + 8 => (SseOpcode::Pextrb, false), + 16 => (SseOpcode::Pextrw, false), + 32 => (SseOpcode::Pextrd, false), + 64 => (SseOpcode::Pextrd, true), + _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()), + }; + let src = RegMem::reg(src); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit)); + } else { + if lane == 0 { + // Remove the extractlane instruction, leaving the float where it is. The upper + // bits will remain unchanged; for correctness, this relies on Cranelift type + // checking to avoid using those bits. + ctx.emit(Inst::gen_move(dst, src, ty)); + } else { + // Otherwise, shuffle the bits in `lane` to the lowest lane. + let sse_op = SseOpcode::Pshufd; + let mask = match src_ty { + // Move the value at `lane` to lane 0, copying existing value at lane 0 to + // other lanes. Again, this relies on Cranelift type checking to avoid + // using those bits. + types::F32X4 => 0b00_00_00_00 | lane, + // Move the value at `lane` 1 (we know it must be 1 because of the `if` + // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type + // checking assumption also applies here. + types::F64X2 => 0b11_10_11_10, + _ => unreachable!(), + }; + let src = RegMem::reg(src); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false)); + } + } + } + + Opcode::Splat | Opcode::LoadSplat => { + let ty = ty.unwrap(); + assert_eq!(ty.bits(), 128); + let src_ty = ctx.input_ty(insn, 0); + assert!(src_ty.bits() < 128); + + let src = match op { + Opcode::Splat => input_to_reg_mem(ctx, inputs[0]), + Opcode::LoadSplat => { + let offset = ctx.data(insn).load_store_offset().unwrap(); + let amode = lower_to_amode(ctx, inputs[0], offset); + RegMem::mem(amode) + } + _ => unreachable!(), + }; + let dst = get_output_reg(ctx, outputs[0]); + + // We know that splat will overwrite all of the lanes of `dst` but it takes several + // instructions to do so. Because of the multiple instructions, there is no good way to + // declare `dst` a `def` except with the following pseudo-instruction. + ctx.emit(Inst::xmm_uninit_value(dst)); + + // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST* + // and VPBROADCAST*. + match ty.lane_bits() { + 8 => { + emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); + // Initialize a register with all 0s. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Shuffle the lowest byte lane to all other lanes. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)) + } + 16 => { + emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); + emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); + // Shuffle the lowest two lanes to all other lanes. + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Pshufd, + RegMem::from(dst), + dst, + 0, + false, + )) + } + 32 => { + emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); + // Shuffle the lowest lane to all other lanes. + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Pshufd, + RegMem::from(dst), + dst, + 0, + false, + )) + } + 64 => { + emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); + emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); + } + _ => panic!("Invalid type to splat: {}", ty), + } + } + + Opcode::VanyTrue => { + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = put_input_in_reg(ctx, inputs[0]); + // Set the ZF if the result is all zeroes. + ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src)); + // If the ZF is not set, place a 1 in `dst`. + ctx.emit(Inst::setcc(CC::NZ, dst)); + } + + Opcode::VallTrue => { + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = input_to_reg_mem(ctx, inputs[0]); + + let eq = |ty: Type| match ty.lane_bits() { + 8 => SseOpcode::Pcmpeqb, + 16 => SseOpcode::Pcmpeqw, + 32 => SseOpcode::Pcmpeqd, + 64 => SseOpcode::Pcmpeqq, + _ => panic!("Unable to find an instruction for {} for type: {}", op, ty), + }; + + // Initialize a register with all 0s. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Compare to see what lanes are filled with all 1s. + ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp)); + // Set the ZF if the result is all zeroes. + ctx.emit(Inst::xmm_cmp_rm_r( + SseOpcode::Ptest, + RegMem::from(tmp), + tmp.to_reg(), + )); + // If the ZF is set, place a 1 in `dst`. + ctx.emit(Inst::setcc(CC::Z, dst)); + } + + Opcode::VhighBits => { + let src = put_input_in_reg(ctx, inputs[0]); + let src_ty = ctx.input_ty(insn, 0); + debug_assert!(src_ty.is_vector() && src_ty.bits() == 128); + let dst = get_output_reg(ctx, outputs[0]); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + + // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for + // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode, + // the instruction can access additional registers when used with a REX.R prefix. The + // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development + // Manual, vol. 2). This being the case, we will always clear REX.W since its use is + // unnecessary (`OperandSize` is used for setting/clearing REX.W). + let size = OperandSize::Size32; + + match src_ty { + types::I8X16 | types::B8X16 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size)) + } + types::I32X4 | types::B32X4 | types::F32X4 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size)) + } + types::I64X2 | types::B64X2 | types::F64X2 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size)) + } + types::I16X8 | types::B16X8 => { + // There is no x86 instruction for extracting the high bit of 16-bit lanes so + // here we: + // - duplicate the 16-bit lanes of `src` into 8-bit lanes: + // PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...] + // - use PMOVMSKB to gather the high bits; now we have duplicates, though + // - shift away the bottom 8 high bits to remove the duplicates. + let tmp = ctx.alloc_tmp(RegClass::V128, src_ty); + ctx.emit(Inst::gen_move(tmp, src, src_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp)); + ctx.emit(Inst::xmm_to_gpr( + SseOpcode::Pmovmskb, + tmp.to_reg(), + dst, + size, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(8), dst)); + } + _ => unimplemented!("unknown input type {} for {}", src_ty, op), + } + } + + Opcode::IaddImm + | Opcode::ImulImm + | Opcode::UdivImm + | Opcode::SdivImm + | Opcode::UremImm + | Opcode::SremImm + | Opcode::IrsubImm + | Opcode::IaddCin + | Opcode::IaddIfcin + | Opcode::IaddCout + | Opcode::IaddCarry + | Opcode::IaddIfcarry + | Opcode::IsubBin + | Opcode::IsubIfbin + | Opcode::IsubBout + | Opcode::IsubIfbout + | Opcode::IsubBorrow + | Opcode::IsubIfborrow + | Opcode::BandImm + | Opcode::BorImm + | Opcode::BxorImm + | Opcode::RotlImm + | Opcode::RotrImm + | Opcode::IshlImm + | Opcode::UshrImm + | Opcode::SshrImm => { + panic!("ALU+imm and ALU+carry ops should not appear here!"); + } + _ => unimplemented!("unimplemented lowering for opcode {:?}", op), + } + + Ok(()) +} + +//============================================================================= +// Lowering-backend trait implementation. + +impl LowerBackend for X64Backend { + type MInst = Inst; + + fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> { + lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.triple) + } + + fn lower_branch_group<C: LowerCtx<I = Inst>>( + &self, + ctx: &mut C, + branches: &[IRInst], + targets: &[MachLabel], + fallthrough: Option<MachLabel>, + ) -> CodegenResult<()> { + // A block should end with at most two branches. The first may be a + // conditional branch; a conditional branch can be followed only by an + // unconditional branch or fallthrough. Otherwise, if only one branch, + // it may be an unconditional branch, a fallthrough, a return, or a + // trap. These conditions are verified by `is_ebb_basic()` during the + // verifier pass. + assert!(branches.len() <= 2); + + if branches.len() == 2 { + // Must be a conditional branch followed by an unconditional branch. + let op0 = ctx.data(branches[0]).opcode(); + let op1 = ctx.data(branches[1]).opcode(); + + trace!( + "lowering two-branch group: opcodes are {:?} and {:?}", + op0, + op1 + ); + assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough); + + let taken = targets[0]; + let not_taken = match op1 { + Opcode::Jump => targets[1], + Opcode::Fallthrough => fallthrough.unwrap(), + _ => unreachable!(), // assert above. + }; + + match op0 { + Opcode::Brz | Opcode::Brnz => { + let flag_input = InsnInput { + insn: branches[0], + input: 0, + }; + + let src_ty = ctx.input_ty(branches[0], 0); + + if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) { + emit_cmp(ctx, icmp); + + let cond_code = ctx.data(icmp).cond_code().unwrap(); + let cond_code = if op0 == Opcode::Brz { + cond_code.inverse() + } else { + cond_code + }; + + let cc = CC::from_intcc(cond_code); + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) { + let cond_code = ctx.data(fcmp).fp_cond_code().unwrap(); + let cond_code = if op0 == Opcode::Brz { + cond_code.inverse() + } else { + cond_code + }; + match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } + FcmpCondResult::AndConditions(cc1, cc2) => { + ctx.emit(Inst::jmp_if(cc1.invert(), not_taken)); + ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken)); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + ctx.emit(Inst::jmp_if(cc1, taken)); + ctx.emit(Inst::jmp_cond(cc2, taken, not_taken)); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), + } + } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) { + let src = put_input_in_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + ); + let cc = match op0 { + Opcode::Brz => CC::Z, + Opcode::Brnz => CC::NZ, + _ => unreachable!(), + }; + let size_bytes = src_ty.bytes() as u8; + ctx.emit(Inst::cmp_rmi_r(size_bytes, RegMemImm::imm(0), src)); + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } else { + unimplemented!("brz/brnz with non-int type {:?}", src_ty); + } + } + + Opcode::BrIcmp => { + let src_ty = ctx.input_ty(branches[0], 0); + if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) { + let lhs = put_input_in_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + ); + let rhs = input_to_reg_mem_imm( + ctx, + InsnInput { + insn: branches[0], + input: 1, + }, + ); + let cc = CC::from_intcc(ctx.data(branches[0]).cond_code().unwrap()); + let byte_size = src_ty.bytes() as u8; + // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives + // us dst - src at the machine instruction level, so invert operands. + ctx.emit(Inst::cmp_rmi_r(byte_size, rhs, lhs)); + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } else { + unimplemented!("bricmp with non-int type {:?}", src_ty); + } + } + + _ => panic!("unexpected branch opcode: {:?}", op0), + } + } else { + assert_eq!(branches.len(), 1); + + // Must be an unconditional branch or trap. + let op = ctx.data(branches[0]).opcode(); + match op { + Opcode::Jump | Opcode::Fallthrough => { + ctx.emit(Inst::jmp_known(targets[0])); + } + + Opcode::BrTable => { + let jt_size = targets.len() - 1; + assert!(jt_size <= u32::max_value() as usize); + let jt_size = jt_size as u32; + + let idx = extend_input_to_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + ExtSpec::ZeroExtendTo32, + ); + + // Bounds-check (compute flags from idx - jt_size) and branch to default. + ctx.emit(Inst::cmp_rmi_r(4, RegMemImm::imm(jt_size), idx)); + + // Emit the compound instruction that does: + // + // lea $jt, %rA + // movsbl [%rA, %rIndex, 2], %rB + // add %rB, %rA + // j *%rA + // [jt entries] + // + // This must be *one* instruction in the vcode because we cannot allow regalloc + // to insert any spills/fills in the middle of the sequence; otherwise, the + // lea PC-rel offset to the jumptable would be incorrect. (The alternative + // is to introduce a relocation pass for inlined jumptables, which is much + // worse.) + + // This temporary is used as a signed integer of 64-bits (to hold addresses). + let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64); + // This temporary is used as a signed integer of 32-bits (for the wasm-table + // index) and then 64-bits (address addend). The small lie about the I64 type + // is benign, since the temporary is dead after this instruction (and its + // Cranelift type is thus unused). + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64); + + let targets_for_term: Vec<MachLabel> = targets.to_vec(); + let default_target = targets[0]; + + let jt_targets: Vec<MachLabel> = targets.iter().skip(1).cloned().collect(); + + ctx.emit(Inst::JmpTableSeq { + idx, + tmp1, + tmp2, + default_target, + targets: jt_targets, + targets_for_term, + }); + } + + _ => panic!("Unknown branch type {:?}", op), + } + } + + Ok(()) + } + + fn maybe_pinned_reg(&self) -> Option<Reg> { + Some(regs::pinned_reg()) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs b/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs new file mode 100644 index 0000000000..fd4444498d --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs @@ -0,0 +1,149 @@ +//! X86_64-bit Instruction Set Architecture. + +use self::inst::EmitInfo; + +use super::TargetIsa; +use crate::ir::{condcodes::IntCC, Function}; +use crate::isa::x64::{inst::regs::create_reg_universe_systemv, settings as x64_settings}; +use crate::isa::Builder as IsaBuilder; +use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode}; +use crate::result::CodegenResult; +use crate::settings::{self as shared_settings, Flags}; +use alloc::boxed::Box; +use regalloc::{PrettyPrint, RealRegUniverse}; +use target_lexicon::Triple; + +mod abi; +mod inst; +mod lower; +mod settings; + +/// An X64 backend. +pub(crate) struct X64Backend { + triple: Triple, + flags: Flags, + x64_flags: x64_settings::Flags, + reg_universe: RealRegUniverse, +} + +impl X64Backend { + /// Create a new X64 backend with the given (shared) flags. + fn new_with_flags(triple: Triple, flags: Flags, x64_flags: x64_settings::Flags) -> Self { + let reg_universe = create_reg_universe_systemv(&flags); + Self { + triple, + flags, + x64_flags, + reg_universe, + } + } + + fn compile_vcode(&self, func: &Function, flags: Flags) -> CodegenResult<VCode<inst::Inst>> { + // This performs lowering to VCode, register-allocates the code, computes + // block layout and finalizes branches. The result is ready for binary emission. + let emit_info = EmitInfo::new(flags.clone(), self.x64_flags.clone()); + let abi = Box::new(abi::X64ABICallee::new(&func, flags)?); + compile::compile::<Self>(&func, self, abi, emit_info) + } +} + +impl MachBackend for X64Backend { + fn compile_function( + &self, + func: &Function, + want_disasm: bool, + ) -> CodegenResult<MachCompileResult> { + let flags = self.flags(); + let vcode = self.compile_vcode(func, flags.clone())?; + + let buffer = vcode.emit(); + let buffer = buffer.finish(); + let frame_size = vcode.frame_size(); + let unwind_info = vcode.unwind_info()?; + + let disasm = if want_disasm { + Some(vcode.show_rru(Some(&create_reg_universe_systemv(flags)))) + } else { + None + }; + + Ok(MachCompileResult { + buffer, + frame_size, + disasm, + unwind_info, + }) + } + + fn flags(&self) -> &Flags { + &self.flags + } + + fn name(&self) -> &'static str { + "x64" + } + + fn triple(&self) -> Triple { + self.triple.clone() + } + + fn reg_universe(&self) -> &RealRegUniverse { + &self.reg_universe + } + + fn unsigned_add_overflow_condition(&self) -> IntCC { + // Unsigned `>=`; this corresponds to the carry flag set on x86, which happens on + // overflow of an add. + IntCC::UnsignedGreaterThanOrEqual + } + + fn unsigned_sub_overflow_condition(&self) -> IntCC { + // unsigned `>=`; this corresponds to the carry flag set on x86, which happens on + // underflow of a subtract (carry is borrow for subtract). + IntCC::UnsignedGreaterThanOrEqual + } + + #[cfg(feature = "unwind")] + fn emit_unwind_info( + &self, + result: &MachCompileResult, + kind: crate::machinst::UnwindInfoKind, + ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> { + use crate::isa::unwind::UnwindInfo; + use crate::machinst::UnwindInfoKind; + Ok(match (result.unwind_info.as_ref(), kind) { + (Some(info), UnwindInfoKind::SystemV) => { + inst::unwind::systemv::create_unwind_info(info.clone())?.map(UnwindInfo::SystemV) + } + (Some(_info), UnwindInfoKind::Windows) => { + //TODO inst::unwind::winx64::create_unwind_info(info.clone())?.map(|u| UnwindInfo::WindowsX64(u)) + None + } + _ => None, + }) + } + + #[cfg(feature = "unwind")] + fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> { + Some(inst::unwind::systemv::create_cie()) + } +} + +/// Create a new `isa::Builder`. +pub(crate) fn isa_builder(triple: Triple) -> IsaBuilder { + IsaBuilder { + triple, + setup: x64_settings::builder(), + constructor: isa_constructor, + } +} + +fn isa_constructor( + triple: Triple, + shared_flags: Flags, + builder: shared_settings::Builder, +) -> Box<dyn TargetIsa> { + let isa_flags = x64_settings::Flags::new(&shared_flags, builder); + let backend = X64Backend::new_with_flags(triple, shared_flags, isa_flags); + Box::new(TargetIsaAdapter::new(backend)) +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs b/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs new file mode 100644 index 0000000000..c5371bb132 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs @@ -0,0 +1,9 @@ +//! x86 Settings. + +use crate::settings::{self, detail, Builder}; +use core::fmt; + +// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a +// public `Flags` struct with an impl for all of the settings defined in +// `cranelift-codegen/meta/src/isa/x86/settings.rs`. +include!(concat!(env!("OUT_DIR"), "/settings-x86.rs")); diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/abi.rs b/third_party/rust/cranelift-codegen/src/isa/x86/abi.rs new file mode 100644 index 0000000000..5119bb3241 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x86/abi.rs @@ -0,0 +1,1093 @@ +//! x86 ABI implementation. + +use super::super::settings as shared_settings; +use super::registers::{FPR, GPR, RU}; +use super::settings as isa_settings; +use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion}; +use crate::cursor::{Cursor, CursorPosition, EncCursor}; +use crate::ir; +use crate::ir::immediates::Imm64; +use crate::ir::stackslot::{StackOffset, StackSize}; +use crate::ir::types; +use crate::ir::{ + get_probestack_funcref, AbiParam, ArgumentExtension, ArgumentLoc, ArgumentPurpose, InstBuilder, + ValueLoc, +}; +use crate::isa::{CallConv, RegClass, RegUnit, TargetIsa}; +use crate::regalloc::RegisterSet; +use crate::result::CodegenResult; +use crate::stack_layout::layout_stack; +use alloc::borrow::Cow; +use core::i32; +use target_lexicon::{PointerWidth, Triple}; + +/// Argument registers for x86-64 +static ARG_GPRS: [RU; 6] = [RU::rdi, RU::rsi, RU::rdx, RU::rcx, RU::r8, RU::r9]; + +/// Return value registers. +static RET_GPRS: [RU; 3] = [RU::rax, RU::rdx, RU::rcx]; + +/// Argument registers for x86-64, when using windows fastcall +static ARG_GPRS_WIN_FASTCALL_X64: [RU; 4] = [RU::rcx, RU::rdx, RU::r8, RU::r9]; + +/// Return value registers for x86-64, when using windows fastcall +static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax]; + +/// The win64 fastcall ABI uses some shadow stack space, allocated by the caller, that can be used +/// by the callee for temporary values. +/// +/// [1] "Space is allocated on the call stack as a shadow store for callees to save" This shadow +/// store contains the parameters which are passed through registers (ARG_GPRS) and is eventually +/// used by the callee to save & restore the values of the arguments. +/// +/// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling +/// convention reserves spill space for parameters, you don’t have to use them as such" +const WIN_SHADOW_STACK_SPACE: StackSize = 32; + +/// Stack alignment requirement for functions. +/// +/// 16 bytes is the perfect stack alignment, because: +/// +/// - On Win64, "The primary exceptions are the stack pointer and malloc or alloca memory, which +/// are aligned to 16 bytes in order to aid performance". +/// - The original 32-bit x86 ELF ABI had a 4-byte aligned stack pointer, but newer versions use a +/// 16-byte aligned stack pointer. +/// - This allows using aligned loads and stores on SIMD vectors of 16 bytes that are located +/// higher up in the stack. +const STACK_ALIGNMENT: u32 = 16; + +#[derive(Clone)] +struct Args { + pointer_bytes: u8, + pointer_bits: u8, + pointer_type: ir::Type, + gpr: &'static [RU], + gpr_used: usize, + fpr_limit: usize, + fpr_used: usize, + offset: u32, + call_conv: CallConv, + shared_flags: shared_settings::Flags, + #[allow(dead_code)] + isa_flags: isa_settings::Flags, + assigning_returns: bool, +} + +impl Args { + fn new( + bits: u8, + gpr: &'static [RU], + fpr_limit: usize, + call_conv: CallConv, + shared_flags: &shared_settings::Flags, + isa_flags: &isa_settings::Flags, + assigning_returns: bool, + ) -> Self { + let offset = if call_conv.extends_windows_fastcall() { + WIN_SHADOW_STACK_SPACE + } else { + 0 + }; + + Self { + pointer_bytes: bits / 8, + pointer_bits: bits, + pointer_type: ir::Type::int(u16::from(bits)).unwrap(), + gpr, + gpr_used: 0, + fpr_limit, + fpr_used: 0, + offset, + call_conv, + shared_flags: shared_flags.clone(), + isa_flags: isa_flags.clone(), + assigning_returns, + } + } +} + +impl ArgAssigner for Args { + fn assign(&mut self, arg: &AbiParam) -> ArgAction { + if let ArgumentPurpose::StructArgument(size) = arg.purpose { + if self.call_conv != CallConv::SystemV { + panic!( + "The sarg argument purpose is not yet implemented for non-systemv call conv {:?}", + self.call_conv, + ); + } + let loc = ArgumentLoc::Stack(self.offset as i32); + self.offset += size; + debug_assert!(self.offset <= i32::MAX as u32); + return ArgAction::AssignAndChangeType(loc, types::SARG_T); + } + + let ty = arg.value_type; + + if ty.bits() > u16::from(self.pointer_bits) { + if !self.assigning_returns && self.call_conv.extends_windows_fastcall() { + // "Any argument that doesn't fit in 8 bytes, or isn't + // 1, 2, 4, or 8 bytes, must be passed by reference" + return ValueConversion::Pointer(self.pointer_type).into(); + } else if !ty.is_vector() && !ty.is_float() { + // On SystemV large integers and booleans are broken down to fit in a register. + return ValueConversion::IntSplit.into(); + } + } + + // Vectors should stay in vector registers unless SIMD is not enabled--then they are split + if ty.is_vector() { + if self.shared_flags.enable_simd() { + let reg = FPR.unit(self.fpr_used); + self.fpr_used += 1; + return ArgumentLoc::Reg(reg).into(); + } + return ValueConversion::VectorSplit.into(); + } + + // Small integers are extended to the size of a pointer register. + if ty.is_int() && ty.bits() < u16::from(self.pointer_bits) { + match arg.extension { + ArgumentExtension::None => {} + ArgumentExtension::Uext => return ValueConversion::Uext(self.pointer_type).into(), + ArgumentExtension::Sext => return ValueConversion::Sext(self.pointer_type).into(), + } + } + + // Handle special-purpose arguments. + if ty.is_int() && self.call_conv.extends_baldrdash() { + match arg.purpose { + // This is SpiderMonkey's `WasmTlsReg`. + ArgumentPurpose::VMContext => { + return ArgumentLoc::Reg(if self.pointer_bits == 64 { + RU::r14 + } else { + RU::rsi + } as RegUnit) + .into(); + } + // This is SpiderMonkey's `WasmTableCallSigReg`. + ArgumentPurpose::SignatureId => { + return ArgumentLoc::Reg(if self.pointer_bits == 64 { + RU::r10 + } else { + RU::rcx + } as RegUnit) + .into() + } + _ => {} + } + } + + // Try to use a GPR. + if !ty.is_float() && self.gpr_used < self.gpr.len() { + let reg = self.gpr[self.gpr_used] as RegUnit; + self.gpr_used += 1; + return ArgumentLoc::Reg(reg).into(); + } + + // Try to use an FPR. + let fpr_offset = if self.call_conv.extends_windows_fastcall() { + // Float and general registers on windows share the same parameter index. + // The used register depends entirely on the parameter index: Even if XMM0 + // is not used for the first parameter, it cannot be used for the second parameter. + debug_assert_eq!(self.fpr_limit, self.gpr.len()); + &mut self.gpr_used + } else { + &mut self.fpr_used + }; + + if ty.is_float() && *fpr_offset < self.fpr_limit { + let reg = FPR.unit(*fpr_offset); + *fpr_offset += 1; + return ArgumentLoc::Reg(reg).into(); + } + + // Assign a stack location. + let loc = ArgumentLoc::Stack(self.offset as i32); + self.offset += u32::from(self.pointer_bytes); + debug_assert!(self.offset <= i32::MAX as u32); + loc.into() + } +} + +/// Legalize `sig`. +pub fn legalize_signature( + sig: &mut Cow<ir::Signature>, + triple: &Triple, + _current: bool, + shared_flags: &shared_settings::Flags, + isa_flags: &isa_settings::Flags, +) { + let bits; + let mut args; + + match triple.pointer_width().unwrap() { + PointerWidth::U16 => panic!(), + PointerWidth::U32 => { + bits = 32; + args = Args::new(bits, &[], 0, sig.call_conv, shared_flags, isa_flags, false); + } + PointerWidth::U64 => { + bits = 64; + args = if sig.call_conv.extends_windows_fastcall() { + Args::new( + bits, + &ARG_GPRS_WIN_FASTCALL_X64[..], + 4, + sig.call_conv, + shared_flags, + isa_flags, + false, + ) + } else { + Args::new( + bits, + &ARG_GPRS[..], + 8, + sig.call_conv, + shared_flags, + isa_flags, + false, + ) + }; + } + } + + let (ret_regs, ret_fpr_limit) = if sig.call_conv.extends_windows_fastcall() { + // windows-x64 calling convention only uses XMM0 or RAX for return values + (&RET_GPRS_WIN_FASTCALL_X64[..], 1) + } else { + (&RET_GPRS[..], 2) + }; + + let mut rets = Args::new( + bits, + ret_regs, + ret_fpr_limit, + sig.call_conv, + shared_flags, + isa_flags, + true, + ); + + // If we don't have enough available return registers + // to fit all of the return values, we need to backtrack and start + // assigning locations all over again with a different strategy. In order to + // do that, we need a copy of the original assigner for the returns. + let mut backup_rets = rets.clone(); + + if let Some(new_returns) = legalize_args(&sig.returns, &mut rets) { + if new_returns + .iter() + .filter(|r| r.purpose == ArgumentPurpose::Normal) + .any(|r| !r.location.is_reg()) + { + // The return values couldn't all fit into available return + // registers. Introduce the use of a struct-return parameter. + debug_assert!(!sig.uses_struct_return_param()); + + // We're using the first register for the return pointer parameter. + let mut ret_ptr_param = AbiParam { + value_type: args.pointer_type, + purpose: ArgumentPurpose::StructReturn, + extension: ArgumentExtension::None, + location: ArgumentLoc::Unassigned, + legalized_to_pointer: false, + }; + match args.assign(&ret_ptr_param) { + ArgAction::Assign(ArgumentLoc::Reg(reg)) => { + ret_ptr_param.location = ArgumentLoc::Reg(reg); + sig.to_mut().params.push(ret_ptr_param); + } + _ => unreachable!("return pointer should always get a register assignment"), + } + + // We're using the first return register for the return pointer (like + // sys v does). + let mut ret_ptr_return = AbiParam { + value_type: args.pointer_type, + purpose: ArgumentPurpose::StructReturn, + extension: ArgumentExtension::None, + location: ArgumentLoc::Unassigned, + legalized_to_pointer: false, + }; + match backup_rets.assign(&ret_ptr_return) { + ArgAction::Assign(ArgumentLoc::Reg(reg)) => { + ret_ptr_return.location = ArgumentLoc::Reg(reg); + sig.to_mut().returns.push(ret_ptr_return); + } + _ => unreachable!("return pointer should always get a register assignment"), + } + + sig.to_mut().returns.retain(|ret| { + // Either this is the return pointer, in which case we want to keep + // it, or else assume that it is assigned for a reason and doesn't + // conflict with our return pointering legalization. + debug_assert_eq!( + ret.location.is_assigned(), + ret.purpose != ArgumentPurpose::Normal + ); + ret.location.is_assigned() + }); + + if let Some(new_returns) = legalize_args(&sig.returns, &mut backup_rets) { + sig.to_mut().returns = new_returns; + } + } else { + sig.to_mut().returns = new_returns; + } + } + + if let Some(new_params) = legalize_args(&sig.params, &mut args) { + sig.to_mut().params = new_params; + } +} + +/// Get register class for a type appearing in a legalized signature. +pub fn regclass_for_abi_type(ty: ir::Type) -> RegClass { + if ty.is_int() || ty.is_bool() || ty.is_ref() { + GPR + } else { + FPR + } +} + +/// Get the set of allocatable registers for `func`. +pub fn allocatable_registers(triple: &Triple, flags: &shared_settings::Flags) -> RegisterSet { + let mut regs = RegisterSet::new(); + regs.take(GPR, RU::rsp as RegUnit); + regs.take(GPR, RU::rbp as RegUnit); + + // 32-bit arch only has 8 registers. + if triple.pointer_width().unwrap() != PointerWidth::U64 { + for i in 8..16 { + regs.take(GPR, GPR.unit(i)); + regs.take(FPR, FPR.unit(i)); + } + if flags.enable_pinned_reg() { + unimplemented!("Pinned register not implemented on x86-32."); + } + } else { + // Choose r15 as the pinned register on 64-bits: it is non-volatile on native ABIs and + // isn't the fixed output register of any instruction. + if flags.enable_pinned_reg() { + regs.take(GPR, RU::r15 as RegUnit); + } + } + + regs +} + +/// Get the set of callee-saved general-purpose registers. +fn callee_saved_gprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] { + match isa.triple().pointer_width().unwrap() { + PointerWidth::U16 => panic!(), + PointerWidth::U32 => &[RU::rbx, RU::rsi, RU::rdi], + PointerWidth::U64 => { + if call_conv.extends_windows_fastcall() { + // "registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 are + // considered nonvolatile and must be saved and restored by a function that uses + // them." + // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention + // RSP & RBP are not listed below, since they are restored automatically during + // a function call. If that wasn't the case, function calls (RET) would not work. + &[ + RU::rbx, + RU::rdi, + RU::rsi, + RU::r12, + RU::r13, + RU::r14, + RU::r15, + ] + } else { + &[RU::rbx, RU::r12, RU::r13, RU::r14, RU::r15] + } + } + } +} + +/// Get the set of callee-saved floating-point (SIMD) registers. +fn callee_saved_fprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] { + match isa.triple().pointer_width().unwrap() { + PointerWidth::U16 => panic!(), + PointerWidth::U32 => &[], + PointerWidth::U64 => { + if call_conv.extends_windows_fastcall() { + // "registers RBX, ... , and XMM6-15 are considered nonvolatile and must be saved + // and restored by a function that uses them." + // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention as of + // February 5th, 2020. + &[ + RU::xmm6, + RU::xmm7, + RU::xmm8, + RU::xmm9, + RU::xmm10, + RU::xmm11, + RU::xmm12, + RU::xmm13, + RU::xmm14, + RU::xmm15, + ] + } else { + &[] + } + } + } +} + +/// Get the set of callee-saved registers that are used. +fn callee_saved_regs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterSet { + let mut all_callee_saved = RegisterSet::empty(); + for reg in callee_saved_gprs(isa, func.signature.call_conv) { + all_callee_saved.free(GPR, *reg as RegUnit); + } + for reg in callee_saved_fprs(isa, func.signature.call_conv) { + all_callee_saved.free(FPR, *reg as RegUnit); + } + + let mut used = RegisterSet::empty(); + for value_loc in func.locations.values() { + // Note that `value_loc` here contains only a single unit of a potentially multi-unit + // register. We don't use registers that overlap each other in the x86 ISA, but in others + // we do. So this should not be blindly reused. + if let ValueLoc::Reg(ru) = *value_loc { + if GPR.contains(ru) { + if !used.is_avail(GPR, ru) { + used.free(GPR, ru); + } + } else if FPR.contains(ru) { + if !used.is_avail(FPR, ru) { + used.free(FPR, ru); + } + } + } + } + + // regmove and regfill instructions may temporarily divert values into other registers, + // and these are not reflected in `func.locations`. Scan the function for such instructions + // and note which callee-saved registers they use. + // + // TODO: Consider re-evaluating how regmove/regfill/regspill work and whether it's possible + // to avoid this step. + for block in &func.layout { + for inst in func.layout.block_insts(block) { + match func.dfg[inst] { + ir::instructions::InstructionData::RegMove { dst, .. } + | ir::instructions::InstructionData::RegFill { dst, .. } => { + if GPR.contains(dst) { + if !used.is_avail(GPR, dst) { + used.free(GPR, dst); + } + } else if FPR.contains(dst) { + if !used.is_avail(FPR, dst) { + used.free(FPR, dst); + } + } + } + _ => (), + } + } + } + + used.intersect(&all_callee_saved); + used +} + +pub fn prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> { + match func.signature.call_conv { + // For now, just translate fast and cold as system_v. + CallConv::Fast | CallConv::Cold | CallConv::SystemV => { + system_v_prologue_epilogue(func, isa) + } + CallConv::WindowsFastcall => fastcall_prologue_epilogue(func, isa), + CallConv::BaldrdashSystemV | CallConv::BaldrdashWindows => { + baldrdash_prologue_epilogue(func, isa) + } + CallConv::Probestack => unimplemented!("probestack calling convention"), + CallConv::Baldrdash2020 => unimplemented!("Baldrdash ABI 2020"), + } +} + +fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> { + debug_assert!( + !isa.flags().enable_probestack(), + "baldrdash does not expect cranelift to emit stack probes" + ); + + let word_size = StackSize::from(isa.pointer_bytes()); + let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() { + WIN_SHADOW_STACK_SPACE + } else { + 0 + }; + + let bytes = + StackSize::from(isa.flags().baldrdash_prologue_words()) * word_size + shadow_store_size; + + let mut ss = ir::StackSlotData::new(ir::StackSlotKind::IncomingArg, bytes); + ss.offset = Some(-(bytes as StackOffset)); + func.stack_slots.push(ss); + + let is_leaf = func.is_leaf(); + layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)?; + Ok(()) +} + +/// Implementation of the fastcall-based Win64 calling convention described at [1] +/// [1] https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention +fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> { + if isa.triple().pointer_width().unwrap() != PointerWidth::U64 { + panic!("TODO: windows-fastcall: x86-32 not implemented yet"); + } + + // The reserved stack area is composed of: + // return address + frame pointer + all callee-saved registers + // + // Pushing the return address is an implicit function of the `call` + // instruction. Each of the others we will then push explicitly. Then we + // will adjust the stack pointer to make room for the rest of the required + // space for this frame. + let csrs = callee_saved_regs_used(isa, func); + let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32; + let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32; + let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size; + + // FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed + if fpsr_stack_size > 0 { + csr_stack_size = (csr_stack_size + 15) & !15; + } + + func.create_stack_slot(ir::StackSlotData { + kind: ir::StackSlotKind::IncomingArg, + size: csr_stack_size, + offset: Some(-(csr_stack_size as StackOffset)), + }); + + let is_leaf = func.is_leaf(); + + // If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space + if !is_leaf { + // TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack + func.create_stack_slot(ir::StackSlotData { + kind: ir::StackSlotKind::ExplicitSlot, + size: WIN_SHADOW_STACK_SPACE, + offset: None, + }); + } + + let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32; + + // Subtract the GPR saved register size from the local size because pushes are used for the saves + let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32); + + // Add CSRs to function signature + let reg_type = isa.pointer_type(); + let sp_arg_index = if fpsr_stack_size > 0 { + let sp_arg = ir::AbiParam::special_reg( + reg_type, + ir::ArgumentPurpose::CalleeSaved, + RU::rsp as RegUnit, + ); + let index = func.signature.params.len(); + func.signature.params.push(sp_arg); + Some(index) + } else { + None + }; + let fp_arg = ir::AbiParam::special_reg( + reg_type, + ir::ArgumentPurpose::FramePointer, + RU::rbp as RegUnit, + ); + func.signature.params.push(fp_arg); + func.signature.returns.push(fp_arg); + + for gp_csr in csrs.iter(GPR) { + let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, gp_csr); + func.signature.params.push(csr_arg); + func.signature.returns.push(csr_arg); + } + + for fp_csr in csrs.iter(FPR) { + // The calling convention described in + // https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention only requires + // preserving the low 128 bits of XMM6-XMM15. + let csr_arg = + ir::AbiParam::special_reg(types::F64X2, ir::ArgumentPurpose::CalleeSaved, fp_csr); + func.signature.params.push(csr_arg); + func.signature.returns.push(csr_arg); + } + + // Set up the cursor and insert the prologue + let entry_block = func.layout.entry_block().expect("missing entry block"); + let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); + insert_common_prologue( + &mut pos, + local_stack_size, + reg_type, + &csrs, + sp_arg_index.is_some(), + isa, + ); + + // Reset the cursor and insert the epilogue + let mut pos = pos.at_position(CursorPosition::Nowhere); + insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index); + + Ok(()) +} + +/// Insert a System V-compatible prologue and epilogue. +fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> { + let pointer_width = isa.triple().pointer_width().unwrap(); + let word_size = pointer_width.bytes() as usize; + + let csrs = callee_saved_regs_used(isa, func); + assert!( + csrs.iter(FPR).len() == 0, + "SysV ABI does not have callee-save SIMD registers" + ); + + // The reserved stack area is composed of: + // return address + frame pointer + all callee-saved registers + // + // Pushing the return address is an implicit function of the `call` + // instruction. Each of the others we will then push explicitly. Then we + // will adjust the stack pointer to make room for the rest of the required + // space for this frame. + let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; + func.create_stack_slot(ir::StackSlotData { + kind: ir::StackSlotKind::IncomingArg, + size: csr_stack_size as u32, + offset: Some(-csr_stack_size), + }); + + let is_leaf = func.is_leaf(); + let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32; + let local_stack_size = i64::from(total_stack_size - csr_stack_size); + + // Add CSRs to function signature + let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap(); + // On X86-32 all parameters, including vmctx, are passed on stack, and we need + // to extract vmctx from the stack before we can save the frame pointer. + let sp_arg_index = if isa.pointer_bits() == 32 { + let sp_arg = ir::AbiParam::special_reg( + reg_type, + ir::ArgumentPurpose::CalleeSaved, + RU::rsp as RegUnit, + ); + let index = func.signature.params.len(); + func.signature.params.push(sp_arg); + Some(index) + } else { + None + }; + let fp_arg = ir::AbiParam::special_reg( + reg_type, + ir::ArgumentPurpose::FramePointer, + RU::rbp as RegUnit, + ); + func.signature.params.push(fp_arg); + func.signature.returns.push(fp_arg); + + for csr in csrs.iter(GPR) { + let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, csr); + func.signature.params.push(csr_arg); + func.signature.returns.push(csr_arg); + } + + // Set up the cursor and insert the prologue + let entry_block = func.layout.entry_block().expect("missing entry block"); + let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); + insert_common_prologue( + &mut pos, + local_stack_size, + reg_type, + &csrs, + sp_arg_index.is_some(), + isa, + ); + + // Reset the cursor and insert the epilogue + let mut pos = pos.at_position(CursorPosition::Nowhere); + insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index); + + Ok(()) +} + +/// Insert the prologue for a given function. +/// This is used by common calling conventions such as System V. +fn insert_common_prologue( + pos: &mut EncCursor, + stack_size: i64, + reg_type: ir::types::Type, + csrs: &RegisterSet, + has_sp_param: bool, + isa: &dyn TargetIsa, +) { + let sp = if has_sp_param { + let block = pos.current_block().expect("missing block under cursor"); + let sp = pos.func.dfg.append_block_param(block, reg_type); + pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit); + Some(sp) + } else { + None + }; + + // If this is a leaf function with zero stack, then there's no need to + // insert a stack check since it can't overflow anything and + // forward-progress is guarantee so long as loop are handled anyway. + // + // If this has a stack size it could stack overflow, or if it isn't a leaf + // it could be part of a long call chain which we need to check anyway. + // + // First we look for the stack limit as a special argument to the function, + // and failing that we see if a custom stack limit factory has been provided + // which will be used to likely calculate the stack limit from the arguments + // or perhaps constants. + if stack_size > 0 || !pos.func.is_leaf() { + let scratch = ir::ValueLoc::Reg(RU::rax as RegUnit); + let stack_limit_arg = match pos.func.special_param(ArgumentPurpose::StackLimit) { + Some(arg) => { + let copy = pos.ins().copy(arg); + pos.func.locations[copy] = scratch; + Some(copy) + } + None => pos + .func + .stack_limit + .map(|gv| interpret_gv(pos, gv, sp, scratch)), + }; + if let Some(stack_limit_arg) = stack_limit_arg { + insert_stack_check(pos, stack_size, stack_limit_arg); + } + } + + // Append param to entry block + let block = pos.current_block().expect("missing block under cursor"); + let fp = pos.func.dfg.append_block_param(block, reg_type); + pos.func.locations[fp] = ir::ValueLoc::Reg(RU::rbp as RegUnit); + + pos.ins().x86_push(fp); + + let mov_sp_inst = pos + .ins() + .copy_special(RU::rsp as RegUnit, RU::rbp as RegUnit); + + let mut last_csr_push = None; + for reg in csrs.iter(GPR) { + // Append param to entry block + let csr_arg = pos.func.dfg.append_block_param(block, reg_type); + + // Assign it a location + pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); + last_csr_push = Some(pos.ins().x86_push(csr_arg)); + } + + // Allocate stack frame storage. + let mut adjust_sp_inst = None; + if stack_size > 0 { + if isa.flags().enable_probestack() && stack_size > (1 << isa.flags().probestack_size_log2()) + { + // Emit a stack probe. + let rax = RU::rax as RegUnit; + let rax_val = ir::ValueLoc::Reg(rax); + + // The probestack function expects its input in %rax. + let arg = pos.ins().iconst(reg_type, stack_size); + pos.func.locations[arg] = rax_val; + + // Call the probestack function. + let callee = get_probestack_funcref(pos.func, reg_type, rax, isa); + + // Make the call. + let call = if !isa.flags().is_pic() + && isa.triple().pointer_width().unwrap() == PointerWidth::U64 + && !pos.func.dfg.ext_funcs[callee].colocated + { + // 64-bit non-PIC non-colocated calls need to be legalized to call_indirect. + // Use r11 as it may be clobbered under all supported calling conventions. + let r11 = RU::r11 as RegUnit; + let sig = pos.func.dfg.ext_funcs[callee].signature; + let addr = pos.ins().func_addr(reg_type, callee); + pos.func.locations[addr] = ir::ValueLoc::Reg(r11); + pos.ins().call_indirect(sig, addr, &[arg]) + } else { + // Otherwise just do a normal call. + pos.ins().call(callee, &[arg]) + }; + + // If the probestack function doesn't adjust sp, do it ourselves. + if !isa.flags().probestack_func_adjusts_sp() { + let result = pos.func.dfg.inst_results(call)[0]; + pos.func.locations[result] = rax_val; + adjust_sp_inst = Some(pos.ins().adjust_sp_down(result)); + } + } else { + // Simply decrement the stack pointer. + adjust_sp_inst = Some(pos.ins().adjust_sp_down_imm(Imm64::new(stack_size))); + } + } + + // With the stack pointer adjusted, save any callee-saved floating point registers via offset + // FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes + let mut last_fpr_save = None; + + for (i, reg) in csrs.iter(FPR).enumerate() { + // Append param to entry block + let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2); + + // Since regalloc has already run, we must assign a location. + pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); + + // Offset to where the register is saved relative to RSP, accounting for FPR save alignment + let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64 + + (stack_size % types::F64X2.bytes() as i64); + + last_fpr_save = Some(pos.ins().store( + ir::MemFlags::trusted(), + csr_arg, + sp.expect("FPR save requires SP param"), + (stack_size - offset) as i32, + )); + } + + pos.func.prologue_end = Some( + last_fpr_save + .or(adjust_sp_inst) + .or(last_csr_push) + .unwrap_or(mov_sp_inst), + ); +} + +/// Inserts code necessary to calculate `gv`. +/// +/// Note that this is typically done with `ins().global_value(...)` but that +/// requires legalization to run to encode it, and we're running super late +/// here in the backend where legalization isn't possible. To get around this +/// we manually interpret the `gv` specified and do register allocation for +/// intermediate values. +/// +/// This is an incomplete implementation of loading `GlobalValue` values to get +/// compared to the stack pointer, but currently it serves enough functionality +/// to get this implemented in `wasmtime` itself. This'll likely get expanded a +/// bit over time! +fn interpret_gv( + pos: &mut EncCursor, + gv: ir::GlobalValue, + sp: Option<ir::Value>, + scratch: ir::ValueLoc, +) -> ir::Value { + match pos.func.global_values[gv] { + ir::GlobalValueData::VMContext => { + let vmctx_index = pos + .func + .signature + .special_param_index(ir::ArgumentPurpose::VMContext) + .expect("no vmcontext parameter found"); + match pos.func.signature.params[vmctx_index] { + AbiParam { + location: ArgumentLoc::Reg(_), + .. + } => { + let entry = pos.func.layout.entry_block().unwrap(); + pos.func.dfg.block_params(entry)[vmctx_index] + } + AbiParam { + location: ArgumentLoc::Stack(offset), + value_type, + .. + } => { + let offset = + offset + i32::from(pos.isa.pointer_bytes() * (1 + vmctx_index as u8)); + // The following access can be marked `trusted` because it is a load of an argument. We + // know it is safe because it was safe to write it in preparing this function call. + let ret = + pos.ins() + .load(value_type, ir::MemFlags::trusted(), sp.unwrap(), offset); + pos.func.locations[ret] = scratch; + return ret; + } + AbiParam { + location: ArgumentLoc::Unassigned, + .. + } => unreachable!(), + } + } + ir::GlobalValueData::Load { + base, + offset, + global_type, + readonly: _, + } => { + let base = interpret_gv(pos, base, sp, scratch); + let ret = pos + .ins() + .load(global_type, ir::MemFlags::trusted(), base, offset); + pos.func.locations[ret] = scratch; + return ret; + } + ref other => panic!("global value for stack limit not supported: {}", other), + } +} + +/// Insert a check that generates a trap if the stack pointer goes +/// below a value in `stack_limit_arg`. +fn insert_stack_check(pos: &mut EncCursor, stack_size: i64, stack_limit_arg: ir::Value) { + use crate::ir::condcodes::IntCC; + + // Our stack pointer, after subtracting `stack_size`, must not be below + // `stack_limit_arg`. To do this we're going to add `stack_size` to + // `stack_limit_arg` and see if the stack pointer is below that. The + // `stack_size + stack_limit_arg` computation might overflow, however, due + // to how stack limits may be loaded and set externally to trigger a trap. + // + // To handle this we'll need an extra comparison to see if the stack + // pointer is already below `stack_limit_arg`. Most of the time this + // isn't necessary though since the stack limit which triggers a trap is + // likely a sentinel somewhere around `usize::max_value()`. In that case + // only conditionally emit this pre-flight check. That way most functions + // only have the one comparison, but are also guaranteed that if we add + // `stack_size` to `stack_limit_arg` is won't overflow. + // + // This does mean that code generators which use this stack check + // functionality need to ensure that values stored into the stack limit + // will never overflow if this threshold is added. + if stack_size >= 32 * 1024 { + let cflags = pos.ins().ifcmp_sp(stack_limit_arg); + pos.func.locations[cflags] = ir::ValueLoc::Reg(RU::rflags as RegUnit); + pos.ins().trapif( + IntCC::UnsignedGreaterThanOrEqual, + cflags, + ir::TrapCode::StackOverflow, + ); + } + + // Copy `stack_limit_arg` into a %rax and use it for calculating + // a SP threshold. + let sp_threshold = pos.ins().iadd_imm(stack_limit_arg, stack_size); + pos.func.locations[sp_threshold] = ir::ValueLoc::Reg(RU::rax as RegUnit); + + // If the stack pointer currently reaches the SP threshold or below it then after opening + // the current stack frame, the current stack pointer will reach the limit. + let cflags = pos.ins().ifcmp_sp(sp_threshold); + pos.func.locations[cflags] = ir::ValueLoc::Reg(RU::rflags as RegUnit); + pos.ins().trapif( + IntCC::UnsignedGreaterThanOrEqual, + cflags, + ir::TrapCode::StackOverflow, + ); +} + +/// Find all `return` instructions and insert epilogues before them. +fn insert_common_epilogues( + pos: &mut EncCursor, + stack_size: i64, + reg_type: ir::types::Type, + csrs: &RegisterSet, + sp_arg_index: Option<usize>, +) { + while let Some(block) = pos.next_block() { + pos.goto_last_inst(block); + if let Some(inst) = pos.current_inst() { + if pos.func.dfg[inst].opcode().is_return() { + insert_common_epilogue(inst, block, stack_size, pos, reg_type, csrs, sp_arg_index); + } + } + } +} + +/// Insert an epilogue given a specific `return` instruction. +/// This is used by common calling conventions such as System V. +fn insert_common_epilogue( + inst: ir::Inst, + block: ir::Block, + stack_size: i64, + pos: &mut EncCursor, + reg_type: ir::types::Type, + csrs: &RegisterSet, + sp_arg_index: Option<usize>, +) { + // Insert the pop of the frame pointer + let fp_pop = pos.ins().x86_pop(reg_type); + let fp_pop_inst = pos.prev_inst().unwrap(); + pos.func.locations[fp_pop] = ir::ValueLoc::Reg(RU::rbp as RegUnit); + pos.func.dfg.append_inst_arg(inst, fp_pop); + + // Insert the CSR pops + let mut first_csr_pop_inst = None; + for reg in csrs.iter(GPR) { + let csr_pop = pos.ins().x86_pop(reg_type); + first_csr_pop_inst = pos.prev_inst(); + assert!(first_csr_pop_inst.is_some()); + pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg); + pos.func.dfg.append_inst_arg(inst, csr_pop); + } + + // Insert the adjustment of SP + let mut sp_adjust_inst = None; + if stack_size > 0 { + pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)); + sp_adjust_inst = pos.prev_inst(); + assert!(sp_adjust_inst.is_some()); + } + + let mut first_fpr_load = None; + if let Some(index) = sp_arg_index { + let sp = pos + .func + .dfg + .block_params(pos.func.layout.entry_block().unwrap())[index]; + + // Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads) + for (i, reg) in csrs.iter(FPR).enumerate() { + // Offset to where the register is saved relative to RSP, accounting for FPR save alignment + let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64 + + (stack_size % types::F64X2.bytes() as i64); + + let value = pos.ins().load( + types::F64X2, + ir::MemFlags::trusted(), + sp, + (stack_size - offset) as i32, + ); + + first_fpr_load.get_or_insert(pos.current_inst().expect("current inst")); + + pos.func.locations[value] = ir::ValueLoc::Reg(reg); + pos.func.dfg.append_inst_arg(inst, value); + } + } else { + assert!(csrs.iter(FPR).len() == 0); + } + + pos.func.epilogues_start.push(( + first_fpr_load + .or(sp_adjust_inst) + .or(first_csr_pop_inst) + .unwrap_or(fp_pop_inst), + block, + )); +} + +#[cfg(feature = "unwind")] +pub fn create_unwind_info( + func: &ir::Function, + isa: &dyn TargetIsa, +) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> { + use crate::isa::unwind::UnwindInfo; + + // Assumption: RBP is being used as the frame pointer for both calling conventions + // In the future, we should be omitting frame pointer as an optimization, so this will change + Ok(match func.signature.call_conv { + CallConv::Fast | CallConv::Cold | CallConv::SystemV => { + super::unwind::systemv::create_unwind_info(func, isa)?.map(|u| UnwindInfo::SystemV(u)) + } + CallConv::WindowsFastcall => { + super::unwind::winx64::create_unwind_info(func, isa)?.map(|u| UnwindInfo::WindowsX64(u)) + } + _ => None, + }) +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/binemit.rs b/third_party/rust/cranelift-codegen/src/isa/x86/binemit.rs new file mode 100644 index 0000000000..90ed8b7ef8 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x86/binemit.rs @@ -0,0 +1,576 @@ +//! Emitting binary x86 machine code. + +use super::enc_tables::{needs_offset, needs_sib_byte}; +use super::registers::RU; +use crate::binemit::{bad_encoding, CodeSink, Reloc}; +use crate::ir::condcodes::{CondCode, FloatCC, IntCC}; +use crate::ir::{ + Block, Constant, ExternalName, Function, Inst, InstructionData, JumpTable, LibCall, Opcode, + TrapCode, +}; +use crate::isa::{RegUnit, StackBase, StackBaseMask, StackRef, TargetIsa}; +use crate::regalloc::RegDiversions; +use cranelift_codegen_shared::isa::x86::EncodingBits; + +include!(concat!(env!("OUT_DIR"), "/binemit-x86.rs")); + +// Convert a stack base to the corresponding register. +fn stk_base(base: StackBase) -> RegUnit { + let ru = match base { + StackBase::SP => RU::rsp, + StackBase::FP => RU::rbp, + StackBase::Zone => unimplemented!(), + }; + ru as RegUnit +} + +// Mandatory prefix bytes for Mp* opcodes. +const PREFIX: [u8; 3] = [0x66, 0xf3, 0xf2]; + +// Second byte for three-byte opcodes for mm=0b10 and mm=0b11. +const OP3_BYTE2: [u8; 2] = [0x38, 0x3a]; + +// A REX prefix with no bits set: 0b0100WRXB. +const BASE_REX: u8 = 0b0100_0000; + +// Create a single-register REX prefix, setting the B bit to bit 3 of the register. +// This is used for instructions that encode a register in the low 3 bits of the opcode and for +// instructions that use the ModR/M `reg` field for something else. +fn rex1(reg_b: RegUnit) -> u8 { + let b = ((reg_b >> 3) & 1) as u8; + BASE_REX | b +} + +// Create a dual-register REX prefix, setting: +// +// REX.B = bit 3 of r/m register, or SIB base register when a SIB byte is present. +// REX.R = bit 3 of reg register. +fn rex2(rm: RegUnit, reg: RegUnit) -> u8 { + let b = ((rm >> 3) & 1) as u8; + let r = ((reg >> 3) & 1) as u8; + BASE_REX | b | (r << 2) +} + +// Create a three-register REX prefix, setting: +// +// REX.B = bit 3 of r/m register, or SIB base register when a SIB byte is present. +// REX.R = bit 3 of reg register. +// REX.X = bit 3 of SIB index register. +fn rex3(rm: RegUnit, reg: RegUnit, index: RegUnit) -> u8 { + let b = ((rm >> 3) & 1) as u8; + let r = ((reg >> 3) & 1) as u8; + let x = ((index >> 3) & 1) as u8; + BASE_REX | b | (x << 1) | (r << 2) +} + +/// Encode the RXBR' bits of the EVEX P0 byte. For an explanation of these bits, see section 2.6.1 +/// in the Intel Software Development Manual, volume 2A. These bits can be used by different +/// addressing modes (see section 2.6.2), requiring different `vex*` functions than this one. +fn evex2(rm: RegUnit, reg: RegUnit) -> u8 { + let b = (!(rm >> 3) & 1) as u8; + let x = (!(rm >> 4) & 1) as u8; + let r = (!(reg >> 3) & 1) as u8; + let r_ = (!(reg >> 4) & 1) as u8; + 0x00 | r_ | (b << 1) | (x << 2) | (r << 3) +} + +/// Determines whether a REX prefix should be emitted. A REX byte always has 0100 in bits 7:4; bits +/// 3:0 correspond to WRXB. W allows certain instructions to declare a 64-bit operand size; because +/// [needs_rex] is only used by [infer_rex] and we prevent [infer_rex] from using [w] in +/// [Template::build], we do not need to check again whether [w] forces an inferred REX prefix--it +/// always does and should be encoded like `.rex().w()`. The RXB are extension of ModR/M or SIB +/// fields; see section 2.2.1.2 in the Intel Software Development Manual. +#[inline] +fn needs_rex(rex: u8) -> bool { + rex != BASE_REX +} + +// Emit a REX prefix. +// +// The R, X, and B bits are computed from registers using the functions above. The W bit is +// extracted from `bits`. +fn rex_prefix<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(rex & 0xf8, BASE_REX); + let w = EncodingBits::from(bits).rex_w(); + sink.put1(rex | (w << 3)); +} + +// Emit a single-byte opcode with no REX prefix. +fn put_op1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x8f00, 0, "Invalid encoding bits for Op1*"); + debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Op1 encoding"); + sink.put1(bits as u8); +} + +// Emit a single-byte opcode with REX prefix. +fn put_rexop1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x0f00, 0, "Invalid encoding bits for RexOp1*"); + rex_prefix(bits, rex, sink); + sink.put1(bits as u8); +} + +/// Emit a single-byte opcode with inferred REX prefix. +fn put_dynrexop1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x0f00, 0, "Invalid encoding bits for DynRexOp1*"); + if needs_rex(rex) { + rex_prefix(bits, rex, sink); + } + sink.put1(bits as u8); +} + +// Emit two-byte opcode: 0F XX +fn put_op2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x8f00, 0x0400, "Invalid encoding bits for Op2*"); + debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Op2 encoding"); + sink.put1(0x0f); + sink.put1(bits as u8); +} + +// Emit two-byte opcode: 0F XX with REX prefix. +fn put_rexop2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x0f00, 0x0400, "Invalid encoding bits for RexOp2*"); + rex_prefix(bits, rex, sink); + sink.put1(0x0f); + sink.put1(bits as u8); +} + +/// Emit two-byte opcode: 0F XX with inferred REX prefix. +fn put_dynrexop2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!( + bits & 0x0f00, + 0x0400, + "Invalid encoding bits for DynRexOp2*" + ); + if needs_rex(rex) { + rex_prefix(bits, rex, sink); + } + sink.put1(0x0f); + sink.put1(bits as u8); +} + +// Emit single-byte opcode with mandatory prefix. +fn put_mp1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x8c00, 0, "Invalid encoding bits for Mp1*"); + let enc = EncodingBits::from(bits); + sink.put1(PREFIX[(enc.pp() - 1) as usize]); + debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp1 encoding"); + sink.put1(bits as u8); +} + +// Emit single-byte opcode with mandatory prefix and REX. +fn put_rexmp1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x0c00, 0, "Invalid encoding bits for RexMp1*"); + let enc = EncodingBits::from(bits); + sink.put1(PREFIX[(enc.pp() - 1) as usize]); + rex_prefix(bits, rex, sink); + sink.put1(bits as u8); +} + +// Emit two-byte opcode (0F XX) with mandatory prefix. +fn put_mp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x8c00, 0x0400, "Invalid encoding bits for Mp2*"); + let enc = EncodingBits::from(bits); + sink.put1(PREFIX[(enc.pp() - 1) as usize]); + debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp2 encoding"); + sink.put1(0x0f); + sink.put1(bits as u8); +} + +// Emit two-byte opcode (0F XX) with mandatory prefix and REX. +fn put_rexmp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x0c00, 0x0400, "Invalid encoding bits for RexMp2*"); + let enc = EncodingBits::from(bits); + sink.put1(PREFIX[(enc.pp() - 1) as usize]); + rex_prefix(bits, rex, sink); + sink.put1(0x0f); + sink.put1(bits as u8); +} + +/// Emit two-byte opcode (0F XX) with mandatory prefix and inferred REX. +fn put_dynrexmp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!( + bits & 0x0c00, + 0x0400, + "Invalid encoding bits for DynRexMp2*" + ); + let enc = EncodingBits::from(bits); + sink.put1(PREFIX[(enc.pp() - 1) as usize]); + if needs_rex(rex) { + rex_prefix(bits, rex, sink); + } + sink.put1(0x0f); + sink.put1(bits as u8); +} + +/// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix. +fn put_mp3<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x8800, 0x0800, "Invalid encoding bits for Mp3*"); + debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp3 encoding"); + let enc = EncodingBits::from(bits); + sink.put1(PREFIX[(enc.pp() - 1) as usize]); + sink.put1(0x0f); + sink.put1(OP3_BYTE2[(enc.mm() - 2) as usize]); + sink.put1(bits as u8); +} + +/// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix and REX +fn put_rexmp3<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x0800, 0x0800, "Invalid encoding bits for RexMp3*"); + let enc = EncodingBits::from(bits); + sink.put1(PREFIX[(enc.pp() - 1) as usize]); + rex_prefix(bits, rex, sink); + sink.put1(0x0f); + sink.put1(OP3_BYTE2[(enc.mm() - 2) as usize]); + sink.put1(bits as u8); +} + +/// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix and an inferred REX prefix. +fn put_dynrexmp3<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!( + bits & 0x0800, + 0x0800, + "Invalid encoding bits for DynRexMp3*" + ); + let enc = EncodingBits::from(bits); + sink.put1(PREFIX[(enc.pp() - 1) as usize]); + if needs_rex(rex) { + rex_prefix(bits, rex, sink); + } + sink.put1(0x0f); + sink.put1(OP3_BYTE2[(enc.mm() - 2) as usize]); + sink.put1(bits as u8); +} + +/// Defines the EVEX context for the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte). Table 2-36 in +/// section 2.6.10 (Intel Software Development Manual, volume 2A) describes how these bits can be +/// used together for certain classes of instructions; i.e., special care should be taken to ensure +/// that instructions use an applicable correct `EvexContext`. Table 2-39 contains cases where +/// opcodes can result in an #UD. +#[allow(dead_code)] +enum EvexContext { + RoundingRegToRegFP { + rc: EvexRoundingControl, + }, + NoRoundingFP { + sae: bool, + length: EvexVectorLength, + }, + MemoryOp { + broadcast: bool, + length: EvexVectorLength, + }, + Other { + length: EvexVectorLength, + }, +} + +impl EvexContext { + /// Encode the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte) for merging with the P2 byte. + fn bits(&self) -> u8 { + match self { + Self::RoundingRegToRegFP { rc } => 0b001 | rc.bits() << 1, + Self::NoRoundingFP { sae, length } => (*sae as u8) | length.bits() << 1, + Self::MemoryOp { broadcast, length } => (*broadcast as u8) | length.bits() << 1, + Self::Other { length } => length.bits() << 1, + } + } +} + +/// The EVEX format allows choosing a vector length in the `L'` and `L` bits; see `EvexContext`. +enum EvexVectorLength { + V128, + V256, + V512, +} + +impl EvexVectorLength { + /// Encode the `L'` and `L` bits for merging with the P2 byte. + fn bits(&self) -> u8 { + match self { + Self::V128 => 0b00, + Self::V256 => 0b01, + Self::V512 => 0b10, + // 0b11 is reserved (#UD). + } + } +} + +/// The EVEX format allows defining rounding control in the `L'` and `L` bits; see `EvexContext`. +enum EvexRoundingControl { + RNE, + RD, + RU, + RZ, +} + +impl EvexRoundingControl { + /// Encode the `L'` and `L` bits for merging with the P2 byte. + fn bits(&self) -> u8 { + match self { + Self::RNE => 0b00, + Self::RD => 0b01, + Self::RU => 0b10, + Self::RZ => 0b11, + } + } +} + +/// Defines the EVEX masking behavior; masking support is described in section 2.6.4 of the Intel +/// Software Development Manual, volume 2A. +#[allow(dead_code)] +enum EvexMasking { + None, + Merging { k: u8 }, + Zeroing { k: u8 }, +} + +impl EvexMasking { + /// Encode the `z` bit for merging with the P2 byte. + fn z_bit(&self) -> u8 { + match self { + Self::None | Self::Merging { .. } => 0, + Self::Zeroing { .. } => 1, + } + } + + /// Encode the `aaa` bits for merging with the P2 byte. + fn aaa_bits(&self) -> u8 { + match self { + Self::None => 0b000, + Self::Merging { k } | Self::Zeroing { k } => { + debug_assert!(*k <= 7); + *k + } + } + } +} + +/// Encode an EVEX prefix, including the instruction opcode. To match the current recipe +/// convention, the ModR/M byte is written separately in the recipe. This EVEX encoding function +/// only encodes the `reg` (operand 1), `vvvv` (operand 2), `rm` (operand 3) form; other forms are +/// possible (see section 2.6.2, Intel Software Development Manual, volume 2A), requiring +/// refactoring of this function or separate functions for each form (e.g. as for the REX prefix). +fn put_evex<CS: CodeSink + ?Sized>( + bits: u16, + reg: RegUnit, + vvvvv: RegUnit, + rm: RegUnit, + context: EvexContext, + masking: EvexMasking, + sink: &mut CS, +) { + let enc = EncodingBits::from(bits); + + // EVEX prefix. + sink.put1(0x62); + + debug_assert!(enc.mm() < 0b100); + let mut p0 = enc.mm() & 0b11; + p0 |= evex2(rm, reg) << 4; // bits 3:2 are always unset + sink.put1(p0); + + let mut p1 = enc.pp() | 0b100; // bit 2 is always set + p1 |= (!(vvvvv as u8) & 0b1111) << 3; + p1 |= (enc.rex_w() & 0b1) << 7; + sink.put1(p1); + + let mut p2 = masking.aaa_bits(); + p2 |= (!(vvvvv as u8 >> 4) & 0b1) << 3; + p2 |= context.bits() << 4; + p2 |= masking.z_bit() << 7; + sink.put1(p2); + + // Opcode + sink.put1(enc.opcode_byte()); + + // ModR/M byte placed in recipe +} + +/// Emit a ModR/M byte for reg-reg operands. +fn modrm_rr<CS: CodeSink + ?Sized>(rm: RegUnit, reg: RegUnit, sink: &mut CS) { + let reg = reg as u8 & 7; + let rm = rm as u8 & 7; + let mut b = 0b11000000; + b |= reg << 3; + b |= rm; + sink.put1(b); +} + +/// Emit a ModR/M byte where the reg bits are part of the opcode. +fn modrm_r_bits<CS: CodeSink + ?Sized>(rm: RegUnit, bits: u16, sink: &mut CS) { + let reg = (bits >> 12) as u8 & 7; + let rm = rm as u8 & 7; + let mut b = 0b11000000; + b |= reg << 3; + b |= rm; + sink.put1(b); +} + +/// Emit a mode 00 ModR/M byte. This is a register-indirect addressing mode with no offset. +/// Registers %rsp and %rbp are invalid for `rm`, %rsp indicates a SIB byte, and %rbp indicates an +/// absolute immediate 32-bit address. +fn modrm_rm<CS: CodeSink + ?Sized>(rm: RegUnit, reg: RegUnit, sink: &mut CS) { + let reg = reg as u8 & 7; + let rm = rm as u8 & 7; + let mut b = 0b00000000; + b |= reg << 3; + b |= rm; + sink.put1(b); +} + +/// Emit a mode 00 Mod/RM byte, with a rip-relative displacement in 64-bit mode. Effective address +/// is calculated by adding displacement to 64-bit rip of next instruction. See intel Sw dev manual +/// section 2.2.1.6. +fn modrm_riprel<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) { + modrm_rm(0b101, reg, sink) +} + +/// Emit a mode 01 ModR/M byte. This is a register-indirect addressing mode with 8-bit +/// displacement. +/// Register %rsp is invalid for `rm`. It indicates the presence of a SIB byte. +fn modrm_disp8<CS: CodeSink + ?Sized>(rm: RegUnit, reg: RegUnit, sink: &mut CS) { + let reg = reg as u8 & 7; + let rm = rm as u8 & 7; + let mut b = 0b01000000; + b |= reg << 3; + b |= rm; + sink.put1(b); +} + +/// Emit a mode 10 ModR/M byte. This is a register-indirect addressing mode with 32-bit +/// displacement. +/// Register %rsp is invalid for `rm`. It indicates the presence of a SIB byte. +fn modrm_disp32<CS: CodeSink + ?Sized>(rm: RegUnit, reg: RegUnit, sink: &mut CS) { + let reg = reg as u8 & 7; + let rm = rm as u8 & 7; + let mut b = 0b10000000; + b |= reg << 3; + b |= rm; + sink.put1(b); +} + +/// Emit a mode 00 ModR/M with a 100 RM indicating a SIB byte is present. +fn modrm_sib<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) { + modrm_rm(0b100, reg, sink); +} + +/// Emit a mode 01 ModR/M with a 100 RM indicating a SIB byte and 8-bit +/// displacement are present. +fn modrm_sib_disp8<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) { + modrm_disp8(0b100, reg, sink); +} + +/// Emit a mode 10 ModR/M with a 100 RM indicating a SIB byte and 32-bit +/// displacement are present. +fn modrm_sib_disp32<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) { + modrm_disp32(0b100, reg, sink); +} + +/// Emit a SIB byte with a base register and no scale+index. +fn sib_noindex<CS: CodeSink + ?Sized>(base: RegUnit, sink: &mut CS) { + let base = base as u8 & 7; + // SIB SS_III_BBB. + let mut b = 0b00_100_000; + b |= base; + sink.put1(b); +} + +/// Emit a SIB byte with a scale, base, and index. +fn sib<CS: CodeSink + ?Sized>(scale: u8, index: RegUnit, base: RegUnit, sink: &mut CS) { + // SIB SS_III_BBB. + debug_assert_eq!(scale & !0x03, 0, "Scale out of range"); + let scale = scale & 3; + let index = index as u8 & 7; + let base = base as u8 & 7; + let b: u8 = (scale << 6) | (index << 3) | base; + sink.put1(b); +} + +/// Get the low 4 bits of an opcode for an integer condition code. +/// +/// Add this offset to a base opcode for: +/// +/// ---- 0x70: Short conditional branch. +/// 0x0f 0x80: Long conditional branch. +/// 0x0f 0x90: SetCC. +/// +fn icc2opc(cond: IntCC) -> u16 { + use crate::ir::condcodes::IntCC::*; + match cond { + Overflow => 0x0, + NotOverflow => 0x1, + UnsignedLessThan => 0x2, + UnsignedGreaterThanOrEqual => 0x3, + Equal => 0x4, + NotEqual => 0x5, + UnsignedLessThanOrEqual => 0x6, + UnsignedGreaterThan => 0x7, + // 0x8 = Sign. + // 0x9 = !Sign. + // 0xa = Parity even. + // 0xb = Parity odd. + SignedLessThan => 0xc, + SignedGreaterThanOrEqual => 0xd, + SignedLessThanOrEqual => 0xe, + SignedGreaterThan => 0xf, + } +} + +/// Get the low 4 bits of an opcode for a floating point condition code. +/// +/// The ucomiss/ucomisd instructions set the FLAGS bits CF/PF/CF like this: +/// +/// ZPC OSA +/// UN 111 000 +/// GT 000 000 +/// LT 001 000 +/// EQ 100 000 +/// +/// Not all floating point condition codes are supported. +fn fcc2opc(cond: FloatCC) -> u16 { + use crate::ir::condcodes::FloatCC::*; + match cond { + Ordered => 0xb, // EQ|LT|GT => *np (P=0) + Unordered => 0xa, // UN => *p (P=1) + OrderedNotEqual => 0x5, // LT|GT => *ne (Z=0), + UnorderedOrEqual => 0x4, // UN|EQ => *e (Z=1) + GreaterThan => 0x7, // GT => *a (C=0&Z=0) + GreaterThanOrEqual => 0x3, // GT|EQ => *ae (C=0) + UnorderedOrLessThan => 0x2, // UN|LT => *b (C=1) + UnorderedOrLessThanOrEqual => 0x6, // UN|LT|EQ => *be (Z=1|C=1) + Equal | // EQ + NotEqual | // UN|LT|GT + LessThan | // LT + LessThanOrEqual | // LT|EQ + UnorderedOrGreaterThan | // UN|GT + UnorderedOrGreaterThanOrEqual // UN|GT|EQ + => panic!("{} not supported", cond), + } +} + +/// Emit a single-byte branch displacement to `destination`. +fn disp1<CS: CodeSink + ?Sized>(destination: Block, func: &Function, sink: &mut CS) { + let delta = func.offsets[destination].wrapping_sub(sink.offset() + 1); + sink.put1(delta as u8); +} + +/// Emit a four-byte branch displacement to `destination`. +fn disp4<CS: CodeSink + ?Sized>(destination: Block, func: &Function, sink: &mut CS) { + let delta = func.offsets[destination].wrapping_sub(sink.offset() + 4); + sink.put4(delta); +} + +/// Emit a four-byte displacement to jump table `jt`. +fn jt_disp4<CS: CodeSink + ?Sized>(jt: JumpTable, func: &Function, sink: &mut CS) { + let delta = func.jt_offsets[jt].wrapping_sub(sink.offset() + 4); + sink.put4(delta); + sink.reloc_jt(Reloc::X86PCRelRodata4, jt); +} + +/// Emit a four-byte displacement to `constant`. +fn const_disp4<CS: CodeSink + ?Sized>(constant: Constant, func: &Function, sink: &mut CS) { + let offset = func.dfg.constants.get_offset(constant); + let delta = offset.wrapping_sub(sink.offset() + 4); + sink.put4(delta); + sink.reloc_constant(Reloc::X86PCRelRodata4, offset); +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/enc_tables.rs b/third_party/rust/cranelift-codegen/src/isa/x86/enc_tables.rs new file mode 100644 index 0000000000..976f1581e3 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x86/enc_tables.rs @@ -0,0 +1,1922 @@ +//! Encoding tables for x86 ISAs. + +use super::registers::*; +use crate::bitset::BitSet; +use crate::cursor::{Cursor, FuncCursor}; +use crate::flowgraph::ControlFlowGraph; +use crate::ir::condcodes::{FloatCC, IntCC}; +use crate::ir::types::*; +use crate::ir::{self, Function, Inst, InstBuilder, MemFlags}; +use crate::isa::constraints::*; +use crate::isa::enc_tables::*; +use crate::isa::encoding::base_size; +use crate::isa::encoding::{Encoding, RecipeSizing}; +use crate::isa::RegUnit; +use crate::isa::{self, TargetIsa}; +use crate::legalizer::expand_as_libcall; +use crate::predicates; +use crate::regalloc::RegDiversions; + +include!(concat!(env!("OUT_DIR"), "/encoding-x86.rs")); +include!(concat!(env!("OUT_DIR"), "/legalize-x86.rs")); + +/// Whether the REX prefix is needed for encoding extended registers (via REX.RXB). +/// +/// Normal x86 instructions have only 3 bits for encoding a register. +/// The REX prefix adds REX.R, REX,X, and REX.B bits, interpreted as fourth bits. +pub fn is_extended_reg(reg: RegUnit) -> bool { + // Extended registers have the fourth bit set. + reg as u8 & 0b1000 != 0 +} + +pub fn needs_sib_byte(reg: RegUnit) -> bool { + reg == RU::r12 as RegUnit || reg == RU::rsp as RegUnit +} +pub fn needs_offset(reg: RegUnit) -> bool { + reg == RU::r13 as RegUnit || reg == RU::rbp as RegUnit +} +pub fn needs_sib_byte_or_offset(reg: RegUnit) -> bool { + needs_sib_byte(reg) || needs_offset(reg) +} + +fn test_input( + op_index: usize, + inst: Inst, + divert: &RegDiversions, + func: &Function, + condition_func: fn(RegUnit) -> bool, +) -> bool { + let in_reg = divert.reg(func.dfg.inst_args(inst)[op_index], &func.locations); + condition_func(in_reg) +} + +fn test_result( + result_index: usize, + inst: Inst, + divert: &RegDiversions, + func: &Function, + condition_func: fn(RegUnit) -> bool, +) -> bool { + let out_reg = divert.reg(func.dfg.inst_results(inst)[result_index], &func.locations); + condition_func(out_reg) +} + +fn size_plus_maybe_offset_for_inreg_0( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + let needs_offset = test_input(0, inst, divert, func, needs_offset); + sizing.base_size + if needs_offset { 1 } else { 0 } +} +fn size_plus_maybe_offset_for_inreg_1( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + let needs_offset = test_input(1, inst, divert, func, needs_offset); + sizing.base_size + if needs_offset { 1 } else { 0 } +} +fn size_plus_maybe_sib_for_inreg_0( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + let needs_sib = test_input(0, inst, divert, func, needs_sib_byte); + sizing.base_size + if needs_sib { 1 } else { 0 } +} +fn size_plus_maybe_sib_for_inreg_1( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + let needs_sib = test_input(1, inst, divert, func, needs_sib_byte); + sizing.base_size + if needs_sib { 1 } else { 0 } +} +fn size_plus_maybe_sib_or_offset_for_inreg_0( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + let needs_sib_or_offset = test_input(0, inst, divert, func, needs_sib_byte_or_offset); + sizing.base_size + if needs_sib_or_offset { 1 } else { 0 } +} +fn size_plus_maybe_sib_or_offset_for_inreg_1( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + let needs_sib_or_offset = test_input(1, inst, divert, func, needs_sib_byte_or_offset); + sizing.base_size + if needs_sib_or_offset { 1 } else { 0 } +} + +/// Calculates the size while inferring if the first and second input registers (inreg0, inreg1) +/// require a dynamic REX prefix and if the second input register (inreg1) requires a SIB or offset. +fn size_plus_maybe_sib_or_offset_inreg1_plus_rex_prefix_for_inreg0_inreg1( + sizing: &RecipeSizing, + enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(0, inst, divert, func, is_extended_reg) + || test_input(1, inst, divert, func, is_extended_reg); + size_plus_maybe_sib_or_offset_for_inreg_1(sizing, enc, inst, divert, func) + + if needs_rex { 1 } else { 0 } +} + +/// Calculates the size while inferring if the first and second input registers (inreg0, inreg1) +/// require a dynamic REX prefix and if the second input register (inreg1) requires a SIB. +fn size_plus_maybe_sib_inreg1_plus_rex_prefix_for_inreg0_inreg1( + sizing: &RecipeSizing, + enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(0, inst, divert, func, is_extended_reg) + || test_input(1, inst, divert, func, is_extended_reg); + size_plus_maybe_sib_for_inreg_1(sizing, enc, inst, divert, func) + if needs_rex { 1 } else { 0 } +} + +/// Calculates the size while inferring if the first input register (inreg0) and first output +/// register (outreg0) require a dynamic REX and if the first input register (inreg0) requires a +/// SIB or offset. +fn size_plus_maybe_sib_or_offset_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0( + sizing: &RecipeSizing, + enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(0, inst, divert, func, is_extended_reg) + || test_result(0, inst, divert, func, is_extended_reg); + size_plus_maybe_sib_or_offset_for_inreg_0(sizing, enc, inst, divert, func) + + if needs_rex { 1 } else { 0 } +} + +/// Calculates the size while inferring if the first input register (inreg0) and first output +/// register (outreg0) require a dynamic REX and if the first input register (inreg0) requires a +/// SIB. +fn size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0( + sizing: &RecipeSizing, + enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(0, inst, divert, func, is_extended_reg) + || test_result(0, inst, divert, func, is_extended_reg); + size_plus_maybe_sib_for_inreg_0(sizing, enc, inst, divert, func) + if needs_rex { 1 } else { 0 } +} + +/// Infers whether a dynamic REX prefix will be emitted, for use with one input reg. +/// +/// A REX prefix is known to be emitted if either: +/// 1. The EncodingBits specify that REX.W is to be set. +/// 2. Registers are used that require REX.R or REX.B bits for encoding. +fn size_with_inferred_rex_for_inreg0( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(0, inst, divert, func, is_extended_reg); + sizing.base_size + if needs_rex { 1 } else { 0 } +} + +/// Infers whether a dynamic REX prefix will be emitted, based on the second operand. +fn size_with_inferred_rex_for_inreg1( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(1, inst, divert, func, is_extended_reg); + sizing.base_size + if needs_rex { 1 } else { 0 } +} + +/// Infers whether a dynamic REX prefix will be emitted, based on the third operand. +fn size_with_inferred_rex_for_inreg2( + sizing: &RecipeSizing, + _: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(2, inst, divert, func, is_extended_reg); + sizing.base_size + if needs_rex { 1 } else { 0 } +} + +/// Infers whether a dynamic REX prefix will be emitted, for use with two input registers. +/// +/// A REX prefix is known to be emitted if either: +/// 1. The EncodingBits specify that REX.W is to be set. +/// 2. Registers are used that require REX.R or REX.B bits for encoding. +fn size_with_inferred_rex_for_inreg0_inreg1( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(0, inst, divert, func, is_extended_reg) + || test_input(1, inst, divert, func, is_extended_reg); + sizing.base_size + if needs_rex { 1 } else { 0 } +} + +/// Infers whether a dynamic REX prefix will be emitted, based on second and third operand. +fn size_with_inferred_rex_for_inreg1_inreg2( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(1, inst, divert, func, is_extended_reg) + || test_input(2, inst, divert, func, is_extended_reg); + sizing.base_size + if needs_rex { 1 } else { 0 } +} + +/// Infers whether a dynamic REX prefix will be emitted, based on a single +/// input register and a single output register. +fn size_with_inferred_rex_for_inreg0_outreg0( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(0, inst, divert, func, is_extended_reg) + || test_result(0, inst, divert, func, is_extended_reg); + sizing.base_size + if needs_rex { 1 } else { 0 } +} + +/// Infers whether a dynamic REX prefix will be emitted, based on a single output register. +fn size_with_inferred_rex_for_outreg0( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_result(0, inst, divert, func, is_extended_reg); + sizing.base_size + if needs_rex { 1 } else { 0 } +} + +/// Infers whether a dynamic REX prefix will be emitted, for use with CMOV. +/// +/// CMOV uses 3 inputs, with the REX is inferred from reg1 and reg2. +fn size_with_inferred_rex_for_cmov( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(1, inst, divert, func, is_extended_reg) + || test_input(2, inst, divert, func, is_extended_reg); + sizing.base_size + if needs_rex { 1 } else { 0 } +} + +/// If the value's definition is a constant immediate, returns its unpacked value, or None +/// otherwise. +fn maybe_iconst_imm(pos: &FuncCursor, value: ir::Value) -> Option<i64> { + if let ir::ValueDef::Result(inst, _) = &pos.func.dfg.value_def(value) { + if let ir::InstructionData::UnaryImm { + opcode: ir::Opcode::Iconst, + imm, + } = &pos.func.dfg[*inst] + { + let value: i64 = (*imm).into(); + Some(value) + } else { + None + } + } else { + None + } +} + +/// Expand the `sdiv` and `srem` instructions using `x86_sdivmodx`. +fn expand_sdivrem( + inst: ir::Inst, + func: &mut ir::Function, + cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + let (x, y, is_srem) = match func.dfg[inst] { + ir::InstructionData::Binary { + opcode: ir::Opcode::Sdiv, + args, + } => (args[0], args[1], false), + ir::InstructionData::Binary { + opcode: ir::Opcode::Srem, + args, + } => (args[0], args[1], true), + _ => panic!("Need sdiv/srem: {}", func.dfg.display_inst(inst, None)), + }; + + let old_block = func.layout.pp_block(inst); + let result = func.dfg.first_result(inst); + let ty = func.dfg.value_type(result); + + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + pos.func.dfg.clear_results(inst); + + let avoid_div_traps = isa.flags().avoid_div_traps(); + + // If we can tolerate native division traps, sdiv doesn't need branching. + if !avoid_div_traps && !is_srem { + let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1); + pos.ins().with_result(result).x86_sdivmodx(x, xhi, y); + pos.remove_inst(); + return; + } + + // Try to remove checks if the input value is an immediate other than 0 or -1. For these two + // immediates, we'd ideally replace conditional traps by traps, but this requires more + // manipulation of the dfg/cfg, which is out of scope here. + let (could_be_zero, could_be_minus_one) = if let Some(imm) = maybe_iconst_imm(&pos, y) { + (imm == 0, imm == -1) + } else { + (true, true) + }; + + // Put in an explicit division-by-zero trap if the environment requires it. + if avoid_div_traps && could_be_zero { + pos.ins().trapz(y, ir::TrapCode::IntegerDivisionByZero); + } + + if !could_be_minus_one { + let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1); + let reuse = if is_srem { + [None, Some(result)] + } else { + [Some(result), None] + }; + pos.ins().with_results(reuse).x86_sdivmodx(x, xhi, y); + pos.remove_inst(); + return; + } + + // block handling the nominal case. + let nominal = pos.func.dfg.make_block(); + + // block handling the -1 divisor case. + let minus_one = pos.func.dfg.make_block(); + + // Final block with one argument representing the final result value. + let done = pos.func.dfg.make_block(); + + // Move the `inst` result value onto the `done` block. + pos.func.dfg.attach_block_param(done, result); + + // Start by checking for a -1 divisor which needs to be handled specially. + let is_m1 = pos.ins().ifcmp_imm(y, -1); + pos.ins().brif(IntCC::Equal, is_m1, minus_one, &[]); + pos.ins().jump(nominal, &[]); + + // Now it is safe to execute the `x86_sdivmodx` instruction which will still trap on division + // by zero. + pos.insert_block(nominal); + let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1); + let (quot, rem) = pos.ins().x86_sdivmodx(x, xhi, y); + let divres = if is_srem { rem } else { quot }; + pos.ins().jump(done, &[divres]); + + // Now deal with the -1 divisor case. + pos.insert_block(minus_one); + let m1_result = if is_srem { + // x % -1 = 0. + pos.ins().iconst(ty, 0) + } else { + // Explicitly check for overflow: Trap when x == INT_MIN. + debug_assert!(avoid_div_traps, "Native trapping divide handled above"); + let f = pos.ins().ifcmp_imm(x, -1 << (ty.lane_bits() - 1)); + pos.ins() + .trapif(IntCC::Equal, f, ir::TrapCode::IntegerOverflow); + // x / -1 = -x. + pos.ins().irsub_imm(x, 0) + }; + + // Recycle the original instruction as a jump. + pos.func.dfg.replace(inst).jump(done, &[m1_result]); + + // Finally insert a label for the completion. + pos.next_inst(); + pos.insert_block(done); + + cfg.recompute_block(pos.func, old_block); + cfg.recompute_block(pos.func, nominal); + cfg.recompute_block(pos.func, minus_one); + cfg.recompute_block(pos.func, done); +} + +/// Expand the `udiv` and `urem` instructions using `x86_udivmodx`. +fn expand_udivrem( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + let (x, y, is_urem) = match func.dfg[inst] { + ir::InstructionData::Binary { + opcode: ir::Opcode::Udiv, + args, + } => (args[0], args[1], false), + ir::InstructionData::Binary { + opcode: ir::Opcode::Urem, + args, + } => (args[0], args[1], true), + _ => panic!("Need udiv/urem: {}", func.dfg.display_inst(inst, None)), + }; + let avoid_div_traps = isa.flags().avoid_div_traps(); + let result = func.dfg.first_result(inst); + let ty = func.dfg.value_type(result); + + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + pos.func.dfg.clear_results(inst); + + // Put in an explicit division-by-zero trap if the environment requires it. + if avoid_div_traps { + let zero_check = if let Some(imm) = maybe_iconst_imm(&pos, y) { + // Ideally, we'd just replace the conditional trap with a trap when the immediate is + // zero, but this requires more manipulation of the dfg/cfg, which is out of scope + // here. + imm == 0 + } else { + true + }; + if zero_check { + pos.ins().trapz(y, ir::TrapCode::IntegerDivisionByZero); + } + } + + // Now it is safe to execute the `x86_udivmodx` instruction. + let xhi = pos.ins().iconst(ty, 0); + let reuse = if is_urem { + [None, Some(result)] + } else { + [Some(result), None] + }; + pos.ins().with_results(reuse).x86_udivmodx(x, xhi, y); + pos.remove_inst(); +} + +/// Expand the `fmin` and `fmax` instructions using the x86 `x86_fmin` and `x86_fmax` +/// instructions. +fn expand_minmax( + inst: ir::Inst, + func: &mut ir::Function, + cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let (x, y, x86_opc, bitwise_opc) = match func.dfg[inst] { + ir::InstructionData::Binary { + opcode: ir::Opcode::Fmin, + args, + } => (args[0], args[1], ir::Opcode::X86Fmin, ir::Opcode::Bor), + ir::InstructionData::Binary { + opcode: ir::Opcode::Fmax, + args, + } => (args[0], args[1], ir::Opcode::X86Fmax, ir::Opcode::Band), + _ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)), + }; + let old_block = func.layout.pp_block(inst); + + // We need to handle the following conditions, depending on how x and y compare: + // + // 1. LT or GT: The native `x86_opc` min/max instruction does what we need. + // 2. EQ: We need to use `bitwise_opc` to make sure that + // fmin(0.0, -0.0) -> -0.0 and fmax(0.0, -0.0) -> 0.0. + // 3. UN: We need to produce a quiet NaN that is canonical if the inputs are canonical. + + // block handling case 1) where operands are ordered but not equal. + let one_block = func.dfg.make_block(); + + // block handling case 3) where one operand is NaN. + let uno_block = func.dfg.make_block(); + + // block that handles the unordered or equal cases 2) and 3). + let ueq_block = func.dfg.make_block(); + + // block handling case 2) where operands are ordered and equal. + let eq_block = func.dfg.make_block(); + + // Final block with one argument representing the final result value. + let done = func.dfg.make_block(); + + // The basic blocks are laid out to minimize branching for the common cases: + // + // 1) One branch not taken, one jump. + // 2) One branch taken. + // 3) Two branches taken, one jump. + + // Move the `inst` result value onto the `done` block. + let result = func.dfg.first_result(inst); + let ty = func.dfg.value_type(result); + func.dfg.clear_results(inst); + func.dfg.attach_block_param(done, result); + + // Test for case 1) ordered and not equal. + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + let cmp_ueq = pos.ins().fcmp(FloatCC::UnorderedOrEqual, x, y); + pos.ins().brnz(cmp_ueq, ueq_block, &[]); + pos.ins().jump(one_block, &[]); + + // Handle the common ordered, not equal (LT|GT) case. + pos.insert_block(one_block); + let one_inst = pos.ins().Binary(x86_opc, ty, x, y).0; + let one_result = pos.func.dfg.first_result(one_inst); + pos.ins().jump(done, &[one_result]); + + // Case 3) Unordered. + // We know that at least one operand is a NaN that needs to be propagated. We simply use an + // `fadd` instruction which has the same NaN propagation semantics. + pos.insert_block(uno_block); + let uno_result = pos.ins().fadd(x, y); + pos.ins().jump(done, &[uno_result]); + + // Case 2) or 3). + pos.insert_block(ueq_block); + // Test for case 3) (UN) one value is NaN. + // TODO: When we get support for flag values, we can reuse the above comparison. + let cmp_uno = pos.ins().fcmp(FloatCC::Unordered, x, y); + pos.ins().brnz(cmp_uno, uno_block, &[]); + pos.ins().jump(eq_block, &[]); + + // We are now in case 2) where x and y compare EQ. + // We need a bitwise operation to get the sign right. + pos.insert_block(eq_block); + let bw_inst = pos.ins().Binary(bitwise_opc, ty, x, y).0; + let bw_result = pos.func.dfg.first_result(bw_inst); + // This should become a fall-through for this second most common case. + // Recycle the original instruction as a jump. + pos.func.dfg.replace(inst).jump(done, &[bw_result]); + + // Finally insert a label for the completion. + pos.next_inst(); + pos.insert_block(done); + + cfg.recompute_block(pos.func, old_block); + cfg.recompute_block(pos.func, one_block); + cfg.recompute_block(pos.func, uno_block); + cfg.recompute_block(pos.func, ueq_block); + cfg.recompute_block(pos.func, eq_block); + cfg.recompute_block(pos.func, done); +} + +/// This legalization converts a minimum/maximum operation into a sequence that matches the +/// non-x86-friendly WebAssembly semantics of NaN handling. This logic is kept separate from +/// [expand_minmax] above (the scalar version) for code clarity. +fn expand_minmax_vector( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let ty = func.dfg.ctrl_typevar(inst); + debug_assert!(ty.is_vector()); + let (x, y, x86_opcode, is_max) = match func.dfg[inst] { + ir::InstructionData::Binary { + opcode: ir::Opcode::Fmin, + args, + } => (args[0], args[1], ir::Opcode::X86Fmin, false), + ir::InstructionData::Binary { + opcode: ir::Opcode::Fmax, + args, + } => (args[0], args[1], ir::Opcode::X86Fmax, true), + _ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)), + }; + + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + // This sequence is complex due to how x86 handles NaNs and +0/-0. If x86 finds a NaN in + // either lane it returns the second operand; likewise, if both operands are in {+0.0, -0.0} + // it returns the second operand. To match the behavior of "return the minimum of the + // operands or a canonical NaN if either operand is NaN," we must compare in both + // directions. + let (forward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, x, y); + let forward = dfg.first_result(forward_inst); + let (backward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, y, x); + let backward = dfg.first_result(backward_inst); + + let (value, mask) = if is_max { + // For maximum: + // Find any differences between the forward and backward `max` operation. + let difference = pos.ins().bxor(forward, backward); + // Merge in the differences. + let propagate_nans_and_plus_zero = pos.ins().bor(backward, difference); + let value = pos.ins().fsub(propagate_nans_and_plus_zero, difference); + // Discover which lanes have NaNs in them. + let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, difference, value); + (value, find_nan_lanes_mask) + } else { + // For minimum: + // If either lane is a NaN, we want to use these bits, not the second operand bits. + let propagate_nans = pos.ins().bor(backward, forward); + // Find which lanes contain a NaN with an unordered comparison, filling the mask with + // 1s. + let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, forward, propagate_nans); + let bitcast_find_nan_lanes_mask = pos.ins().raw_bitcast(ty, find_nan_lanes_mask); + // Then flood the value lane with all 1s if that lane is a NaN. This causes all NaNs + // along this code path to be quieted and negative: after the upcoming shift and and_not, + // all upper bits (sign, exponent, and payload MSB) will be 1s. + let tmp = pos.ins().bor(propagate_nans, bitcast_find_nan_lanes_mask); + (tmp, bitcast_find_nan_lanes_mask) + }; + + // During this lowering we will need to know how many bits to shift by and what type to + // convert to when using an integer shift. Recall that an IEEE754 number looks like: + // `[sign bit] [exponent bits] [significand bits]` + // A quiet NaN has all exponent bits set to 1 and the most significant bit of the + // significand set to 1; a signaling NaN has the same exponent but the MSB of the + // significand is set to 0. The payload of the NaN is the remaining significand bits, and + // WebAssembly assumes a canonical NaN is quiet and has 0s in its payload. To compute this + // canonical NaN, we create a mask for the top 10 bits on F32X4 (1 sign + 8 exp. + 1 MSB + // sig.) and the top 13 bits on F64X2 (1 sign + 11 exp. + 1 MSB sig.). This means that all + // NaNs produced with the mask will be negative (`-NaN`) which is allowed by the sign + // non-determinism in the spec: https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0 + let (shift_by, ty_as_int) = match ty { + F32X4 => (10, I32X4), + F64X2 => (13, I64X2), + _ => unimplemented!("this legalization only understands 128-bit floating point types"), + }; + + // In order to clear the NaN payload for canonical NaNs, we shift right the NaN lanes (all + // 1s) leaving 0s in the top bits. Remember that non-NaN lanes are all 0s so this has + // little effect. + let mask_as_int = pos.ins().raw_bitcast(ty_as_int, mask); + let shift_mask = pos.ins().ushr_imm(mask_as_int, shift_by); + let shift_mask_as_float = pos.ins().raw_bitcast(ty, shift_mask); + + // Finally, we replace the value with `value & ~shift_mask`. For non-NaN lanes, this is + // equivalent to `... & 1111...` but for NaN lanes this will only have 1s in the top bits, + // clearing the payload. + pos.func + .dfg + .replace(inst) + .band_not(value, shift_mask_as_float); +} + +/// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to +/// i64 with a pattern, the rest needs more code. +/// +/// Note that this is the scalar implementation; for the vector implemenation see +/// [expand_fcvt_from_uint_vector]. +fn expand_fcvt_from_uint( + inst: ir::Inst, + func: &mut ir::Function, + cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let x; + match func.dfg[inst] { + ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtFromUint, + arg, + } => x = arg, + _ => panic!("Need fcvt_from_uint: {}", func.dfg.display_inst(inst, None)), + } + let xty = func.dfg.value_type(x); + let result = func.dfg.first_result(inst); + let ty = func.dfg.value_type(result); + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + // Conversion from an unsigned int smaller than 64bit is easy on x86-64. + match xty { + ir::types::I8 | ir::types::I16 | ir::types::I32 => { + // TODO: This should be guarded by an ISA check. + let wide = pos.ins().uextend(ir::types::I64, x); + pos.func.dfg.replace(inst).fcvt_from_sint(ty, wide); + return; + } + ir::types::I64 => {} + _ => unimplemented!(), + } + + let old_block = pos.func.layout.pp_block(inst); + + // block handling the case where x >= 0. + let poszero_block = pos.func.dfg.make_block(); + + // block handling the case where x < 0. + let neg_block = pos.func.dfg.make_block(); + + // Final block with one argument representing the final result value. + let done = pos.func.dfg.make_block(); + + // Move the `inst` result value onto the `done` block. + pos.func.dfg.clear_results(inst); + pos.func.dfg.attach_block_param(done, result); + + // If x as a signed int is not negative, we can use the existing `fcvt_from_sint` instruction. + let is_neg = pos.ins().icmp_imm(IntCC::SignedLessThan, x, 0); + pos.ins().brnz(is_neg, neg_block, &[]); + pos.ins().jump(poszero_block, &[]); + + // Easy case: just use a signed conversion. + pos.insert_block(poszero_block); + let posres = pos.ins().fcvt_from_sint(ty, x); + pos.ins().jump(done, &[posres]); + + // Now handle the negative case. + pos.insert_block(neg_block); + + // Divide x by two to get it in range for the signed conversion, keep the LSB, and scale it + // back up on the FP side. + let ihalf = pos.ins().ushr_imm(x, 1); + let lsb = pos.ins().band_imm(x, 1); + let ifinal = pos.ins().bor(ihalf, lsb); + let fhalf = pos.ins().fcvt_from_sint(ty, ifinal); + let negres = pos.ins().fadd(fhalf, fhalf); + + // Recycle the original instruction as a jump. + pos.func.dfg.replace(inst).jump(done, &[negres]); + + // Finally insert a label for the completion. + pos.next_inst(); + pos.insert_block(done); + + cfg.recompute_block(pos.func, old_block); + cfg.recompute_block(pos.func, poszero_block); + cfg.recompute_block(pos.func, neg_block); + cfg.recompute_block(pos.func, done); +} + +/// To convert packed unsigned integers to their float equivalents, we must legalize to a special +/// AVX512 instruction (using MCSR rounding) or use a long sequence of instructions. This logic is +/// separate from [expand_fcvt_from_uint] above (the scalar version), only due to how the transform +/// groups are set up; TODO if we change the SIMD legalization groups, then this logic could be +/// merged into [expand_fcvt_from_uint] (see https://github.com/bytecodealliance/wasmtime/issues/1745). +fn expand_fcvt_from_uint_vector( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtFromUint, + arg, + } = pos.func.dfg[inst] + { + let controlling_type = pos.func.dfg.ctrl_typevar(inst); + if controlling_type == F32X4 { + debug_assert_eq!(pos.func.dfg.value_type(arg), I32X4); + let x86_isa = isa + .as_any() + .downcast_ref::<isa::x86::Isa>() + .expect("the target ISA must be x86 at this point"); + if x86_isa.isa_flags.use_avx512vl_simd() || x86_isa.isa_flags.use_avx512f_simd() { + // If we have certain AVX512 features, we can lower this instruction simply. + pos.func.dfg.replace(inst).x86_vcvtudq2ps(arg); + } else { + // Otherwise, we default to a very lengthy SSE4.1-compatible sequence: PXOR, + // PBLENDW, PSUB, CVTDQ2PS, PSRLD, CVTDQ2PS, ADDPS, ADDPS + let bitcast_arg = pos.ins().raw_bitcast(I16X8, arg); + let zero_constant = pos.func.dfg.constants.insert(vec![0; 16].into()); + let zero = pos.ins().vconst(I16X8, zero_constant); + let low = pos.ins().x86_pblendw(zero, bitcast_arg, 0x55); + let bitcast_low = pos.ins().raw_bitcast(I32X4, low); + let high = pos.ins().isub(arg, bitcast_low); + let convert_low = pos.ins().fcvt_from_sint(F32X4, bitcast_low); + let shift_high = pos.ins().ushr_imm(high, 1); + let convert_high = pos.ins().fcvt_from_sint(F32X4, shift_high); + let double_high = pos.ins().fadd(convert_high, convert_high); + pos.func.dfg.replace(inst).fadd(double_high, convert_low); + } + } else { + unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None)) + } + } +} + +fn expand_fcvt_to_sint( + inst: ir::Inst, + func: &mut ir::Function, + cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + use crate::ir::immediates::{Ieee32, Ieee64}; + + let x = match func.dfg[inst] { + ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtToSint, + arg, + } => arg, + _ => panic!("Need fcvt_to_sint: {}", func.dfg.display_inst(inst, None)), + }; + let old_block = func.layout.pp_block(inst); + let xty = func.dfg.value_type(x); + let result = func.dfg.first_result(inst); + let ty = func.dfg.value_type(result); + + // Final block after the bad value checks. + let done = func.dfg.make_block(); + + // block for checking failure cases. + let maybe_trap_block = func.dfg.make_block(); + + // The `x86_cvtt2si` performs the desired conversion, but it doesn't trap on NaN or overflow. + // It produces an INT_MIN result instead. + func.dfg.replace(inst).x86_cvtt2si(ty, x); + + let mut pos = FuncCursor::new(func).after_inst(inst); + pos.use_srcloc(inst); + + let is_done = pos + .ins() + .icmp_imm(IntCC::NotEqual, result, 1 << (ty.lane_bits() - 1)); + pos.ins().brnz(is_done, done, &[]); + pos.ins().jump(maybe_trap_block, &[]); + + // We now have the following possibilities: + // + // 1. INT_MIN was actually the correct conversion result. + // 2. The input was NaN -> trap bad_toint + // 3. The input was out of range -> trap int_ovf + // + pos.insert_block(maybe_trap_block); + + // Check for NaN. + let is_nan = pos.ins().fcmp(FloatCC::Unordered, x, x); + pos.ins() + .trapnz(is_nan, ir::TrapCode::BadConversionToInteger); + + // Check for case 1: INT_MIN is the correct result. + // Determine the smallest floating point number that would convert to INT_MIN. + let mut overflow_cc = FloatCC::LessThan; + let output_bits = ty.lane_bits(); + let flimit = match xty { + ir::types::F32 => + // An f32 can represent `i16::min_value() - 1` exactly with precision to spare, so + // there are values less than -2^(N-1) that convert correctly to INT_MIN. + { + pos.ins().f32const(if output_bits < 32 { + overflow_cc = FloatCC::LessThanOrEqual; + Ieee32::fcvt_to_sint_negative_overflow(output_bits) + } else { + Ieee32::pow2(output_bits - 1).neg() + }) + } + ir::types::F64 => + // An f64 can represent `i32::min_value() - 1` exactly with precision to spare, so + // there are values less than -2^(N-1) that convert correctly to INT_MIN. + { + pos.ins().f64const(if output_bits < 64 { + overflow_cc = FloatCC::LessThanOrEqual; + Ieee64::fcvt_to_sint_negative_overflow(output_bits) + } else { + Ieee64::pow2(output_bits - 1).neg() + }) + } + _ => panic!("Can't convert {}", xty), + }; + let overflow = pos.ins().fcmp(overflow_cc, x, flimit); + pos.ins().trapnz(overflow, ir::TrapCode::IntegerOverflow); + + // Finally, we could have a positive value that is too large. + let fzero = match xty { + ir::types::F32 => pos.ins().f32const(Ieee32::with_bits(0)), + ir::types::F64 => pos.ins().f64const(Ieee64::with_bits(0)), + _ => panic!("Can't convert {}", xty), + }; + let overflow = pos.ins().fcmp(FloatCC::GreaterThanOrEqual, x, fzero); + pos.ins().trapnz(overflow, ir::TrapCode::IntegerOverflow); + + pos.ins().jump(done, &[]); + pos.insert_block(done); + + cfg.recompute_block(pos.func, old_block); + cfg.recompute_block(pos.func, maybe_trap_block); + cfg.recompute_block(pos.func, done); +} + +fn expand_fcvt_to_sint_sat( + inst: ir::Inst, + func: &mut ir::Function, + cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + use crate::ir::immediates::{Ieee32, Ieee64}; + + let x = match func.dfg[inst] { + ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtToSintSat, + arg, + } => arg, + _ => panic!( + "Need fcvt_to_sint_sat: {}", + func.dfg.display_inst(inst, None) + ), + }; + + let old_block = func.layout.pp_block(inst); + let xty = func.dfg.value_type(x); + let result = func.dfg.first_result(inst); + let ty = func.dfg.value_type(result); + + // Final block after the bad value checks. + let done_block = func.dfg.make_block(); + let intmin_block = func.dfg.make_block(); + let minsat_block = func.dfg.make_block(); + let maxsat_block = func.dfg.make_block(); + func.dfg.clear_results(inst); + func.dfg.attach_block_param(done_block, result); + + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + // The `x86_cvtt2si` performs the desired conversion, but it doesn't trap on NaN or + // overflow. It produces an INT_MIN result instead. + let cvtt2si = pos.ins().x86_cvtt2si(ty, x); + + let is_done = pos + .ins() + .icmp_imm(IntCC::NotEqual, cvtt2si, 1 << (ty.lane_bits() - 1)); + pos.ins().brnz(is_done, done_block, &[cvtt2si]); + pos.ins().jump(intmin_block, &[]); + + // We now have the following possibilities: + // + // 1. INT_MIN was actually the correct conversion result. + // 2. The input was NaN -> replace the result value with 0. + // 3. The input was out of range -> saturate the result to the min/max value. + pos.insert_block(intmin_block); + + // Check for NaN, which is truncated to 0. + let zero = pos.ins().iconst(ty, 0); + let is_nan = pos.ins().fcmp(FloatCC::Unordered, x, x); + pos.ins().brnz(is_nan, done_block, &[zero]); + pos.ins().jump(minsat_block, &[]); + + // Check for case 1: INT_MIN is the correct result. + // Determine the smallest floating point number that would convert to INT_MIN. + pos.insert_block(minsat_block); + let mut overflow_cc = FloatCC::LessThan; + let output_bits = ty.lane_bits(); + let flimit = match xty { + ir::types::F32 => + // An f32 can represent `i16::min_value() - 1` exactly with precision to spare, so + // there are values less than -2^(N-1) that convert correctly to INT_MIN. + { + pos.ins().f32const(if output_bits < 32 { + overflow_cc = FloatCC::LessThanOrEqual; + Ieee32::fcvt_to_sint_negative_overflow(output_bits) + } else { + Ieee32::pow2(output_bits - 1).neg() + }) + } + ir::types::F64 => + // An f64 can represent `i32::min_value() - 1` exactly with precision to spare, so + // there are values less than -2^(N-1) that convert correctly to INT_MIN. + { + pos.ins().f64const(if output_bits < 64 { + overflow_cc = FloatCC::LessThanOrEqual; + Ieee64::fcvt_to_sint_negative_overflow(output_bits) + } else { + Ieee64::pow2(output_bits - 1).neg() + }) + } + _ => panic!("Can't convert {}", xty), + }; + + let overflow = pos.ins().fcmp(overflow_cc, x, flimit); + let min_imm = match ty { + ir::types::I32 => i32::min_value() as i64, + ir::types::I64 => i64::min_value(), + _ => panic!("Don't know the min value for {}", ty), + }; + let min_value = pos.ins().iconst(ty, min_imm); + pos.ins().brnz(overflow, done_block, &[min_value]); + pos.ins().jump(maxsat_block, &[]); + + // Finally, we could have a positive value that is too large. + pos.insert_block(maxsat_block); + let fzero = match xty { + ir::types::F32 => pos.ins().f32const(Ieee32::with_bits(0)), + ir::types::F64 => pos.ins().f64const(Ieee64::with_bits(0)), + _ => panic!("Can't convert {}", xty), + }; + + let max_imm = match ty { + ir::types::I32 => i32::max_value() as i64, + ir::types::I64 => i64::max_value(), + _ => panic!("Don't know the max value for {}", ty), + }; + let max_value = pos.ins().iconst(ty, max_imm); + + let overflow = pos.ins().fcmp(FloatCC::GreaterThanOrEqual, x, fzero); + pos.ins().brnz(overflow, done_block, &[max_value]); + + // Recycle the original instruction. + pos.func.dfg.replace(inst).jump(done_block, &[cvtt2si]); + + // Finally insert a label for the completion. + pos.next_inst(); + pos.insert_block(done_block); + + cfg.recompute_block(pos.func, old_block); + cfg.recompute_block(pos.func, intmin_block); + cfg.recompute_block(pos.func, minsat_block); + cfg.recompute_block(pos.func, maxsat_block); + cfg.recompute_block(pos.func, done_block); +} + +/// This legalization converts a vector of 32-bit floating point lanes to signed integer lanes +/// using CVTTPS2DQ (see encoding of `x86_cvtt2si`). This logic is separate from [expand_fcvt_to_sint_sat] +/// above (the scalar version), only due to how the transform groups are set up; TODO if we change +/// the SIMD legalization groups, then this logic could be merged into [expand_fcvt_to_sint_sat] +/// (see https://github.com/bytecodealliance/wasmtime/issues/1745). +fn expand_fcvt_to_sint_sat_vector( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtToSintSat, + arg, + } = pos.func.dfg[inst] + { + let controlling_type = pos.func.dfg.ctrl_typevar(inst); + if controlling_type == I32X4 { + debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4); + // We must both quiet any NaNs--setting that lane to 0--and saturate any + // lanes that might overflow during conversion to the highest/lowest signed integer + // allowed in that lane. + + // Saturate NaNs: `fcmp eq` will not match if a lane contains a NaN. We use ANDPS to + // avoid doing the comparison twice (we need the zeroed lanes to find differences). + let zeroed_nans = pos.ins().fcmp(FloatCC::Equal, arg, arg); + let zeroed_nans_bitcast = pos.ins().raw_bitcast(F32X4, zeroed_nans); + let zeroed_nans_copy = pos.ins().band(arg, zeroed_nans_bitcast); + + // Find differences with the zeroed lanes (we will only use the MSB: 1 if positive or + // NaN, 0 otherwise). + let differences = pos.ins().bxor(zeroed_nans_bitcast, arg); + let differences_bitcast = pos.ins().raw_bitcast(I32X4, differences); + + // Convert the numeric lanes. CVTTPS2DQ will mark overflows with 0x80000000 (MSB set). + let converted = pos.ins().x86_cvtt2si(I32X4, zeroed_nans_copy); + + // Create a mask of all 1s only on positive overflow, 0s otherwise. This uses the MSB + // of `differences` (1 when positive or NaN) and the MSB of `converted` (1 on positive + // overflow). + let tmp = pos.ins().band(differences_bitcast, converted); + let mask = pos.ins().sshr_imm(tmp, 31); + + // Apply the mask to create 0x7FFFFFFF for positive overflow. XOR of all 0s (all other + // cases) has no effect. + pos.func.dfg.replace(inst).bxor(converted, mask); + } else { + unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None)) + } + } +} + +fn expand_fcvt_to_uint( + inst: ir::Inst, + func: &mut ir::Function, + cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + use crate::ir::immediates::{Ieee32, Ieee64}; + + let x = match func.dfg[inst] { + ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtToUint, + arg, + } => arg, + _ => panic!("Need fcvt_to_uint: {}", func.dfg.display_inst(inst, None)), + }; + + let old_block = func.layout.pp_block(inst); + let xty = func.dfg.value_type(x); + let result = func.dfg.first_result(inst); + let ty = func.dfg.value_type(result); + + // block handle numbers < 2^(N-1). + let below_uint_max_block = func.dfg.make_block(); + + // block handle numbers < 0. + let below_zero_block = func.dfg.make_block(); + + // block handling numbers >= 2^(N-1). + let large = func.dfg.make_block(); + + // Final block after the bad value checks. + let done = func.dfg.make_block(); + + // Move the `inst` result value onto the `done` block. + func.dfg.clear_results(inst); + func.dfg.attach_block_param(done, result); + + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + // Start by materializing the floating point constant 2^(N-1) where N is the number of bits in + // the destination integer type. + let pow2nm1 = match xty { + ir::types::F32 => pos.ins().f32const(Ieee32::pow2(ty.lane_bits() - 1)), + ir::types::F64 => pos.ins().f64const(Ieee64::pow2(ty.lane_bits() - 1)), + _ => panic!("Can't convert {}", xty), + }; + let is_large = pos.ins().ffcmp(x, pow2nm1); + pos.ins() + .brff(FloatCC::GreaterThanOrEqual, is_large, large, &[]); + pos.ins().jump(below_uint_max_block, &[]); + + // We need to generate a specific trap code when `x` is NaN, so reuse the flags from the + // previous comparison. + pos.insert_block(below_uint_max_block); + pos.ins().trapff( + FloatCC::Unordered, + is_large, + ir::TrapCode::BadConversionToInteger, + ); + + // Now we know that x < 2^(N-1) and not NaN. + let sres = pos.ins().x86_cvtt2si(ty, x); + let is_neg = pos.ins().ifcmp_imm(sres, 0); + pos.ins() + .brif(IntCC::SignedGreaterThanOrEqual, is_neg, done, &[sres]); + pos.ins().jump(below_zero_block, &[]); + + pos.insert_block(below_zero_block); + pos.ins().trap(ir::TrapCode::IntegerOverflow); + + // Handle the case where x >= 2^(N-1) and not NaN. + pos.insert_block(large); + let adjx = pos.ins().fsub(x, pow2nm1); + let lres = pos.ins().x86_cvtt2si(ty, adjx); + let is_neg = pos.ins().ifcmp_imm(lres, 0); + pos.ins() + .trapif(IntCC::SignedLessThan, is_neg, ir::TrapCode::IntegerOverflow); + let lfinal = pos.ins().iadd_imm(lres, 1 << (ty.lane_bits() - 1)); + + // Recycle the original instruction as a jump. + pos.func.dfg.replace(inst).jump(done, &[lfinal]); + + // Finally insert a label for the completion. + pos.next_inst(); + pos.insert_block(done); + + cfg.recompute_block(pos.func, old_block); + cfg.recompute_block(pos.func, below_uint_max_block); + cfg.recompute_block(pos.func, below_zero_block); + cfg.recompute_block(pos.func, large); + cfg.recompute_block(pos.func, done); +} + +fn expand_fcvt_to_uint_sat( + inst: ir::Inst, + func: &mut ir::Function, + cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + use crate::ir::immediates::{Ieee32, Ieee64}; + + let x = match func.dfg[inst] { + ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtToUintSat, + arg, + } => arg, + _ => panic!( + "Need fcvt_to_uint_sat: {}", + func.dfg.display_inst(inst, None) + ), + }; + + let old_block = func.layout.pp_block(inst); + let xty = func.dfg.value_type(x); + let result = func.dfg.first_result(inst); + let ty = func.dfg.value_type(result); + + // block handle numbers < 2^(N-1). + let below_pow2nm1_or_nan_block = func.dfg.make_block(); + let below_pow2nm1_block = func.dfg.make_block(); + + // block handling numbers >= 2^(N-1). + let large = func.dfg.make_block(); + + // block handling numbers < 2^N. + let uint_large_block = func.dfg.make_block(); + + // Final block after the bad value checks. + let done = func.dfg.make_block(); + + // Move the `inst` result value onto the `done` block. + func.dfg.clear_results(inst); + func.dfg.attach_block_param(done, result); + + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + // Start by materializing the floating point constant 2^(N-1) where N is the number of bits in + // the destination integer type. + let pow2nm1 = match xty { + ir::types::F32 => pos.ins().f32const(Ieee32::pow2(ty.lane_bits() - 1)), + ir::types::F64 => pos.ins().f64const(Ieee64::pow2(ty.lane_bits() - 1)), + _ => panic!("Can't convert {}", xty), + }; + let zero = pos.ins().iconst(ty, 0); + let is_large = pos.ins().ffcmp(x, pow2nm1); + pos.ins() + .brff(FloatCC::GreaterThanOrEqual, is_large, large, &[]); + pos.ins().jump(below_pow2nm1_or_nan_block, &[]); + + // We need to generate zero when `x` is NaN, so reuse the flags from the previous comparison. + pos.insert_block(below_pow2nm1_or_nan_block); + pos.ins().brff(FloatCC::Unordered, is_large, done, &[zero]); + pos.ins().jump(below_pow2nm1_block, &[]); + + // Now we know that x < 2^(N-1) and not NaN. If the result of the cvtt2si is positive, we're + // done; otherwise saturate to the minimum unsigned value, that is 0. + pos.insert_block(below_pow2nm1_block); + let sres = pos.ins().x86_cvtt2si(ty, x); + let is_neg = pos.ins().ifcmp_imm(sres, 0); + pos.ins() + .brif(IntCC::SignedGreaterThanOrEqual, is_neg, done, &[sres]); + pos.ins().jump(done, &[zero]); + + // Handle the case where x >= 2^(N-1) and not NaN. + pos.insert_block(large); + let adjx = pos.ins().fsub(x, pow2nm1); + let lres = pos.ins().x86_cvtt2si(ty, adjx); + let max_value = pos.ins().iconst( + ty, + match ty { + ir::types::I32 => u32::max_value() as i64, + ir::types::I64 => u64::max_value() as i64, + _ => panic!("Can't convert {}", ty), + }, + ); + let is_neg = pos.ins().ifcmp_imm(lres, 0); + pos.ins() + .brif(IntCC::SignedLessThan, is_neg, done, &[max_value]); + pos.ins().jump(uint_large_block, &[]); + + pos.insert_block(uint_large_block); + let lfinal = pos.ins().iadd_imm(lres, 1 << (ty.lane_bits() - 1)); + + // Recycle the original instruction as a jump. + pos.func.dfg.replace(inst).jump(done, &[lfinal]); + + // Finally insert a label for the completion. + pos.next_inst(); + pos.insert_block(done); + + cfg.recompute_block(pos.func, old_block); + cfg.recompute_block(pos.func, below_pow2nm1_or_nan_block); + cfg.recompute_block(pos.func, below_pow2nm1_block); + cfg.recompute_block(pos.func, large); + cfg.recompute_block(pos.func, uint_large_block); + cfg.recompute_block(pos.func, done); +} + +// Lanes of an I32x4 filled with the max signed integer values converted to an F32x4. +static MAX_SIGNED_I32X4S_AS_F32X4S: [u8; 16] = [ + 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, +]; + +/// This legalization converts a vector of 32-bit floating point lanes to unsigned integer lanes +/// using a long sequence of NaN quieting and truncation. This logic is separate from +/// [expand_fcvt_to_uint_sat] above (the scalar version), only due to how the transform groups are +/// set up; TODO if we change the SIMD legalization groups, then this logic could be merged into +/// [expand_fcvt_to_uint_sat] (see https://github.com/bytecodealliance/wasmtime/issues/1745). +fn expand_fcvt_to_uint_sat_vector( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtToUintSat, + arg, + } = pos.func.dfg[inst] + { + let controlling_type = pos.func.dfg.ctrl_typevar(inst); + if controlling_type == I32X4 { + debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4); + // We must both quiet any NaNs--setting that lane to 0--and saturate any + // lanes that might overflow during conversion to the highest/lowest integer + // allowed in that lane. + let zeroes_constant = pos.func.dfg.constants.insert(vec![0x00; 16].into()); + let max_signed_constant = pos + .func + .dfg + .constants + .insert(MAX_SIGNED_I32X4S_AS_F32X4S.as_ref().into()); + let zeroes = pos.ins().vconst(F32X4, zeroes_constant); + let max_signed = pos.ins().vconst(F32X4, max_signed_constant); + // Clamp the input to 0 for negative floating point numbers. TODO we need to + // convert NaNs to 0 but this doesn't do that? + let ge_zero = pos.ins().x86_fmax(arg, zeroes); + // Find lanes that exceed the max signed value that CVTTPS2DQ knows how to convert. + // For floating point numbers above this, CVTTPS2DQ returns the undefined value + // 0x80000000. + let minus_max_signed = pos.ins().fsub(ge_zero, max_signed); + let le_max_signed = + pos.ins() + .fcmp(FloatCC::LessThanOrEqual, max_signed, minus_max_signed); + // Identify lanes that have minus_max_signed > max_signed || minus_max_signed < 0. + // These lanes have the MSB set to 1 after the XOR. We are trying to calculate a + // valid, in-range addend. + let minus_max_signed_as_int = pos.ins().x86_cvtt2si(I32X4, minus_max_signed); + let le_max_signed_as_int = pos.ins().raw_bitcast(I32X4, le_max_signed); + let difference = pos + .ins() + .bxor(minus_max_signed_as_int, le_max_signed_as_int); + // Calculate amount to add above 0x7FFFFFF, zeroing out any lanes identified + // previously (MSB set to 1). + let zeroes_as_int = pos.ins().raw_bitcast(I32X4, zeroes); + let addend = pos.ins().x86_pmaxs(difference, zeroes_as_int); + // Convert the original clamped number to an integer and add back in the addend + // (the part of the value above 0x7FFFFFF, since CVTTPS2DQ overflows with these). + let converted = pos.ins().x86_cvtt2si(I32X4, ge_zero); + pos.func.dfg.replace(inst).iadd(converted, addend); + } else { + unreachable!( + "{} should not be legalized in expand_fcvt_to_uint_sat_vector", + pos.func.dfg.display_inst(inst, None) + ) + } + } +} + +/// Convert shuffle instructions. +fn convert_shuffle( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Shuffle { args, mask, .. } = pos.func.dfg[inst] { + // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a 1 + // in the most significant position zeroes the lane. + let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b }; + + // We only have to worry about aliasing here because copies will be introduced later (in + // regalloc). + let a = pos.func.dfg.resolve_aliases(args[0]); + let b = pos.func.dfg.resolve_aliases(args[1]); + let mask = pos + .func + .dfg + .immediates + .get(mask) + .expect("The shuffle immediate should have been recorded before this point") + .clone(); + if a == b { + // PSHUFB the first argument (since it is the same as the second). + let constructed_mask = mask + .iter() + // If the mask is greater than 15 it still may be referring to a lane in b. + .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b }) + .map(zero_unknown_lane_index) + .collect(); + let handle = pos.func.dfg.constants.insert(constructed_mask); + // Move the built mask into another XMM register. + let a_type = pos.func.dfg.value_type(a); + let mask_value = pos.ins().vconst(a_type, handle); + // Shuffle the single incoming argument. + pos.func.dfg.replace(inst).x86_pshufb(a, mask_value); + } else { + // PSHUFB the first argument, placing zeroes for unused lanes. + let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect(); + let handle = pos.func.dfg.constants.insert(constructed_mask); + // Move the built mask into another XMM register. + let a_type = pos.func.dfg.value_type(a); + let mask_value = pos.ins().vconst(a_type, handle); + // Shuffle the first argument. + let shuffled_first_arg = pos.ins().x86_pshufb(a, mask_value); + + // PSHUFB the second argument, placing zeroes for unused lanes. + let constructed_mask = mask + .iter() + .map(|b| b.wrapping_sub(16)) + .map(zero_unknown_lane_index) + .collect(); + let handle = pos.func.dfg.constants.insert(constructed_mask); + // Move the built mask into another XMM register. + let b_type = pos.func.dfg.value_type(b); + let mask_value = pos.ins().vconst(b_type, handle); + // Shuffle the second argument. + let shuffled_second_arg = pos.ins().x86_pshufb(b, mask_value); + + // OR the vectors together to form the final shuffled value. + pos.func + .dfg + .replace(inst) + .bor(shuffled_first_arg, shuffled_second_arg); + + // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB + }; + } +} + +/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF +/// extractlane instruction +fn convert_extractlane( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::BinaryImm8 { + opcode: ir::Opcode::Extractlane, + arg, + imm: lane, + } = pos.func.dfg[inst] + { + // NOTE: the following legalization assumes that the upper bits of the XMM register do + // not need to be zeroed during extractlane. + let value_type = pos.func.dfg.value_type(arg); + if value_type.lane_type().is_float() { + // Floats are already in XMM registers and can stay there. + let shuffled = if lane != 0 { + // Replace the extractlane with a PSHUFD to get the float in the right place. + match value_type { + F32X4 => { + // Move the selected lane to the 0 lane. + let shuffle_mask: u8 = 0b00_00_00_00 | lane; + pos.ins().x86_pshufd(arg, shuffle_mask) + } + F64X2 => { + assert_eq!(lane, 1); + // Because we know the lane == 1, we move the upper 64 bits to the lower + // 64 bits, leaving the top 64 bits as-is. + let shuffle_mask = 0b11_10_11_10; + let bitcast = pos.ins().raw_bitcast(F32X4, arg); + pos.ins().x86_pshufd(bitcast, shuffle_mask) + } + _ => unreachable!(), + } + } else { + // Remove the extractlane instruction, leaving the float where it is. + arg + }; + // Then we must bitcast to the right type. + pos.func + .dfg + .replace(inst) + .raw_bitcast(value_type.lane_type(), shuffled); + } else { + // For non-floats, lower with the usual PEXTR* instruction. + pos.func.dfg.replace(inst).x86_pextr(arg, lane); + } + } +} + +/// Because floats exist in XMM registers, we can keep them there when executing a CLIF +/// insertlane instruction +fn convert_insertlane( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::TernaryImm8 { + opcode: ir::Opcode::Insertlane, + args: [vector, replacement], + imm: lane, + } = pos.func.dfg[inst] + { + let value_type = pos.func.dfg.value_type(vector); + if value_type.lane_type().is_float() { + // Floats are already in XMM registers and can stay there. + match value_type { + F32X4 => { + assert!(lane <= 3); + let immediate = 0b00_00_00_00 | lane << 4; + // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane + // shifted into bits 5:6). + pos.func + .dfg + .replace(inst) + .x86_insertps(vector, replacement, immediate) + } + F64X2 => { + let replacement_as_vector = pos.ins().raw_bitcast(F64X2, replacement); // only necessary due to SSA types + if lane == 0 { + // Move the lowest quadword in replacement to vector without changing + // the upper bits. + pos.func + .dfg + .replace(inst) + .x86_movsd(vector, replacement_as_vector) + } else { + assert_eq!(lane, 1); + // Move the low 64 bits of replacement vector to the high 64 bits of the + // vector. + pos.func + .dfg + .replace(inst) + .x86_movlhps(vector, replacement_as_vector) + } + } + _ => unreachable!(), + }; + } else { + // For non-floats, lower with the usual PINSR* instruction. + pos.func + .dfg + .replace(inst) + .x86_pinsr(vector, replacement, lane); + } + } +} + +/// For SIMD or scalar integer negation, convert `ineg` to `vconst + isub` or `iconst + isub`. +fn convert_ineg( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Unary { + opcode: ir::Opcode::Ineg, + arg, + } = pos.func.dfg[inst] + { + let value_type = pos.func.dfg.value_type(arg); + let zero_value = if value_type.is_vector() && value_type.lane_type().is_int() { + let zero_immediate = pos.func.dfg.constants.insert(vec![0; 16].into()); + pos.ins().vconst(value_type, zero_immediate) // this should be legalized to a PXOR + } else if value_type.is_int() { + pos.ins().iconst(value_type, 0) + } else { + panic!("Can't convert ineg of type {}", value_type) + }; + pos.func.dfg.replace(inst).isub(zero_value, arg); + } else { + unreachable!() + } +} + +fn expand_dword_to_xmm<'f>( + pos: &mut FuncCursor<'_>, + arg: ir::Value, + arg_type: ir::Type, +) -> ir::Value { + if arg_type == I64 { + let (arg_lo, arg_hi) = pos.ins().isplit(arg); + let arg = pos.ins().scalar_to_vector(I32X4, arg_lo); + let arg = pos.ins().insertlane(arg, arg_hi, 1); + let arg = pos.ins().raw_bitcast(I64X2, arg); + arg + } else { + pos.ins().bitcast(I64X2, arg) + } +} + +fn contract_dword_from_xmm<'f>( + pos: &mut FuncCursor<'f>, + inst: ir::Inst, + ret: ir::Value, + ret_type: ir::Type, +) { + if ret_type == I64 { + let ret = pos.ins().raw_bitcast(I32X4, ret); + let ret_lo = pos.ins().extractlane(ret, 0); + let ret_hi = pos.ins().extractlane(ret, 1); + pos.func.dfg.replace(inst).iconcat(ret_lo, ret_hi); + } else { + let ret = pos.ins().extractlane(ret, 0); + pos.func.dfg.replace(inst).ireduce(ret_type, ret); + } +} + +// Masks for i8x16 unsigned right shift. +static USHR_MASKS: [u8; 128] = [ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, +]; + +// Convert a vector unsigned right shift. x86 has implementations for i16x8 and up (see `x86_pslr`), +// but for i8x16 we translate the shift to a i16x8 shift and mask off the upper bits. This same +// conversion could be provided in the CDSL if we could use varargs there (TODO); i.e. `load_complex` +// has a varargs field that we can't modify with the CDSL in legalize.rs. +fn convert_ushr( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Binary { + opcode: ir::Opcode::Ushr, + args: [arg0, arg1], + } = pos.func.dfg[inst] + { + // Note that for Wasm, the bounding of the shift index has happened during translation + let arg0_type = pos.func.dfg.value_type(arg0); + let arg1_type = pos.func.dfg.value_type(arg1); + assert!(!arg1_type.is_vector() && arg1_type.is_int()); + + // TODO it may be more clear to use scalar_to_vector here; the current issue is that + // scalar_to_vector has the restriction that the vector produced has a matching lane size + // (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 -> + // i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift + // index type--this could be relaxed since it is not really meaningful. + let shift_index = pos.ins().bitcast(I64X2, arg1); + + if arg0_type == I8X16 { + // First, shift the vector using an I16X8 shift. + let bitcasted = pos.ins().raw_bitcast(I16X8, arg0); + let shifted = pos.ins().x86_psrl(bitcasted, shift_index); + let shifted = pos.ins().raw_bitcast(I8X16, shifted); + + // Then, fixup the even lanes that have incorrect upper bits. This uses the 128 mask + // bytes as a table that we index into. It is a substantial code-size increase but + // reduces the instruction count slightly. + let masks = pos.func.dfg.constants.insert(USHR_MASKS.as_ref().into()); + let mask_address = pos.ins().const_addr(isa.pointer_type(), masks); + let mask_offset = pos.ins().ishl_imm(arg1, 4); + let mask = + pos.ins() + .load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0); + pos.func.dfg.replace(inst).band(shifted, mask); + } else if arg0_type.is_vector() { + // x86 has encodings for these shifts. + pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index); + } else if arg0_type == I64 { + // 64 bit shifts need to be legalized on x86_32. + let x86_isa = isa + .as_any() + .downcast_ref::<isa::x86::Isa>() + .expect("the target ISA must be x86 at this point"); + if x86_isa.isa_flags.has_sse41() { + // if we have pinstrq/pextrq (SSE 4.1), legalize to that + let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type); + let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type); + let shifted = pos.ins().x86_psrl(value, amount); + contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type); + } else { + // otherwise legalize to libcall + expand_as_libcall(inst, func, isa); + } + } else { + // Everything else should be already legal. + unreachable!() + } + } +} + +// Masks for i8x16 left shift. +static SHL_MASKS: [u8; 128] = [ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, + 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, + 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, + 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, + 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +]; + +// Convert a vector left shift. x86 has implementations for i16x8 and up (see `x86_psll`), +// but for i8x16 we translate the shift to a i16x8 shift and mask off the lower bits. This same +// conversion could be provided in the CDSL if we could use varargs there (TODO); i.e. `load_complex` +// has a varargs field that we can't modify with the CDSL in legalize.rs. +fn convert_ishl( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Binary { + opcode: ir::Opcode::Ishl, + args: [arg0, arg1], + } = pos.func.dfg[inst] + { + // Note that for Wasm, the bounding of the shift index has happened during translation + let arg0_type = pos.func.dfg.value_type(arg0); + let arg1_type = pos.func.dfg.value_type(arg1); + assert!(!arg1_type.is_vector() && arg1_type.is_int()); + + // TODO it may be more clear to use scalar_to_vector here; the current issue is that + // scalar_to_vector has the restriction that the vector produced has a matching lane size + // (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 -> + // i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift + // index type--this could be relaxed since it is not really meaningful. + let shift_index = pos.ins().bitcast(I64X2, arg1); + + if arg0_type == I8X16 { + // First, shift the vector using an I16X8 shift. + let bitcasted = pos.ins().raw_bitcast(I16X8, arg0); + let shifted = pos.ins().x86_psll(bitcasted, shift_index); + let shifted = pos.ins().raw_bitcast(I8X16, shifted); + + // Then, fixup the even lanes that have incorrect lower bits. This uses the 128 mask + // bytes as a table that we index into. It is a substantial code-size increase but + // reduces the instruction count slightly. + let masks = pos.func.dfg.constants.insert(SHL_MASKS.as_ref().into()); + let mask_address = pos.ins().const_addr(isa.pointer_type(), masks); + let mask_offset = pos.ins().ishl_imm(arg1, 4); + let mask = + pos.ins() + .load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0); + pos.func.dfg.replace(inst).band(shifted, mask); + } else if arg0_type.is_vector() { + // x86 has encodings for these shifts. + pos.func.dfg.replace(inst).x86_psll(arg0, shift_index); + } else if arg0_type == I64 { + // 64 bit shifts need to be legalized on x86_32. + let x86_isa = isa + .as_any() + .downcast_ref::<isa::x86::Isa>() + .expect("the target ISA must be x86 at this point"); + if x86_isa.isa_flags.has_sse41() { + // if we have pinstrq/pextrq (SSE 4.1), legalize to that + let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type); + let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type); + let shifted = pos.ins().x86_psll(value, amount); + contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type); + } else { + // otherwise legalize to libcall + expand_as_libcall(inst, func, isa); + } + } else { + // Everything else should be already legal. + unreachable!() + } + } +} + +/// Convert an imul.i64x2 to a valid code sequence on x86, first with AVX512 and then with SSE2. +fn convert_i64x2_imul( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Binary { + opcode: ir::Opcode::Imul, + args: [arg0, arg1], + } = pos.func.dfg[inst] + { + let ty = pos.func.dfg.ctrl_typevar(inst); + if ty == I64X2 { + let x86_isa = isa + .as_any() + .downcast_ref::<isa::x86::Isa>() + .expect("the target ISA must be x86 at this point"); + if x86_isa.isa_flags.use_avx512dq_simd() || x86_isa.isa_flags.use_avx512vl_simd() { + // If we have certain AVX512 features, we can lower this instruction simply. + pos.func.dfg.replace(inst).x86_pmullq(arg0, arg1); + } else { + // Otherwise, we default to a very lengthy SSE2-compatible sequence. It splits each + // 64-bit lane into 32-bit high and low sections using shifting and then performs + // the following arithmetic per lane: with arg0 = concat(high0, low0) and arg1 = + // concat(high1, low1), calculate (high0 * low1) + (high1 * low0) + (low0 * low1). + let high0 = pos.ins().ushr_imm(arg0, 32); + let mul0 = pos.ins().x86_pmuludq(high0, arg1); + let high1 = pos.ins().ushr_imm(arg1, 32); + let mul1 = pos.ins().x86_pmuludq(high1, arg0); + let addhigh = pos.ins().iadd(mul0, mul1); + let high = pos.ins().ishl_imm(addhigh, 32); + let low = pos.ins().x86_pmuludq(arg0, arg1); + pos.func.dfg.replace(inst).iadd(low, high); + } + } else { + unreachable!( + "{} should be encodable; it cannot be legalized by convert_i64x2_imul", + pos.func.dfg.display_inst(inst, None) + ); + } + } +} + +fn expand_tls_value( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + use crate::settings::TlsModel; + + assert!( + isa.triple().architecture == target_lexicon::Architecture::X86_64, + "Not yet implemented for {:?}", + isa.triple(), + ); + + if let ir::InstructionData::UnaryGlobalValue { + opcode: ir::Opcode::TlsValue, + global_value, + } = func.dfg[inst] + { + let ctrl_typevar = func.dfg.ctrl_typevar(inst); + assert_eq!(ctrl_typevar, ir::types::I64); + + match isa.flags().tls_model() { + TlsModel::None => panic!("tls_model flag is not set."), + TlsModel::ElfGd => { + func.dfg.replace(inst).x86_elf_tls_get_addr(global_value); + } + TlsModel::Macho => { + func.dfg.replace(inst).x86_macho_tls_get_addr(global_value); + } + model => unimplemented!("tls_value for tls model {:?}", model), + } + } else { + unreachable!(); + } +} + +fn expand_load_splat( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + + pos.use_srcloc(inst); + + let (ptr, offset, flags) = match pos.func.dfg[inst] { + ir::InstructionData::Load { + opcode: ir::Opcode::LoadSplat, + arg, + offset, + flags, + } => (arg, offset, flags), + _ => panic!( + "Expected load_splat: {}", + pos.func.dfg.display_inst(inst, None) + ), + }; + let ty = pos.func.dfg.ctrl_typevar(inst); + let load = pos.ins().load(ty.lane_type(), flags, ptr, offset); + + pos.func.dfg.replace(inst).splat(ty, load); +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/mod.rs b/third_party/rust/cranelift-codegen/src/isa/x86/mod.rs new file mode 100644 index 0000000000..cbdeb3069d --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x86/mod.rs @@ -0,0 +1,190 @@ +//! x86 Instruction Set Architectures. + +mod abi; +mod binemit; +mod enc_tables; +mod registers; +pub mod settings; +#[cfg(feature = "unwind")] +pub mod unwind; + +use super::super::settings as shared_settings; +#[cfg(feature = "testing_hooks")] +use crate::binemit::CodeSink; +use crate::binemit::{emit_function, MemoryCodeSink}; +use crate::ir; +use crate::isa::enc_tables::{self as shared_enc_tables, lookup_enclist, Encodings}; +use crate::isa::Builder as IsaBuilder; +#[cfg(feature = "unwind")] +use crate::isa::{unwind::systemv::RegisterMappingError, RegUnit}; +use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa}; +use crate::regalloc; +use crate::result::CodegenResult; +use crate::timing; +use alloc::borrow::Cow; +use alloc::boxed::Box; +use core::any::Any; +use core::fmt; +use target_lexicon::{PointerWidth, Triple}; + +#[allow(dead_code)] +struct Isa { + triple: Triple, + shared_flags: shared_settings::Flags, + isa_flags: settings::Flags, + cpumode: &'static [shared_enc_tables::Level1Entry<u16>], +} + +/// Get an ISA builder for creating x86 targets. +pub fn isa_builder(triple: Triple) -> IsaBuilder { + IsaBuilder { + triple, + setup: settings::builder(), + constructor: isa_constructor, + } +} + +fn isa_constructor( + triple: Triple, + shared_flags: shared_settings::Flags, + builder: shared_settings::Builder, +) -> Box<dyn TargetIsa> { + let level1 = match triple.pointer_width().unwrap() { + PointerWidth::U16 => unimplemented!("x86-16"), + PointerWidth::U32 => &enc_tables::LEVEL1_I32[..], + PointerWidth::U64 => &enc_tables::LEVEL1_I64[..], + }; + + let isa_flags = settings::Flags::new(&shared_flags, builder); + + Box::new(Isa { + triple, + isa_flags, + shared_flags, + cpumode: level1, + }) +} + +impl TargetIsa for Isa { + fn name(&self) -> &'static str { + "x86" + } + + fn triple(&self) -> &Triple { + &self.triple + } + + fn flags(&self) -> &shared_settings::Flags { + &self.shared_flags + } + + fn uses_cpu_flags(&self) -> bool { + true + } + + fn uses_complex_addresses(&self) -> bool { + true + } + + fn register_info(&self) -> RegInfo { + registers::INFO.clone() + } + + #[cfg(feature = "unwind")] + fn map_dwarf_register(&self, reg: RegUnit) -> Result<u16, RegisterMappingError> { + unwind::systemv::map_reg(self, reg).map(|r| r.0) + } + + fn encoding_info(&self) -> EncInfo { + enc_tables::INFO.clone() + } + + fn legal_encodings<'a>( + &'a self, + func: &'a ir::Function, + inst: &'a ir::InstructionData, + ctrl_typevar: ir::Type, + ) -> Encodings<'a> { + lookup_enclist( + ctrl_typevar, + inst, + func, + self.cpumode, + &enc_tables::LEVEL2[..], + &enc_tables::ENCLISTS[..], + &enc_tables::LEGALIZE_ACTIONS[..], + &enc_tables::RECIPE_PREDICATES[..], + &enc_tables::INST_PREDICATES[..], + self.isa_flags.predicate_view(), + ) + } + + fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool) { + abi::legalize_signature( + sig, + &self.triple, + current, + &self.shared_flags, + &self.isa_flags, + ) + } + + fn regclass_for_abi_type(&self, ty: ir::Type) -> RegClass { + abi::regclass_for_abi_type(ty) + } + + fn allocatable_registers(&self, _func: &ir::Function) -> regalloc::RegisterSet { + abi::allocatable_registers(&self.triple, &self.shared_flags) + } + + #[cfg(feature = "testing_hooks")] + fn emit_inst( + &self, + func: &ir::Function, + inst: ir::Inst, + divert: &mut regalloc::RegDiversions, + sink: &mut dyn CodeSink, + ) { + binemit::emit_inst(func, inst, divert, sink, self) + } + + fn emit_function_to_memory(&self, func: &ir::Function, sink: &mut MemoryCodeSink) { + emit_function(func, binemit::emit_inst, sink, self) + } + + fn prologue_epilogue(&self, func: &mut ir::Function) -> CodegenResult<()> { + let _tt = timing::prologue_epilogue(); + abi::prologue_epilogue(func, self) + } + + fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC { + ir::condcodes::IntCC::UnsignedLessThan + } + + fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC { + ir::condcodes::IntCC::UnsignedLessThan + } + + #[cfg(feature = "unwind")] + fn create_unwind_info( + &self, + func: &ir::Function, + ) -> CodegenResult<Option<super::unwind::UnwindInfo>> { + abi::create_unwind_info(func, self) + } + + #[cfg(feature = "unwind")] + fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> { + Some(unwind::systemv::create_cie()) + } + + fn as_any(&self) -> &dyn Any { + self as &dyn Any + } +} + +impl fmt::Display for Isa { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}\n{}", self.shared_flags, self.isa_flags) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/registers.rs b/third_party/rust/cranelift-codegen/src/isa/x86/registers.rs new file mode 100644 index 0000000000..a7518b268b --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x86/registers.rs @@ -0,0 +1,86 @@ +//! x86 register descriptions. + +use crate::isa::registers::{RegBank, RegClass, RegClassData, RegInfo, RegUnit}; + +include!(concat!(env!("OUT_DIR"), "/registers-x86.rs")); + +#[cfg(test)] +mod tests { + use super::*; + use crate::isa::RegUnit; + use alloc::string::{String, ToString}; + + #[test] + fn unit_encodings() { + fn gpr(unit: usize) -> Option<u16> { + Some(GPR.unit(unit)) + } + // The encoding of integer registers is not alphabetical. + assert_eq!(INFO.parse_regunit("rax"), gpr(0)); + assert_eq!(INFO.parse_regunit("rbx"), gpr(3)); + assert_eq!(INFO.parse_regunit("rcx"), gpr(1)); + assert_eq!(INFO.parse_regunit("rdx"), gpr(2)); + assert_eq!(INFO.parse_regunit("rsi"), gpr(6)); + assert_eq!(INFO.parse_regunit("rdi"), gpr(7)); + assert_eq!(INFO.parse_regunit("rbp"), gpr(5)); + assert_eq!(INFO.parse_regunit("rsp"), gpr(4)); + assert_eq!(INFO.parse_regunit("r8"), gpr(8)); + assert_eq!(INFO.parse_regunit("r15"), gpr(15)); + + fn fpr(unit: usize) -> Option<u16> { + Some(FPR.unit(unit)) + } + assert_eq!(INFO.parse_regunit("xmm0"), fpr(0)); + assert_eq!(INFO.parse_regunit("xmm15"), fpr(15)); + + // FIXME(#1306) Add these tests back in when FPR32 is re-added. + // fn fpr32(unit: usize) -> Option<u16> { + // Some(FPR32.unit(unit)) + // } + // assert_eq!(INFO.parse_regunit("xmm0"), fpr32(0)); + // assert_eq!(INFO.parse_regunit("xmm31"), fpr32(31)); + } + + #[test] + fn unit_names() { + fn gpr(ru: RegUnit) -> String { + INFO.display_regunit(GPR.first + ru).to_string() + } + assert_eq!(gpr(0), "%rax"); + assert_eq!(gpr(3), "%rbx"); + assert_eq!(gpr(1), "%rcx"); + assert_eq!(gpr(2), "%rdx"); + assert_eq!(gpr(6), "%rsi"); + assert_eq!(gpr(7), "%rdi"); + assert_eq!(gpr(5), "%rbp"); + assert_eq!(gpr(4), "%rsp"); + assert_eq!(gpr(8), "%r8"); + assert_eq!(gpr(15), "%r15"); + + fn fpr(ru: RegUnit) -> String { + INFO.display_regunit(FPR.first + ru).to_string() + } + assert_eq!(fpr(0), "%xmm0"); + assert_eq!(fpr(15), "%xmm15"); + + // FIXME(#1306) Add these tests back in when FPR32 is re-added. + // fn fpr32(ru: RegUnit) -> String { + // INFO.display_regunit(FPR32.first + ru).to_string() + // } + // assert_eq!(fpr32(0), "%xmm0"); + // assert_eq!(fpr32(31), "%xmm31"); + } + + #[test] + fn regclasses() { + assert_eq!(GPR.intersect_index(GPR), Some(GPR.into())); + assert_eq!(GPR.intersect_index(ABCD), Some(ABCD.into())); + assert_eq!(GPR.intersect_index(FPR), None); + assert_eq!(ABCD.intersect_index(GPR), Some(ABCD.into())); + assert_eq!(ABCD.intersect_index(ABCD), Some(ABCD.into())); + assert_eq!(ABCD.intersect_index(FPR), None); + assert_eq!(FPR.intersect_index(FPR), Some(FPR.into())); + assert_eq!(FPR.intersect_index(GPR), None); + assert_eq!(FPR.intersect_index(ABCD), None); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/settings.rs b/third_party/rust/cranelift-codegen/src/isa/x86/settings.rs new file mode 100644 index 0000000000..2d3a3f6698 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x86/settings.rs @@ -0,0 +1,52 @@ +//! x86 Settings. + +use crate::settings::{self, detail, Builder}; +use core::fmt; + +// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a +// public `Flags` struct with an impl for all of the settings defined in +// `cranelift-codegen/meta/src/isa/x86/settings.rs`. +include!(concat!(env!("OUT_DIR"), "/settings-x86.rs")); + +#[cfg(test)] +mod tests { + use super::{builder, Flags}; + use crate::settings::{self, Configurable}; + + #[test] + fn presets() { + let shared = settings::Flags::new(settings::builder()); + + // Nehalem has SSE4.1 but not BMI1. + let mut b0 = builder(); + b0.enable("nehalem").unwrap(); + let f0 = Flags::new(&shared, b0); + assert_eq!(f0.has_sse41(), true); + assert_eq!(f0.has_bmi1(), false); + + let mut b1 = builder(); + b1.enable("haswell").unwrap(); + let f1 = Flags::new(&shared, b1); + assert_eq!(f1.has_sse41(), true); + assert_eq!(f1.has_bmi1(), true); + } + #[test] + fn display_presets() { + // Spot check that the flags Display impl does not cause a panic + let shared = settings::Flags::new(settings::builder()); + + let b0 = builder(); + let f0 = Flags::new(&shared, b0); + let _ = format!("{}", f0); + + let mut b1 = builder(); + b1.enable("nehalem").unwrap(); + let f1 = Flags::new(&shared, b1); + let _ = format!("{}", f1); + + let mut b2 = builder(); + b2.enable("haswell").unwrap(); + let f2 = Flags::new(&shared, b2); + let _ = format!("{}", f2); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/x86/unwind.rs new file mode 100644 index 0000000000..2d6b29f04d --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x86/unwind.rs @@ -0,0 +1,535 @@ +//! Module for x86 unwind generation for supported ABIs. + +pub mod systemv; +pub mod winx64; + +use crate::ir::{Function, InstructionData, Opcode, ValueLoc}; +use crate::isa::x86::registers::{FPR, RU}; +use crate::isa::{RegUnit, TargetIsa}; +use crate::result::CodegenResult; +use alloc::vec::Vec; +use std::collections::HashMap; + +use crate::isa::unwind::input::{UnwindCode, UnwindInfo}; + +pub(crate) fn create_unwind_info( + func: &Function, + isa: &dyn TargetIsa, +) -> CodegenResult<Option<UnwindInfo<RegUnit>>> { + // Find last block based on max offset. + let last_block = func + .layout + .blocks() + .max_by_key(|b| func.offsets[*b]) + .expect("at least a block"); + // Find last instruction offset + size, and make it function size. + let function_size = func + .inst_offsets(last_block, &isa.encoding_info()) + .fold(0, |_, (offset, _, size)| offset + size); + + let entry_block = func.layout.entry_block().expect("missing entry block"); + let prologue_end = func.prologue_end.unwrap(); + let epilogues_start = func + .epilogues_start + .iter() + .map(|(i, b)| (*b, *i)) + .collect::<HashMap<_, _>>(); + + let word_size = isa.pointer_bytes(); + + let mut stack_size = None; + let mut prologue_size = 0; + let mut prologue_unwind_codes = Vec::new(); + let mut epilogues_unwind_codes = Vec::new(); + let mut frame_register: Option<RegUnit> = None; + + // Process only entry block and blocks with epilogues. + let mut blocks = func + .epilogues_start + .iter() + .map(|(_, b)| *b) + .collect::<Vec<_>>(); + if !blocks.contains(&entry_block) { + blocks.push(entry_block); + } + blocks.sort_by_key(|b| func.offsets[*b]); + + for block in blocks.iter() { + let mut in_prologue = block == &entry_block; + let mut in_epilogue = false; + let mut epilogue_pop_offsets = Vec::new(); + + let epilogue_start = epilogues_start.get(block); + let is_last_block = block == &last_block; + + for (offset, inst, size) in func.inst_offsets(*block, &isa.encoding_info()) { + let offset = offset + size; + + let unwind_codes; + if in_prologue { + // Check for prologue end (inclusive) + if prologue_end == inst { + in_prologue = false; + } + prologue_size += size; + unwind_codes = &mut prologue_unwind_codes; + } else if !in_epilogue && epilogue_start == Some(&inst) { + // Now in an epilogue, emit a remember state instruction if not last block + in_epilogue = true; + + epilogues_unwind_codes.push(Vec::new()); + unwind_codes = epilogues_unwind_codes.last_mut().unwrap(); + + if !is_last_block { + unwind_codes.push((offset, UnwindCode::RememberState)); + } + } else if in_epilogue { + unwind_codes = epilogues_unwind_codes.last_mut().unwrap(); + } else { + // Ignore normal instructions + continue; + } + + match func.dfg[inst] { + InstructionData::Unary { opcode, arg } => { + match opcode { + Opcode::X86Push => { + let reg = func.locations[arg].unwrap_reg(); + unwind_codes.push(( + offset, + UnwindCode::StackAlloc { + size: word_size.into(), + }, + )); + unwind_codes.push(( + offset, + UnwindCode::SaveRegister { + reg, + stack_offset: 0, + }, + )); + } + Opcode::AdjustSpDown => { + let stack_size = + stack_size.expect("expected a previous stack size instruction"); + + // This is used when calling a stack check function + // We need to track the assignment to RAX which has the size of the stack + unwind_codes + .push((offset, UnwindCode::StackAlloc { size: stack_size })); + } + _ => {} + } + } + InstructionData::UnaryImm { opcode, imm } => { + match opcode { + Opcode::Iconst => { + let imm: i64 = imm.into(); + assert!(imm <= core::u32::MAX as i64); + assert!(stack_size.is_none()); + + // This instruction should only appear in a prologue to pass an + // argument of the stack size to a stack check function. + // Record the stack size so we know what it is when we encounter the adjustment + // instruction (which will adjust via the register assigned to this instruction). + stack_size = Some(imm as u32); + } + Opcode::AdjustSpDownImm => { + let imm: i64 = imm.into(); + assert!(imm <= core::u32::MAX as i64); + + stack_size = Some(imm as u32); + + unwind_codes + .push((offset, UnwindCode::StackAlloc { size: imm as u32 })); + } + Opcode::AdjustSpUpImm => { + let imm: i64 = imm.into(); + assert!(imm <= core::u32::MAX as i64); + + stack_size = Some(imm as u32); + + unwind_codes + .push((offset, UnwindCode::StackDealloc { size: imm as u32 })); + } + _ => {} + } + } + InstructionData::Store { + opcode: Opcode::Store, + args: [arg1, arg2], + offset: stack_offset, + .. + } => { + if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) = + (func.locations[arg1], func.locations[arg2]) + { + // If this is a save of an FPR, record an unwind operation + // Note: the stack_offset here is relative to an adjusted SP + if dst == (RU::rsp as RegUnit) && FPR.contains(src) { + let stack_offset: i32 = stack_offset.into(); + unwind_codes.push(( + offset, + UnwindCode::SaveRegister { + reg: src, + stack_offset: stack_offset as u32, + }, + )); + } + } + } + InstructionData::CopySpecial { src, dst, .. } if frame_register.is_none() => { + // Check for change in CFA register (RSP is always the starting CFA) + if src == (RU::rsp as RegUnit) { + unwind_codes.push((offset, UnwindCode::SetFramePointer { reg: dst })); + frame_register = Some(dst); + } + } + InstructionData::NullAry { opcode } => match opcode { + Opcode::X86Pop => { + epilogue_pop_offsets.push(offset); + } + _ => {} + }, + InstructionData::MultiAry { opcode, .. } if in_epilogue => match opcode { + Opcode::Return => { + let args = func.dfg.inst_args(inst); + for (i, arg) in args.iter().rev().enumerate() { + // Only walk back the args for the pop instructions encountered + if i >= epilogue_pop_offsets.len() { + break; + } + + let offset = epilogue_pop_offsets[i]; + + let reg = func.locations[*arg].unwrap_reg(); + unwind_codes.push((offset, UnwindCode::RestoreRegister { reg })); + unwind_codes.push(( + offset, + UnwindCode::StackDealloc { + size: word_size.into(), + }, + )); + + if Some(reg) == frame_register { + unwind_codes.push((offset, UnwindCode::RestoreFramePointer)); + // Keep frame_register assigned for next epilogue. + } + } + epilogue_pop_offsets.clear(); + + // TODO ensure unwind codes sorted by offsets ? + + if !is_last_block { + unwind_codes.push((offset, UnwindCode::RestoreState)); + } + + in_epilogue = false; + } + _ => {} + }, + _ => {} + }; + } + } + + Ok(Some(UnwindInfo { + prologue_size, + prologue_unwind_codes, + epilogues_unwind_codes, + function_size, + word_size, + initial_sp_offset: word_size, + })) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::{ + types, AbiParam, ExternalName, InstBuilder, Signature, StackSlotData, StackSlotKind, + }; + use crate::isa::{lookup, CallConv}; + use crate::settings::{builder, Flags}; + use crate::Context; + use std::str::FromStr; + use target_lexicon::triple; + + #[test] + #[cfg_attr(feature = "x64", should_panic)] // TODO #2079 + fn test_small_alloc() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::WindowsFastcall, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let unwind = create_unwind_info(&context.func, &*isa) + .expect("can create unwind info") + .expect("expected unwind info"); + + assert_eq!( + unwind, + UnwindInfo { + prologue_size: 9, + prologue_unwind_codes: vec![ + (2, UnwindCode::StackAlloc { size: 8 }), + ( + 2, + UnwindCode::SaveRegister { + reg: RU::rbp.into(), + stack_offset: 0, + } + ), + ( + 5, + UnwindCode::SetFramePointer { + reg: RU::rbp.into(), + } + ), + (9, UnwindCode::StackAlloc { size: 64 }) + ], + epilogues_unwind_codes: vec![vec![ + (13, UnwindCode::StackDealloc { size: 64 }), + ( + 15, + UnwindCode::RestoreRegister { + reg: RU::rbp.into() + } + ), + (15, UnwindCode::StackDealloc { size: 8 }), + (15, UnwindCode::RestoreFramePointer) + ]], + function_size: 16, + word_size: 8, + initial_sp_offset: 8, + } + ); + } + + #[test] + #[cfg_attr(feature = "x64", should_panic)] // TODO #2079 + fn test_medium_alloc() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::WindowsFastcall, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 10000)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let unwind = create_unwind_info(&context.func, &*isa) + .expect("can create unwind info") + .expect("expected unwind info"); + + assert_eq!( + unwind, + UnwindInfo { + prologue_size: 27, + prologue_unwind_codes: vec![ + (2, UnwindCode::StackAlloc { size: 8 }), + ( + 2, + UnwindCode::SaveRegister { + reg: RU::rbp.into(), + stack_offset: 0, + } + ), + ( + 5, + UnwindCode::SetFramePointer { + reg: RU::rbp.into(), + } + ), + (27, UnwindCode::StackAlloc { size: 10000 }) + ], + epilogues_unwind_codes: vec![vec![ + (34, UnwindCode::StackDealloc { size: 10000 }), + ( + 36, + UnwindCode::RestoreRegister { + reg: RU::rbp.into() + } + ), + (36, UnwindCode::StackDealloc { size: 8 }), + (36, UnwindCode::RestoreFramePointer) + ]], + function_size: 37, + word_size: 8, + initial_sp_offset: 8, + } + ); + } + + #[test] + #[cfg_attr(feature = "x64", should_panic)] // TODO #2079 + fn test_large_alloc() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::WindowsFastcall, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 1000000)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let unwind = create_unwind_info(&context.func, &*isa) + .expect("can create unwind info") + .expect("expected unwind info"); + + assert_eq!( + unwind, + UnwindInfo { + prologue_size: 27, + prologue_unwind_codes: vec![ + (2, UnwindCode::StackAlloc { size: 8 }), + ( + 2, + UnwindCode::SaveRegister { + reg: RU::rbp.into(), + stack_offset: 0, + } + ), + ( + 5, + UnwindCode::SetFramePointer { + reg: RU::rbp.into(), + } + ), + (27, UnwindCode::StackAlloc { size: 1000000 }) + ], + epilogues_unwind_codes: vec![vec![ + (34, UnwindCode::StackDealloc { size: 1000000 }), + ( + 36, + UnwindCode::RestoreRegister { + reg: RU::rbp.into() + } + ), + (36, UnwindCode::StackDealloc { size: 8 }), + (36, UnwindCode::RestoreFramePointer) + ]], + function_size: 37, + word_size: 8, + initial_sp_offset: 8, + } + ); + } + + fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function { + let mut func = + Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv)); + + let block0 = func.dfg.make_block(); + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().return_(&[]); + + if let Some(stack_slot) = stack_slot { + func.stack_slots.push(stack_slot); + } + + func + } + + #[test] + #[cfg_attr(feature = "x64", should_panic)] // TODO #2079 + fn test_multi_return_func() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV)); + + context.compile(&*isa).expect("expected compilation"); + + let unwind = create_unwind_info(&context.func, &*isa) + .expect("can create unwind info") + .expect("expected unwind info"); + + assert_eq!( + unwind, + UnwindInfo { + prologue_size: 5, + prologue_unwind_codes: vec![ + (2, UnwindCode::StackAlloc { size: 8 }), + ( + 2, + UnwindCode::SaveRegister { + reg: RU::rbp.into(), + stack_offset: 0, + } + ), + ( + 5, + UnwindCode::SetFramePointer { + reg: RU::rbp.into() + } + ) + ], + epilogues_unwind_codes: vec![ + vec![ + (12, UnwindCode::RememberState), + ( + 12, + UnwindCode::RestoreRegister { + reg: RU::rbp.into() + } + ), + (12, UnwindCode::StackDealloc { size: 8 }), + (12, UnwindCode::RestoreFramePointer), + (13, UnwindCode::RestoreState) + ], + vec![ + ( + 15, + UnwindCode::RestoreRegister { + reg: RU::rbp.into() + } + ), + (15, UnwindCode::StackDealloc { size: 8 }), + (15, UnwindCode::RestoreFramePointer) + ] + ], + function_size: 16, + word_size: 8, + initial_sp_offset: 8, + } + ); + } + + fn create_multi_return_function(call_conv: CallConv) -> Function { + let mut sig = Signature::new(call_conv); + sig.params.push(AbiParam::new(types::I32)); + let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig); + + let block0 = func.dfg.make_block(); + let v0 = func.dfg.append_block_param(block0, types::I32); + let block1 = func.dfg.make_block(); + let block2 = func.dfg.make_block(); + + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().brnz(v0, block2, &[]); + pos.ins().jump(block1, &[]); + + pos.insert_block(block1); + pos.ins().return_(&[]); + + pos.insert_block(block2); + pos.ins().return_(&[]); + + func + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/unwind/systemv.rs b/third_party/rust/cranelift-codegen/src/isa/x86/unwind/systemv.rs new file mode 100644 index 0000000000..f6333f5afb --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x86/unwind/systemv.rs @@ -0,0 +1,234 @@ +//! Unwind information for System V ABI (x86-64). + +use crate::ir::Function; +use crate::isa::{ + unwind::systemv::{RegisterMappingError, UnwindInfo}, + CallConv, RegUnit, TargetIsa, +}; +use crate::result::CodegenResult; +use gimli::{write::CommonInformationEntry, Encoding, Format, Register, X86_64}; + +/// Creates a new x86-64 common information entry (CIE). +pub fn create_cie() -> CommonInformationEntry { + use gimli::write::CallFrameInstruction; + + let mut entry = CommonInformationEntry::new( + Encoding { + address_size: 8, + format: Format::Dwarf32, + version: 1, + }, + 1, // Code alignment factor + -8, // Data alignment factor + X86_64::RA, + ); + + // Every frame will start with the call frame address (CFA) at RSP+8 + // It is +8 to account for the push of the return address by the call instruction + entry.add_instruction(CallFrameInstruction::Cfa(X86_64::RSP, 8)); + + // Every frame will start with the return address at RSP (CFA-8 = RSP+8-8 = RSP) + entry.add_instruction(CallFrameInstruction::Offset(X86_64::RA, -8)); + + entry +} + +/// Map Cranelift registers to their corresponding Gimli registers. +pub fn map_reg(isa: &dyn TargetIsa, reg: RegUnit) -> Result<Register, RegisterMappingError> { + if isa.name() != "x86" || isa.pointer_bits() != 64 { + return Err(RegisterMappingError::UnsupportedArchitecture); + } + + // Mapping from https://github.com/bytecodealliance/cranelift/pull/902 by @iximeow + const X86_GP_REG_MAP: [gimli::Register; 16] = [ + X86_64::RAX, + X86_64::RCX, + X86_64::RDX, + X86_64::RBX, + X86_64::RSP, + X86_64::RBP, + X86_64::RSI, + X86_64::RDI, + X86_64::R8, + X86_64::R9, + X86_64::R10, + X86_64::R11, + X86_64::R12, + X86_64::R13, + X86_64::R14, + X86_64::R15, + ]; + const X86_XMM_REG_MAP: [gimli::Register; 16] = [ + X86_64::XMM0, + X86_64::XMM1, + X86_64::XMM2, + X86_64::XMM3, + X86_64::XMM4, + X86_64::XMM5, + X86_64::XMM6, + X86_64::XMM7, + X86_64::XMM8, + X86_64::XMM9, + X86_64::XMM10, + X86_64::XMM11, + X86_64::XMM12, + X86_64::XMM13, + X86_64::XMM14, + X86_64::XMM15, + ]; + + let reg_info = isa.register_info(); + let bank = reg_info + .bank_containing_regunit(reg) + .ok_or_else(|| RegisterMappingError::MissingBank)?; + match bank.name { + "IntRegs" => { + // x86 GP registers have a weird mapping to DWARF registers, so we use a + // lookup table. + Ok(X86_GP_REG_MAP[(reg - bank.first_unit) as usize]) + } + "FloatRegs" => Ok(X86_XMM_REG_MAP[(reg - bank.first_unit) as usize]), + _ => Err(RegisterMappingError::UnsupportedRegisterBank(bank.name)), + } +} + +pub(crate) fn create_unwind_info( + func: &Function, + isa: &dyn TargetIsa, +) -> CodegenResult<Option<UnwindInfo>> { + // Only System V-like calling conventions are supported + match func.signature.call_conv { + CallConv::Fast | CallConv::Cold | CallConv::SystemV => {} + _ => return Ok(None), + } + + if func.prologue_end.is_none() || isa.name() != "x86" || isa.pointer_bits() != 64 { + return Ok(None); + } + + let unwind = match super::create_unwind_info(func, isa)? { + Some(u) => u, + None => { + return Ok(None); + } + }; + + struct RegisterMapper<'a, 'b>(&'a (dyn TargetIsa + 'b)); + impl<'a, 'b> crate::isa::unwind::systemv::RegisterMapper<RegUnit> for RegisterMapper<'a, 'b> { + fn map(&self, reg: RegUnit) -> Result<u16, RegisterMappingError> { + Ok(map_reg(self.0, reg)?.0) + } + fn sp(&self) -> u16 { + X86_64::RSP.0 + } + } + let map = RegisterMapper(isa); + + Ok(Some(UnwindInfo::build(unwind, &map)?)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::{ + types, AbiParam, ExternalName, InstBuilder, Signature, StackSlotData, StackSlotKind, + }; + use crate::isa::{lookup, CallConv}; + use crate::settings::{builder, Flags}; + use crate::Context; + use gimli::write::Address; + use std::str::FromStr; + use target_lexicon::triple; + + #[test] + #[cfg_attr(feature = "x64", should_panic)] // TODO #2079 + fn test_simple_func() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::SystemV, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let fde = match isa + .create_unwind_info(&context.func) + .expect("can create unwind info") + { + Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => { + info.to_fde(Address::Constant(1234)) + } + _ => panic!("expected unwind information"), + }; + + assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(1234), length: 16, lsda: None, instructions: [(2, CfaOffset(16)), (2, Offset(Register(6), -16)), (5, CfaRegister(Register(6))), (15, SameValue(Register(6))), (15, Cfa(Register(7), 8))] }"); + } + + fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function { + let mut func = + Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv)); + + let block0 = func.dfg.make_block(); + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().return_(&[]); + + if let Some(stack_slot) = stack_slot { + func.stack_slots.push(stack_slot); + } + + func + } + + #[test] + #[cfg_attr(feature = "x64", should_panic)] // TODO #2079 + fn test_multi_return_func() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV)); + + context.compile(&*isa).expect("expected compilation"); + + let fde = match isa + .create_unwind_info(&context.func) + .expect("can create unwind info") + { + Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => { + info.to_fde(Address::Constant(4321)) + } + _ => panic!("expected unwind information"), + }; + + assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 16, lsda: None, instructions: [(2, CfaOffset(16)), (2, Offset(Register(6), -16)), (5, CfaRegister(Register(6))), (12, RememberState), (12, SameValue(Register(6))), (12, Cfa(Register(7), 8)), (13, RestoreState), (15, SameValue(Register(6))), (15, Cfa(Register(7), 8))] }"); + } + + fn create_multi_return_function(call_conv: CallConv) -> Function { + let mut sig = Signature::new(call_conv); + sig.params.push(AbiParam::new(types::I32)); + let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig); + + let block0 = func.dfg.make_block(); + let v0 = func.dfg.append_block_param(block0, types::I32); + let block1 = func.dfg.make_block(); + let block2 = func.dfg.make_block(); + + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().brnz(v0, block2, &[]); + pos.ins().jump(block1, &[]); + + pos.insert_block(block1); + pos.ins().return_(&[]); + + pos.insert_block(block2); + pos.ins().return_(&[]); + + func + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x86/unwind/winx64.rs b/third_party/rust/cranelift-codegen/src/isa/x86/unwind/winx64.rs new file mode 100644 index 0000000000..ed046f9a87 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x86/unwind/winx64.rs @@ -0,0 +1,268 @@ +//! Unwind information for Windows x64 ABI. + +use crate::ir::Function; +use crate::isa::x86::registers::{FPR, GPR}; +use crate::isa::{unwind::winx64::UnwindInfo, CallConv, RegUnit, TargetIsa}; +use crate::result::CodegenResult; + +pub(crate) fn create_unwind_info( + func: &Function, + isa: &dyn TargetIsa, +) -> CodegenResult<Option<UnwindInfo>> { + // Only Windows fastcall is supported for unwind information + if func.signature.call_conv != CallConv::WindowsFastcall || func.prologue_end.is_none() { + return Ok(None); + } + + let unwind = match super::create_unwind_info(func, isa)? { + Some(u) => u, + None => { + return Ok(None); + } + }; + + Ok(Some(UnwindInfo::build::<RegisterMapper>(unwind)?)) +} + +struct RegisterMapper; + +impl crate::isa::unwind::winx64::RegisterMapper for RegisterMapper { + fn map(reg: RegUnit) -> crate::isa::unwind::winx64::MappedRegister { + use crate::isa::unwind::winx64::MappedRegister; + if GPR.contains(reg) { + MappedRegister::Int(GPR.index_of(reg) as u8) + } else if FPR.contains(reg) { + MappedRegister::Xmm(reg as u8) + } else { + panic!() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::{ExternalName, InstBuilder, Signature, StackSlotData, StackSlotKind}; + use crate::isa::unwind::winx64::UnwindCode; + use crate::isa::x86::registers::RU; + use crate::isa::{lookup, CallConv}; + use crate::settings::{builder, Flags}; + use crate::Context; + use std::str::FromStr; + use target_lexicon::triple; + + #[test] + fn test_wrong_calling_convention() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function(CallConv::SystemV, None)); + + context.compile(&*isa).expect("expected compilation"); + + assert_eq!( + create_unwind_info(&context.func, &*isa).expect("can create unwind info"), + None + ); + } + + #[test] + #[cfg_attr(feature = "x64", should_panic)] // TODO #2079 + fn test_small_alloc() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::WindowsFastcall, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let unwind = create_unwind_info(&context.func, &*isa) + .expect("can create unwind info") + .expect("expected unwind info"); + + assert_eq!( + unwind, + UnwindInfo { + flags: 0, + prologue_size: 9, + frame_register: None, + frame_register_offset: 0, + unwind_codes: vec![ + UnwindCode::PushRegister { + offset: 2, + reg: GPR.index_of(RU::rbp.into()) as u8 + }, + UnwindCode::StackAlloc { + offset: 9, + size: 64 + } + ] + } + ); + + assert_eq!(unwind.emit_size(), 8); + + let mut buf = [0u8; 8]; + unwind.emit(&mut buf); + + assert_eq!( + buf, + [ + 0x01, // Version and flags (version 1, no flags) + 0x09, // Prologue size + 0x02, // Unwind code count (1 for stack alloc, 1 for push reg) + 0x00, // Frame register + offset (no frame register) + 0x09, // Prolog offset + 0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes) + 0x02, // Prolog offset + 0x50, // Operation 0 (save nonvolatile register), reg = 5 (RBP) + ] + ); + } + + #[test] + #[cfg_attr(feature = "x64", should_panic)] // TODO #2079 + fn test_medium_alloc() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::WindowsFastcall, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 10000)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let unwind = create_unwind_info(&context.func, &*isa) + .expect("can create unwind info") + .expect("expected unwind info"); + + assert_eq!( + unwind, + UnwindInfo { + flags: 0, + prologue_size: 27, + frame_register: None, + frame_register_offset: 0, + unwind_codes: vec![ + UnwindCode::PushRegister { + offset: 2, + reg: GPR.index_of(RU::rbp.into()) as u8 + }, + UnwindCode::StackAlloc { + offset: 27, + size: 10000 + } + ] + } + ); + + assert_eq!(unwind.emit_size(), 12); + + let mut buf = [0u8; 12]; + unwind.emit(&mut buf); + + assert_eq!( + buf, + [ + 0x01, // Version and flags (version 1, no flags) + 0x1B, // Prologue size + 0x03, // Unwind code count (2 for stack alloc, 1 for push reg) + 0x00, // Frame register + offset (no frame register) + 0x1B, // Prolog offset + 0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0) + 0xE2, // Low size byte + 0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes) + 0x02, // Prolog offset + 0x50, // Operation 0 (push nonvolatile register), reg = 5 (RBP) + 0x00, // Padding + 0x00, // Padding + ] + ); + } + + #[test] + #[cfg_attr(feature = "x64", should_panic)] // TODO #2079 + fn test_large_alloc() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::WindowsFastcall, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 1000000)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let unwind = create_unwind_info(&context.func, &*isa) + .expect("can create unwind info") + .expect("expected unwind info"); + + assert_eq!( + unwind, + UnwindInfo { + flags: 0, + prologue_size: 27, + frame_register: None, + frame_register_offset: 0, + unwind_codes: vec![ + UnwindCode::PushRegister { + offset: 2, + reg: GPR.index_of(RU::rbp.into()) as u8 + }, + UnwindCode::StackAlloc { + offset: 27, + size: 1000000 + } + ] + } + ); + + assert_eq!(unwind.emit_size(), 12); + + let mut buf = [0u8; 12]; + unwind.emit(&mut buf); + + assert_eq!( + buf, + [ + 0x01, // Version and flags (version 1, no flags) + 0x1B, // Prologue size + 0x04, // Unwind code count (3 for stack alloc, 1 for push reg) + 0x00, // Frame register + offset (no frame register) + 0x1B, // Prolog offset + 0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1) + 0x40, // Byte 1 of size + 0x42, // Byte 2 of size + 0x0F, // Byte 3 of size + 0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes) + 0x02, // Prolog offset + 0x50, // Operation 0 (push nonvolatile register), reg = 5 (RBP) + ] + ); + } + + fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function { + let mut func = + Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv)); + + let block0 = func.dfg.make_block(); + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().return_(&[]); + + if let Some(stack_slot) = stack_slot { + func.stack_slots.push(stack_slot); + } + + func + } +} |