diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/cranelift-codegen/src/isa/x64 | |
parent | Initial commit. (diff) | |
download | firefox-upstream.tar.xz firefox-upstream.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/cranelift-codegen/src/isa/x64')
11 files changed, 15701 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs b/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs new file mode 100644 index 0000000000..f4c7624f36 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/abi.rs @@ -0,0 +1,794 @@ +//! Implementation of the standard x64 ABI. + +use crate::ir::types::*; +use crate::ir::{self, types, MemFlags, TrapCode, Type}; +use crate::isa; +use crate::isa::{x64::inst::*, CallConv}; +use crate::machinst::abi_impl::*; +use crate::machinst::*; +use crate::settings; +use crate::{CodegenError, CodegenResult}; +use alloc::boxed::Box; +use alloc::vec::Vec; +use args::*; +use regalloc::{RealReg, Reg, RegClass, Set, Writable}; +use smallvec::{smallvec, SmallVec}; +use std::convert::TryFrom; + +/// This is the limit for the size of argument and return-value areas on the +/// stack. We place a reasonable limit here to avoid integer overflow issues +/// with 32-bit arithmetic: for now, 128 MB. +static STACK_ARG_RET_SIZE_LIMIT: u64 = 128 * 1024 * 1024; + +/// Offset in stack-arg area to callee-TLS slot in Baldrdash-2020 calling convention. +static BALDRDASH_CALLEE_TLS_OFFSET: i64 = 0; +/// Offset in stack-arg area to caller-TLS slot in Baldrdash-2020 calling convention. +static BALDRDASH_CALLER_TLS_OFFSET: i64 = 8; + +/// Try to fill a Baldrdash register, returning it if it was found. +fn try_fill_baldrdash_reg(call_conv: CallConv, param: &ir::AbiParam) -> Option<ABIArg> { + if call_conv.extends_baldrdash() { + match ¶m.purpose { + &ir::ArgumentPurpose::VMContext => { + // This is SpiderMonkey's `WasmTlsReg`. + Some(ABIArg::Reg( + regs::r14().to_real_reg(), + types::I64, + param.extension, + param.purpose, + )) + } + &ir::ArgumentPurpose::SignatureId => { + // This is SpiderMonkey's `WasmTableCallSigReg`. + Some(ABIArg::Reg( + regs::r10().to_real_reg(), + types::I64, + param.extension, + param.purpose, + )) + } + &ir::ArgumentPurpose::CalleeTLS => { + // This is SpiderMonkey's callee TLS slot in the extended frame of Wasm's ABI-2020. + assert!(call_conv == isa::CallConv::Baldrdash2020); + Some(ABIArg::Stack( + BALDRDASH_CALLEE_TLS_OFFSET, + ir::types::I64, + ir::ArgumentExtension::None, + param.purpose, + )) + } + &ir::ArgumentPurpose::CallerTLS => { + // This is SpiderMonkey's caller TLS slot in the extended frame of Wasm's ABI-2020. + assert!(call_conv == isa::CallConv::Baldrdash2020); + Some(ABIArg::Stack( + BALDRDASH_CALLER_TLS_OFFSET, + ir::types::I64, + ir::ArgumentExtension::None, + param.purpose, + )) + } + _ => None, + } + } else { + None + } +} + +/// Support for the x64 ABI from the callee side (within a function body). +pub(crate) type X64ABICallee = ABICalleeImpl<X64ABIMachineSpec>; + +/// Support for the x64 ABI from the caller side (at a callsite). +pub(crate) type X64ABICaller = ABICallerImpl<X64ABIMachineSpec>; + +/// Implementation of ABI primitives for x64. +pub(crate) struct X64ABIMachineSpec; + +impl ABIMachineSpec for X64ABIMachineSpec { + type I = Inst; + + fn word_bits() -> u32 { + 64 + } + + /// Return required stack alignment in bytes. + fn stack_align(_call_conv: isa::CallConv) -> u32 { + 16 + } + + fn compute_arg_locs( + call_conv: isa::CallConv, + params: &[ir::AbiParam], + args_or_rets: ArgsOrRets, + add_ret_area_ptr: bool, + ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> { + let is_baldrdash = call_conv.extends_baldrdash(); + let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020; + + let mut next_gpr = 0; + let mut next_vreg = 0; + let mut next_stack: u64 = 0; + let mut ret = vec![]; + + if args_or_rets == ArgsOrRets::Args && has_baldrdash_tls { + // Baldrdash ABI-2020 always has two stack-arg slots reserved, for the callee and + // caller TLS-register values, respectively. + next_stack = 16; + } + + for i in 0..params.len() { + // Process returns backward, according to the SpiderMonkey ABI (which we + // adopt internally if `is_baldrdash` is set). + let param = match (args_or_rets, is_baldrdash) { + (ArgsOrRets::Args, _) => ¶ms[i], + (ArgsOrRets::Rets, false) => ¶ms[i], + (ArgsOrRets::Rets, true) => ¶ms[params.len() - 1 - i], + }; + + // Validate "purpose". + match ¶m.purpose { + &ir::ArgumentPurpose::VMContext + | &ir::ArgumentPurpose::Normal + | &ir::ArgumentPurpose::StackLimit + | &ir::ArgumentPurpose::SignatureId + | &ir::ArgumentPurpose::CalleeTLS + | &ir::ArgumentPurpose::CallerTLS => {} + _ => panic!( + "Unsupported argument purpose {:?} in signature: {:?}", + param.purpose, params + ), + } + + let intreg = in_int_reg(param.value_type); + let vecreg = in_vec_reg(param.value_type); + debug_assert!(intreg || vecreg); + debug_assert!(!(intreg && vecreg)); + + let (next_reg, candidate) = if intreg { + let candidate = match args_or_rets { + ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr), + ArgsOrRets::Rets => get_intreg_for_retval_systemv(&call_conv, next_gpr, i), + }; + debug_assert!(candidate + .map(|r| r.get_class() == RegClass::I64) + .unwrap_or(true)); + (&mut next_gpr, candidate) + } else { + let candidate = match args_or_rets { + ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg), + ArgsOrRets::Rets => get_fltreg_for_retval_systemv(&call_conv, next_vreg, i), + }; + debug_assert!(candidate + .map(|r| r.get_class() == RegClass::V128) + .unwrap_or(true)); + (&mut next_vreg, candidate) + }; + + if let Some(param) = try_fill_baldrdash_reg(call_conv, param) { + assert!(intreg); + ret.push(param); + } else if let Some(reg) = candidate { + ret.push(ABIArg::Reg( + reg.to_real_reg(), + param.value_type, + param.extension, + param.purpose, + )); + *next_reg += 1; + } else { + // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte + // stack alignment happens separately after all args.) + let size = (param.value_type.bits() / 8) as u64; + let size = std::cmp::max(size, 8); + // Align. + debug_assert!(size.is_power_of_two()); + next_stack = (next_stack + size - 1) & !(size - 1); + ret.push(ABIArg::Stack( + next_stack as i64, + param.value_type, + param.extension, + param.purpose, + )); + next_stack += size; + } + } + + if args_or_rets == ArgsOrRets::Rets && is_baldrdash { + ret.reverse(); + } + + let extra_arg = if add_ret_area_ptr { + debug_assert!(args_or_rets == ArgsOrRets::Args); + if let Some(reg) = get_intreg_for_arg_systemv(&call_conv, next_gpr) { + ret.push(ABIArg::Reg( + reg.to_real_reg(), + types::I64, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + )); + } else { + ret.push(ABIArg::Stack( + next_stack as i64, + types::I64, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + )); + next_stack += 8; + } + Some(ret.len() - 1) + } else { + None + }; + + next_stack = (next_stack + 15) & !15; + + // To avoid overflow issues, limit the arg/return size to something reasonable. + if next_stack > STACK_ARG_RET_SIZE_LIMIT { + return Err(CodegenError::ImplLimitExceeded); + } + + Ok((ret, next_stack as i64, extra_arg)) + } + + fn fp_to_arg_offset(call_conv: isa::CallConv, flags: &settings::Flags) -> i64 { + if call_conv.extends_baldrdash() { + let num_words = flags.baldrdash_prologue_words() as i64; + debug_assert!(num_words > 0, "baldrdash must set baldrdash_prologue_words"); + num_words * 8 + } else { + 16 // frame pointer + return address. + } + } + + fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I { + let ext_kind = match ty { + types::B1 + | types::B8 + | types::I8 + | types::B16 + | types::I16 + | types::B32 + | types::I32 => ExtKind::SignExtend, + types::B64 | types::I64 | types::R64 | types::F32 | types::F64 => ExtKind::None, + _ if ty.bytes() == 16 => ExtKind::None, + _ => panic!("load_stack({})", ty), + }; + Inst::load(ty, mem, into_reg, ext_kind) + } + + fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Self::I { + Inst::store(ty, from_reg, mem) + } + + fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self::I { + Inst::gen_move(to_reg, from_reg, ty) + } + + /// Generate an integer-extend operation. + fn gen_extend( + to_reg: Writable<Reg>, + from_reg: Reg, + is_signed: bool, + from_bits: u8, + to_bits: u8, + ) -> Self::I { + let ext_mode = ExtMode::new(from_bits as u16, to_bits as u16) + .expect(&format!("invalid extension: {} -> {}", from_bits, to_bits)); + if is_signed { + Inst::movsx_rm_r(ext_mode, RegMem::reg(from_reg), to_reg) + } else { + Inst::movzx_rm_r(ext_mode, RegMem::reg(from_reg), to_reg) + } + } + + fn gen_ret() -> Self::I { + Inst::ret() + } + + fn gen_epilogue_placeholder() -> Self::I { + Inst::epilogue_placeholder() + } + + fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Self::I; 4]> { + let mut ret = SmallVec::new(); + if from_reg != into_reg.to_reg() { + ret.push(Inst::gen_move(into_reg, from_reg, I64)); + } + ret.push(Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::imm(imm), + into_reg, + )); + ret + } + + fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Self::I; 2]> { + smallvec![ + Inst::cmp_rmi_r(/* bytes = */ 8, RegMemImm::reg(regs::rsp()), limit_reg), + Inst::TrapIf { + // NBE == "> unsigned"; args above are reversed; this tests limit_reg > rsp. + cc: CC::NBE, + trap_code: TrapCode::StackOverflow, + }, + ] + } + + fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Self::I { + let mem: SyntheticAmode = mem.into(); + Inst::lea(mem, into_reg) + } + + fn get_stacklimit_reg() -> Reg { + debug_assert!( + !is_callee_save_systemv(regs::r10().to_real_reg()) + && !is_callee_save_baldrdash(regs::r10().to_real_reg()) + ); + + // As per comment on trait definition, we must return a caller-save + // register here. + regs::r10() + } + + fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Self::I { + // Only ever used for I64s; if that changes, see if the ExtKind below needs to be changed. + assert_eq!(ty, I64); + let simm32 = offset as u32; + let mem = Amode::imm_reg(simm32, base); + Inst::load(ty, mem, into_reg, ExtKind::None) + } + + fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Self::I { + let simm32 = offset as u32; + let mem = Amode::imm_reg(simm32, base); + Inst::store(ty, from_reg, mem) + } + + fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Self::I; 2]> { + let (alu_op, amount) = if amount >= 0 { + (AluRmiROpcode::Add, amount) + } else { + (AluRmiROpcode::Sub, -amount) + }; + + let amount = amount as u32; + + smallvec![Inst::alu_rmi_r( + true, + alu_op, + RegMemImm::imm(amount), + Writable::from_reg(regs::rsp()), + )] + } + + fn gen_nominal_sp_adj(offset: i32) -> Self::I { + Inst::VirtualSPOffsetAdj { + offset: offset as i64, + } + } + + fn gen_prologue_frame_setup() -> SmallVec<[Self::I; 2]> { + let r_rsp = regs::rsp(); + let r_rbp = regs::rbp(); + let w_rbp = Writable::from_reg(r_rbp); + let mut insts = SmallVec::new(); + // RSP before the call will be 0 % 16. So here, it is 8 % 16. + insts.push(Inst::push64(RegMemImm::reg(r_rbp))); + // RSP is now 0 % 16 + insts.push(Inst::mov_r_r(true, r_rsp, w_rbp)); + insts + } + + fn gen_epilogue_frame_restore() -> SmallVec<[Self::I; 2]> { + let mut insts = SmallVec::new(); + insts.push(Inst::mov_r_r( + true, + regs::rbp(), + Writable::from_reg(regs::rsp()), + )); + insts.push(Inst::pop64(Writable::from_reg(regs::rbp()))); + insts + } + + fn gen_clobber_save( + call_conv: isa::CallConv, + _: &settings::Flags, + clobbers: &Set<Writable<RealReg>>, + fixed_frame_storage_size: u32, + _outgoing_args_size: u32, + ) -> (u64, SmallVec<[Self::I; 16]>) { + let mut insts = SmallVec::new(); + // Find all clobbered registers that are callee-save. These are only I64 + // registers (all XMM registers are caller-save) so we can compute the + // total size of the needed stack space easily. + let clobbered = get_callee_saves(&call_conv, clobbers); + let clobbered_size = 8 * clobbered.len() as u32; + let stack_size = clobbered_size + fixed_frame_storage_size; + // Align to 16 bytes. + let stack_size = (stack_size + 15) & !15; + // Adjust the stack pointer downward with one `sub rsp, IMM` + // instruction. + if stack_size > 0 { + insts.push(Inst::alu_rmi_r( + true, + AluRmiROpcode::Sub, + RegMemImm::imm(stack_size), + Writable::from_reg(regs::rsp()), + )); + } + // Store each clobbered register in order at offsets from RSP. + let mut cur_offset = 0; + for reg in &clobbered { + let r_reg = reg.to_reg(); + match r_reg.get_class() { + RegClass::I64 => { + insts.push(Inst::mov_r_m( + /* bytes = */ 8, + r_reg.to_reg(), + Amode::imm_reg(cur_offset, regs::rsp()), + )); + cur_offset += 8; + } + // No XMM regs are callee-save, so we do not need to implement + // this. + _ => unimplemented!(), + } + } + + (clobbered_size as u64, insts) + } + + fn gen_clobber_restore( + call_conv: isa::CallConv, + flags: &settings::Flags, + clobbers: &Set<Writable<RealReg>>, + _fixed_frame_storage_size: u32, + _outgoing_args_size: u32, + ) -> SmallVec<[Self::I; 16]> { + let mut insts = SmallVec::new(); + + let clobbered = get_callee_saves(&call_conv, clobbers); + let stack_size = 8 * clobbered.len() as u32; + let stack_size = (stack_size + 15) & !15; + + // Restore regs by loading from offsets of RSP. + let mut cur_offset = 0; + for reg in &clobbered { + let rreg = reg.to_reg(); + match rreg.get_class() { + RegClass::I64 => { + insts.push(Inst::mov64_m_r( + Amode::imm_reg(cur_offset, regs::rsp()), + Writable::from_reg(rreg.to_reg()), + )); + cur_offset += 8; + } + _ => unimplemented!(), + } + } + // Adjust RSP back upward. + if stack_size > 0 { + insts.push(Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::imm(stack_size), + Writable::from_reg(regs::rsp()), + )); + } + + // If this is Baldrdash-2020, restore the callee (i.e., our) TLS + // register. We may have allocated it for something else and clobbered + // it, but the ABI expects us to leave the TLS register unchanged. + if call_conv == isa::CallConv::Baldrdash2020 { + let off = BALDRDASH_CALLEE_TLS_OFFSET + Self::fp_to_arg_offset(call_conv, flags); + insts.push(Inst::mov64_m_r( + Amode::imm_reg(off as u32, regs::rbp()), + Writable::from_reg(regs::r14()), + )); + } + + insts + } + + /// Generate a call instruction/sequence. + fn gen_call( + dest: &CallDest, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: ir::Opcode, + tmp: Writable<Reg>, + _callee_conv: isa::CallConv, + _caller_conv: isa::CallConv, + ) -> SmallVec<[(InstIsSafepoint, Self::I); 2]> { + let mut insts = SmallVec::new(); + match dest { + &CallDest::ExtName(ref name, RelocDistance::Near) => { + insts.push(( + InstIsSafepoint::Yes, + Inst::call_known(name.clone(), uses, defs, opcode), + )); + } + &CallDest::ExtName(ref name, RelocDistance::Far) => { + insts.push(( + InstIsSafepoint::No, + Inst::LoadExtName { + dst: tmp, + name: Box::new(name.clone()), + offset: 0, + }, + )); + insts.push(( + InstIsSafepoint::Yes, + Inst::call_unknown(RegMem::reg(tmp.to_reg()), uses, defs, opcode), + )); + } + &CallDest::Reg(reg) => { + insts.push(( + InstIsSafepoint::Yes, + Inst::call_unknown(RegMem::reg(reg), uses, defs, opcode), + )); + } + } + insts + } + + fn get_number_of_spillslots_for_value(rc: RegClass, ty: Type) -> u32 { + // We allocate in terms of 8-byte slots. + match (rc, ty) { + (RegClass::I64, _) => 1, + (RegClass::V128, types::F32) | (RegClass::V128, types::F64) => 1, + (RegClass::V128, _) => 2, + _ => panic!("Unexpected register class!"), + } + } + + fn get_virtual_sp_offset_from_state(s: &<Self::I as MachInstEmit>::State) -> i64 { + s.virtual_sp_offset + } + + fn get_nominal_sp_to_fp(s: &<Self::I as MachInstEmit>::State) -> i64 { + s.nominal_sp_to_fp + } + + fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> Vec<Writable<Reg>> { + let mut caller_saved = vec![ + // Systemv calling convention: + // - GPR: all except RBX, RBP, R12 to R15 (which are callee-saved). + Writable::from_reg(regs::rsi()), + Writable::from_reg(regs::rdi()), + Writable::from_reg(regs::rax()), + Writable::from_reg(regs::rcx()), + Writable::from_reg(regs::rdx()), + Writable::from_reg(regs::r8()), + Writable::from_reg(regs::r9()), + Writable::from_reg(regs::r10()), + Writable::from_reg(regs::r11()), + // - XMM: all the registers! + Writable::from_reg(regs::xmm0()), + Writable::from_reg(regs::xmm1()), + Writable::from_reg(regs::xmm2()), + Writable::from_reg(regs::xmm3()), + Writable::from_reg(regs::xmm4()), + Writable::from_reg(regs::xmm5()), + Writable::from_reg(regs::xmm6()), + Writable::from_reg(regs::xmm7()), + Writable::from_reg(regs::xmm8()), + Writable::from_reg(regs::xmm9()), + Writable::from_reg(regs::xmm10()), + Writable::from_reg(regs::xmm11()), + Writable::from_reg(regs::xmm12()), + Writable::from_reg(regs::xmm13()), + Writable::from_reg(regs::xmm14()), + Writable::from_reg(regs::xmm15()), + ]; + + if call_conv_of_callee.extends_baldrdash() { + caller_saved.push(Writable::from_reg(regs::r12())); + caller_saved.push(Writable::from_reg(regs::r13())); + // Not r14; implicitly preserved in the entry. + caller_saved.push(Writable::from_reg(regs::r15())); + caller_saved.push(Writable::from_reg(regs::rbx())); + } + + caller_saved + } +} + +impl From<StackAMode> for SyntheticAmode { + fn from(amode: StackAMode) -> Self { + // We enforce a 128 MB stack-frame size limit above, so these + // `expect()`s should never fail. + match amode { + StackAMode::FPOffset(off, _ty) => { + let off = i32::try_from(off) + .expect("Offset in FPOffset is greater than 2GB; should hit impl limit first"); + let simm32 = off as u32; + SyntheticAmode::Real(Amode::ImmReg { + simm32, + base: regs::rbp(), + flags: MemFlags::trusted(), + }) + } + StackAMode::NominalSPOffset(off, _ty) => { + let off = i32::try_from(off).expect( + "Offset in NominalSPOffset is greater than 2GB; should hit impl limit first", + ); + let simm32 = off as u32; + SyntheticAmode::nominal_sp_offset(simm32) + } + StackAMode::SPOffset(off, _ty) => { + let off = i32::try_from(off) + .expect("Offset in SPOffset is greater than 2GB; should hit impl limit first"); + let simm32 = off as u32; + SyntheticAmode::Real(Amode::ImmReg { + simm32, + base: regs::rsp(), + flags: MemFlags::trusted(), + }) + } + } + } +} + +fn in_int_reg(ty: types::Type) -> bool { + match ty { + types::I8 + | types::I16 + | types::I32 + | types::I64 + | types::B1 + | types::B8 + | types::B16 + | types::B32 + | types::B64 + | types::R64 => true, + types::R32 => panic!("unexpected 32-bits refs on x64!"), + _ => false, + } +} + +fn in_vec_reg(ty: types::Type) -> bool { + match ty { + types::F32 | types::F64 => true, + _ if ty.is_vector() => true, + _ => false, + } +} + +fn get_intreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> { + match call_conv { + CallConv::Fast + | CallConv::Cold + | CallConv::SystemV + | CallConv::BaldrdashSystemV + | CallConv::Baldrdash2020 => {} + _ => panic!("int args only supported for SysV calling convention"), + }; + match idx { + 0 => Some(regs::rdi()), + 1 => Some(regs::rsi()), + 2 => Some(regs::rdx()), + 3 => Some(regs::rcx()), + 4 => Some(regs::r8()), + 5 => Some(regs::r9()), + _ => None, + } +} + +fn get_fltreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> { + match call_conv { + CallConv::Fast + | CallConv::Cold + | CallConv::SystemV + | CallConv::BaldrdashSystemV + | CallConv::Baldrdash2020 => {} + _ => panic!("float args only supported for SysV calling convention"), + }; + match idx { + 0 => Some(regs::xmm0()), + 1 => Some(regs::xmm1()), + 2 => Some(regs::xmm2()), + 3 => Some(regs::xmm3()), + 4 => Some(regs::xmm4()), + 5 => Some(regs::xmm5()), + 6 => Some(regs::xmm6()), + 7 => Some(regs::xmm7()), + _ => None, + } +} + +fn get_intreg_for_retval_systemv( + call_conv: &CallConv, + intreg_idx: usize, + retval_idx: usize, +) -> Option<Reg> { + match call_conv { + CallConv::Fast | CallConv::Cold | CallConv::SystemV => match intreg_idx { + 0 => Some(regs::rax()), + 1 => Some(regs::rdx()), + _ => None, + }, + CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => { + if intreg_idx == 0 && retval_idx == 0 { + Some(regs::rax()) + } else { + None + } + } + CallConv::WindowsFastcall | CallConv::BaldrdashWindows | CallConv::Probestack => todo!(), + } +} + +fn get_fltreg_for_retval_systemv( + call_conv: &CallConv, + fltreg_idx: usize, + retval_idx: usize, +) -> Option<Reg> { + match call_conv { + CallConv::Fast | CallConv::Cold | CallConv::SystemV => match fltreg_idx { + 0 => Some(regs::xmm0()), + 1 => Some(regs::xmm1()), + _ => None, + }, + CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => { + if fltreg_idx == 0 && retval_idx == 0 { + Some(regs::xmm0()) + } else { + None + } + } + CallConv::WindowsFastcall | CallConv::BaldrdashWindows | CallConv::Probestack => todo!(), + } +} + +fn is_callee_save_systemv(r: RealReg) -> bool { + use regs::*; + match r.get_class() { + RegClass::I64 => match r.get_hw_encoding() as u8 { + ENC_RBX | ENC_RBP | ENC_R12 | ENC_R13 | ENC_R14 | ENC_R15 => true, + _ => false, + }, + RegClass::V128 => false, + _ => unimplemented!(), + } +} + +fn is_callee_save_baldrdash(r: RealReg) -> bool { + use regs::*; + match r.get_class() { + RegClass::I64 => { + if r.get_hw_encoding() as u8 == ENC_R14 { + // r14 is the WasmTlsReg and is preserved implicitly. + false + } else { + // Defer to native for the other ones. + is_callee_save_systemv(r) + } + } + RegClass::V128 => false, + _ => unimplemented!(), + } +} + +fn get_callee_saves(call_conv: &CallConv, regs: &Set<Writable<RealReg>>) -> Vec<Writable<RealReg>> { + let mut regs: Vec<Writable<RealReg>> = match call_conv { + CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => regs + .iter() + .cloned() + .filter(|r| is_callee_save_baldrdash(r.to_reg())) + .collect(), + CallConv::BaldrdashWindows => { + todo!("baldrdash windows"); + } + CallConv::Fast | CallConv::Cold | CallConv::SystemV => regs + .iter() + .cloned() + .filter(|r| is_callee_save_systemv(r.to_reg())) + .collect(), + CallConv::WindowsFastcall => todo!("windows fastcall"), + CallConv::Probestack => todo!("probestack?"), + }; + // Sort registers for deterministic code output. We can do an unstable sort because the + // registers will be unique (there are no dups). + regs.sort_unstable_by_key(|r| r.to_reg().get_index()); + regs +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs new file mode 100644 index 0000000000..6a8f65feb3 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/args.rs @@ -0,0 +1,1215 @@ +//! Instruction operand sub-components (aka "parts"): definitions and printing. + +use super::regs::{self, show_ireg_sized}; +use super::EmitState; +use crate::ir::condcodes::{FloatCC, IntCC}; +use crate::ir::MemFlags; +use crate::machinst::*; +use regalloc::{ + PrettyPrint, PrettyPrintSized, RealRegUniverse, Reg, RegClass, RegUsageCollector, + RegUsageMapper, Writable, +}; +use std::fmt; +use std::string::String; + +/// A possible addressing mode (amode) that can be used in instructions. +/// These denote a 64-bit value only. +#[derive(Clone, Debug)] +pub enum Amode { + /// Immediate sign-extended and a Register. + ImmReg { + simm32: u32, + base: Reg, + flags: MemFlags, + }, + + /// sign-extend-32-to-64(Immediate) + Register1 + (Register2 << Shift) + ImmRegRegShift { + simm32: u32, + base: Reg, + index: Reg, + shift: u8, /* 0 .. 3 only */ + flags: MemFlags, + }, + + /// sign-extend-32-to-64(Immediate) + RIP (instruction pointer). + /// To wit: not supported in 32-bits mode. + RipRelative { target: MachLabel }, +} + +impl Amode { + pub(crate) fn imm_reg(simm32: u32, base: Reg) -> Self { + debug_assert!(base.get_class() == RegClass::I64); + Self::ImmReg { + simm32, + base, + flags: MemFlags::trusted(), + } + } + + pub(crate) fn imm_reg_reg_shift(simm32: u32, base: Reg, index: Reg, shift: u8) -> Self { + debug_assert!(base.get_class() == RegClass::I64); + debug_assert!(index.get_class() == RegClass::I64); + debug_assert!(shift <= 3); + Self::ImmRegRegShift { + simm32, + base, + index, + shift, + flags: MemFlags::trusted(), + } + } + + pub(crate) fn rip_relative(target: MachLabel) -> Self { + Self::RipRelative { target } + } + + pub(crate) fn with_flags(&self, flags: MemFlags) -> Self { + match self { + &Self::ImmReg { simm32, base, .. } => Self::ImmReg { + simm32, + base, + flags, + }, + &Self::ImmRegRegShift { + simm32, + base, + index, + shift, + .. + } => Self::ImmRegRegShift { + simm32, + base, + index, + shift, + flags, + }, + _ => panic!("Amode {:?} cannot take memflags", self), + } + } + + /// Add the regs mentioned by `self` to `collector`. + pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) { + match self { + Amode::ImmReg { base, .. } => { + collector.add_use(*base); + } + Amode::ImmRegRegShift { base, index, .. } => { + collector.add_use(*base); + collector.add_use(*index); + } + Amode::RipRelative { .. } => { + // RIP isn't involved in regalloc. + } + } + } + + pub(crate) fn get_flags(&self) -> MemFlags { + match self { + Amode::ImmReg { flags, .. } => *flags, + Amode::ImmRegRegShift { flags, .. } => *flags, + Amode::RipRelative { .. } => MemFlags::trusted(), + } + } + + pub(crate) fn can_trap(&self) -> bool { + !self.get_flags().notrap() + } +} + +impl PrettyPrint for Amode { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + match self { + Amode::ImmReg { simm32, base, .. } => { + format!("{}({})", *simm32 as i32, base.show_rru(mb_rru)) + } + Amode::ImmRegRegShift { + simm32, + base, + index, + shift, + .. + } => format!( + "{}({},{},{})", + *simm32 as i32, + base.show_rru(mb_rru), + index.show_rru(mb_rru), + 1 << shift + ), + Amode::RipRelative { ref target } => format!("label{}(%rip)", target.get()), + } + } +} + +/// A Memory Address. These denote a 64-bit value only. +/// Used for usual addressing modes as well as addressing modes used during compilation, when the +/// moving SP offset is not known. +#[derive(Clone)] +pub enum SyntheticAmode { + /// A real amode. + Real(Amode), + + /// A (virtual) offset to the "nominal SP" value, which will be recomputed as we push and pop + /// within the function. + NominalSPOffset { simm32: u32 }, +} + +impl SyntheticAmode { + pub(crate) fn nominal_sp_offset(simm32: u32) -> Self { + SyntheticAmode::NominalSPOffset { simm32 } + } + + /// Add the regs mentioned by `self` to `collector`. + pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) { + match self { + SyntheticAmode::Real(addr) => addr.get_regs_as_uses(collector), + SyntheticAmode::NominalSPOffset { .. } => { + // Nothing to do; the base is SP and isn't involved in regalloc. + } + } + } + + pub(crate) fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) { + match self { + SyntheticAmode::Real(addr) => addr.map_uses(map), + SyntheticAmode::NominalSPOffset { .. } => { + // Nothing to do. + } + } + } + + pub(crate) fn finalize(&self, state: &mut EmitState) -> Amode { + match self { + SyntheticAmode::Real(addr) => addr.clone(), + SyntheticAmode::NominalSPOffset { simm32 } => { + let off = *simm32 as i64 + state.virtual_sp_offset; + // TODO will require a sequence of add etc. + assert!( + off <= u32::max_value() as i64, + "amode finalize: add sequence NYI" + ); + Amode::imm_reg(off as u32, regs::rsp()) + } + } + } +} + +impl Into<SyntheticAmode> for Amode { + fn into(self) -> SyntheticAmode { + SyntheticAmode::Real(self) + } +} + +impl PrettyPrint for SyntheticAmode { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + match self { + SyntheticAmode::Real(addr) => addr.show_rru(mb_rru), + SyntheticAmode::NominalSPOffset { simm32 } => { + format!("rsp({} + virtual offset)", *simm32 as i32) + } + } + } +} + +/// An operand which is either an integer Register, a value in Memory or an Immediate. This can +/// denote an 8, 16, 32 or 64 bit value. For the Immediate form, in the 8- and 16-bit case, only +/// the lower 8 or 16 bits of `simm32` is relevant. In the 64-bit case, the value denoted by +/// `simm32` is its sign-extension out to 64 bits. +#[derive(Clone)] +pub enum RegMemImm { + Reg { reg: Reg }, + Mem { addr: SyntheticAmode }, + Imm { simm32: u32 }, +} + +impl RegMemImm { + pub(crate) fn reg(reg: Reg) -> Self { + debug_assert!(reg.get_class() == RegClass::I64 || reg.get_class() == RegClass::V128); + Self::Reg { reg } + } + pub(crate) fn mem(addr: impl Into<SyntheticAmode>) -> Self { + Self::Mem { addr: addr.into() } + } + pub(crate) fn imm(simm32: u32) -> Self { + Self::Imm { simm32 } + } + + /// Asserts that in register mode, the reg class is the one that's expected. + pub(crate) fn assert_regclass_is(&self, expected_reg_class: RegClass) { + if let Self::Reg { reg } = self { + debug_assert_eq!(reg.get_class(), expected_reg_class); + } + } + + /// Add the regs mentioned by `self` to `collector`. + pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) { + match self { + Self::Reg { reg } => collector.add_use(*reg), + Self::Mem { addr } => addr.get_regs_as_uses(collector), + Self::Imm { .. } => {} + } + } + + pub(crate) fn to_reg(&self) -> Option<Reg> { + match self { + Self::Reg { reg } => Some(*reg), + _ => None, + } + } +} + +impl PrettyPrint for RegMemImm { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + self.show_rru_sized(mb_rru, 8) + } +} + +impl PrettyPrintSized for RegMemImm { + fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String { + match self { + Self::Reg { reg } => show_ireg_sized(*reg, mb_rru, size), + Self::Mem { addr } => addr.show_rru(mb_rru), + Self::Imm { simm32 } => format!("${}", *simm32 as i32), + } + } +} + +/// An operand which is either an integer Register or a value in Memory. This can denote an 8, 16, +/// 32, 64, or 128 bit value. +#[derive(Clone)] +pub enum RegMem { + Reg { reg: Reg }, + Mem { addr: SyntheticAmode }, +} + +impl RegMem { + pub(crate) fn reg(reg: Reg) -> Self { + debug_assert!(reg.get_class() == RegClass::I64 || reg.get_class() == RegClass::V128); + Self::Reg { reg } + } + pub(crate) fn mem(addr: impl Into<SyntheticAmode>) -> Self { + Self::Mem { addr: addr.into() } + } + /// Asserts that in register mode, the reg class is the one that's expected. + pub(crate) fn assert_regclass_is(&self, expected_reg_class: RegClass) { + if let Self::Reg { reg } = self { + debug_assert_eq!(reg.get_class(), expected_reg_class); + } + } + /// Add the regs mentioned by `self` to `collector`. + pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) { + match self { + RegMem::Reg { reg } => collector.add_use(*reg), + RegMem::Mem { addr, .. } => addr.get_regs_as_uses(collector), + } + } + pub(crate) fn to_reg(&self) -> Option<Reg> { + match self { + RegMem::Reg { reg } => Some(*reg), + _ => None, + } + } +} + +impl From<Writable<Reg>> for RegMem { + fn from(r: Writable<Reg>) -> Self { + RegMem::reg(r.to_reg()) + } +} + +impl PrettyPrint for RegMem { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + self.show_rru_sized(mb_rru, 8) + } +} + +impl PrettyPrintSized for RegMem { + fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String { + match self { + RegMem::Reg { reg } => show_ireg_sized(*reg, mb_rru, size), + RegMem::Mem { addr, .. } => addr.show_rru(mb_rru), + } + } +} + +/// Some basic ALU operations. TODO: maybe add Adc, Sbb. +#[derive(Copy, Clone, PartialEq)] +pub enum AluRmiROpcode { + Add, + Sub, + And, + Or, + Xor, + /// The signless, non-extending (N x N -> N, for N in {32,64}) variant. + Mul, +} + +impl fmt::Debug for AluRmiROpcode { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + AluRmiROpcode::Add => "add", + AluRmiROpcode::Sub => "sub", + AluRmiROpcode::And => "and", + AluRmiROpcode::Or => "or", + AluRmiROpcode::Xor => "xor", + AluRmiROpcode::Mul => "imul", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for AluRmiROpcode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +#[derive(Clone, PartialEq)] +pub enum UnaryRmROpcode { + /// Bit-scan reverse. + Bsr, + /// Bit-scan forward. + Bsf, +} + +impl fmt::Debug for UnaryRmROpcode { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + UnaryRmROpcode::Bsr => write!(fmt, "bsr"), + UnaryRmROpcode::Bsf => write!(fmt, "bsf"), + } + } +} + +impl fmt::Display for UnaryRmROpcode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +pub(crate) enum InstructionSet { + SSE, + SSE2, + SSSE3, + SSE41, + SSE42, +} + +/// Some SSE operations requiring 2 operands r/m and r. +#[derive(Clone, Copy, PartialEq)] +pub enum SseOpcode { + Addps, + Addpd, + Addss, + Addsd, + Andps, + Andpd, + Andnps, + Andnpd, + Comiss, + Comisd, + Cmpps, + Cmppd, + Cmpss, + Cmpsd, + Cvtdq2ps, + Cvtsd2ss, + Cvtsd2si, + Cvtsi2ss, + Cvtsi2sd, + Cvtss2si, + Cvtss2sd, + Cvttps2dq, + Cvttss2si, + Cvttsd2si, + Divps, + Divpd, + Divss, + Divsd, + Insertps, + Maxps, + Maxpd, + Maxss, + Maxsd, + Minps, + Minpd, + Minss, + Minsd, + Movaps, + Movapd, + Movd, + Movdqa, + Movdqu, + Movlhps, + Movmskps, + Movmskpd, + Movq, + Movss, + Movsd, + Movups, + Movupd, + Mulps, + Mulpd, + Mulss, + Mulsd, + Orps, + Orpd, + Pabsb, + Pabsw, + Pabsd, + Packsswb, + Paddb, + Paddd, + Paddq, + Paddw, + Paddsb, + Paddsw, + Paddusb, + Paddusw, + Pand, + Pandn, + Pavgb, + Pavgw, + Pcmpeqb, + Pcmpeqw, + Pcmpeqd, + Pcmpeqq, + Pcmpgtb, + Pcmpgtw, + Pcmpgtd, + Pcmpgtq, + Pextrb, + Pextrw, + Pextrd, + Pinsrb, + Pinsrw, + Pinsrd, + Pmaxsb, + Pmaxsw, + Pmaxsd, + Pmaxub, + Pmaxuw, + Pmaxud, + Pminsb, + Pminsw, + Pminsd, + Pminub, + Pminuw, + Pminud, + Pmovmskb, + Pmulld, + Pmullw, + Pmuludq, + Por, + Pshufb, + Pshufd, + Psllw, + Pslld, + Psllq, + Psraw, + Psrad, + Psrlw, + Psrld, + Psrlq, + Psubb, + Psubd, + Psubq, + Psubw, + Psubsb, + Psubsw, + Psubusb, + Psubusw, + Ptest, + Pxor, + Rcpss, + Roundss, + Roundsd, + Rsqrtss, + Sqrtps, + Sqrtpd, + Sqrtss, + Sqrtsd, + Subps, + Subpd, + Subss, + Subsd, + Ucomiss, + Ucomisd, + Xorps, + Xorpd, +} + +impl SseOpcode { + /// Which `InstructionSet` is the first supporting this opcode? + pub(crate) fn available_from(&self) -> InstructionSet { + use InstructionSet::*; + match self { + SseOpcode::Addps + | SseOpcode::Addss + | SseOpcode::Andps + | SseOpcode::Andnps + | SseOpcode::Comiss + | SseOpcode::Cmpps + | SseOpcode::Cmpss + | SseOpcode::Cvtsi2ss + | SseOpcode::Cvtss2si + | SseOpcode::Cvttss2si + | SseOpcode::Divps + | SseOpcode::Divss + | SseOpcode::Maxps + | SseOpcode::Maxss + | SseOpcode::Minps + | SseOpcode::Minss + | SseOpcode::Movaps + | SseOpcode::Movlhps + | SseOpcode::Movmskps + | SseOpcode::Movss + | SseOpcode::Movups + | SseOpcode::Mulps + | SseOpcode::Mulss + | SseOpcode::Orps + | SseOpcode::Rcpss + | SseOpcode::Rsqrtss + | SseOpcode::Sqrtps + | SseOpcode::Sqrtss + | SseOpcode::Subps + | SseOpcode::Subss + | SseOpcode::Ucomiss + | SseOpcode::Xorps => SSE, + + SseOpcode::Addpd + | SseOpcode::Addsd + | SseOpcode::Andpd + | SseOpcode::Andnpd + | SseOpcode::Cmppd + | SseOpcode::Cmpsd + | SseOpcode::Comisd + | SseOpcode::Cvtdq2ps + | SseOpcode::Cvtsd2ss + | SseOpcode::Cvtsd2si + | SseOpcode::Cvtsi2sd + | SseOpcode::Cvtss2sd + | SseOpcode::Cvttps2dq + | SseOpcode::Cvttsd2si + | SseOpcode::Divpd + | SseOpcode::Divsd + | SseOpcode::Maxpd + | SseOpcode::Maxsd + | SseOpcode::Minpd + | SseOpcode::Minsd + | SseOpcode::Movapd + | SseOpcode::Movd + | SseOpcode::Movmskpd + | SseOpcode::Movq + | SseOpcode::Movsd + | SseOpcode::Movupd + | SseOpcode::Movdqa + | SseOpcode::Movdqu + | SseOpcode::Mulpd + | SseOpcode::Mulsd + | SseOpcode::Orpd + | SseOpcode::Packsswb + | SseOpcode::Paddb + | SseOpcode::Paddd + | SseOpcode::Paddq + | SseOpcode::Paddw + | SseOpcode::Paddsb + | SseOpcode::Paddsw + | SseOpcode::Paddusb + | SseOpcode::Paddusw + | SseOpcode::Pand + | SseOpcode::Pandn + | SseOpcode::Pavgb + | SseOpcode::Pavgw + | SseOpcode::Pcmpeqb + | SseOpcode::Pcmpeqw + | SseOpcode::Pcmpeqd + | SseOpcode::Pcmpgtb + | SseOpcode::Pcmpgtw + | SseOpcode::Pcmpgtd + | SseOpcode::Pextrw + | SseOpcode::Pinsrw + | SseOpcode::Pmaxsw + | SseOpcode::Pmaxub + | SseOpcode::Pminsw + | SseOpcode::Pminub + | SseOpcode::Pmovmskb + | SseOpcode::Pmullw + | SseOpcode::Pmuludq + | SseOpcode::Por + | SseOpcode::Pshufd + | SseOpcode::Psllw + | SseOpcode::Pslld + | SseOpcode::Psllq + | SseOpcode::Psraw + | SseOpcode::Psrad + | SseOpcode::Psrlw + | SseOpcode::Psrld + | SseOpcode::Psrlq + | SseOpcode::Psubb + | SseOpcode::Psubd + | SseOpcode::Psubq + | SseOpcode::Psubw + | SseOpcode::Psubsb + | SseOpcode::Psubsw + | SseOpcode::Psubusb + | SseOpcode::Psubusw + | SseOpcode::Pxor + | SseOpcode::Sqrtpd + | SseOpcode::Sqrtsd + | SseOpcode::Subpd + | SseOpcode::Subsd + | SseOpcode::Ucomisd + | SseOpcode::Xorpd => SSE2, + + SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3, + + SseOpcode::Insertps + | SseOpcode::Pcmpeqq + | SseOpcode::Pextrb + | SseOpcode::Pextrd + | SseOpcode::Pinsrb + | SseOpcode::Pinsrd + | SseOpcode::Pmaxsb + | SseOpcode::Pmaxsd + | SseOpcode::Pmaxuw + | SseOpcode::Pmaxud + | SseOpcode::Pminsb + | SseOpcode::Pminsd + | SseOpcode::Pminuw + | SseOpcode::Pminud + | SseOpcode::Pmulld + | SseOpcode::Ptest + | SseOpcode::Roundss + | SseOpcode::Roundsd => SSE41, + + SseOpcode::Pcmpgtq => SSE42, + } + } + + /// Returns the src operand size for an instruction. + pub(crate) fn src_size(&self) -> u8 { + match self { + SseOpcode::Movd => 4, + _ => 8, + } + } +} + +impl fmt::Debug for SseOpcode { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + SseOpcode::Addps => "addps", + SseOpcode::Addpd => "addpd", + SseOpcode::Addss => "addss", + SseOpcode::Addsd => "addsd", + SseOpcode::Andpd => "andpd", + SseOpcode::Andps => "andps", + SseOpcode::Andnps => "andnps", + SseOpcode::Andnpd => "andnpd", + SseOpcode::Cmpps => "cmpps", + SseOpcode::Cmppd => "cmppd", + SseOpcode::Cmpss => "cmpss", + SseOpcode::Cmpsd => "cmpsd", + SseOpcode::Comiss => "comiss", + SseOpcode::Comisd => "comisd", + SseOpcode::Cvtdq2ps => "cvtdq2ps", + SseOpcode::Cvtsd2ss => "cvtsd2ss", + SseOpcode::Cvtsd2si => "cvtsd2si", + SseOpcode::Cvtsi2ss => "cvtsi2ss", + SseOpcode::Cvtsi2sd => "cvtsi2sd", + SseOpcode::Cvtss2si => "cvtss2si", + SseOpcode::Cvtss2sd => "cvtss2sd", + SseOpcode::Cvttps2dq => "cvttps2dq", + SseOpcode::Cvttss2si => "cvttss2si", + SseOpcode::Cvttsd2si => "cvttsd2si", + SseOpcode::Divps => "divps", + SseOpcode::Divpd => "divpd", + SseOpcode::Divss => "divss", + SseOpcode::Divsd => "divsd", + SseOpcode::Insertps => "insertps", + SseOpcode::Maxps => "maxps", + SseOpcode::Maxpd => "maxpd", + SseOpcode::Maxss => "maxss", + SseOpcode::Maxsd => "maxsd", + SseOpcode::Minps => "minps", + SseOpcode::Minpd => "minpd", + SseOpcode::Minss => "minss", + SseOpcode::Minsd => "minsd", + SseOpcode::Movaps => "movaps", + SseOpcode::Movapd => "movapd", + SseOpcode::Movd => "movd", + SseOpcode::Movdqa => "movdqa", + SseOpcode::Movdqu => "movdqu", + SseOpcode::Movlhps => "movlhps", + SseOpcode::Movmskps => "movmskps", + SseOpcode::Movmskpd => "movmskpd", + SseOpcode::Movq => "movq", + SseOpcode::Movss => "movss", + SseOpcode::Movsd => "movsd", + SseOpcode::Movups => "movups", + SseOpcode::Movupd => "movupd", + SseOpcode::Mulps => "mulps", + SseOpcode::Mulpd => "mulpd", + SseOpcode::Mulss => "mulss", + SseOpcode::Mulsd => "mulsd", + SseOpcode::Orpd => "orpd", + SseOpcode::Orps => "orps", + SseOpcode::Pabsb => "pabsb", + SseOpcode::Pabsw => "pabsw", + SseOpcode::Pabsd => "pabsd", + SseOpcode::Packsswb => "packsswb", + SseOpcode::Paddb => "paddb", + SseOpcode::Paddd => "paddd", + SseOpcode::Paddq => "paddq", + SseOpcode::Paddw => "paddw", + SseOpcode::Paddsb => "paddsb", + SseOpcode::Paddsw => "paddsw", + SseOpcode::Paddusb => "paddusb", + SseOpcode::Paddusw => "paddusw", + SseOpcode::Pand => "pand", + SseOpcode::Pandn => "pandn", + SseOpcode::Pavgb => "pavgb", + SseOpcode::Pavgw => "pavgw", + SseOpcode::Pcmpeqb => "pcmpeqb", + SseOpcode::Pcmpeqw => "pcmpeqw", + SseOpcode::Pcmpeqd => "pcmpeqd", + SseOpcode::Pcmpeqq => "pcmpeqq", + SseOpcode::Pcmpgtb => "pcmpgtb", + SseOpcode::Pcmpgtw => "pcmpgtw", + SseOpcode::Pcmpgtd => "pcmpgtd", + SseOpcode::Pcmpgtq => "pcmpgtq", + SseOpcode::Pextrb => "pextrb", + SseOpcode::Pextrw => "pextrw", + SseOpcode::Pextrd => "pextrd", + SseOpcode::Pinsrb => "pinsrb", + SseOpcode::Pinsrw => "pinsrw", + SseOpcode::Pinsrd => "pinsrd", + SseOpcode::Pmaxsb => "pmaxsb", + SseOpcode::Pmaxsw => "pmaxsw", + SseOpcode::Pmaxsd => "pmaxsd", + SseOpcode::Pmaxub => "pmaxub", + SseOpcode::Pmaxuw => "pmaxuw", + SseOpcode::Pmaxud => "pmaxud", + SseOpcode::Pminsb => "pminsb", + SseOpcode::Pminsw => "pminsw", + SseOpcode::Pminsd => "pminsd", + SseOpcode::Pminub => "pminub", + SseOpcode::Pminuw => "pminuw", + SseOpcode::Pminud => "pminud", + SseOpcode::Pmovmskb => "pmovmskb", + SseOpcode::Pmulld => "pmulld", + SseOpcode::Pmullw => "pmullw", + SseOpcode::Pmuludq => "pmuludq", + SseOpcode::Por => "por", + SseOpcode::Pshufb => "pshufb", + SseOpcode::Pshufd => "pshufd", + SseOpcode::Psllw => "psllw", + SseOpcode::Pslld => "pslld", + SseOpcode::Psllq => "psllq", + SseOpcode::Psraw => "psraw", + SseOpcode::Psrad => "psrad", + SseOpcode::Psrlw => "psrlw", + SseOpcode::Psrld => "psrld", + SseOpcode::Psrlq => "psrlq", + SseOpcode::Psubb => "psubb", + SseOpcode::Psubd => "psubd", + SseOpcode::Psubq => "psubq", + SseOpcode::Psubw => "psubw", + SseOpcode::Psubsb => "psubsb", + SseOpcode::Psubsw => "psubsw", + SseOpcode::Psubusb => "psubusb", + SseOpcode::Psubusw => "psubusw", + SseOpcode::Ptest => "ptest", + SseOpcode::Pxor => "pxor", + SseOpcode::Rcpss => "rcpss", + SseOpcode::Roundss => "roundss", + SseOpcode::Roundsd => "roundsd", + SseOpcode::Rsqrtss => "rsqrtss", + SseOpcode::Sqrtps => "sqrtps", + SseOpcode::Sqrtpd => "sqrtpd", + SseOpcode::Sqrtss => "sqrtss", + SseOpcode::Sqrtsd => "sqrtsd", + SseOpcode::Subps => "subps", + SseOpcode::Subpd => "subpd", + SseOpcode::Subss => "subss", + SseOpcode::Subsd => "subsd", + SseOpcode::Ucomiss => "ucomiss", + SseOpcode::Ucomisd => "ucomisd", + SseOpcode::Xorps => "xorps", + SseOpcode::Xorpd => "xorpd", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for SseOpcode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +/// This defines the ways a value can be extended: either signed- or zero-extension, or none for +/// types that are not extended. Contrast with [ExtMode], which defines the widths from and to which +/// values can be extended. +#[derive(Clone, PartialEq)] +pub enum ExtKind { + None, + SignExtend, + ZeroExtend, +} + +/// These indicate ways of extending (widening) a value, using the Intel +/// naming: B(yte) = u8, W(ord) = u16, L(ong)word = u32, Q(uad)word = u64 +#[derive(Clone, PartialEq)] +pub enum ExtMode { + /// Byte -> Longword. + BL, + /// Byte -> Quadword. + BQ, + /// Word -> Longword. + WL, + /// Word -> Quadword. + WQ, + /// Longword -> Quadword. + LQ, +} + +impl ExtMode { + /// Calculate the `ExtMode` from passed bit lengths of the from/to types. + pub(crate) fn new(from_bits: u16, to_bits: u16) -> Option<ExtMode> { + match (from_bits, to_bits) { + (1, 8) | (1, 16) | (1, 32) | (8, 16) | (8, 32) => Some(ExtMode::BL), + (1, 64) | (8, 64) => Some(ExtMode::BQ), + (16, 32) => Some(ExtMode::WL), + (16, 64) => Some(ExtMode::WQ), + (32, 64) => Some(ExtMode::LQ), + _ => None, + } + } + + /// Return the source register size in bytes. + pub(crate) fn src_size(&self) -> u8 { + match self { + ExtMode::BL | ExtMode::BQ => 1, + ExtMode::WL | ExtMode::WQ => 2, + ExtMode::LQ => 4, + } + } + + /// Return the destination register size in bytes. + pub(crate) fn dst_size(&self) -> u8 { + match self { + ExtMode::BL | ExtMode::WL => 4, + ExtMode::BQ | ExtMode::WQ | ExtMode::LQ => 8, + } + } +} + +impl fmt::Debug for ExtMode { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + ExtMode::BL => "bl", + ExtMode::BQ => "bq", + ExtMode::WL => "wl", + ExtMode::WQ => "wq", + ExtMode::LQ => "lq", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for ExtMode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +/// These indicate the form of a scalar shift/rotate: left, signed right, unsigned right. +#[derive(Clone)] +pub enum ShiftKind { + ShiftLeft, + /// Inserts zeros in the most significant bits. + ShiftRightLogical, + /// Replicates the sign bit in the most significant bits. + ShiftRightArithmetic, + RotateLeft, + RotateRight, +} + +impl fmt::Debug for ShiftKind { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + ShiftKind::ShiftLeft => "shl", + ShiftKind::ShiftRightLogical => "shr", + ShiftKind::ShiftRightArithmetic => "sar", + ShiftKind::RotateLeft => "rol", + ShiftKind::RotateRight => "ror", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for ShiftKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +/// What kind of division or remainer instruction this is? +#[derive(Clone)] +pub enum DivOrRemKind { + SignedDiv, + UnsignedDiv, + SignedRem, + UnsignedRem, +} + +impl DivOrRemKind { + pub(crate) fn is_signed(&self) -> bool { + match self { + DivOrRemKind::SignedDiv | DivOrRemKind::SignedRem => true, + _ => false, + } + } + + pub(crate) fn is_div(&self) -> bool { + match self { + DivOrRemKind::SignedDiv | DivOrRemKind::UnsignedDiv => true, + _ => false, + } + } +} + +/// These indicate condition code tests. Not all are represented since not all are useful in +/// compiler-generated code. +#[derive(Copy, Clone)] +#[repr(u8)] +pub enum CC { + /// overflow + O = 0, + /// no overflow + NO = 1, + + /// < unsigned + B = 2, + /// >= unsigned + NB = 3, + + /// zero + Z = 4, + /// not-zero + NZ = 5, + + /// <= unsigned + BE = 6, + /// > unsigned + NBE = 7, + + /// negative + S = 8, + /// not-negative + NS = 9, + + /// < signed + L = 12, + /// >= signed + NL = 13, + + /// <= signed + LE = 14, + /// > signed + NLE = 15, + + /// parity + P = 10, + + /// not parity + NP = 11, +} + +impl CC { + pub(crate) fn from_intcc(intcc: IntCC) -> Self { + match intcc { + IntCC::Equal => CC::Z, + IntCC::NotEqual => CC::NZ, + IntCC::SignedGreaterThanOrEqual => CC::NL, + IntCC::SignedGreaterThan => CC::NLE, + IntCC::SignedLessThanOrEqual => CC::LE, + IntCC::SignedLessThan => CC::L, + IntCC::UnsignedGreaterThanOrEqual => CC::NB, + IntCC::UnsignedGreaterThan => CC::NBE, + IntCC::UnsignedLessThanOrEqual => CC::BE, + IntCC::UnsignedLessThan => CC::B, + IntCC::Overflow => CC::O, + IntCC::NotOverflow => CC::NO, + } + } + + pub(crate) fn invert(&self) -> Self { + match self { + CC::O => CC::NO, + CC::NO => CC::O, + + CC::B => CC::NB, + CC::NB => CC::B, + + CC::Z => CC::NZ, + CC::NZ => CC::Z, + + CC::BE => CC::NBE, + CC::NBE => CC::BE, + + CC::S => CC::NS, + CC::NS => CC::S, + + CC::L => CC::NL, + CC::NL => CC::L, + + CC::LE => CC::NLE, + CC::NLE => CC::LE, + + CC::P => CC::NP, + CC::NP => CC::P, + } + } + + pub(crate) fn from_floatcc(floatcc: FloatCC) -> Self { + match floatcc { + FloatCC::Ordered => CC::NP, + FloatCC::Unordered => CC::P, + // Alias for NE + FloatCC::OrderedNotEqual => CC::NZ, + // Alias for E + FloatCC::UnorderedOrEqual => CC::Z, + // Alias for A + FloatCC::GreaterThan => CC::NBE, + // Alias for AE + FloatCC::GreaterThanOrEqual => CC::NB, + FloatCC::UnorderedOrLessThan => CC::B, + FloatCC::UnorderedOrLessThanOrEqual => CC::BE, + FloatCC::Equal + | FloatCC::NotEqual + | FloatCC::LessThan + | FloatCC::LessThanOrEqual + | FloatCC::UnorderedOrGreaterThan + | FloatCC::UnorderedOrGreaterThanOrEqual => panic!( + "{:?} can't be lowered to a CC code; treat as special case.", + floatcc + ), + } + } + + pub(crate) fn get_enc(self) -> u8 { + self as u8 + } +} + +impl fmt::Debug for CC { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + CC::O => "o", + CC::NO => "no", + CC::B => "b", + CC::NB => "nb", + CC::Z => "z", + CC::NZ => "nz", + CC::BE => "be", + CC::NBE => "nbe", + CC::S => "s", + CC::NS => "ns", + CC::L => "l", + CC::NL => "nl", + CC::LE => "le", + CC::NLE => "nle", + CC::P => "p", + CC::NP => "np", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for CC { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + +/// Encode the ways that floats can be compared. This is used in float comparisons such as `cmpps`, +/// e.g.; it is distinguished from other float comparisons (e.g. `ucomiss`) in that those use EFLAGS +/// whereas [FcmpImm] is used as an immediate. +pub(crate) enum FcmpImm { + Equal = 0x00, + LessThan = 0x01, + LessThanOrEqual = 0x02, + Unordered = 0x03, + NotEqual = 0x04, + UnorderedOrGreaterThanOrEqual = 0x05, + UnorderedOrGreaterThan = 0x06, + Ordered = 0x07, +} + +impl FcmpImm { + pub(crate) fn encode(self) -> u8 { + self as u8 + } +} + +impl From<FloatCC> for FcmpImm { + fn from(cond: FloatCC) -> Self { + match cond { + FloatCC::Equal => FcmpImm::Equal, + FloatCC::LessThan => FcmpImm::LessThan, + FloatCC::LessThanOrEqual => FcmpImm::LessThanOrEqual, + FloatCC::Unordered => FcmpImm::Unordered, + FloatCC::NotEqual => FcmpImm::NotEqual, + FloatCC::UnorderedOrGreaterThanOrEqual => FcmpImm::UnorderedOrGreaterThanOrEqual, + FloatCC::UnorderedOrGreaterThan => FcmpImm::UnorderedOrGreaterThan, + FloatCC::Ordered => FcmpImm::Ordered, + _ => panic!("unable to create comparison predicate for {}", cond), + } + } +} + +/// An operand's size in bits. +#[derive(Clone, Copy, PartialEq)] +pub enum OperandSize { + Size32, + Size64, +} + +impl OperandSize { + pub(crate) fn from_bytes(num_bytes: u32) -> Self { + match num_bytes { + 1 | 2 | 4 => OperandSize::Size32, + 8 => OperandSize::Size64, + _ => unreachable!(), + } + } + + pub(crate) fn to_bytes(&self) -> u8 { + match self { + Self::Size32 => 4, + Self::Size64 => 8, + } + } + + pub(crate) fn to_bits(&self) -> u8 { + match self { + Self::Size32 => 32, + Self::Size64 => 64, + } + } +} + +/// An x64 memory fence kind. +#[derive(Clone)] +#[allow(dead_code)] +pub enum FenceKind { + /// `mfence` instruction ("Memory Fence") + MFence, + /// `lfence` instruction ("Load Fence") + LFence, + /// `sfence` instruction ("Store Fence") + SFence, +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs new file mode 100644 index 0000000000..dd4125a2da --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit.rs @@ -0,0 +1,2819 @@ +use crate::binemit::{Addend, Reloc}; +use crate::ir::immediates::{Ieee32, Ieee64}; +use crate::ir::TrapCode; +use crate::isa::x64::inst::args::*; +use crate::isa::x64::inst::*; +use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel}; +use core::convert::TryInto; +use log::debug; +use regalloc::{Reg, RegClass, Writable}; + +fn low8_will_sign_extend_to_64(x: u32) -> bool { + let xs = (x as i32) as i64; + xs == ((xs << 56) >> 56) +} + +fn low8_will_sign_extend_to_32(x: u32) -> bool { + let xs = x as i32; + xs == ((xs << 24) >> 24) +} + +//============================================================================= +// Instructions and subcomponents: emission + +// For all of the routines that take both a memory-or-reg operand (sometimes +// called "E" in the Intel documentation) and a reg-only operand ("G" in +// Intelese), the order is always G first, then E. +// +// "enc" in the following means "hardware register encoding number". + +#[inline(always)] +fn encode_modrm(m0d: u8, enc_reg_g: u8, rm_e: u8) -> u8 { + debug_assert!(m0d < 4); + debug_assert!(enc_reg_g < 8); + debug_assert!(rm_e < 8); + ((m0d & 3) << 6) | ((enc_reg_g & 7) << 3) | (rm_e & 7) +} + +#[inline(always)] +fn encode_sib(shift: u8, enc_index: u8, enc_base: u8) -> u8 { + debug_assert!(shift < 4); + debug_assert!(enc_index < 8); + debug_assert!(enc_base < 8); + ((shift & 3) << 6) | ((enc_index & 7) << 3) | (enc_base & 7) +} + +/// Get the encoding number of a GPR. +#[inline(always)] +fn int_reg_enc(reg: Reg) -> u8 { + debug_assert!(reg.is_real()); + debug_assert_eq!(reg.get_class(), RegClass::I64); + reg.get_hw_encoding() +} + +/// Get the encoding number of any register. +#[inline(always)] +fn reg_enc(reg: Reg) -> u8 { + debug_assert!(reg.is_real()); + reg.get_hw_encoding() +} + +/// A small bit field to record a REX prefix specification: +/// - bit 0 set to 1 indicates REX.W must be 0 (cleared). +/// - bit 1 set to 1 indicates the REX prefix must always be emitted. +#[repr(transparent)] +#[derive(Clone, Copy)] +struct RexFlags(u8); + +impl RexFlags { + /// By default, set the W field, and don't always emit. + #[inline(always)] + fn set_w() -> Self { + Self(0) + } + /// Creates a new RexPrefix for which the REX.W bit will be cleared. + #[inline(always)] + fn clear_w() -> Self { + Self(1) + } + + #[inline(always)] + fn always_emit(&mut self) -> &mut Self { + self.0 = self.0 | 2; + self + } + + #[inline(always)] + fn must_clear_w(&self) -> bool { + (self.0 & 1) != 0 + } + #[inline(always)] + fn must_always_emit(&self) -> bool { + (self.0 & 2) != 0 + } + + #[inline(always)] + fn emit_two_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_e: u8) { + let w = if self.must_clear_w() { 0 } else { 1 }; + let r = (enc_g >> 3) & 1; + let x = 0; + let b = (enc_e >> 3) & 1; + let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b; + if rex != 0x40 || self.must_always_emit() { + sink.put1(rex); + } + } + + #[inline(always)] + fn emit_three_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_index: u8, enc_base: u8) { + let w = if self.must_clear_w() { 0 } else { 1 }; + let r = (enc_g >> 3) & 1; + let x = (enc_index >> 3) & 1; + let b = (enc_base >> 3) & 1; + let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b; + if rex != 0x40 || self.must_always_emit() { + sink.put1(rex); + } + } +} + +/// We may need to include one or more legacy prefix bytes before the REX prefix. This enum +/// covers only the small set of possibilities that we actually need. +enum LegacyPrefixes { + /// No prefix bytes + None, + /// Operand Size Override -- here, denoting "16-bit operation" + _66, + /// The Lock prefix + _F0, + /// Operand size override and Lock + _66F0, + /// REPNE, but no specific meaning here -- is just an opcode extension + _F2, + /// REP/REPE, but no specific meaning here -- is just an opcode extension + _F3, +} + +impl LegacyPrefixes { + #[inline(always)] + fn emit(&self, sink: &mut MachBuffer<Inst>) { + match self { + LegacyPrefixes::_66 => sink.put1(0x66), + LegacyPrefixes::_F0 => sink.put1(0xF0), + LegacyPrefixes::_66F0 => { + // I don't think the order matters, but in any case, this is the same order that + // the GNU assembler uses. + sink.put1(0x66); + sink.put1(0xF0); + } + LegacyPrefixes::_F2 => sink.put1(0xF2), + LegacyPrefixes::_F3 => sink.put1(0xF3), + LegacyPrefixes::None => (), + } + } +} + +/// This is the core 'emit' function for instructions that reference memory. +/// +/// For an instruction that has as operands a reg encoding `enc_g` and a memory address `mem_e`, +/// create and emit: +/// - first the legacy prefixes, if any +/// - then the REX prefix, if needed +/// - then caller-supplied opcode byte(s) (`opcodes` and `num_opcodes`), +/// - then the MOD/RM byte, +/// - then optionally, a SIB byte, +/// - and finally optionally an immediate that will be derived from the `mem_e` operand. +/// +/// For most instructions up to and including SSE4.2, that will be the whole instruction: this is +/// what we call "standard" instructions, and abbreviate "std" in the name here. VEX-prefixed +/// instructions will require their own emitter functions. +/// +/// This will also work for 32-bits x86 instructions, assuming no REX prefix is provided. +/// +/// The opcodes are written bigendianly for the convenience of callers. For example, if the opcode +/// bytes to be emitted are, in this order, F3 0F 27, then the caller should pass `opcodes` == +/// 0xF3_0F_27 and `num_opcodes` == 3. +/// +/// The register operand is represented here not as a `Reg` but as its hardware encoding, `enc_g`. +/// `rex` can specify special handling for the REX prefix. By default, the REX prefix will +/// indicate a 64-bit operation and will be deleted if it is redundant (0x40). Note that for a +/// 64-bit operation, the REX prefix will normally never be redundant, since REX.W must be 1 to +/// indicate a 64-bit operation. +fn emit_std_enc_mem( + sink: &mut MachBuffer<Inst>, + state: &EmitState, + prefixes: LegacyPrefixes, + opcodes: u32, + mut num_opcodes: usize, + enc_g: u8, + mem_e: &Amode, + rex: RexFlags, +) { + // General comment for this function: the registers in `mem_e` must be + // 64-bit integer registers, because they are part of an address + // expression. But `enc_g` can be derived from a register of any class. + + let srcloc = state.cur_srcloc(); + if srcloc != SourceLoc::default() && mem_e.can_trap() { + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + prefixes.emit(sink); + + match mem_e { + Amode::ImmReg { simm32, base, .. } => { + // First, the REX byte. + let enc_e = int_reg_enc(*base); + rex.emit_two_op(sink, enc_g, enc_e); + + // Now the opcode(s). These include any other prefixes the caller + // hands to us. + while num_opcodes > 0 { + num_opcodes -= 1; + sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); + } + + // Now the mod/rm and associated immediates. This is + // significantly complicated due to the multiple special cases. + if *simm32 == 0 + && enc_e != regs::ENC_RSP + && enc_e != regs::ENC_RBP + && enc_e != regs::ENC_R12 + && enc_e != regs::ENC_R13 + { + // FIXME JRS 2020Feb11: those four tests can surely be + // replaced by a single mask-and-compare check. We should do + // that because this routine is likely to be hot. + sink.put1(encode_modrm(0, enc_g & 7, enc_e & 7)); + } else if *simm32 == 0 && (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12) { + sink.put1(encode_modrm(0, enc_g & 7, 4)); + sink.put1(0x24); + } else if low8_will_sign_extend_to_32(*simm32) + && enc_e != regs::ENC_RSP + && enc_e != regs::ENC_R12 + { + sink.put1(encode_modrm(1, enc_g & 7, enc_e & 7)); + sink.put1((simm32 & 0xFF) as u8); + } else if enc_e != regs::ENC_RSP && enc_e != regs::ENC_R12 { + sink.put1(encode_modrm(2, enc_g & 7, enc_e & 7)); + sink.put4(*simm32); + } else if (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12) + && low8_will_sign_extend_to_32(*simm32) + { + // REX.B distinguishes RSP from R12 + sink.put1(encode_modrm(1, enc_g & 7, 4)); + sink.put1(0x24); + sink.put1((simm32 & 0xFF) as u8); + } else if enc_e == regs::ENC_R12 || enc_e == regs::ENC_RSP { + //.. wait for test case for RSP case + // REX.B distinguishes RSP from R12 + sink.put1(encode_modrm(2, enc_g & 7, 4)); + sink.put1(0x24); + sink.put4(*simm32); + } else { + unreachable!("ImmReg"); + } + } + + Amode::ImmRegRegShift { + simm32, + base: reg_base, + index: reg_index, + shift, + .. + } => { + let enc_base = int_reg_enc(*reg_base); + let enc_index = int_reg_enc(*reg_index); + + // The rex byte. + rex.emit_three_op(sink, enc_g, enc_index, enc_base); + + // All other prefixes and opcodes. + while num_opcodes > 0 { + num_opcodes -= 1; + sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); + } + + // modrm, SIB, immediates. + if low8_will_sign_extend_to_32(*simm32) && enc_index != regs::ENC_RSP { + sink.put1(encode_modrm(1, enc_g & 7, 4)); + sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7)); + sink.put1(*simm32 as u8); + } else if enc_index != regs::ENC_RSP { + sink.put1(encode_modrm(2, enc_g & 7, 4)); + sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7)); + sink.put4(*simm32); + } else { + panic!("ImmRegRegShift"); + } + } + + Amode::RipRelative { ref target } => { + // First, the REX byte, with REX.B = 0. + rex.emit_two_op(sink, enc_g, 0); + + // Now the opcode(s). These include any other prefixes the caller + // hands to us. + while num_opcodes > 0 { + num_opcodes -= 1; + sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); + } + + // RIP-relative is mod=00, rm=101. + sink.put1(encode_modrm(0, enc_g & 7, 0b101)); + + let offset = sink.cur_offset(); + sink.use_label_at_offset(offset, *target, LabelUse::JmpRel32); + sink.put4(0); + } + } +} + +/// This is the core 'emit' function for instructions that do not reference memory. +/// +/// This is conceptually the same as emit_modrm_sib_enc_ge, except it is for the case where the E +/// operand is a register rather than memory. Hence it is much simpler. +fn emit_std_enc_enc( + sink: &mut MachBuffer<Inst>, + prefixes: LegacyPrefixes, + opcodes: u32, + mut num_opcodes: usize, + enc_g: u8, + enc_e: u8, + rex: RexFlags, +) { + // EncG and EncE can be derived from registers of any class, and they + // don't even have to be from the same class. For example, for an + // integer-to-FP conversion insn, one might be RegClass::I64 and the other + // RegClass::V128. + + // The legacy prefixes. + prefixes.emit(sink); + + // The rex byte. + rex.emit_two_op(sink, enc_g, enc_e); + + // All other prefixes and opcodes. + while num_opcodes > 0 { + num_opcodes -= 1; + sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); + } + + // Now the mod/rm byte. The instruction we're generating doesn't access + // memory, so there is no SIB byte or immediate -- we're done. + sink.put1(encode_modrm(3, enc_g & 7, enc_e & 7)); +} + +// These are merely wrappers for the above two functions that facilitate passing +// actual `Reg`s rather than their encodings. + +fn emit_std_reg_mem( + sink: &mut MachBuffer<Inst>, + state: &EmitState, + prefixes: LegacyPrefixes, + opcodes: u32, + num_opcodes: usize, + reg_g: Reg, + mem_e: &Amode, + rex: RexFlags, +) { + let enc_g = reg_enc(reg_g); + emit_std_enc_mem( + sink, + state, + prefixes, + opcodes, + num_opcodes, + enc_g, + mem_e, + rex, + ); +} + +fn emit_std_reg_reg( + sink: &mut MachBuffer<Inst>, + prefixes: LegacyPrefixes, + opcodes: u32, + num_opcodes: usize, + reg_g: Reg, + reg_e: Reg, + rex: RexFlags, +) { + let enc_g = reg_enc(reg_g); + let enc_e = reg_enc(reg_e); + emit_std_enc_enc(sink, prefixes, opcodes, num_opcodes, enc_g, enc_e, rex); +} + +/// Write a suitable number of bits from an imm64 to the sink. +fn emit_simm(sink: &mut MachBuffer<Inst>, size: u8, simm32: u32) { + match size { + 8 | 4 => sink.put4(simm32), + 2 => sink.put2(simm32 as u16), + 1 => sink.put1(simm32 as u8), + _ => unreachable!(), + } +} + +/// A small helper to generate a signed conversion instruction. +fn emit_signed_cvt( + sink: &mut MachBuffer<Inst>, + info: &EmitInfo, + state: &mut EmitState, + src: Reg, + dst: Writable<Reg>, + to_f64: bool, +) { + // Handle an unsigned int, which is the "easy" case: a signed conversion will do the + // right thing. + let op = if to_f64 { + SseOpcode::Cvtsi2sd + } else { + SseOpcode::Cvtsi2ss + }; + let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst); + inst.emit(sink, info, state); +} + +/// Emits a one way conditional jump if CC is set (true). +fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) { + let cond_start = sink.cur_offset(); + let cond_disp_off = cond_start + 2; + sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32); + sink.put1(0x0F); + sink.put1(0x80 + cc.get_enc()); + sink.put4(0x0); +} + +/// Emits a relocation, attaching the current source location as well. +fn emit_reloc( + sink: &mut MachBuffer<Inst>, + state: &EmitState, + kind: Reloc, + name: &ExternalName, + addend: Addend, +) { + let srcloc = state.cur_srcloc(); + sink.add_reloc(srcloc, kind, name, addend); +} + +/// The top-level emit function. +/// +/// Important! Do not add improved (shortened) encoding cases to existing +/// instructions without also adding tests for those improved encodings. That +/// is a dangerous game that leads to hard-to-track-down errors in the emitted +/// code. +/// +/// For all instructions, make sure to have test coverage for all of the +/// following situations. Do this by creating the cross product resulting from +/// applying the following rules to each operand: +/// +/// (1) for any insn that mentions a register: one test using a register from +/// the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one +/// using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15]. +/// This helps detect incorrect REX prefix construction. +/// +/// (2) for any insn that mentions a byte register: one test for each of the +/// four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil], +/// [r8b .. r11b] and [r12b .. r15b]. This checks that +/// apparently-redundant REX prefixes are retained when required. +/// +/// (3) for any insn that contains an immediate field, check the following +/// cases: field is zero, field is in simm8 range (-128 .. 127), field is +/// in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF). This is because some +/// instructions that require a 32-bit immediate have a short-form encoding +/// when the imm is in simm8 range. +/// +/// Rules (1), (2) and (3) don't apply for registers within address expressions +/// (`Addr`s). Those are already pretty well tested, and the registers in them +/// don't have any effect on the containing instruction (apart from possibly +/// require REX prefix bits). +/// +/// When choosing registers for a test, avoid using registers with the same +/// offset within a given group. For example, don't use rax and r8, since they +/// both have the lowest 3 bits as 000, and so the test won't detect errors +/// where those 3-bit register sub-fields are confused by the emitter. Instead +/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001). Similarly, don't use (eg) cl +/// and bpl since they have the same offset in their group; use instead (eg) cl +/// and sil. +/// +/// For all instructions, also add a test that uses only low-half registers +/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX +/// prefixes are correctly omitted. This low-half restriction must apply to +/// _all_ registers in the insn, even those in address expressions. +/// +/// Following these rules creates large numbers of test cases, but it's the +/// only way to make the emitter reliable. +/// +/// Known possible improvements: +/// +/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate. (Do we +/// care?) +pub(crate) fn emit( + inst: &Inst, + sink: &mut MachBuffer<Inst>, + info: &EmitInfo, + state: &mut EmitState, +) { + if let Some(iset_requirement) = inst.isa_requirement() { + match iset_requirement { + // Cranelift assumes SSE2 at least. + InstructionSet::SSE | InstructionSet::SSE2 => {} + InstructionSet::SSSE3 => assert!(info.isa_flags.has_ssse3()), + InstructionSet::SSE41 => assert!(info.isa_flags.has_sse41()), + InstructionSet::SSE42 => assert!(info.isa_flags.has_sse42()), + } + } + + match inst { + Inst::AluRmiR { + is_64, + op, + src, + dst: reg_g, + } => { + let rex = if *is_64 { + RexFlags::set_w() + } else { + RexFlags::clear_w() + }; + + if *op == AluRmiROpcode::Mul { + // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so + // we have to special-case it. + match src { + RegMemImm::Reg { reg: reg_e } => { + emit_std_reg_reg( + sink, + LegacyPrefixes::None, + 0x0FAF, + 2, + reg_g.to_reg(), + *reg_e, + rex, + ); + } + + RegMemImm::Mem { addr } => { + let amode = addr.finalize(state); + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + 0x0FAF, + 2, + reg_g.to_reg(), + &amode, + rex, + ); + } + + RegMemImm::Imm { simm32 } => { + let use_imm8 = low8_will_sign_extend_to_32(*simm32); + let opcode = if use_imm8 { 0x6B } else { 0x69 }; + // Yes, really, reg_g twice. + emit_std_reg_reg( + sink, + LegacyPrefixes::None, + opcode, + 1, + reg_g.to_reg(), + reg_g.to_reg(), + rex, + ); + emit_simm(sink, if use_imm8 { 1 } else { 4 }, *simm32); + } + } + } else { + let (opcode_r, opcode_m, subopcode_i) = match op { + AluRmiROpcode::Add => (0x01, 0x03, 0), + AluRmiROpcode::Sub => (0x29, 0x2B, 5), + AluRmiROpcode::And => (0x21, 0x23, 4), + AluRmiROpcode::Or => (0x09, 0x0B, 1), + AluRmiROpcode::Xor => (0x31, 0x33, 6), + AluRmiROpcode::Mul => panic!("unreachable"), + }; + + match src { + RegMemImm::Reg { reg: reg_e } => { + // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R + // duality). Do this too, so as to be able to compare generated machine + // code easily. + emit_std_reg_reg( + sink, + LegacyPrefixes::None, + opcode_r, + 1, + *reg_e, + reg_g.to_reg(), + rex, + ); + // NB: if this is ever extended to handle byte size ops, be sure to retain + // redundant REX prefixes. + } + + RegMemImm::Mem { addr } => { + // Here we revert to the "normal" G-E ordering. + let amode = addr.finalize(state); + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + opcode_m, + 1, + reg_g.to_reg(), + &amode, + rex, + ); + } + + RegMemImm::Imm { simm32 } => { + let use_imm8 = low8_will_sign_extend_to_32(*simm32); + let opcode = if use_imm8 { 0x83 } else { 0x81 }; + // And also here we use the "normal" G-E ordering. + let enc_g = int_reg_enc(reg_g.to_reg()); + emit_std_enc_enc( + sink, + LegacyPrefixes::None, + opcode, + 1, + subopcode_i, + enc_g, + rex, + ); + emit_simm(sink, if use_imm8 { 1 } else { 4 }, *simm32); + } + } + } + } + + Inst::UnaryRmR { size, op, src, dst } => { + let (prefix, rex_flags) = match size { + 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!(), + }; + + let (opcode, num_opcodes) = match op { + UnaryRmROpcode::Bsr => (0x0fbd, 2), + UnaryRmROpcode::Bsf => (0x0fbc, 2), + }; + + match src { + RegMem::Reg { reg: src } => emit_std_reg_reg( + sink, + prefix, + opcode, + num_opcodes, + dst.to_reg(), + *src, + rex_flags, + ), + RegMem::Mem { addr: src } => { + let amode = src.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode, + num_opcodes, + dst.to_reg(), + &amode, + rex_flags, + ); + } + } + } + + Inst::Not { size, src } => { + let (opcode, prefix, rex_flags) = match size { + 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()), + 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("{}", size), + }; + + let subopcode = 2; + let src = int_reg_enc(src.to_reg()); + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags) + } + + Inst::Neg { size, src } => { + let (opcode, prefix, rex_flags) = match size { + 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()), + 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("{}", size), + }; + + let subopcode = 3; + let src = int_reg_enc(src.to_reg()); + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags) + } + + Inst::Div { + size, + signed, + divisor, + } => { + let (opcode, prefix, rex_flags) = match size { + 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()), + 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("{}", size), + }; + + let loc = state.cur_srcloc(); + sink.add_trap(loc, TrapCode::IntegerDivisionByZero); + + let subopcode = if *signed { 7 } else { 6 }; + match divisor { + RegMem::Reg { reg } => { + let src = int_reg_enc(*reg); + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags) + } + RegMem::Mem { addr: src } => { + let amode = src.finalize(state); + emit_std_enc_mem(sink, state, prefix, opcode, 1, subopcode, &amode, rex_flags); + } + } + } + + Inst::MulHi { size, signed, rhs } => { + let (prefix, rex_flags) = match size { + 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!(), + }; + + let subopcode = if *signed { 5 } else { 4 }; + match rhs { + RegMem::Reg { reg } => { + let src = int_reg_enc(*reg); + emit_std_enc_enc(sink, prefix, 0xF7, 1, subopcode, src, rex_flags) + } + RegMem::Mem { addr: src } => { + let amode = src.finalize(state); + emit_std_enc_mem(sink, state, prefix, 0xF7, 1, subopcode, &amode, rex_flags); + } + } + } + + Inst::SignExtendData { size } => match size { + 1 => { + sink.put1(0x66); + sink.put1(0x98); + } + 2 => { + sink.put1(0x66); + sink.put1(0x99); + } + 4 => sink.put1(0x99), + 8 => { + sink.put1(0x48); + sink.put1(0x99); + } + _ => unreachable!(), + }, + + Inst::CheckedDivOrRemSeq { + kind, + size, + divisor, + tmp, + } => { + // Generates the following code sequence: + // + // ;; check divide by zero: + // cmp 0 %divisor + // jnz $after_trap + // ud2 + // $after_trap: + // + // ;; for signed modulo/div: + // cmp -1 %divisor + // jnz $do_op + // ;; for signed modulo, result is 0 + // mov #0, %rdx + // j $done + // ;; for signed div, check for integer overflow against INT_MIN of the right size + // cmp INT_MIN, %rax + // jnz $do_op + // ud2 + // + // $do_op: + // ;; if signed + // cdq ;; sign-extend from rax into rdx + // ;; else + // mov #0, %rdx + // idiv %divisor + // + // $done: + debug_assert!(info.flags().avoid_div_traps()); + + // Check if the divisor is zero, first. + let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0), divisor.to_reg()); + inst.emit(sink, info, state); + + let inst = Inst::trap_if(CC::Z, TrapCode::IntegerDivisionByZero); + inst.emit(sink, info, state); + + let (do_op, done_label) = if kind.is_signed() { + // Now check if the divisor is -1. + let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0xffffffff), divisor.to_reg()); + inst.emit(sink, info, state); + + let do_op = sink.get_label(); + + // If not equal, jump to do-op. + one_way_jmp(sink, CC::NZ, do_op); + + // Here, divisor == -1. + if !kind.is_div() { + // x % -1 = 0; put the result into the destination, $rdx. + let done_label = sink.get_label(); + + let inst = Inst::imm( + OperandSize::from_bytes(*size as u32), + 0, + Writable::from_reg(regs::rdx()), + ); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done_label); + inst.emit(sink, info, state); + + (Some(do_op), Some(done_label)) + } else { + // Check for integer overflow. + if *size == 8 { + let tmp = tmp.expect("temporary for i64 sdiv"); + + let inst = Inst::imm(OperandSize::Size64, 0x8000000000000000, tmp); + inst.emit(sink, info, state); + + let inst = Inst::cmp_rmi_r(8, RegMemImm::reg(tmp.to_reg()), regs::rax()); + inst.emit(sink, info, state); + } else { + let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0x80000000), regs::rax()); + inst.emit(sink, info, state); + } + + // If not equal, jump over the trap. + let inst = Inst::trap_if(CC::Z, TrapCode::IntegerOverflow); + inst.emit(sink, info, state); + + (Some(do_op), None) + } + } else { + (None, None) + }; + + if let Some(do_op) = do_op { + sink.bind_label(do_op); + } + + assert!( + *size > 1, + "CheckedDivOrRemSeq for i8 is not yet implemented" + ); + + // Fill in the high parts: + if kind.is_signed() { + // sign-extend the sign-bit of rax into rdx, for signed opcodes. + let inst = Inst::sign_extend_data(*size); + inst.emit(sink, info, state); + } else { + // zero for unsigned opcodes. + let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(regs::rdx())); + inst.emit(sink, info, state); + } + + let inst = Inst::div(*size, kind.is_signed(), RegMem::reg(divisor.to_reg())); + inst.emit(sink, info, state); + + // Lowering takes care of moving the result back into the right register, see comment + // there. + + if let Some(done) = done_label { + sink.bind_label(done); + } + } + + Inst::Imm { + dst_is_64, + simm64, + dst, + } => { + let enc_dst = int_reg_enc(dst.to_reg()); + if *dst_is_64 { + if low32_will_sign_extend_to_64(*simm64) { + // Sign-extended move imm32. + emit_std_enc_enc( + sink, + LegacyPrefixes::None, + 0xC7, + 1, + /* subopcode */ 0, + enc_dst, + RexFlags::set_w(), + ); + sink.put4(*simm64 as u32); + } else { + sink.put1(0x48 | ((enc_dst >> 3) & 1)); + sink.put1(0xB8 | (enc_dst & 7)); + sink.put8(*simm64); + } + } else { + if ((enc_dst >> 3) & 1) == 1 { + sink.put1(0x41); + } + sink.put1(0xB8 | (enc_dst & 7)); + sink.put4(*simm64 as u32); + } + } + + Inst::MovRR { is_64, src, dst } => { + let rex = if *is_64 { + RexFlags::set_w() + } else { + RexFlags::clear_w() + }; + emit_std_reg_reg(sink, LegacyPrefixes::None, 0x89, 1, *src, dst.to_reg(), rex); + } + + Inst::MovzxRmR { ext_mode, src, dst } => { + let (opcodes, num_opcodes, mut rex_flags) = match ext_mode { + ExtMode::BL => { + // MOVZBL is (REX.W==0) 0F B6 /r + (0x0FB6, 2, RexFlags::clear_w()) + } + ExtMode::BQ => { + // MOVZBQ is (REX.W==1) 0F B6 /r + // I'm not sure why the Intel manual offers different + // encodings for MOVZBQ than for MOVZBL. AIUI they should + // achieve the same, since MOVZBL is just going to zero out + // the upper half of the destination anyway. + (0x0FB6, 2, RexFlags::set_w()) + } + ExtMode::WL => { + // MOVZWL is (REX.W==0) 0F B7 /r + (0x0FB7, 2, RexFlags::clear_w()) + } + ExtMode::WQ => { + // MOVZWQ is (REX.W==1) 0F B7 /r + (0x0FB7, 2, RexFlags::set_w()) + } + ExtMode::LQ => { + // This is just a standard 32 bit load, and we rely on the + // default zero-extension rule to perform the extension. + // Note that in reg/reg mode, gcc seems to use the swapped form R/RM, which we + // don't do here, since it's the same encoding size. + // MOV r/m32, r32 is (REX.W==0) 8B /r + (0x8B, 1, RexFlags::clear_w()) + } + }; + + match src { + RegMem::Reg { reg: src } => { + match ext_mode { + ExtMode::BL | ExtMode::BQ => { + // A redundant REX prefix must be emitted for certain register inputs. + let enc_src = int_reg_enc(*src); + if enc_src >= 4 && enc_src <= 7 { + rex_flags.always_emit(); + }; + } + _ => {} + } + emit_std_reg_reg( + sink, + LegacyPrefixes::None, + opcodes, + num_opcodes, + dst.to_reg(), + *src, + rex_flags, + ) + } + + RegMem::Mem { addr: src } => { + let src = &src.finalize(state); + + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + opcodes, + num_opcodes, + dst.to_reg(), + src, + rex_flags, + ) + } + } + } + + Inst::Mov64MR { src, dst } => { + let src = &src.finalize(state); + + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + 0x8B, + 1, + dst.to_reg(), + src, + RexFlags::set_w(), + ) + } + + Inst::LoadEffectiveAddress { addr, dst } => { + let amode = addr.finalize(state); + + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + 0x8D, + 1, + dst.to_reg(), + &amode, + RexFlags::set_w(), + ); + } + + Inst::MovsxRmR { ext_mode, src, dst } => { + let (opcodes, num_opcodes, mut rex_flags) = match ext_mode { + ExtMode::BL => { + // MOVSBL is (REX.W==0) 0F BE /r + (0x0FBE, 2, RexFlags::clear_w()) + } + ExtMode::BQ => { + // MOVSBQ is (REX.W==1) 0F BE /r + (0x0FBE, 2, RexFlags::set_w()) + } + ExtMode::WL => { + // MOVSWL is (REX.W==0) 0F BF /r + (0x0FBF, 2, RexFlags::clear_w()) + } + ExtMode::WQ => { + // MOVSWQ is (REX.W==1) 0F BF /r + (0x0FBF, 2, RexFlags::set_w()) + } + ExtMode::LQ => { + // MOVSLQ is (REX.W==1) 63 /r + (0x63, 1, RexFlags::set_w()) + } + }; + + match src { + RegMem::Reg { reg: src } => { + match ext_mode { + ExtMode::BL | ExtMode::BQ => { + // A redundant REX prefix must be emitted for certain register inputs. + let enc_src = int_reg_enc(*src); + if enc_src >= 4 && enc_src <= 7 { + rex_flags.always_emit(); + }; + } + _ => {} + } + emit_std_reg_reg( + sink, + LegacyPrefixes::None, + opcodes, + num_opcodes, + dst.to_reg(), + *src, + rex_flags, + ) + } + + RegMem::Mem { addr: src } => { + let src = &src.finalize(state); + + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + opcodes, + num_opcodes, + dst.to_reg(), + src, + rex_flags, + ) + } + } + } + + Inst::MovRM { size, src, dst } => { + let dst = &dst.finalize(state); + + match size { + 1 => { + // This is one of the few places where the presence of a + // redundant REX prefix changes the meaning of the + // instruction. + let mut rex = RexFlags::clear_w(); + + let enc_src = int_reg_enc(*src); + if enc_src >= 4 && enc_src <= 7 { + rex.always_emit(); + }; + + // MOV r8, r/m8 is (REX.W==0) 88 /r + emit_std_reg_mem(sink, state, LegacyPrefixes::None, 0x88, 1, *src, dst, rex) + } + + 2 => { + // MOV r16, r/m16 is 66 (REX.W==0) 89 /r + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::_66, + 0x89, + 1, + *src, + dst, + RexFlags::clear_w(), + ) + } + + 4 => { + // MOV r32, r/m32 is (REX.W==0) 89 /r + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + 0x89, + 1, + *src, + dst, + RexFlags::clear_w(), + ) + } + + 8 => { + // MOV r64, r/m64 is (REX.W==1) 89 /r + emit_std_reg_mem( + sink, + state, + LegacyPrefixes::None, + 0x89, + 1, + *src, + dst, + RexFlags::set_w(), + ) + } + + _ => panic!("x64::Inst::Mov_R_M::emit: unreachable"), + } + } + + Inst::ShiftR { + size, + kind, + num_bits, + dst, + } => { + let enc_dst = int_reg_enc(dst.to_reg()); + let subopcode = match kind { + ShiftKind::RotateLeft => 0, + ShiftKind::RotateRight => 1, + ShiftKind::ShiftLeft => 4, + ShiftKind::ShiftRightLogical => 5, + ShiftKind::ShiftRightArithmetic => 7, + }; + + match num_bits { + None => { + let (opcode, prefix, rex_flags) = match size { + 1 => (0xD2, LegacyPrefixes::None, RexFlags::clear_w()), + 2 => (0xD3, LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (0xD3, LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (0xD3, LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("{}", size), + }; + + // SHL/SHR/SAR %cl, reg8 is (REX.W==0) D2 /subopcode + // SHL/SHR/SAR %cl, reg16 is 66 (REX.W==0) D3 /subopcode + // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode + // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags); + } + + Some(num_bits) => { + let (opcode, prefix, rex_flags) = match size { + 1 => (0xC0, LegacyPrefixes::None, RexFlags::clear_w()), + 2 => (0xC1, LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (0xC1, LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (0xC1, LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("{}", size), + }; + + // SHL/SHR/SAR $ib, reg8 is (REX.W==0) C0 /subopcode + // SHL/SHR/SAR $ib, reg16 is 66 (REX.W==0) C1 /subopcode + // SHL/SHR/SAR $ib, reg32 is (REX.W==0) C1 /subopcode ib + // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib + // When the shift amount is 1, there's an even shorter encoding, but we don't + // bother with that nicety here. + emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags); + sink.put1(*num_bits); + } + } + } + + Inst::XmmRmiReg { opcode, src, dst } => { + let rex = RexFlags::clear_w(); + let prefix = LegacyPrefixes::_66; + if let RegMemImm::Imm { simm32 } = src { + let (opcode_bytes, reg_digit) = match opcode { + SseOpcode::Psllw => (0x0F71, 6), + SseOpcode::Pslld => (0x0F72, 6), + SseOpcode::Psllq => (0x0F73, 6), + SseOpcode::Psraw => (0x0F71, 4), + SseOpcode::Psrad => (0x0F72, 4), + SseOpcode::Psrlw => (0x0F71, 2), + SseOpcode::Psrld => (0x0F72, 2), + SseOpcode::Psrlq => (0x0F73, 2), + _ => panic!("invalid opcode: {}", opcode), + }; + let dst_enc = reg_enc(dst.to_reg()); + emit_std_enc_enc(sink, prefix, opcode_bytes, 2, reg_digit, dst_enc, rex); + let imm = (*simm32) + .try_into() + .expect("the immediate must be convertible to a u8"); + sink.put1(imm); + } else { + let opcode_bytes = match opcode { + SseOpcode::Psllw => 0x0FF1, + SseOpcode::Pslld => 0x0FF2, + SseOpcode::Psllq => 0x0FF3, + SseOpcode::Psraw => 0x0FE1, + SseOpcode::Psrad => 0x0FE2, + SseOpcode::Psrlw => 0x0FD1, + SseOpcode::Psrld => 0x0FD2, + SseOpcode::Psrlq => 0x0FD3, + _ => panic!("invalid opcode: {}", opcode), + }; + + match src { + RegMemImm::Reg { reg } => { + emit_std_reg_reg(sink, prefix, opcode_bytes, 2, dst.to_reg(), *reg, rex); + } + RegMemImm::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode_bytes, + 2, + dst.to_reg(), + addr, + rex, + ); + } + RegMemImm::Imm { .. } => unreachable!(), + } + }; + } + + Inst::CmpRmiR { + size, + src: src_e, + dst: reg_g, + } => { + let mut prefix = LegacyPrefixes::None; + if *size == 2 { + prefix = LegacyPrefixes::_66; + } + + let mut rex = match size { + 8 => RexFlags::set_w(), + 4 | 2 => RexFlags::clear_w(), + 1 => { + let mut rex = RexFlags::clear_w(); + // Here, a redundant REX prefix changes the meaning of the instruction. + let enc_g = int_reg_enc(*reg_g); + if enc_g >= 4 && enc_g <= 7 { + rex.always_emit(); + } + rex + } + _ => panic!("x64::Inst::Cmp_RMI_R::emit: unreachable"), + }; + + match src_e { + RegMemImm::Reg { reg: reg_e } => { + if *size == 1 { + // Check whether the E register forces the use of a redundant REX. + let enc_e = int_reg_enc(*reg_e); + if enc_e >= 4 && enc_e <= 7 { + rex.always_emit(); + } + } + + // Use the swapped operands encoding, to stay consistent with the output of + // gcc/llvm. + let opcode = if *size == 1 { 0x38 } else { 0x39 }; + emit_std_reg_reg(sink, prefix, opcode, 1, *reg_e, *reg_g, rex); + } + + RegMemImm::Mem { addr } => { + let addr = &addr.finalize(state); + // Whereas here we revert to the "normal" G-E ordering. + let opcode = if *size == 1 { 0x3A } else { 0x3B }; + emit_std_reg_mem(sink, state, prefix, opcode, 1, *reg_g, addr, rex); + } + + RegMemImm::Imm { simm32 } => { + // FIXME JRS 2020Feb11: there are shorter encodings for + // cmp $imm, rax/eax/ax/al. + let use_imm8 = low8_will_sign_extend_to_32(*simm32); + + // And also here we use the "normal" G-E ordering. + let opcode = if *size == 1 { + 0x80 + } else if use_imm8 { + 0x83 + } else { + 0x81 + }; + + let enc_g = int_reg_enc(*reg_g); + emit_std_enc_enc(sink, prefix, opcode, 1, 7 /*subopcode*/, enc_g, rex); + emit_simm(sink, if use_imm8 { 1 } else { *size }, *simm32); + } + } + } + + Inst::Setcc { cc, dst } => { + let opcode = 0x0f90 + cc.get_enc() as u32; + let mut rex_flags = RexFlags::clear_w(); + rex_flags.always_emit(); + emit_std_enc_enc( + sink, + LegacyPrefixes::None, + opcode, + 2, + 0, + reg_enc(dst.to_reg()), + rex_flags, + ); + } + + Inst::Cmove { + size, + cc, + src, + dst: reg_g, + } => { + let (prefix, rex_flags) = match size { + 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (LegacyPrefixes::None, RexFlags::set_w()), + _ => unreachable!("invalid size spec for cmove"), + }; + let opcode = 0x0F40 + cc.get_enc() as u32; + match src { + RegMem::Reg { reg: reg_e } => { + emit_std_reg_reg(sink, prefix, opcode, 2, reg_g.to_reg(), *reg_e, rex_flags); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode, + 2, + reg_g.to_reg(), + addr, + rex_flags, + ); + } + } + } + + Inst::XmmCmove { + is_64, + cc, + src, + dst, + } => { + // Lowering of the Select IR opcode when the input is an fcmp relies on the fact that + // this doesn't clobber flags. Make sure to not do so here. + let next = sink.get_label(); + + // Jump if cc is *not* set. + one_way_jmp(sink, cc.invert(), next); + + let op = if *is_64 { + SseOpcode::Movsd + } else { + SseOpcode::Movss + }; + let inst = Inst::xmm_unary_rm_r(op, src.clone(), *dst); + inst.emit(sink, info, state); + + sink.bind_label(next); + } + + Inst::Push64 { src } => { + match src { + RegMemImm::Reg { reg } => { + let enc_reg = int_reg_enc(*reg); + let rex = 0x40 | ((enc_reg >> 3) & 1); + if rex != 0x40 { + sink.put1(rex); + } + sink.put1(0x50 | (enc_reg & 7)); + } + + RegMemImm::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_enc_mem( + sink, + state, + LegacyPrefixes::None, + 0xFF, + 1, + 6, /*subopcode*/ + addr, + RexFlags::clear_w(), + ); + } + + RegMemImm::Imm { simm32 } => { + if low8_will_sign_extend_to_64(*simm32) { + sink.put1(0x6A); + sink.put1(*simm32 as u8); + } else { + sink.put1(0x68); + sink.put4(*simm32); + } + } + } + } + + Inst::Pop64 { dst } => { + let enc_dst = int_reg_enc(dst.to_reg()); + if enc_dst >= 8 { + // 0x41 == REX.{W=0, B=1}. It seems that REX.W is irrelevant here. + sink.put1(0x41); + } + sink.put1(0x58 + (enc_dst & 7)); + } + + Inst::CallKnown { dest, opcode, .. } => { + if let Some(s) = state.take_stack_map() { + sink.add_stack_map(StackMapExtent::UpcomingBytes(5), s); + } + sink.put1(0xE8); + // The addend adjusts for the difference between the end of the instruction and the + // beginning of the immediate field. + emit_reloc(sink, state, Reloc::X86CallPCRel4, &dest, -4); + sink.put4(0); + if opcode.is_call() { + let loc = state.cur_srcloc(); + sink.add_call_site(loc, *opcode); + } + } + + Inst::CallUnknown { dest, opcode, .. } => { + let start_offset = sink.cur_offset(); + match dest { + RegMem::Reg { reg } => { + let reg_enc = int_reg_enc(*reg); + emit_std_enc_enc( + sink, + LegacyPrefixes::None, + 0xFF, + 1, + 2, /*subopcode*/ + reg_enc, + RexFlags::clear_w(), + ); + } + + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_enc_mem( + sink, + state, + LegacyPrefixes::None, + 0xFF, + 1, + 2, /*subopcode*/ + addr, + RexFlags::clear_w(), + ); + } + } + if let Some(s) = state.take_stack_map() { + sink.add_stack_map(StackMapExtent::StartedAtOffset(start_offset), s); + } + if opcode.is_call() { + let loc = state.cur_srcloc(); + sink.add_call_site(loc, *opcode); + } + } + + Inst::Ret {} => sink.put1(0xC3), + + Inst::JmpKnown { dst } => { + let br_start = sink.cur_offset(); + let br_disp_off = br_start + 1; + let br_end = br_start + 5; + + sink.use_label_at_offset(br_disp_off, *dst, LabelUse::JmpRel32); + sink.add_uncond_branch(br_start, br_end, *dst); + + sink.put1(0xE9); + // Placeholder for the label value. + sink.put4(0x0); + } + + Inst::JmpIf { cc, taken } => { + let cond_start = sink.cur_offset(); + let cond_disp_off = cond_start + 2; + + sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32); + // Since this is not a terminator, don't enroll in the branch inversion mechanism. + + sink.put1(0x0F); + sink.put1(0x80 + cc.get_enc()); + // Placeholder for the label value. + sink.put4(0x0); + } + + Inst::JmpCond { + cc, + taken, + not_taken, + } => { + // If taken. + let cond_start = sink.cur_offset(); + let cond_disp_off = cond_start + 2; + let cond_end = cond_start + 6; + + sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32); + let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00]; + sink.add_cond_branch(cond_start, cond_end, *taken, &inverted[..]); + + sink.put1(0x0F); + sink.put1(0x80 + cc.get_enc()); + // Placeholder for the label value. + sink.put4(0x0); + + // If not taken. + let uncond_start = sink.cur_offset(); + let uncond_disp_off = uncond_start + 1; + let uncond_end = uncond_start + 5; + + sink.use_label_at_offset(uncond_disp_off, *not_taken, LabelUse::JmpRel32); + sink.add_uncond_branch(uncond_start, uncond_end, *not_taken); + + sink.put1(0xE9); + // Placeholder for the label value. + sink.put4(0x0); + } + + Inst::JmpUnknown { target } => { + match target { + RegMem::Reg { reg } => { + let reg_enc = int_reg_enc(*reg); + emit_std_enc_enc( + sink, + LegacyPrefixes::None, + 0xFF, + 1, + 4, /*subopcode*/ + reg_enc, + RexFlags::clear_w(), + ); + } + + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_enc_mem( + sink, + state, + LegacyPrefixes::None, + 0xFF, + 1, + 4, /*subopcode*/ + addr, + RexFlags::clear_w(), + ); + } + } + } + + Inst::JmpTableSeq { + idx, + tmp1, + tmp2, + ref targets, + default_target, + .. + } => { + // This sequence is *one* instruction in the vcode, and is expanded only here at + // emission time, because we cannot allow the regalloc to insert spills/reloads in + // the middle; we depend on hardcoded PC-rel addressing below. + // + // We don't have to worry about emitting islands, because the only label-use type has a + // maximum range of 2 GB. If we later consider using shorter-range label references, + // this will need to be revisited. + + // Save index in a tmp (the live range of ridx only goes to start of this + // sequence; rtmp1 or rtmp2 may overwrite it). + + // We generate the following sequence: + // ;; generated by lowering: cmp #jmp_table_size, %idx + // jnb $default_target + // movl %idx, %tmp2 + // lea start_of_jump_table_offset(%rip), %tmp1 + // movslq [%tmp1, %tmp2, 4], %tmp2 ;; shift of 2, viz. multiply index by 4 + // addq %tmp2, %tmp1 + // j *%tmp1 + // $start_of_jump_table: + // -- jump table entries + one_way_jmp(sink, CC::NB, *default_target); // idx unsigned >= jmp table size + + // Copy the index (and make sure to clear the high 32-bits lane of tmp2). + let inst = Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(*idx), *tmp2); + inst.emit(sink, info, state); + + // Load base address of jump table. + let start_of_jumptable = sink.get_label(); + let inst = Inst::lea(Amode::rip_relative(start_of_jumptable), *tmp1); + inst.emit(sink, info, state); + + // Load value out of the jump table. It's a relative offset to the target block, so it + // might be negative; use a sign-extension. + let inst = Inst::movsx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg_reg_shift(0, tmp1.to_reg(), tmp2.to_reg(), 2)), + *tmp2, + ); + inst.emit(sink, info, state); + + // Add base of jump table to jump-table-sourced block offset. + let inst = Inst::alu_rmi_r( + true, /* is_64 */ + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + *tmp1, + ); + inst.emit(sink, info, state); + + // Branch to computed address. + let inst = Inst::jmp_unknown(RegMem::reg(tmp1.to_reg())); + inst.emit(sink, info, state); + + // Emit jump table (table of 32-bit offsets). + sink.bind_label(start_of_jumptable); + let jt_off = sink.cur_offset(); + for &target in targets.iter() { + let word_off = sink.cur_offset(); + // off_into_table is an addend here embedded in the label to be later patched at + // the end of codegen. The offset is initially relative to this jump table entry; + // with the extra addend, it'll be relative to the jump table's start, after + // patching. + let off_into_table = word_off - jt_off; + sink.use_label_at_offset(word_off, target, LabelUse::PCRel32); + sink.put4(off_into_table); + } + } + + Inst::TrapIf { cc, trap_code } => { + let else_label = sink.get_label(); + + // Jump over if the invert of CC is set (i.e. CC is not set). + one_way_jmp(sink, cc.invert(), else_label); + + // Trap! + let inst = Inst::trap(*trap_code); + inst.emit(sink, info, state); + + sink.bind_label(else_label); + } + + Inst::XmmUnaryRmR { + op, + src: src_e, + dst: reg_g, + } => { + let rex = RexFlags::clear_w(); + + let (prefix, opcode, num_opcodes) = match op { + SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2), + SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2), + SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2), + SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F28, 2), + SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F6F, 2), + SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F6F, 2), + SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2), + SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10, 2), + SseOpcode::Movups => (LegacyPrefixes::None, 0x0F10, 2), + SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F10, 2), + SseOpcode::Pabsb => (LegacyPrefixes::_66, 0x0F381C, 3), + SseOpcode::Pabsw => (LegacyPrefixes::_66, 0x0F381D, 3), + SseOpcode::Pabsd => (LegacyPrefixes::_66, 0x0F381E, 3), + SseOpcode::Sqrtps => (LegacyPrefixes::None, 0x0F51, 2), + SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2), + SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2), + SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + + match src_e { + RegMem::Reg { reg: reg_e } => { + emit_std_reg_reg( + sink, + prefix, + opcode, + num_opcodes, + reg_g.to_reg(), + *reg_e, + rex, + ); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode, + num_opcodes, + reg_g.to_reg(), + addr, + rex, + ); + } + }; + } + + Inst::XmmRmR { + op, + src: src_e, + dst: reg_g, + } => { + let rex = RexFlags::clear_w(); + let (prefix, opcode, length) = match op { + SseOpcode::Addps => (LegacyPrefixes::None, 0x0F58, 2), + SseOpcode::Addpd => (LegacyPrefixes::_66, 0x0F58, 2), + SseOpcode::Addss => (LegacyPrefixes::_F3, 0x0F58, 2), + SseOpcode::Addsd => (LegacyPrefixes::_F2, 0x0F58, 2), + SseOpcode::Andps => (LegacyPrefixes::None, 0x0F54, 2), + SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2), + SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2), + SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2), + SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2), + SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2), + SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2), + SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2), + SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2), + SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2), + SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2), + SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2), + SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2), + SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2), + SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2), + SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2), + SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2), + SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2), + SseOpcode::Movlhps => (LegacyPrefixes::None, 0x0F16, 2), + SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2), + SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2), + SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2), + SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2), + SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2), + SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2), + SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2), + SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2), + SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2), + SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2), + SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2), + SseOpcode::Paddw => (LegacyPrefixes::_66, 0x0FFD, 2), + SseOpcode::Paddsb => (LegacyPrefixes::_66, 0x0FEC, 2), + SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2), + SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2), + SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2), + SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2), + SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2), + SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2), + SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2), + SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2), + SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2), + SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2), + SseOpcode::Pcmpeqq => (LegacyPrefixes::_66, 0x0F3829, 3), + SseOpcode::Pcmpgtb => (LegacyPrefixes::_66, 0x0F64, 2), + SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2), + SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2), + SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3), + SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3), + SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2), + SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3), + SseOpcode::Pmaxub => (LegacyPrefixes::_66, 0x0FDE, 2), + SseOpcode::Pmaxuw => (LegacyPrefixes::_66, 0x0F383E, 3), + SseOpcode::Pmaxud => (LegacyPrefixes::_66, 0x0F383F, 3), + SseOpcode::Pminsb => (LegacyPrefixes::_66, 0x0F3838, 3), + SseOpcode::Pminsw => (LegacyPrefixes::_66, 0x0FEA, 2), + SseOpcode::Pminsd => (LegacyPrefixes::_66, 0x0F3839, 3), + SseOpcode::Pminub => (LegacyPrefixes::_66, 0x0FDA, 2), + SseOpcode::Pminuw => (LegacyPrefixes::_66, 0x0F383A, 3), + SseOpcode::Pminud => (LegacyPrefixes::_66, 0x0F383B, 3), + SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3), + SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2), + SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2), + SseOpcode::Por => (LegacyPrefixes::_66, 0x0FEB, 2), + SseOpcode::Pshufb => (LegacyPrefixes::_66, 0x0F3800, 3), + SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2), + SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2), + SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2), + SseOpcode::Psubw => (LegacyPrefixes::_66, 0x0FF9, 2), + SseOpcode::Psubsb => (LegacyPrefixes::_66, 0x0FE8, 2), + SseOpcode::Psubsw => (LegacyPrefixes::_66, 0x0FE9, 2), + SseOpcode::Psubusb => (LegacyPrefixes::_66, 0x0FD8, 2), + SseOpcode::Psubusw => (LegacyPrefixes::_66, 0x0FD9, 2), + SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2), + SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2), + SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2), + SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2), + SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2), + SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2), + SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + + match src_e { + RegMem::Reg { reg: reg_e } => { + emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode, + length, + reg_g.to_reg(), + addr, + rex, + ); + } + } + } + + Inst::XmmMinMaxSeq { + size, + is_min, + lhs, + rhs_dst, + } => { + // Generates the following sequence: + // cmpss/cmpsd %lhs, %rhs_dst + // jnz do_min_max + // jp propagate_nan + // + // ;; ordered and equal: propagate the sign bit (for -0 vs 0): + // {and,or}{ss,sd} %lhs, %rhs_dst + // j done + // + // ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the + // ;; NaN value is returned), we add both inputs. + // propagate_nan: + // add{ss,sd} %lhs, %rhs_dst + // j done + // + // do_min_max: + // {min,max}{ss,sd} %lhs, %rhs_dst + // + // done: + let done = sink.get_label(); + let propagate_nan = sink.get_label(); + let do_min_max = sink.get_label(); + + let (add_op, cmp_op, and_op, or_op, min_max_op) = match size { + OperandSize::Size32 => ( + SseOpcode::Addss, + SseOpcode::Ucomiss, + SseOpcode::Andps, + SseOpcode::Orps, + if *is_min { + SseOpcode::Minss + } else { + SseOpcode::Maxss + }, + ), + OperandSize::Size64 => ( + SseOpcode::Addsd, + SseOpcode::Ucomisd, + SseOpcode::Andpd, + SseOpcode::Orpd, + if *is_min { + SseOpcode::Minsd + } else { + SseOpcode::Maxsd + }, + ), + }; + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(*lhs), rhs_dst.to_reg()); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::NZ, do_min_max); + one_way_jmp(sink, CC::P, propagate_nan); + + // Ordered and equal. The operands are bit-identical unless they are zero + // and negative zero. These instructions merge the sign bits in that + // case, and are no-ops otherwise. + let op = if *is_min { or_op } else { and_op }; + let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + + // x86's min/max are not symmetric; if either operand is a NaN, they return the + // read-only operand: perform an addition between the two operands, which has the + // desired NaN propagation effects. + sink.bind_label(propagate_nan); + let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::P, done); + + sink.bind_label(do_min_max); + let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst); + inst.emit(sink, info, state); + + sink.bind_label(done); + } + + Inst::XmmRmRImm { + op, + src, + dst, + imm, + is64, + } => { + let (prefix, opcode, len) = match op { + SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2), + SseOpcode::Cmppd => (LegacyPrefixes::_66, 0x0FC2, 2), + SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2), + SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2), + SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3), + SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3), + SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2), + SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3), + SseOpcode::Pextrb => (LegacyPrefixes::_66, 0x0F3A14, 3), + SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2), + SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3), + SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + let rex = if *is64 { + RexFlags::set_w() + } else { + RexFlags::clear_w() + }; + let regs_swapped = match *op { + // These opcodes (and not the SSE2 version of PEXTRW) flip the operand + // encoding: `dst` in ModRM's r/m, `src` in ModRM's reg field. + SseOpcode::Pextrb | SseOpcode::Pextrd => true, + // The rest of the opcodes have the customary encoding: `dst` in ModRM's reg, + // `src` in ModRM's r/m field. + _ => false, + }; + match src { + RegMem::Reg { reg } => { + if regs_swapped { + emit_std_reg_reg(sink, prefix, opcode, len, *reg, dst.to_reg(), rex); + } else { + emit_std_reg_reg(sink, prefix, opcode, len, dst.to_reg(), *reg, rex); + } + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + assert!( + !regs_swapped, + "No existing way to encode a mem argument in the ModRM r/m field." + ); + emit_std_reg_mem(sink, state, prefix, opcode, len, dst.to_reg(), addr, rex); + } + } + sink.put1(*imm); + } + + Inst::XmmLoadConst { src, dst, ty } => { + let load_offset = Amode::rip_relative(sink.get_label_for_constant(*src)); + let load = Inst::load(*ty, load_offset, *dst, ExtKind::None); + load.emit(sink, info, state); + } + + Inst::XmmUninitializedValue { .. } => { + // This instruction format only exists to declare a register as a `def`; no code is + // emitted. + } + + Inst::XmmMovRM { op, src, dst } => { + let (prefix, opcode) = match op { + SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29), + SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29), + SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F7F), + SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F), + SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11), + SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11), + SseOpcode::Movups => (LegacyPrefixes::None, 0x0F11), + SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F11), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + let dst = &dst.finalize(state); + emit_std_reg_mem( + sink, + state, + prefix, + opcode, + 2, + *src, + dst, + RexFlags::clear_w(), + ); + } + + Inst::XmmToGpr { + op, + src, + dst, + dst_size, + } => { + let (prefix, opcode, dst_first) = match op { + SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true), + SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true), + // Movd and movq use the same opcode; the presence of the REX prefix (set below) + // actually determines which is used. + SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false), + SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true), + SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true), + SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true), + _ => panic!("unexpected opcode {:?}", op), + }; + let rex = match dst_size { + OperandSize::Size32 => RexFlags::clear_w(), + OperandSize::Size64 => RexFlags::set_w(), + }; + + let (src, dst) = if dst_first { + (dst.to_reg(), *src) + } else { + (*src, dst.to_reg()) + }; + + emit_std_reg_reg(sink, prefix, opcode, 2, src, dst, rex); + } + + Inst::GprToXmm { + op, + src: src_e, + dst: reg_g, + src_size, + } => { + let (prefix, opcode) = match op { + // Movd and movq use the same opcode; the presence of the REX prefix (set below) + // actually determines which is used. + SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E), + SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A), + SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A), + _ => panic!("unexpected opcode {:?}", op), + }; + let rex = match *src_size { + OperandSize::Size32 => RexFlags::clear_w(), + OperandSize::Size64 => RexFlags::set_w(), + }; + match src_e { + RegMem::Reg { reg: reg_e } => { + emit_std_reg_reg(sink, prefix, opcode, 2, reg_g.to_reg(), *reg_e, rex); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem(sink, state, prefix, opcode, 2, reg_g.to_reg(), addr, rex); + } + } + } + + Inst::XmmCmpRmR { op, src, dst } => { + let rex = RexFlags::clear_w(); + let (prefix, opcode, len) = match op { + SseOpcode::Ptest => (LegacyPrefixes::_66, 0x0F3817, 3), + SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E, 2), + SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E, 2), + _ => unimplemented!("Emit xmm cmp rm r"), + }; + + match src { + RegMem::Reg { reg } => { + emit_std_reg_reg(sink, prefix, opcode, len, *dst, *reg, rex); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state); + emit_std_reg_mem(sink, state, prefix, opcode, len, *dst, addr, rex); + } + } + } + + Inst::CvtUint64ToFloatSeq { + to_f64, + src, + dst, + tmp_gpr1, + tmp_gpr2, + } => { + // Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a + // different sequence. + // + // Emit the following sequence: + // + // cmp 0, %src + // jl handle_negative + // + // ;; handle positive, which can't overflow + // cvtsi2sd/cvtsi2ss %src, %dst + // j done + // + // ;; handle negative: see below for an explanation of what it's doing. + // handle_negative: + // mov %src, %tmp_gpr1 + // shr $1, %tmp_gpr1 + // mov %src, %tmp_gpr2 + // and $1, %tmp_gpr2 + // or %tmp_gpr1, %tmp_gpr2 + // cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst + // addsd/addss %dst, %dst + // + // done: + + assert_ne!(src, tmp_gpr1); + assert_ne!(src, tmp_gpr2); + assert_ne!(tmp_gpr1, tmp_gpr2); + + let handle_negative = sink.get_label(); + let done = sink.get_label(); + + // If x seen as a signed int64 is not negative, a signed-conversion will do the right + // thing. + // TODO use tst src, src here. + let inst = Inst::cmp_rmi_r(8, RegMemImm::imm(0), src.to_reg()); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::L, handle_negative); + + // Handle a positive int64, which is the "easy" case: a signed conversion will do the + // right thing. + emit_signed_cvt(sink, info, state, src.to_reg(), *dst, *to_f64); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + + sink.bind_label(handle_negative); + + // Divide x by two to get it in range for the signed conversion, keep the LSB, and + // scale it back up on the FP side. + let inst = Inst::gen_move(*tmp_gpr1, src.to_reg(), types::I64); + inst.emit(sink, info, state); + + // tmp_gpr1 := src >> 1 + let inst = Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(1), *tmp_gpr1); + inst.emit(sink, info, state); + + let inst = Inst::gen_move(*tmp_gpr2, src.to_reg(), types::I64); + inst.emit(sink, info, state); + + let inst = Inst::alu_rmi_r( + true, /* 64bits */ + AluRmiROpcode::And, + RegMemImm::imm(1), + *tmp_gpr2, + ); + inst.emit(sink, info, state); + + let inst = Inst::alu_rmi_r( + true, /* 64bits */ + AluRmiROpcode::Or, + RegMemImm::reg(tmp_gpr1.to_reg()), + *tmp_gpr2, + ); + inst.emit(sink, info, state); + + emit_signed_cvt(sink, info, state, tmp_gpr2.to_reg(), *dst, *to_f64); + + let add_op = if *to_f64 { + SseOpcode::Addsd + } else { + SseOpcode::Addss + }; + let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst); + inst.emit(sink, info, state); + + sink.bind_label(done); + } + + Inst::CvtFloatToSintSeq { + src_size, + dst_size, + is_saturating, + src, + dst, + tmp_gpr, + tmp_xmm, + } => { + // Emits the following common sequence: + // + // cvttss2si/cvttsd2si %src, %dst + // cmp %dst, 1 + // jno done + // + // Then, for saturating conversions: + // + // ;; check for NaN + // cmpss/cmpsd %src, %src + // jnp not_nan + // xor %dst, %dst + // + // ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is + // ;; already in %dst. + // xorpd %tmp_xmm, %tmp_xmm + // cmpss/cmpsd %src, %tmp_xmm + // jnb done + // mov/movaps $INT_MAX, %dst + // + // done: + // + // Then, for non-saturating conversions: + // + // ;; check for NaN + // cmpss/cmpsd %src, %src + // jnp not_nan + // ud2 trap BadConversionToInteger + // + // ;; check if INT_MIN was the correct result, against a magic constant: + // not_nan: + // movaps/mov $magic, %tmp_gpr + // movq/movd %tmp_gpr, %tmp_xmm + // cmpss/cmpsd %tmp_xmm, %src + // jnb/jnbe $check_positive + // ud2 trap IntegerOverflow + // + // ;; if positive, it was a real overflow + // check_positive: + // xorpd %tmp_xmm, %tmp_xmm + // cmpss/cmpsd %src, %tmp_xmm + // jnb done + // ud2 trap IntegerOverflow + // + // done: + + let src = src.to_reg(); + + let (cast_op, cmp_op, trunc_op) = match src_size { + OperandSize::Size64 => (SseOpcode::Movq, SseOpcode::Ucomisd, SseOpcode::Cvttsd2si), + OperandSize::Size32 => (SseOpcode::Movd, SseOpcode::Ucomiss, SseOpcode::Cvttss2si), + }; + + let done = sink.get_label(); + let not_nan = sink.get_label(); + + // The truncation. + let inst = Inst::xmm_to_gpr(trunc_op, src, *dst, *dst_size); + inst.emit(sink, info, state); + + // Compare against 1, in case of overflow the dst operand was INT_MIN. + let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(1), dst.to_reg()); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::NO, done); // no overflow => done + + // Check for NaN. + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), src); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN + + if *is_saturating { + // For NaN, emit 0. + let inst = Inst::alu_rmi_r( + *dst_size == OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst.to_reg()), + *dst, + ); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + + sink.bind_label(not_nan); + + // If the input was positive, saturate to INT_MAX. + + // Zero out tmp_xmm. + let inst = + Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm); + inst.emit(sink, info, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg()); + inst.emit(sink, info, state); + + // Jump if >= to done. + one_way_jmp(sink, CC::NB, done); + + // Otherwise, put INT_MAX. + if *dst_size == OperandSize::Size64 { + let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, *dst); + inst.emit(sink, info, state); + } else { + let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, *dst); + inst.emit(sink, info, state); + } + } else { + let check_positive = sink.get_label(); + + let inst = Inst::trap(TrapCode::BadConversionToInteger); + inst.emit(sink, info, state); + + // Check if INT_MIN was the correct result: determine the smallest floating point + // number that would convert to INT_MIN, put it in a temporary register, and compare + // against the src register. + // If the src register is less (or in some cases, less-or-equal) than the threshold, + // trap! + + sink.bind_label(not_nan); + + let mut no_overflow_cc = CC::NB; // >= + let output_bits = dst_size.to_bits(); + match *src_size { + OperandSize::Size32 => { + let cst = Ieee32::pow2(output_bits - 1).neg().bits(); + let inst = Inst::imm(OperandSize::Size32, cst as u64, *tmp_gpr); + inst.emit(sink, info, state); + } + OperandSize::Size64 => { + // An f64 can represent `i32::min_value() - 1` exactly with precision to spare, + // so there are values less than -2^(N-1) that convert correctly to INT_MIN. + let cst = if output_bits < 64 { + no_overflow_cc = CC::NBE; // > + Ieee64::fcvt_to_sint_negative_overflow(output_bits) + } else { + Ieee64::pow2(output_bits - 1).neg() + }; + let inst = Inst::imm(OperandSize::Size64, cst.bits(), *tmp_gpr); + inst.emit(sink, info, state); + } + } + + let inst = + Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm); + inst.emit(sink, info, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm.to_reg()), src); + inst.emit(sink, info, state); + + // jump over trap if src >= or > threshold + one_way_jmp(sink, no_overflow_cc, check_positive); + + let inst = Inst::trap(TrapCode::IntegerOverflow); + inst.emit(sink, info, state); + + // If positive, it was a real overflow. + + sink.bind_label(check_positive); + + // Zero out the tmp_xmm register. + let inst = + Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm); + inst.emit(sink, info, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg()); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::NB, done); // jump over trap if 0 >= src + + let inst = Inst::trap(TrapCode::IntegerOverflow); + inst.emit(sink, info, state); + } + + sink.bind_label(done); + } + + Inst::CvtFloatToUintSeq { + src_size, + dst_size, + is_saturating, + src, + dst, + tmp_gpr, + tmp_xmm, + } => { + // The only difference in behavior between saturating and non-saturating is how we + // handle errors. Emits the following sequence: + // + // movaps/mov 2**(int_width - 1), %tmp_gpr + // movq/movd %tmp_gpr, %tmp_xmm + // cmpss/cmpsd %tmp_xmm, %src + // jnb is_large + // + // ;; check for NaN inputs + // jnp not_nan + // -- non-saturating: ud2 trap BadConversionToInteger + // -- saturating: xor %dst, %dst; j done + // + // not_nan: + // cvttss2si/cvttsd2si %src, %dst + // cmp 0, %dst + // jnl done + // -- non-saturating: ud2 trap IntegerOverflow + // -- saturating: xor %dst, %dst; j done + // + // is_large: + // subss/subsd %tmp_xmm, %src ; <-- we clobber %src here + // cvttss2si/cvttss2sd %tmp_x, %dst + // cmp 0, %dst + // jnl next_is_large + // -- non-saturating: ud2 trap IntegerOverflow + // -- saturating: movaps $UINT_MAX, %dst; j done + // + // next_is_large: + // add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers + // + // done: + + assert_ne!(tmp_xmm, src, "tmp_xmm clobbers src!"); + + let (sub_op, cast_op, cmp_op, trunc_op) = if *src_size == OperandSize::Size64 { + ( + SseOpcode::Subsd, + SseOpcode::Movq, + SseOpcode::Ucomisd, + SseOpcode::Cvttsd2si, + ) + } else { + ( + SseOpcode::Subss, + SseOpcode::Movd, + SseOpcode::Ucomiss, + SseOpcode::Cvttss2si, + ) + }; + + let done = sink.get_label(); + + let cst = if *src_size == OperandSize::Size64 { + Ieee64::pow2(dst_size.to_bits() - 1).bits() + } else { + Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64 + }; + + let inst = Inst::imm(*src_size, cst, *tmp_gpr); + inst.emit(sink, info, state); + + let inst = + Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm); + inst.emit(sink, info, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm.to_reg()), src.to_reg()); + inst.emit(sink, info, state); + + let handle_large = sink.get_label(); + one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold + + let not_nan = sink.get_label(); + one_way_jmp(sink, CC::NP, not_nan); // jump over trap if not NaN + + if *is_saturating { + // Emit 0. + let inst = Inst::alu_rmi_r( + *dst_size == OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst.to_reg()), + *dst, + ); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + } else { + // Trap. + let inst = Inst::trap(TrapCode::BadConversionToInteger); + inst.emit(sink, info, state); + } + + sink.bind_label(not_nan); + + // Actual truncation for small inputs: if the result is not positive, then we had an + // overflow. + + let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size); + inst.emit(sink, info, state); + + let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(0), dst.to_reg()); + inst.emit(sink, info, state); + + one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done + + if *is_saturating { + // The input was "small" (< 2**(width -1)), so the only way to get an integer + // overflow is because the input was too small: saturate to the min value, i.e. 0. + let inst = Inst::alu_rmi_r( + *dst_size == OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst.to_reg()), + *dst, + ); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + } else { + // Trap. + let inst = Inst::trap(TrapCode::IntegerOverflow); + inst.emit(sink, info, state); + } + + // Now handle large inputs. + + sink.bind_label(handle_large); + + let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src); + inst.emit(sink, info, state); + + let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size); + inst.emit(sink, info, state); + + let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(0), dst.to_reg()); + inst.emit(sink, info, state); + + let next_is_large = sink.get_label(); + one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large + + if *is_saturating { + // The input was "large" (>= 2**(width -1)), so the only way to get an integer + // overflow is because the input was too large: saturate to the max value. + let inst = Inst::imm( + OperandSize::Size64, + if *dst_size == OperandSize::Size64 { + u64::max_value() + } else { + u32::max_value() as u64 + }, + *dst, + ); + inst.emit(sink, info, state); + + let inst = Inst::jmp_known(done); + inst.emit(sink, info, state); + } else { + let inst = Inst::trap(TrapCode::IntegerOverflow); + inst.emit(sink, info, state); + } + + sink.bind_label(next_is_large); + + if *dst_size == OperandSize::Size64 { + let inst = Inst::imm(OperandSize::Size64, 1 << 63, *tmp_gpr); + inst.emit(sink, info, state); + + let inst = Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::reg(tmp_gpr.to_reg()), + *dst, + ); + inst.emit(sink, info, state); + } else { + let inst = + Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::imm(1 << 31), *dst); + inst.emit(sink, info, state); + } + + sink.bind_label(done); + } + + Inst::LoadExtName { dst, name, offset } => { + // The full address can be encoded in the register, with a relocation. + // Generates: movabsq $name, %dst + let enc_dst = int_reg_enc(dst.to_reg()); + sink.put1(0x48 | ((enc_dst >> 3) & 1)); + sink.put1(0xB8 | (enc_dst & 7)); + emit_reloc(sink, state, Reloc::Abs8, name, *offset); + if info.flags().emit_all_ones_funcaddrs() { + sink.put8(u64::max_value()); + } else { + sink.put8(0); + } + } + + Inst::LockCmpxchg { ty, src, dst } => { + // lock cmpxchg{b,w,l,q} %src, (dst) + // Note that 0xF0 is the Lock prefix. + let (prefix, rex, opcodes) = match *ty { + types::I8 => { + let mut rex_flags = RexFlags::clear_w(); + let enc_src = int_reg_enc(*src); + if enc_src >= 4 && enc_src <= 7 { + rex_flags.always_emit(); + }; + (LegacyPrefixes::_F0, rex_flags, 0x0FB0) + } + types::I16 => (LegacyPrefixes::_66F0, RexFlags::clear_w(), 0x0FB1), + types::I32 => (LegacyPrefixes::_F0, RexFlags::clear_w(), 0x0FB1), + types::I64 => (LegacyPrefixes::_F0, RexFlags::set_w(), 0x0FB1), + _ => unreachable!(), + }; + let amode = dst.finalize(state); + emit_std_reg_mem(sink, state, prefix, opcodes, 2, *src, &amode, rex); + } + + Inst::AtomicRmwSeq { ty, op } => { + // Emit this: + // + // mov{zbq,zwq,zlq,q} (%r9), %rax // rax = old value + // again: + // movq %rax, %r11 // rax = old value, r11 = old value + // `op`q %r10, %r11 // rax = old value, r11 = new value + // lock cmpxchg{b,w,l,q} %r11, (%r9) // try to store new value + // jnz again // If this is taken, rax will have a "revised" old value + // + // Operand conventions: + // IN: %r9 (addr), %r10 (2nd arg for `op`) + // OUT: %rax (old value), %r11 (trashed), %rflags (trashed) + // + // In the case where the operation is 'xchg', the "`op`q" instruction is instead + // movq %r10, %r11 + // so that we simply write in the destination, the "2nd arg for `op`". + let rax = regs::rax(); + let r9 = regs::r9(); + let r10 = regs::r10(); + let r11 = regs::r11(); + let rax_w = Writable::from_reg(rax); + let r11_w = Writable::from_reg(r11); + let amode = Amode::imm_reg(0, r9); + let again_label = sink.get_label(); + + // mov{zbq,zwq,zlq,q} (%r9), %rax + // No need to call `add_trap` here, since the `i1` emit will do that. + let i1 = Inst::load(*ty, amode.clone(), rax_w, ExtKind::ZeroExtend); + i1.emit(sink, info, state); + + // again: + sink.bind_label(again_label); + + // movq %rax, %r11 + let i2 = Inst::mov_r_r(true, rax, r11_w); + i2.emit(sink, info, state); + + // opq %r10, %r11 + let r10_rmi = RegMemImm::reg(r10); + let i3 = if *op == inst_common::AtomicRmwOp::Xchg { + Inst::mov_r_r(true, r10, r11_w) + } else { + let alu_op = match op { + inst_common::AtomicRmwOp::Add => AluRmiROpcode::Add, + inst_common::AtomicRmwOp::Sub => AluRmiROpcode::Sub, + inst_common::AtomicRmwOp::And => AluRmiROpcode::And, + inst_common::AtomicRmwOp::Or => AluRmiROpcode::Or, + inst_common::AtomicRmwOp::Xor => AluRmiROpcode::Xor, + inst_common::AtomicRmwOp::Xchg => unreachable!(), + }; + Inst::alu_rmi_r(true, alu_op, r10_rmi, r11_w) + }; + i3.emit(sink, info, state); + + // lock cmpxchg{b,w,l,q} %r11, (%r9) + // No need to call `add_trap` here, since the `i4` emit will do that. + let i4 = Inst::LockCmpxchg { + ty: *ty, + src: r11, + dst: amode.into(), + }; + i4.emit(sink, info, state); + + // jnz again + one_way_jmp(sink, CC::NZ, again_label); + } + + Inst::Fence { kind } => { + sink.put1(0x0F); + sink.put1(0xAE); + match kind { + FenceKind::MFence => sink.put1(0xF0), // mfence = 0F AE F0 + FenceKind::LFence => sink.put1(0xE8), // lfence = 0F AE E8 + FenceKind::SFence => sink.put1(0xF8), // sfence = 0F AE F8 + } + } + + Inst::Hlt => { + sink.put1(0xcc); + } + + Inst::Ud2 { trap_code } => { + let cur_srcloc = state.cur_srcloc(); + sink.add_trap(cur_srcloc, *trap_code); + if let Some(s) = state.take_stack_map() { + sink.add_stack_map(StackMapExtent::UpcomingBytes(2), s); + } + sink.put1(0x0f); + sink.put1(0x0b); + } + + Inst::VirtualSPOffsetAdj { offset } => { + debug!( + "virtual sp offset adjusted by {} -> {}", + offset, + state.virtual_sp_offset + offset + ); + state.virtual_sp_offset += offset; + } + + Inst::Nop { len } => { + // These encodings can all be found in Intel's architecture manual, at the NOP + // instruction description. + let mut len = *len; + while len != 0 { + let emitted = u8::min(len, 9); + match emitted { + 0 => {} + 1 => sink.put1(0x90), // NOP + 2 => { + // 66 NOP + sink.put1(0x66); + sink.put1(0x90); + } + 3 => { + // NOP [EAX] + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x00); + } + 4 => { + // NOP 0(EAX), with 0 a 1-byte immediate. + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x40); + sink.put1(0x00); + } + 5 => { + // NOP [EAX, EAX, 1] + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x44); + sink.put1(0x00); + sink.put1(0x00); + } + 6 => { + // 66 NOP [EAX, EAX, 1] + sink.put1(0x66); + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x44); + sink.put1(0x00); + sink.put1(0x00); + } + 7 => { + // NOP 0[EAX], but 0 is a 4 bytes immediate. + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x80); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + } + 8 => { + // NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate. + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x84); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + } + 9 => { + // 66 NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate. + sink.put1(0x66); + sink.put1(0x0F); + sink.put1(0x1F); + sink.put1(0x84); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + sink.put1(0x00); + } + _ => unreachable!(), + } + len -= emitted; + } + } + + Inst::EpiloguePlaceholder => { + // Generate no code. + } + } + + state.clear_post_insn(); +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs new file mode 100644 index 0000000000..06092d498a --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/emit_tests.rs @@ -0,0 +1,3593 @@ +//! Tests for the emitter +//! +//! See comments at the top of `fn x64_emit` for advice on how to create reliable test cases. +//! +//! to see stdout: cargo test -- --nocapture +//! +//! for this specific case, as of 24 Aug 2020: +//! +//! cd to the top of your wasmtime tree, then: +//! RUST_BACKTRACE=1 cargo test --features test-programs/test_programs \ +//! --features experimental_x64 --all --exclude peepmatic --exclude lightbeam \ +//! --exclude wasmtime-lightbeam --exclude peepmatic-automata --exclude peepmatic-fuzzing \ +//! --exclude peepmatic-macro -- isa::x64::inst::emit_tests::test_x64_emit + +use super::*; +use crate::isa::test_utils; +use crate::isa::x64; +use alloc::vec::Vec; + +#[test] +fn test_x64_emit() { + let rax = regs::rax(); + let rbx = regs::rbx(); + let rcx = regs::rcx(); + let rdx = regs::rdx(); + let rsi = regs::rsi(); + let rdi = regs::rdi(); + let rsp = regs::rsp(); + let rbp = regs::rbp(); + let r8 = regs::r8(); + let r9 = regs::r9(); + let r10 = regs::r10(); + let r11 = regs::r11(); + let r12 = regs::r12(); + let r13 = regs::r13(); + let r14 = regs::r14(); + let r15 = regs::r15(); + + let xmm0 = regs::xmm0(); + let xmm1 = regs::xmm1(); + let xmm2 = regs::xmm2(); + let xmm3 = regs::xmm3(); + let xmm4 = regs::xmm4(); + let xmm5 = regs::xmm5(); + let xmm6 = regs::xmm6(); + let xmm7 = regs::xmm7(); + let xmm8 = regs::xmm8(); + let xmm9 = regs::xmm9(); + let xmm10 = regs::xmm10(); + let xmm11 = regs::xmm11(); + let xmm12 = regs::xmm12(); + let xmm13 = regs::xmm13(); + let xmm14 = regs::xmm14(); + let xmm15 = regs::xmm15(); + + // And Writable<> versions of the same: + let w_rax = Writable::<Reg>::from_reg(rax); + let w_rbx = Writable::<Reg>::from_reg(rbx); + let w_rcx = Writable::<Reg>::from_reg(rcx); + let w_rdx = Writable::<Reg>::from_reg(rdx); + let w_rsi = Writable::<Reg>::from_reg(rsi); + let w_rdi = Writable::<Reg>::from_reg(rdi); + let _w_rsp = Writable::<Reg>::from_reg(rsp); + let _w_rbp = Writable::<Reg>::from_reg(rbp); + let w_r8 = Writable::<Reg>::from_reg(r8); + let w_r9 = Writable::<Reg>::from_reg(r9); + let _w_r10 = Writable::<Reg>::from_reg(r10); + let w_r11 = Writable::<Reg>::from_reg(r11); + let w_r12 = Writable::<Reg>::from_reg(r12); + let w_r13 = Writable::<Reg>::from_reg(r13); + let w_r14 = Writable::<Reg>::from_reg(r14); + let w_r15 = Writable::<Reg>::from_reg(r15); + + let w_xmm0 = Writable::<Reg>::from_reg(xmm0); + let w_xmm1 = Writable::<Reg>::from_reg(xmm1); + let w_xmm2 = Writable::<Reg>::from_reg(xmm2); + let w_xmm3 = Writable::<Reg>::from_reg(xmm3); + let w_xmm4 = Writable::<Reg>::from_reg(xmm4); + let w_xmm5 = Writable::<Reg>::from_reg(xmm5); + let w_xmm6 = Writable::<Reg>::from_reg(xmm6); + let w_xmm7 = Writable::<Reg>::from_reg(xmm7); + let w_xmm8 = Writable::<Reg>::from_reg(xmm8); + let w_xmm9 = Writable::<Reg>::from_reg(xmm9); + let w_xmm10 = Writable::<Reg>::from_reg(xmm10); + let w_xmm11 = Writable::<Reg>::from_reg(xmm11); + let w_xmm12 = Writable::<Reg>::from_reg(xmm12); + let w_xmm13 = Writable::<Reg>::from_reg(xmm13); + let w_xmm14 = Writable::<Reg>::from_reg(xmm14); + let w_xmm15 = Writable::<Reg>::from_reg(xmm15); + + let mut insns = Vec::<(Inst, &str, &str)>::new(); + + // ======================================================== + // Cases aimed at checking Addr-esses: IR (Imm + Reg) + // + // These are just a bunch of loads with all supported (by the emitter) + // permutations of address formats. + // + // Addr_IR, offset zero + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rax), w_rdi), + "488B38", + "movq 0(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rbx), w_rdi), + "488B3B", + "movq 0(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rcx), w_rdi), + "488B39", + "movq 0(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rdx), w_rdi), + "488B3A", + "movq 0(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rbp), w_rdi), + "488B7D00", + "movq 0(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rsp), w_rdi), + "488B3C24", + "movq 0(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rsi), w_rdi), + "488B3E", + "movq 0(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, rdi), w_rdi), + "488B3F", + "movq 0(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r8), w_rdi), + "498B38", + "movq 0(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r9), w_rdi), + "498B39", + "movq 0(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r10), w_rdi), + "498B3A", + "movq 0(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r11), w_rdi), + "498B3B", + "movq 0(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r12), w_rdi), + "498B3C24", + "movq 0(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r13), w_rdi), + "498B7D00", + "movq 0(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r14), w_rdi), + "498B3E", + "movq 0(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0, r15), w_rdi), + "498B3F", + "movq 0(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset max simm8 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rax), w_rdi), + "488B787F", + "movq 127(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rbx), w_rdi), + "488B7B7F", + "movq 127(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rcx), w_rdi), + "488B797F", + "movq 127(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rdx), w_rdi), + "488B7A7F", + "movq 127(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rbp), w_rdi), + "488B7D7F", + "movq 127(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rsp), w_rdi), + "488B7C247F", + "movq 127(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rsi), w_rdi), + "488B7E7F", + "movq 127(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, rdi), w_rdi), + "488B7F7F", + "movq 127(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r8), w_rdi), + "498B787F", + "movq 127(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r9), w_rdi), + "498B797F", + "movq 127(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r10), w_rdi), + "498B7A7F", + "movq 127(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r11), w_rdi), + "498B7B7F", + "movq 127(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r12), w_rdi), + "498B7C247F", + "movq 127(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r13), w_rdi), + "498B7D7F", + "movq 127(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r14), w_rdi), + "498B7E7F", + "movq 127(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(127, r15), w_rdi), + "498B7F7F", + "movq 127(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset min simm8 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rax), w_rdi), + "488B7880", + "movq -128(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rbx), w_rdi), + "488B7B80", + "movq -128(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rcx), w_rdi), + "488B7980", + "movq -128(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rdx), w_rdi), + "488B7A80", + "movq -128(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rbp), w_rdi), + "488B7D80", + "movq -128(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rsp), w_rdi), + "488B7C2480", + "movq -128(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rsi), w_rdi), + "488B7E80", + "movq -128(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, rdi), w_rdi), + "488B7F80", + "movq -128(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r8), w_rdi), + "498B7880", + "movq -128(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r9), w_rdi), + "498B7980", + "movq -128(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r10), w_rdi), + "498B7A80", + "movq -128(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r11), w_rdi), + "498B7B80", + "movq -128(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r12), w_rdi), + "498B7C2480", + "movq -128(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r13), w_rdi), + "498B7D80", + "movq -128(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r14), w_rdi), + "498B7E80", + "movq -128(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-128i32 as u32, r15), w_rdi), + "498B7F80", + "movq -128(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset smallest positive simm32 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rax), w_rdi), + "488BB880000000", + "movq 128(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rbx), w_rdi), + "488BBB80000000", + "movq 128(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rcx), w_rdi), + "488BB980000000", + "movq 128(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rdx), w_rdi), + "488BBA80000000", + "movq 128(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rbp), w_rdi), + "488BBD80000000", + "movq 128(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rsp), w_rdi), + "488BBC2480000000", + "movq 128(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rsi), w_rdi), + "488BBE80000000", + "movq 128(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, rdi), w_rdi), + "488BBF80000000", + "movq 128(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r8), w_rdi), + "498BB880000000", + "movq 128(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r9), w_rdi), + "498BB980000000", + "movq 128(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r10), w_rdi), + "498BBA80000000", + "movq 128(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r11), w_rdi), + "498BBB80000000", + "movq 128(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r12), w_rdi), + "498BBC2480000000", + "movq 128(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r13), w_rdi), + "498BBD80000000", + "movq 128(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r14), w_rdi), + "498BBE80000000", + "movq 128(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(128, r15), w_rdi), + "498BBF80000000", + "movq 128(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset smallest negative simm32 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rax), w_rdi), + "488BB87FFFFFFF", + "movq -129(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rbx), w_rdi), + "488BBB7FFFFFFF", + "movq -129(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rcx), w_rdi), + "488BB97FFFFFFF", + "movq -129(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rdx), w_rdi), + "488BBA7FFFFFFF", + "movq -129(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rbp), w_rdi), + "488BBD7FFFFFFF", + "movq -129(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rsp), w_rdi), + "488BBC247FFFFFFF", + "movq -129(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rsi), w_rdi), + "488BBE7FFFFFFF", + "movq -129(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, rdi), w_rdi), + "488BBF7FFFFFFF", + "movq -129(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r8), w_rdi), + "498BB87FFFFFFF", + "movq -129(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r9), w_rdi), + "498BB97FFFFFFF", + "movq -129(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r10), w_rdi), + "498BBA7FFFFFFF", + "movq -129(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r11), w_rdi), + "498BBB7FFFFFFF", + "movq -129(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r12), w_rdi), + "498BBC247FFFFFFF", + "movq -129(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r13), w_rdi), + "498BBD7FFFFFFF", + "movq -129(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r14), w_rdi), + "498BBE7FFFFFFF", + "movq -129(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-129i32 as u32, r15), w_rdi), + "498BBF7FFFFFFF", + "movq -129(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset large positive simm32 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rax), w_rdi), + "488BB877207317", + "movq 393420919(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rbx), w_rdi), + "488BBB77207317", + "movq 393420919(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rcx), w_rdi), + "488BB977207317", + "movq 393420919(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rdx), w_rdi), + "488BBA77207317", + "movq 393420919(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rbp), w_rdi), + "488BBD77207317", + "movq 393420919(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rsp), w_rdi), + "488BBC2477207317", + "movq 393420919(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rsi), w_rdi), + "488BBE77207317", + "movq 393420919(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, rdi), w_rdi), + "488BBF77207317", + "movq 393420919(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r8), w_rdi), + "498BB877207317", + "movq 393420919(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r9), w_rdi), + "498BB977207317", + "movq 393420919(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r10), w_rdi), + "498BBA77207317", + "movq 393420919(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r11), w_rdi), + "498BBB77207317", + "movq 393420919(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r12), w_rdi), + "498BBC2477207317", + "movq 393420919(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r13), w_rdi), + "498BBD77207317", + "movq 393420919(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r14), w_rdi), + "498BBE77207317", + "movq 393420919(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(0x17732077, r15), w_rdi), + "498BBF77207317", + "movq 393420919(%r15), %rdi", + )); + + // ======================================================== + // Addr_IR, offset large negative simm32 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rax), w_rdi), + "488BB8D9A6BECE", + "movq -826366247(%rax), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rbx), w_rdi), + "488BBBD9A6BECE", + "movq -826366247(%rbx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rcx), w_rdi), + "488BB9D9A6BECE", + "movq -826366247(%rcx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rdx), w_rdi), + "488BBAD9A6BECE", + "movq -826366247(%rdx), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rbp), w_rdi), + "488BBDD9A6BECE", + "movq -826366247(%rbp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rsp), w_rdi), + "488BBC24D9A6BECE", + "movq -826366247(%rsp), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rsi), w_rdi), + "488BBED9A6BECE", + "movq -826366247(%rsi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, rdi), w_rdi), + "488BBFD9A6BECE", + "movq -826366247(%rdi), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r8), w_rdi), + "498BB8D9A6BECE", + "movq -826366247(%r8), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r9), w_rdi), + "498BB9D9A6BECE", + "movq -826366247(%r9), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r10), w_rdi), + "498BBAD9A6BECE", + "movq -826366247(%r10), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r11), w_rdi), + "498BBBD9A6BECE", + "movq -826366247(%r11), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r12), w_rdi), + "498BBC24D9A6BECE", + "movq -826366247(%r12), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r13), w_rdi), + "498BBDD9A6BECE", + "movq -826366247(%r13), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r14), w_rdi), + "498BBED9A6BECE", + "movq -826366247(%r14), %rdi", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg(-0x31415927i32 as u32, r15), w_rdi), + "498BBFD9A6BECE", + "movq -826366247(%r15), %rdi", + )); + + // ======================================================== + // Cases aimed at checking Addr-esses: IRRS (Imm + Reg + (Reg << Shift)) + // Note these don't check the case where the index reg is RSP, since we + // don't encode any of those. + // + // Addr_IRRS, offset max simm8 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rax, rax, 0), w_r11), + "4C8B5C007F", + "movq 127(%rax,%rax,1), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rdi, rax, 1), w_r11), + "4C8B5C477F", + "movq 127(%rdi,%rax,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r8, rax, 2), w_r11), + "4D8B5C807F", + "movq 127(%r8,%rax,4), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r15, rax, 3), w_r11), + "4D8B5CC77F", + "movq 127(%r15,%rax,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rax, rdi, 3), w_r11), + "4C8B5CF87F", + "movq 127(%rax,%rdi,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, rdi, rdi, 2), w_r11), + "4C8B5CBF7F", + "movq 127(%rdi,%rdi,4), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r8, rdi, 1), w_r11), + "4D8B5C787F", + "movq 127(%r8,%rdi,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(127, r15, rdi, 0), w_r11), + "4D8B5C3F7F", + "movq 127(%r15,%rdi,1), %r11", + )); + + // ======================================================== + // Addr_IRRS, offset min simm8 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rax, r8, 2), w_r11), + "4E8B5C8080", + "movq -128(%rax,%r8,4), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rdi, r8, 3), w_r11), + "4E8B5CC780", + "movq -128(%rdi,%r8,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r8, r8, 0), w_r11), + "4F8B5C0080", + "movq -128(%r8,%r8,1), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r15, r8, 1), w_r11), + "4F8B5C4780", + "movq -128(%r15,%r8,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rax, r15, 1), w_r11), + "4E8B5C7880", + "movq -128(%rax,%r15,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, rdi, r15, 0), w_r11), + "4E8B5C3F80", + "movq -128(%rdi,%r15,1), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r8, r15, 3), w_r11), + "4F8B5CF880", + "movq -128(%r8,%r15,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(-128i32 as u32, r15, r15, 2), w_r11), + "4F8B5CBF80", + "movq -128(%r15,%r15,4), %r11", + )); + + // ======================================================== + // Addr_IRRS, offset large positive simm32 + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rax, rax, 0), w_r11), + "4C8B9C00BE25664F", + "movq 1332094398(%rax,%rax,1), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rdi, rax, 1), w_r11), + "4C8B9C47BE25664F", + "movq 1332094398(%rdi,%rax,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r8, rax, 2), w_r11), + "4D8B9C80BE25664F", + "movq 1332094398(%r8,%rax,4), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r15, rax, 3), w_r11), + "4D8B9CC7BE25664F", + "movq 1332094398(%r15,%rax,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rax, rdi, 3), w_r11), + "4C8B9CF8BE25664F", + "movq 1332094398(%rax,%rdi,8), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, rdi, rdi, 2), w_r11), + "4C8B9CBFBE25664F", + "movq 1332094398(%rdi,%rdi,4), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r8, rdi, 1), w_r11), + "4D8B9C78BE25664F", + "movq 1332094398(%r8,%rdi,2), %r11", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(0x4f6625be, r15, rdi, 0), w_r11), + "4D8B9C3FBE25664F", + "movq 1332094398(%r15,%rdi,1), %r11", + )); + + // ======================================================== + // Addr_IRRS, offset large negative simm32 + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rax, r8, 2), + w_r11, + ), + "4E8B9C8070E9B2D9", + "movq -642586256(%rax,%r8,4), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rdi, r8, 3), + w_r11, + ), + "4E8B9CC770E9B2D9", + "movq -642586256(%rdi,%r8,8), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r8, r8, 0), + w_r11, + ), + "4F8B9C0070E9B2D9", + "movq -642586256(%r8,%r8,1), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r15, r8, 1), + w_r11, + ), + "4F8B9C4770E9B2D9", + "movq -642586256(%r15,%r8,2), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rax, r15, 1), + w_r11, + ), + "4E8B9C7870E9B2D9", + "movq -642586256(%rax,%r15,2), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, rdi, r15, 0), + w_r11, + ), + "4E8B9C3F70E9B2D9", + "movq -642586256(%rdi,%r15,1), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r8, r15, 3), + w_r11, + ), + "4F8B9CF870E9B2D9", + "movq -642586256(%r8,%r15,8), %r11", + )); + insns.push(( + Inst::mov64_m_r( + Amode::imm_reg_reg_shift(-0x264d1690i32 as u32, r15, r15, 2), + w_r11, + ), + "4F8B9CBF70E9B2D9", + "movq -642586256(%r15,%r15,4), %r11", + )); + + // End of test cases for Addr + // ======================================================== + + // ======================================================== + // General tests for each insn. Don't forget to follow the + // guidelines commented just prior to `fn x64_emit`. + // + // Alu_RMI_R + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Add, RegMemImm::reg(r15), w_rdx), + "4C01FA", + "addq %r15, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::reg(rcx), w_r8), + "4101C8", + "addl %ecx, %r8d", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::reg(rcx), w_rsi), + "01CE", + "addl %ecx, %esi", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_rdx, + ), + "48035763", + "addq 99(%rdi), %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_r8, + ), + "44034763", + "addl 99(%rdi), %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_rsi, + ), + "037763", + "addl 99(%rdi), %esi", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::imm(-127i32 as u32), + w_rdx, + ), + "4883C281", + "addq $-127, %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::imm(-129i32 as u32), + w_rdx, + ), + "4881C27FFFFFFF", + "addq $-129, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Add, RegMemImm::imm(76543210), w_rdx), + "4881C2EAF48F04", + "addq $76543210, %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::imm(-127i32 as u32), + w_r8, + ), + "4183C081", + "addl $-127, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::imm(-129i32 as u32), + w_r8, + ), + "4181C07FFFFFFF", + "addl $-129, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::imm(-76543210i32 as u32), + w_r8, + ), + "4181C0160B70FB", + "addl $-76543210, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::imm(-127i32 as u32), + w_rsi, + ), + "83C681", + "addl $-127, %esi", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Add, + RegMemImm::imm(-129i32 as u32), + w_rsi, + ), + "81C67FFFFFFF", + "addl $-129, %esi", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Add, RegMemImm::imm(76543210), w_rsi), + "81C6EAF48F04", + "addl $76543210, %esi", + )); + // This is pretty feeble + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Sub, RegMemImm::reg(r15), w_rdx), + "4C29FA", + "subq %r15, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::And, RegMemImm::reg(r15), w_rdx), + "4C21FA", + "andq %r15, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Or, RegMemImm::reg(r15), w_rdx), + "4C09FA", + "orq %r15, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Xor, RegMemImm::reg(r15), w_rdx), + "4C31FA", + "xorq %r15, %rdx", + )); + // Test all mul cases, though + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Mul, RegMemImm::reg(r15), w_rdx), + "490FAFD7", + "imulq %r15, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::reg(rcx), w_r8), + "440FAFC1", + "imull %ecx, %r8d", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::reg(rcx), w_rsi), + "0FAFF1", + "imull %ecx, %esi", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Mul, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_rdx, + ), + "480FAF5763", + "imulq 99(%rdi), %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_r8, + ), + "440FAF4763", + "imull 99(%rdi), %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::mem(Amode::imm_reg(99, rdi)), + w_rsi, + ), + "0FAF7763", + "imull 99(%rdi), %esi", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Mul, + RegMemImm::imm(-127i32 as u32), + w_rdx, + ), + "486BD281", + "imulq $-127, %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + true, + AluRmiROpcode::Mul, + RegMemImm::imm(-129i32 as u32), + w_rdx, + ), + "4869D27FFFFFFF", + "imulq $-129, %rdx", + )); + insns.push(( + Inst::alu_rmi_r(true, AluRmiROpcode::Mul, RegMemImm::imm(76543210), w_rdx), + "4869D2EAF48F04", + "imulq $76543210, %rdx", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::imm(-127i32 as u32), + w_r8, + ), + "456BC081", + "imull $-127, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::imm(-129i32 as u32), + w_r8, + ), + "4569C07FFFFFFF", + "imull $-129, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::imm(-76543210i32 as u32), + w_r8, + ), + "4569C0160B70FB", + "imull $-76543210, %r8d", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::imm(-127i32 as u32), + w_rsi, + ), + "6BF681", + "imull $-127, %esi", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Mul, + RegMemImm::imm(-129i32 as u32), + w_rsi, + ), + "69F67FFFFFFF", + "imull $-129, %esi", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Mul, RegMemImm::imm(76543210), w_rsi), + "69F6EAF48F04", + "imull $76543210, %esi", + )); + + // ======================================================== + // UnaryRmR + + insns.push(( + Inst::unary_rm_r(4, UnaryRmROpcode::Bsr, RegMem::reg(rsi), w_rdi), + "0FBDFE", + "bsrl %esi, %edi", + )); + insns.push(( + Inst::unary_rm_r(8, UnaryRmROpcode::Bsr, RegMem::reg(r15), w_rax), + "490FBDC7", + "bsrq %r15, %rax", + )); + + // ======================================================== + // Not + insns.push(( + Inst::not(4, Writable::from_reg(regs::rsi())), + "F7D6", + "notl %esi", + )); + insns.push(( + Inst::not(8, Writable::from_reg(regs::r15())), + "49F7D7", + "notq %r15", + )); + insns.push(( + Inst::not(4, Writable::from_reg(regs::r14())), + "41F7D6", + "notl %r14d", + )); + insns.push(( + Inst::not(2, Writable::from_reg(regs::rdi())), + "66F7D7", + "notw %di", + )); + + // ======================================================== + // Neg + insns.push(( + Inst::neg(4, Writable::from_reg(regs::rsi())), + "F7DE", + "negl %esi", + )); + insns.push(( + Inst::neg(8, Writable::from_reg(regs::r15())), + "49F7DF", + "negq %r15", + )); + insns.push(( + Inst::neg(4, Writable::from_reg(regs::r14())), + "41F7DE", + "negl %r14d", + )); + insns.push(( + Inst::neg(2, Writable::from_reg(regs::rdi())), + "66F7DF", + "negw %di", + )); + + // ======================================================== + // Div + insns.push(( + Inst::div(4, true /*signed*/, RegMem::reg(regs::rsi())), + "F7FE", + "idiv %esi", + )); + insns.push(( + Inst::div(8, true /*signed*/, RegMem::reg(regs::r15())), + "49F7FF", + "idiv %r15", + )); + insns.push(( + Inst::div(4, false /*signed*/, RegMem::reg(regs::r14())), + "41F7F6", + "div %r14d", + )); + insns.push(( + Inst::div(8, false /*signed*/, RegMem::reg(regs::rdi())), + "48F7F7", + "div %rdi", + )); + + // ======================================================== + // MulHi + insns.push(( + Inst::mul_hi(4, true /*signed*/, RegMem::reg(regs::rsi())), + "F7EE", + "imul %esi", + )); + insns.push(( + Inst::mul_hi(8, true /*signed*/, RegMem::reg(regs::r15())), + "49F7EF", + "imul %r15", + )); + insns.push(( + Inst::mul_hi(4, false /*signed*/, RegMem::reg(regs::r14())), + "41F7E6", + "mul %r14d", + )); + insns.push(( + Inst::mul_hi(8, false /*signed*/, RegMem::reg(regs::rdi())), + "48F7E7", + "mul %rdi", + )); + + // ======================================================== + // cbw + insns.push((Inst::sign_extend_data(1), "6698", "cbw")); + + // ======================================================== + // cdq family: SignExtendRaxRdx + insns.push((Inst::sign_extend_data(2), "6699", "cwd")); + insns.push((Inst::sign_extend_data(4), "99", "cdq")); + insns.push((Inst::sign_extend_data(8), "4899", "cqo")); + + // ======================================================== + // Imm_R + // + insns.push(( + Inst::imm(OperandSize::Size32, 1234567, w_r14), + "41BE87D61200", + "movl $1234567, %r14d", + )); + insns.push(( + Inst::imm(OperandSize::Size32, -126i64 as u64, w_r14), + "41BE82FFFFFF", + "movl $-126, %r14d", + )); + insns.push(( + Inst::imm(OperandSize::Size64, 1234567898765, w_r14), + "49BE8D26FB711F010000", + "movabsq $1234567898765, %r14", + )); + insns.push(( + Inst::imm(OperandSize::Size64, -126i64 as u64, w_r14), + "49C7C682FFFFFF", + "movabsq $-126, %r14", + )); + insns.push(( + Inst::imm(OperandSize::Size32, 1234567, w_rcx), + "B987D61200", + "movl $1234567, %ecx", + )); + insns.push(( + Inst::imm(OperandSize::Size32, -126i64 as u64, w_rcx), + "B982FFFFFF", + "movl $-126, %ecx", + )); + insns.push(( + Inst::imm(OperandSize::Size64, 1234567898765, w_rsi), + "48BE8D26FB711F010000", + "movabsq $1234567898765, %rsi", + )); + insns.push(( + Inst::imm(OperandSize::Size64, -126i64 as u64, w_rbx), + "48C7C382FFFFFF", + "movabsq $-126, %rbx", + )); + + // ======================================================== + // Mov_R_R + insns.push(( + Inst::mov_r_r(false, rbx, w_rsi), + "89DE", + "movl %ebx, %esi", + )); + insns.push(( + Inst::mov_r_r(false, rbx, w_r9), + "4189D9", + "movl %ebx, %r9d", + )); + insns.push(( + Inst::mov_r_r(false, r11, w_rsi), + "4489DE", + "movl %r11d, %esi", + )); + insns.push(( + Inst::mov_r_r(false, r12, w_r9), + "4589E1", + "movl %r12d, %r9d", + )); + insns.push(( + Inst::mov_r_r(true, rbx, w_rsi), + "4889DE", + "movq %rbx, %rsi", + )); + insns.push(( + Inst::mov_r_r(true, rbx, w_r9), + "4989D9", + "movq %rbx, %r9", + )); + insns.push(( + Inst::mov_r_r(true, r11, w_rsi), + "4C89DE", + "movq %r11, %rsi", + )); + insns.push(( + Inst::mov_r_r(true, r12, w_r9), + "4D89E1", + "movq %r12, %r9", + )); + + // ======================================================== + // MovZX_RM_R + insns.push(( + Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(rdi), w_rdi), + "400FB6FF", + "movzbl %dil, %edi", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(rax), w_rsi), + "0FB6F0", + "movzbl %al, %esi", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::BL, RegMem::reg(r15), w_rsi), + "410FB6F7", + "movzbl %r15b, %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "0FB671F9", + "movzbl -7(%rcx), %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "410FB658F9", + "movzbl -7(%r8), %ebx", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "450FB64AF9", + "movzbl -7(%r10), %r9d", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "410FB653F9", + "movzbl -7(%r11), %edx", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::BQ, RegMem::reg(rax), w_rsi), + "480FB6F0", + "movzbq %al, %rsi", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::BQ, RegMem::reg(r10), w_rsi), + "490FB6F2", + "movzbq %r10b, %rsi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "480FB671F9", + "movzbq -7(%rcx), %rsi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "490FB658F9", + "movzbq -7(%r8), %rbx", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "4D0FB64AF9", + "movzbq -7(%r10), %r9", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "490FB653F9", + "movzbq -7(%r11), %rdx", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::WL, RegMem::reg(rcx), w_rsi), + "0FB7F1", + "movzwl %cx, %esi", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::WL, RegMem::reg(r10), w_rsi), + "410FB7F2", + "movzwl %r10w, %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "0FB771F9", + "movzwl -7(%rcx), %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "410FB758F9", + "movzwl -7(%r8), %ebx", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "450FB74AF9", + "movzwl -7(%r10), %r9d", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "410FB753F9", + "movzwl -7(%r11), %edx", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::WQ, RegMem::reg(rcx), w_rsi), + "480FB7F1", + "movzwq %cx, %rsi", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::WQ, RegMem::reg(r11), w_rsi), + "490FB7F3", + "movzwq %r11w, %rsi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "480FB771F9", + "movzwq -7(%rcx), %rsi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "490FB758F9", + "movzwq -7(%r8), %rbx", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "4D0FB74AF9", + "movzwq -7(%r10), %r9", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "490FB753F9", + "movzwq -7(%r11), %rdx", + )); + insns.push(( + Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(rcx), w_rsi), + "8BF1", + "movl %ecx, %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "8B71F9", + "movl -7(%rcx), %esi", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "418B58F9", + "movl -7(%r8), %ebx", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "458B4AF9", + "movl -7(%r10), %r9d", + )); + insns.push(( + Inst::movzx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "418B53F9", + "movl -7(%r11), %edx", + )); + + // ======================================================== + // Mov64_M_R + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, rbx, 0), w_rcx), + "488B8C18B3000000", + "movq 179(%rax,%rbx,1), %rcx", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, rbx, 0), w_r8), + "4C8B8418B3000000", + "movq 179(%rax,%rbx,1), %r8", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, r9, 0), w_rcx), + "4A8B8C08B3000000", + "movq 179(%rax,%r9,1), %rcx", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, rax, r9, 0), w_r8), + "4E8B8408B3000000", + "movq 179(%rax,%r9,1), %r8", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, rbx, 0), w_rcx), + "498B8C1AB3000000", + "movq 179(%r10,%rbx,1), %rcx", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, rbx, 0), w_r8), + "4D8B841AB3000000", + "movq 179(%r10,%rbx,1), %r8", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_rcx), + "4B8B8C0AB3000000", + "movq 179(%r10,%r9,1), %rcx", + )); + insns.push(( + Inst::mov64_m_r(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_r8), + "4F8B840AB3000000", + "movq 179(%r10,%r9,1), %r8", + )); + + // ======================================================== + // LoadEffectiveAddress + insns.push(( + Inst::lea(Amode::imm_reg(42, r10), w_r8), + "4D8D422A", + "lea 42(%r10), %r8", + )); + insns.push(( + Inst::lea(Amode::imm_reg(42, r10), w_r15), + "4D8D7A2A", + "lea 42(%r10), %r15", + )); + insns.push(( + Inst::lea(Amode::imm_reg_reg_shift(179, r10, r9, 0), w_r8), + "4F8D840AB3000000", + "lea 179(%r10,%r9,1), %r8", + )); + insns.push(( + Inst::lea(Amode::rip_relative(MachLabel::from_block(0)), w_rdi), + "488D3D00000000", + "lea label0(%rip), %rdi", + )); + + // ======================================================== + // MovSX_RM_R + insns.push(( + Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rdi), w_rdi), + "400FBEFF", + "movsbl %dil, %edi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rcx), w_rsi), + "0FBEF1", + "movsbl %cl, %esi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(r14), w_rsi), + "410FBEF6", + "movsbl %r14b, %esi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "0FBE71F9", + "movsbl -7(%rcx), %esi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "410FBE58F9", + "movsbl -7(%r8), %ebx", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "450FBE4AF9", + "movsbl -7(%r10), %r9d", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "410FBE53F9", + "movsbl -7(%r11), %edx", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::BQ, RegMem::reg(rcx), w_rsi), + "480FBEF1", + "movsbq %cl, %rsi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::BQ, RegMem::reg(r15), w_rsi), + "490FBEF7", + "movsbq %r15b, %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "480FBE71F9", + "movsbq -7(%rcx), %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "490FBE58F9", + "movsbq -7(%r8), %rbx", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "4D0FBE4AF9", + "movsbq -7(%r10), %r9", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::BQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "490FBE53F9", + "movsbq -7(%r11), %rdx", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::WL, RegMem::reg(rcx), w_rsi), + "0FBFF1", + "movswl %cx, %esi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::WL, RegMem::reg(r14), w_rsi), + "410FBFF6", + "movswl %r14w, %esi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "0FBF71F9", + "movswl -7(%rcx), %esi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "410FBF58F9", + "movswl -7(%r8), %ebx", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "450FBF4AF9", + "movswl -7(%r10), %r9d", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WL, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "410FBF53F9", + "movswl -7(%r11), %edx", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::WQ, RegMem::reg(rcx), w_rsi), + "480FBFF1", + "movswq %cx, %rsi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::WQ, RegMem::reg(r13), w_rsi), + "490FBFF5", + "movswq %r13w, %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "480FBF71F9", + "movswq -7(%rcx), %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "490FBF58F9", + "movswq -7(%r8), %rbx", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "4D0FBF4AF9", + "movswq -7(%r10), %r9", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::WQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "490FBF53F9", + "movswq -7(%r11), %rdx", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::LQ, RegMem::reg(rcx), w_rsi), + "4863F1", + "movslq %ecx, %rsi", + )); + insns.push(( + Inst::movsx_rm_r(ExtMode::LQ, RegMem::reg(r15), w_rsi), + "4963F7", + "movslq %r15d, %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, rcx)), + w_rsi, + ), + "486371F9", + "movslq -7(%rcx), %rsi", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r8)), + w_rbx, + ), + "496358F9", + "movslq -7(%r8), %rbx", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r10)), + w_r9, + ), + "4D634AF9", + "movslq -7(%r10), %r9", + )); + insns.push(( + Inst::movsx_rm_r( + ExtMode::LQ, + RegMem::mem(Amode::imm_reg(-7i32 as u32, r11)), + w_rdx, + ), + "496353F9", + "movslq -7(%r11), %rdx", + )); + + // ======================================================== + // Mov_R_M. Byte stores are tricky. Check everything carefully. + insns.push(( + Inst::mov_r_m(8, rax, Amode::imm_reg(99, rdi)), + "48894763", + "movq %rax, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(8, rbx, Amode::imm_reg(99, r8)), + "49895863", + "movq %rbx, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(8, rcx, Amode::imm_reg(99, rsi)), + "48894E63", + "movq %rcx, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(8, rdx, Amode::imm_reg(99, r9)), + "49895163", + "movq %rdx, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(8, rsi, Amode::imm_reg(99, rax)), + "48897063", + "movq %rsi, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(8, rdi, Amode::imm_reg(99, r15)), + "49897F63", + "movq %rdi, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(8, rsp, Amode::imm_reg(99, rcx)), + "48896163", + "movq %rsp, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(8, rbp, Amode::imm_reg(99, r14)), + "49896E63", + "movq %rbp, 99(%r14)", + )); + insns.push(( + Inst::mov_r_m(8, r8, Amode::imm_reg(99, rdi)), + "4C894763", + "movq %r8, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(8, r9, Amode::imm_reg(99, r8)), + "4D894863", + "movq %r9, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(8, r10, Amode::imm_reg(99, rsi)), + "4C895663", + "movq %r10, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(8, r11, Amode::imm_reg(99, r9)), + "4D895963", + "movq %r11, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(8, r12, Amode::imm_reg(99, rax)), + "4C896063", + "movq %r12, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(8, r13, Amode::imm_reg(99, r15)), + "4D896F63", + "movq %r13, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(8, r14, Amode::imm_reg(99, rcx)), + "4C897163", + "movq %r14, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(8, r15, Amode::imm_reg(99, r14)), + "4D897E63", + "movq %r15, 99(%r14)", + )); + // + insns.push(( + Inst::mov_r_m(4, rax, Amode::imm_reg(99, rdi)), + "894763", + "movl %eax, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(4, rbx, Amode::imm_reg(99, r8)), + "41895863", + "movl %ebx, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(4, rcx, Amode::imm_reg(99, rsi)), + "894E63", + "movl %ecx, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(4, rdx, Amode::imm_reg(99, r9)), + "41895163", + "movl %edx, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(4, rsi, Amode::imm_reg(99, rax)), + "897063", + "movl %esi, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(4, rdi, Amode::imm_reg(99, r15)), + "41897F63", + "movl %edi, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(4, rsp, Amode::imm_reg(99, rcx)), + "896163", + "movl %esp, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(4, rbp, Amode::imm_reg(99, r14)), + "41896E63", + "movl %ebp, 99(%r14)", + )); + insns.push(( + Inst::mov_r_m(4, r8, Amode::imm_reg(99, rdi)), + "44894763", + "movl %r8d, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(4, r9, Amode::imm_reg(99, r8)), + "45894863", + "movl %r9d, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(4, r10, Amode::imm_reg(99, rsi)), + "44895663", + "movl %r10d, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(4, r11, Amode::imm_reg(99, r9)), + "45895963", + "movl %r11d, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(4, r12, Amode::imm_reg(99, rax)), + "44896063", + "movl %r12d, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(4, r13, Amode::imm_reg(99, r15)), + "45896F63", + "movl %r13d, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(4, r14, Amode::imm_reg(99, rcx)), + "44897163", + "movl %r14d, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(4, r15, Amode::imm_reg(99, r14)), + "45897E63", + "movl %r15d, 99(%r14)", + )); + // + insns.push(( + Inst::mov_r_m(2, rax, Amode::imm_reg(99, rdi)), + "66894763", + "movw %ax, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(2, rbx, Amode::imm_reg(99, r8)), + "6641895863", + "movw %bx, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(2, rcx, Amode::imm_reg(99, rsi)), + "66894E63", + "movw %cx, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(2, rdx, Amode::imm_reg(99, r9)), + "6641895163", + "movw %dx, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(2, rsi, Amode::imm_reg(99, rax)), + "66897063", + "movw %si, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(2, rdi, Amode::imm_reg(99, r15)), + "6641897F63", + "movw %di, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(2, rsp, Amode::imm_reg(99, rcx)), + "66896163", + "movw %sp, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(2, rbp, Amode::imm_reg(99, r14)), + "6641896E63", + "movw %bp, 99(%r14)", + )); + insns.push(( + Inst::mov_r_m(2, r8, Amode::imm_reg(99, rdi)), + "6644894763", + "movw %r8w, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(2, r9, Amode::imm_reg(99, r8)), + "6645894863", + "movw %r9w, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(2, r10, Amode::imm_reg(99, rsi)), + "6644895663", + "movw %r10w, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(2, r11, Amode::imm_reg(99, r9)), + "6645895963", + "movw %r11w, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(2, r12, Amode::imm_reg(99, rax)), + "6644896063", + "movw %r12w, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(2, r13, Amode::imm_reg(99, r15)), + "6645896F63", + "movw %r13w, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(2, r14, Amode::imm_reg(99, rcx)), + "6644897163", + "movw %r14w, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(2, r15, Amode::imm_reg(99, r14)), + "6645897E63", + "movw %r15w, 99(%r14)", + )); + // + insns.push(( + Inst::mov_r_m(1, rax, Amode::imm_reg(99, rdi)), + "884763", + "movb %al, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(1, rbx, Amode::imm_reg(99, r8)), + "41885863", + "movb %bl, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(1, rcx, Amode::imm_reg(99, rsi)), + "884E63", + "movb %cl, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(1, rdx, Amode::imm_reg(99, r9)), + "41885163", + "movb %dl, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(1, rsi, Amode::imm_reg(99, rax)), + "40887063", + "movb %sil, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(1, rdi, Amode::imm_reg(99, r15)), + "41887F63", + "movb %dil, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(1, rsp, Amode::imm_reg(99, rcx)), + "40886163", + "movb %spl, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(1, rbp, Amode::imm_reg(99, r14)), + "41886E63", + "movb %bpl, 99(%r14)", + )); + insns.push(( + Inst::mov_r_m(1, r8, Amode::imm_reg(99, rdi)), + "44884763", + "movb %r8b, 99(%rdi)", + )); + insns.push(( + Inst::mov_r_m(1, r9, Amode::imm_reg(99, r8)), + "45884863", + "movb %r9b, 99(%r8)", + )); + insns.push(( + Inst::mov_r_m(1, r10, Amode::imm_reg(99, rsi)), + "44885663", + "movb %r10b, 99(%rsi)", + )); + insns.push(( + Inst::mov_r_m(1, r11, Amode::imm_reg(99, r9)), + "45885963", + "movb %r11b, 99(%r9)", + )); + insns.push(( + Inst::mov_r_m(1, r12, Amode::imm_reg(99, rax)), + "44886063", + "movb %r12b, 99(%rax)", + )); + insns.push(( + Inst::mov_r_m(1, r13, Amode::imm_reg(99, r15)), + "45886F63", + "movb %r13b, 99(%r15)", + )); + insns.push(( + Inst::mov_r_m(1, r14, Amode::imm_reg(99, rcx)), + "44887163", + "movb %r14b, 99(%rcx)", + )); + insns.push(( + Inst::mov_r_m(1, r15, Amode::imm_reg(99, r14)), + "45887E63", + "movb %r15b, 99(%r14)", + )); + + // ======================================================== + // Shift_R + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftLeft, None, w_rdi), + "D3E7", + "shll %cl, %edi", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftLeft, None, w_r12), + "41D3E4", + "shll %cl, %r12d", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftLeft, Some(2), w_r8), + "41C1E002", + "shll $2, %r8d", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftLeft, Some(31), w_r13), + "41C1E51F", + "shll $31, %r13d", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftLeft, None, w_r13), + "49D3E5", + "shlq %cl, %r13", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftLeft, None, w_rdi), + "48D3E7", + "shlq %cl, %rdi", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftLeft, Some(2), w_r8), + "49C1E002", + "shlq $2, %r8", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftLeft, Some(3), w_rbx), + "48C1E303", + "shlq $3, %rbx", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftLeft, Some(63), w_r13), + "49C1E53F", + "shlq $63, %r13", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightLogical, None, w_rdi), + "D3EF", + "shrl %cl, %edi", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(2), w_r8), + "41C1E802", + "shrl $2, %r8d", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(31), w_r13), + "41C1ED1F", + "shrl $31, %r13d", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightLogical, None, w_rdi), + "48D3EF", + "shrq %cl, %rdi", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(2), w_r8), + "49C1E802", + "shrq $2, %r8", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(63), w_r13), + "49C1ED3F", + "shrq $63, %r13", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, None, w_rdi), + "D3FF", + "sarl %cl, %edi", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, Some(2), w_r8), + "41C1F802", + "sarl $2, %r8d", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::ShiftRightArithmetic, Some(31), w_r13), + "41C1FD1F", + "sarl $31, %r13d", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, None, w_rdi), + "48D3FF", + "sarq %cl, %rdi", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, Some(2), w_r8), + "49C1F802", + "sarq $2, %r8", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::ShiftRightArithmetic, Some(63), w_r13), + "49C1FD3F", + "sarq $63, %r13", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::RotateLeft, None, w_r8), + "49D3C0", + "rolq %cl, %r8", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::RotateLeft, Some(3), w_r9), + "41C1C103", + "roll $3, %r9d", + )); + insns.push(( + Inst::shift_r(4, ShiftKind::RotateRight, None, w_rsi), + "D3CE", + "rorl %cl, %esi", + )); + insns.push(( + Inst::shift_r(8, ShiftKind::RotateRight, Some(5), w_r15), + "49C1CF05", + "rorq $5, %r15", + )); + insns.push(( + Inst::shift_r(1, ShiftKind::RotateRight, None, w_rsi), + "D2CE", + "rorb %cl, %sil", + )); + insns.push(( + Inst::shift_r(1, ShiftKind::RotateRight, Some(5), w_r15), + "41C0CF05", + "rorb $5, %r15b", + )); + insns.push(( + Inst::shift_r(2, ShiftKind::RotateRight, None, w_rsi), + "66D3CE", + "rorw %cl, %si", + )); + insns.push(( + Inst::shift_r(2, ShiftKind::RotateRight, Some(5), w_r15), + "6641C1CF05", + "rorw $5, %r15w", + )); + + // ======================================================== + // CmpRMIR + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::reg(r15), rdx), + "4C39FA", + "cmpq %r15, %rdx", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::reg(rcx), r8), + "4939C8", + "cmpq %rcx, %r8", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::reg(rcx), rsi), + "4839CE", + "cmpq %rcx, %rsi", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx), + "483B5763", + "cmpq 99(%rdi), %rdx", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8), + "4C3B4763", + "cmpq 99(%rdi), %r8", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi), + "483B7763", + "cmpq 99(%rdi), %rsi", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::imm(76543210), rdx), + "4881FAEAF48F04", + "cmpq $76543210, %rdx", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::imm(-76543210i32 as u32), r8), + "4981F8160B70FB", + "cmpq $-76543210, %r8", + )); + insns.push(( + Inst::cmp_rmi_r(8, RegMemImm::imm(76543210), rsi), + "4881FEEAF48F04", + "cmpq $76543210, %rsi", + )); + // + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::reg(r15), rdx), + "4439FA", + "cmpl %r15d, %edx", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::reg(rcx), r8), + "4139C8", + "cmpl %ecx, %r8d", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::reg(rcx), rsi), + "39CE", + "cmpl %ecx, %esi", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx), + "3B5763", + "cmpl 99(%rdi), %edx", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8), + "443B4763", + "cmpl 99(%rdi), %r8d", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi), + "3B7763", + "cmpl 99(%rdi), %esi", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::imm(76543210), rdx), + "81FAEAF48F04", + "cmpl $76543210, %edx", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::imm(-76543210i32 as u32), r8), + "4181F8160B70FB", + "cmpl $-76543210, %r8d", + )); + insns.push(( + Inst::cmp_rmi_r(4, RegMemImm::imm(76543210), rsi), + "81FEEAF48F04", + "cmpl $76543210, %esi", + )); + // + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::reg(r15), rdx), + "664439FA", + "cmpw %r15w, %dx", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::reg(rcx), r8), + "664139C8", + "cmpw %cx, %r8w", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::reg(rcx), rsi), + "6639CE", + "cmpw %cx, %si", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx), + "663B5763", + "cmpw 99(%rdi), %dx", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8), + "66443B4763", + "cmpw 99(%rdi), %r8w", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi), + "663B7763", + "cmpw 99(%rdi), %si", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::imm(23210), rdx), + "6681FAAA5A", + "cmpw $23210, %dx", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::imm(-7654i32 as u32), r8), + "664181F81AE2", + "cmpw $-7654, %r8w", + )); + insns.push(( + Inst::cmp_rmi_r(2, RegMemImm::imm(7654), rsi), + "6681FEE61D", + "cmpw $7654, %si", + )); + // + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r15), rdx), + "4438FA", + "cmpb %r15b, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r8), + "4138C8", + "cmpb %cl, %r8b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rsi), + "4038CE", + "cmpb %cl, %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), rdx), + "3A5763", + "cmpb 99(%rdi), %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), r8), + "443A4763", + "cmpb 99(%rdi), %r8b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::mem(Amode::imm_reg(99, rdi)), rsi), + "403A7763", + "cmpb 99(%rdi), %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::imm(70), rdx), + "80FA46", + "cmpb $70, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::imm(-76i32 as u32), r8), + "4180F8B4", + "cmpb $-76, %r8b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::imm(76), rsi), + "4080FE4C", + "cmpb $76, %sil", + )); + // Extra byte-cases (paranoia!) for cmp_rmi_r for first operand = R + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rax), rbx), + "38C3", + "cmpb %al, %bl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rbx), rax), + "38D8", + "cmpb %bl, %al", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rdx), + "38CA", + "cmpb %cl, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), rsi), + "4038CE", + "cmpb %cl, %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r10), + "4138CA", + "cmpb %cl, %r10b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rcx), r14), + "4138CE", + "cmpb %cl, %r14b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), rdx), + "4038EA", + "cmpb %bpl, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), rsi), + "4038EE", + "cmpb %bpl, %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), r10), + "4138EA", + "cmpb %bpl, %r10b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(rbp), r14), + "4138EE", + "cmpb %bpl, %r14b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r9), rdx), + "4438CA", + "cmpb %r9b, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r9), rsi), + "4438CE", + "cmpb %r9b, %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r9), r10), + "4538CA", + "cmpb %r9b, %r10b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r9), r14), + "4538CE", + "cmpb %r9b, %r14b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r13), rdx), + "4438EA", + "cmpb %r13b, %dl", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r13), rsi), + "4438EE", + "cmpb %r13b, %sil", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r13), r10), + "4538EA", + "cmpb %r13b, %r10b", + )); + insns.push(( + Inst::cmp_rmi_r(1, RegMemImm::reg(r13), r14), + "4538EE", + "cmpb %r13b, %r14b", + )); + + // ======================================================== + // SetCC + insns.push((Inst::setcc(CC::O, w_rsi), "400F90C6", "seto %sil")); + insns.push((Inst::setcc(CC::NLE, w_rsi), "400F9FC6", "setnle %sil")); + insns.push((Inst::setcc(CC::Z, w_r14), "410F94C6", "setz %r14b")); + insns.push((Inst::setcc(CC::LE, w_r14), "410F9EC6", "setle %r14b")); + insns.push((Inst::setcc(CC::P, w_r9), "410F9AC1", "setp %r9b")); + insns.push((Inst::setcc(CC::NP, w_r8), "410F9BC0", "setnp %r8b")); + // ======================================================== + // Cmove + insns.push(( + Inst::cmove(2, CC::O, RegMem::reg(rdi), w_rsi), + "660F40F7", + "cmovow %di, %si", + )); + insns.push(( + Inst::cmove( + 2, + CC::NO, + RegMem::mem(Amode::imm_reg_reg_shift(37, rdi, rsi, 2)), + w_r15, + ), + "66440F417CB725", + "cmovnow 37(%rdi,%rsi,4), %r15w", + )); + insns.push(( + Inst::cmove(4, CC::LE, RegMem::reg(rdi), w_rsi), + "0F4EF7", + "cmovlel %edi, %esi", + )); + insns.push(( + Inst::cmove(4, CC::NLE, RegMem::mem(Amode::imm_reg(0, r15)), w_rsi), + "410F4F37", + "cmovnlel 0(%r15), %esi", + )); + insns.push(( + Inst::cmove(8, CC::Z, RegMem::reg(rdi), w_r14), + "4C0F44F7", + "cmovzq %rdi, %r14", + )); + insns.push(( + Inst::cmove(8, CC::NZ, RegMem::mem(Amode::imm_reg(13, rdi)), w_r14), + "4C0F45770D", + "cmovnzq 13(%rdi), %r14", + )); + + // ======================================================== + // Push64 + insns.push((Inst::push64(RegMemImm::reg(rdi)), "57", "pushq %rdi")); + insns.push((Inst::push64(RegMemImm::reg(r8)), "4150", "pushq %r8")); + insns.push(( + Inst::push64(RegMemImm::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))), + "FFB4CE41010000", + "pushq 321(%rsi,%rcx,8)", + )); + insns.push(( + Inst::push64(RegMemImm::mem(Amode::imm_reg_reg_shift(321, r9, rbx, 2))), + "41FFB49941010000", + "pushq 321(%r9,%rbx,4)", + )); + insns.push((Inst::push64(RegMemImm::imm(0)), "6A00", "pushq $0")); + insns.push((Inst::push64(RegMemImm::imm(127)), "6A7F", "pushq $127")); + insns.push(( + Inst::push64(RegMemImm::imm(128)), + "6880000000", + "pushq $128", + )); + insns.push(( + Inst::push64(RegMemImm::imm(0x31415927)), + "6827594131", + "pushq $826366247", + )); + insns.push(( + Inst::push64(RegMemImm::imm(-128i32 as u32)), + "6A80", + "pushq $-128", + )); + insns.push(( + Inst::push64(RegMemImm::imm(-129i32 as u32)), + "687FFFFFFF", + "pushq $-129", + )); + insns.push(( + Inst::push64(RegMemImm::imm(-0x75c4e8a1i32 as u32)), + "685F173B8A", + "pushq $-1975838881", + )); + + // ======================================================== + // Pop64 + insns.push((Inst::pop64(w_rax), "58", "popq %rax")); + insns.push((Inst::pop64(w_rdi), "5F", "popq %rdi")); + insns.push((Inst::pop64(w_r8), "4158", "popq %r8")); + insns.push((Inst::pop64(w_r15), "415F", "popq %r15")); + + // ======================================================== + // CallKnown + insns.push(( + Inst::call_known( + ExternalName::User { + namespace: 0, + index: 0, + }, + Vec::new(), + Vec::new(), + Opcode::Call, + ), + "E800000000", + "call User { namespace: 0, index: 0 }", + )); + + // ======================================================== + // CallUnknown + fn call_unknown(rm: RegMem) -> Inst { + Inst::call_unknown(rm, Vec::new(), Vec::new(), Opcode::CallIndirect) + } + + insns.push((call_unknown(RegMem::reg(rbp)), "FFD5", "call *%rbp")); + insns.push((call_unknown(RegMem::reg(r11)), "41FFD3", "call *%r11")); + insns.push(( + call_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))), + "FF94CE41010000", + "call *321(%rsi,%rcx,8)", + )); + insns.push(( + call_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rdx, 2))), + "41FF949241010000", + "call *321(%r10,%rdx,4)", + )); + + // ======================================================== + // Ret + insns.push((Inst::ret(), "C3", "ret")); + + // ======================================================== + // JmpKnown skipped for now + + // ======================================================== + // JmpCondSymm isn't a real instruction + + // ======================================================== + // JmpCond skipped for now + + // ======================================================== + // JmpCondCompound isn't a real instruction + + // ======================================================== + // JmpUnknown + insns.push((Inst::jmp_unknown(RegMem::reg(rbp)), "FFE5", "jmp *%rbp")); + insns.push(( + Inst::jmp_unknown(RegMem::reg(r11)), + "41FFE3", + "jmp *%r11", + )); + insns.push(( + Inst::jmp_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, rsi, rcx, 3))), + "FFA4CE41010000", + "jmp *321(%rsi,%rcx,8)", + )); + insns.push(( + Inst::jmp_unknown(RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rdx, 2))), + "41FFA49241010000", + "jmp *321(%r10,%rdx,4)", + )); + + // ======================================================== + // XMM_CMP_RM_R + + insns.push(( + Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, RegMem::reg(xmm1), xmm2), + "0F2ED1", + "ucomiss %xmm1, %xmm2", + )); + + insns.push(( + Inst::xmm_cmp_rm_r(SseOpcode::Ucomiss, RegMem::reg(xmm0), xmm9), + "440F2EC8", + "ucomiss %xmm0, %xmm9", + )); + + insns.push(( + Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, RegMem::reg(xmm13), xmm4), + "66410F2EE5", + "ucomisd %xmm13, %xmm4", + )); + + insns.push(( + Inst::xmm_cmp_rm_r(SseOpcode::Ucomisd, RegMem::reg(xmm11), xmm12), + "66450F2EE3", + "ucomisd %xmm11, %xmm12", + )); + + // ======================================================== + // XMM_RM_R: float binary ops + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0), + "F30F58C1", + "addss %xmm1, %xmm0", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13), + "F3450F58EB", + "addss %xmm11, %xmm13", + )); + insns.push(( + Inst::xmm_rm_r( + SseOpcode::Addss, + RegMem::mem(Amode::imm_reg_reg_shift(123, r10, rdx, 2)), + w_xmm0, + ), + "F3410F5844927B", + "addss 123(%r10,%rdx,4), %xmm0", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4), + "F2410F58E7", + "addsd %xmm15, %xmm4", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1), + "F30F5CC8", + "subss %xmm0, %xmm1", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1), + "F3410F5CCC", + "subss %xmm12, %xmm1", + )); + insns.push(( + Inst::xmm_rm_r( + SseOpcode::Subss, + RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rax, 3)), + w_xmm10, + ), + "F3450F5C94C241010000", + "subss 321(%r10,%rax,8), %xmm10", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14), + "F2440F5CF5", + "subsd %xmm5, %xmm14", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4), + "F30F59E5", + "mulss %xmm5, %xmm4", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4), + "F20F59E5", + "mulsd %xmm5, %xmm4", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7), + "F3410F5EF8", + "divss %xmm8, %xmm7", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4), + "F20F5EE5", + "divsd %xmm5, %xmm4", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12), + "440F54E3", + "andps %xmm3, %xmm12", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11), + "440F55DC", + "andnps %xmm4, %xmm11", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15), + "440F56F9", + "orps %xmm1, %xmm15", + )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4), + "0F56E5", + "orps %xmm5, %xmm4", + )); + + // ======================================================== + // XMM_RM_R: Integer Packed + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5), + "66410FFCE9", + "paddb %xmm9, %xmm5", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6), + "660FFDF7", + "paddw %xmm7, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13), + "66450FFEEC", + "paddd %xmm12, %xmm13", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8), + "66440FD4C1", + "paddq %xmm1, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5), + "66410FECE9", + "paddsb %xmm9, %xmm5", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6), + "660FEDF7", + "paddsw %xmm7, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13), + "66450FDCEC", + "paddusb %xmm12, %xmm13", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8), + "66440FDDC1", + "paddusw %xmm1, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5), + "66410FE8E9", + "psubsb %xmm9, %xmm5", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6), + "660FE9F7", + "psubsw %xmm7, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13), + "66450FD8EC", + "psubusb %xmm12, %xmm13", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8), + "66440FD9C1", + "psubusw %xmm1, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13), + "66450FE0EC", + "pavgb %xmm12, %xmm13", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8), + "66440FE3C1", + "pavgw %xmm1, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9), + "66440FF8CD", + "psubb %xmm5, %xmm9", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7), + "660FF9FE", + "psubw %xmm6, %xmm7", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12), + "66450FFAE5", + "psubd %xmm13, %xmm12", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1), + "66410FFBC8", + "psubq %xmm8, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6), + "66410F3840F7", + "pmulld %xmm15, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1), + "66410FD5CE", + "pmullw %xmm14, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9), + "66450FF4C8", + "pmuludq %xmm8, %xmm9", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6), + "66410F383CF7", + "pmaxsb %xmm15, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6), + "66410FEEF7", + "pmaxsw %xmm15, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6), + "66410F383DF7", + "pmaxsd %xmm15, %xmm6", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1), + "66410FDECE", + "pmaxub %xmm14, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1), + "66410F383ECE", + "pmaxuw %xmm14, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1), + "66410F383FCE", + "pmaxud %xmm14, %xmm1", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9), + "66450F3838C8", + "pminsb %xmm8, %xmm9", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9), + "66450FEAC8", + "pminsw %xmm8, %xmm9", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9), + "66450F3839C8", + "pminsd %xmm8, %xmm9", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2), + "660FDAD3", + "pminub %xmm3, %xmm2", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2), + "660F383AD3", + "pminuw %xmm3, %xmm2", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2), + "660F383BD3", + "pminud %xmm3, %xmm2", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2), + "66410FEFD3", + "pxor %xmm11, %xmm2", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2), + "66410F3800D3", + "pshufb %xmm11, %xmm2", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2), + "66410F63D3", + "packsswb %xmm11, %xmm2", + )); + + // ======================================================== + // XMM_RM_R: Integer Conversion + insns.push(( + Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::reg(xmm1), w_xmm8), + "440F5BC1", + "cvtdq2ps %xmm1, %xmm8", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Cvttps2dq, RegMem::reg(xmm9), w_xmm8), + "F3450F5BC1", + "cvttps2dq %xmm9, %xmm8", + )); + + // XMM_Mov_R_M: float stores + insns.push(( + Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12)), + "F3450F11BC2480000000", + "movss %xmm15, 128(%r12)", + )); + insns.push(( + Inst::xmm_mov_r_m(SseOpcode::Movsd, xmm1, Amode::imm_reg(0, rsi)), + "F20F110E", + "movsd %xmm1, 0(%rsi)", + )); + + // XmmUnary: moves and unary float ops + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Movss, RegMem::reg(xmm13), w_xmm2), + "F3410F10D5", + "movss %xmm13, %xmm2", + )); + + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Movsd, RegMem::reg(xmm0), w_xmm1), + "F20F10C8", + "movsd %xmm0, %xmm1", + )); + insns.push(( + Inst::xmm_unary_rm_r( + SseOpcode::Movsd, + RegMem::mem(Amode::imm_reg(0, rsi)), + w_xmm2, + ), + "F20F1016", + "movsd 0(%rsi), %xmm2", + )); + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Movsd, RegMem::reg(xmm14), w_xmm3), + "F2410F10DE", + "movsd %xmm14, %xmm3", + )); + + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Movaps, RegMem::reg(xmm5), w_xmm14), + "440F28F5", + "movaps %xmm5, %xmm14", + )); + + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Sqrtss, RegMem::reg(xmm7), w_xmm8), + "F3440F51C7", + "sqrtss %xmm7, %xmm8", + )); + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Sqrtsd, RegMem::reg(xmm1), w_xmm2), + "F20F51D1", + "sqrtsd %xmm1, %xmm2", + )); + + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, RegMem::reg(xmm0), w_xmm1), + "F30F5AC8", + "cvtss2sd %xmm0, %xmm1", + )); + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, RegMem::reg(xmm1), w_xmm0), + "F20F5AC1", + "cvtsd2ss %xmm1, %xmm0", + )); + + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Pabsb, RegMem::reg(xmm2), w_xmm1), + "660F381CCA", + "pabsb %xmm2, %xmm1", + )); + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Pabsw, RegMem::reg(xmm0), w_xmm0), + "660F381DC0", + "pabsw %xmm0, %xmm0", + )); + insns.push(( + Inst::xmm_unary_rm_r(SseOpcode::Pabsd, RegMem::reg(xmm10), w_xmm11), + "66450F381EDA", + "pabsd %xmm10, %xmm11", + )); + + // Xmm to int conversions, and conversely. + + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Movd, xmm0, w_rsi, OperandSize::Size32), + "660F7EC6", + "movd %xmm0, %esi", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Movq, xmm2, w_rdi, OperandSize::Size64), + "66480F7ED7", + "movq %xmm2, %rdi", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Cvttss2si, xmm0, w_rsi, OperandSize::Size32), + "F30F2CF0", + "cvttss2si %xmm0, %esi", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Cvttss2si, xmm0, w_rdi, OperandSize::Size64), + "F3480F2CF8", + "cvttss2si %xmm0, %rdi", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Cvttsd2si, xmm0, w_rax, OperandSize::Size32), + "F20F2CC0", + "cvttsd2si %xmm0, %eax", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Cvttsd2si, xmm0, w_r15, OperandSize::Size64), + "F24C0F2CF8", + "cvttsd2si %xmm0, %r15", + )); + + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Pmovmskb, xmm10, w_rax, OperandSize::Size32), + "66410FD7C2", + "pmovmskb %xmm10, %eax", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Movmskps, xmm2, w_rax, OperandSize::Size32), + "0F50C2", + "movmskps %xmm2, %eax", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Movmskpd, xmm0, w_rcx, OperandSize::Size32), + "660F50C8", + "movmskpd %xmm0, %ecx", + )); + + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::reg(rax), + OperandSize::Size32, + w_xmm15, + ), + "66440F6EF8", + "movd %eax, %xmm15", + )); + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::mem(Amode::imm_reg(2, r10)), + OperandSize::Size32, + w_xmm9, + ), + "66450F6E4A02", + "movd 2(%r10), %xmm9", + )); + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::reg(rsi), + OperandSize::Size32, + w_xmm1, + ), + "660F6ECE", + "movd %esi, %xmm1", + )); + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Movq, + RegMem::reg(rdi), + OperandSize::Size64, + w_xmm15, + ), + "664C0F6EFF", + "movq %rdi, %xmm15", + )); + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Cvtsi2ss, + RegMem::reg(rdi), + OperandSize::Size32, + w_xmm15, + ), + "F3440F2AFF", + "cvtsi2ss %edi, %xmm15", + )); + insns.push(( + Inst::gpr_to_xmm( + SseOpcode::Cvtsi2sd, + RegMem::reg(rsi), + OperandSize::Size64, + w_xmm1, + ), + "F2480F2ACE", + "cvtsi2sd %rsi, %xmm1", + )); + + // ======================================================== + // XmmRmi + insns.push(( + Inst::xmm_rmi_reg(SseOpcode::Psraw, RegMemImm::reg(xmm10), w_xmm1), + "66410FE1CA", + "psraw %xmm10, %xmm1", + )); + insns.push(( + Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(31), w_xmm1), + "660F72F11F", + "pslld $31, %xmm1", + )); + insns.push(( + Inst::xmm_rmi_reg(SseOpcode::Psrlq, RegMemImm::imm(1), w_xmm3), + "660F73D301", + "psrlq $1, %xmm3", + )); + + // ======================================================== + // XmmRmRImm + insns.push(( + Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false), + "660FC2CD02", + "cmppd $2, %xmm5, %xmm1", + )); + insns.push(( + Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false), + "410FC2FF00", + "cmpps $0, %xmm15, %xmm7", + )); + + // ======================================================== + // Pertaining to atomics. + let am1: SyntheticAmode = Amode::imm_reg_reg_shift(321, r10, rdx, 2).into(); + // `am2` doesn't contribute any 1 bits to the rex prefix, so we must use it when testing + // for retention of the apparently-redundant rex prefix in the 8-bit case. + let am2: SyntheticAmode = Amode::imm_reg_reg_shift(-12345i32 as u32, rcx, rsi, 3).into(); + + // A general 8-bit case. + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: rbx, + dst: am1, + }, + "F0410FB09C9241010000", + "lock cmpxchgb %bl, 321(%r10,%rdx,4)", + )); + // Check redundant rex retention in 8-bit cases. + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: rdx, + dst: am2.clone(), + }, + "F00FB094F1C7CFFFFF", + "lock cmpxchgb %dl, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: rsi, + dst: am2.clone(), + }, + "F0400FB0B4F1C7CFFFFF", + "lock cmpxchgb %sil, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: r10, + dst: am2.clone(), + }, + "F0440FB094F1C7CFFFFF", + "lock cmpxchgb %r10b, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: r15, + dst: am2.clone(), + }, + "F0440FB0BCF1C7CFFFFF", + "lock cmpxchgb %r15b, -12345(%rcx,%rsi,8)", + )); + // 16 bit cases + insns.push(( + Inst::LockCmpxchg { + ty: types::I16, + src: rsi, + dst: am2.clone(), + }, + "66F00FB1B4F1C7CFFFFF", + "lock cmpxchgw %si, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I16, + src: r10, + dst: am2.clone(), + }, + "66F0440FB194F1C7CFFFFF", + "lock cmpxchgw %r10w, -12345(%rcx,%rsi,8)", + )); + // 32 bit cases + insns.push(( + Inst::LockCmpxchg { + ty: types::I32, + src: rsi, + dst: am2.clone(), + }, + "F00FB1B4F1C7CFFFFF", + "lock cmpxchgl %esi, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I32, + src: r10, + dst: am2.clone(), + }, + "F0440FB194F1C7CFFFFF", + "lock cmpxchgl %r10d, -12345(%rcx,%rsi,8)", + )); + // 64 bit cases + insns.push(( + Inst::LockCmpxchg { + ty: types::I64, + src: rsi, + dst: am2.clone(), + }, + "F0480FB1B4F1C7CFFFFF", + "lock cmpxchgq %rsi, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I64, + src: r10, + dst: am2.clone(), + }, + "F04C0FB194F1C7CFFFFF", + "lock cmpxchgq %r10, -12345(%rcx,%rsi,8)", + )); + + // AtomicRmwSeq + insns.push(( + Inst::AtomicRmwSeq { ty: types::I8, op: inst_common::AtomicRmwOp::Or, }, + "490FB6014989C34D09D3F0450FB0190F85EFFFFFFF", + "atomically { 8_bits_at_[%r9]) Or= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + insns.push(( + Inst::AtomicRmwSeq { ty: types::I16, op: inst_common::AtomicRmwOp::And, }, + "490FB7014989C34D21D366F0450FB1190F85EEFFFFFF", + "atomically { 16_bits_at_[%r9]) And= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + insns.push(( + Inst::AtomicRmwSeq { ty: types::I32, op: inst_common::AtomicRmwOp::Xchg, }, + "418B014989C34D89D3F0450FB1190F85EFFFFFFF", + "atomically { 32_bits_at_[%r9]) Xchg= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + insns.push(( + Inst::AtomicRmwSeq { ty: types::I64, op: inst_common::AtomicRmwOp::Add, }, + "498B014989C34D01D3F04D0FB1190F85EFFFFFFF", + "atomically { 64_bits_at_[%r9]) Add= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + + // Fence + insns.push(( + Inst::Fence { + kind: FenceKind::MFence, + }, + "0FAEF0", + "mfence", + )); + insns.push(( + Inst::Fence { + kind: FenceKind::LFence, + }, + "0FAEE8", + "lfence", + )); + insns.push(( + Inst::Fence { + kind: FenceKind::SFence, + }, + "0FAEF8", + "sfence", + )); + + // ======================================================== + // Misc instructions. + + insns.push((Inst::Hlt, "CC", "hlt")); + + let trap_code = TrapCode::UnreachableCodeReached; + insns.push((Inst::Ud2 { trap_code }, "0F0B", "ud2 unreachable")); + + // ======================================================== + // Actually run the tests! + let flags = settings::Flags::new(settings::builder()); + + use crate::settings::Configurable; + let mut isa_flag_builder = x64::settings::builder(); + isa_flag_builder.enable("has_ssse3").unwrap(); + isa_flag_builder.enable("has_sse41").unwrap(); + let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder); + + let rru = regs::create_reg_universe_systemv(&flags); + let emit_info = EmitInfo::new(flags, isa_flags); + for (insn, expected_encoding, expected_printing) in insns { + // Check the printed text is as expected. + let actual_printing = insn.show_rru(Some(&rru)); + assert_eq!(expected_printing, actual_printing); + let mut sink = test_utils::TestCodeSink::new(); + let mut buffer = MachBuffer::new(); + + insn.emit(&mut buffer, &emit_info, &mut Default::default()); + + // Allow one label just after the instruction (so the offset is 0). + let label = buffer.get_label(); + buffer.bind_label(label); + + let buffer = buffer.finish(); + buffer.emit(&mut sink); + let actual_encoding = &sink.stringify(); + assert_eq!(expected_encoding, actual_encoding, "{}", expected_printing); + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs new file mode 100644 index 0000000000..1172b22eff --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/mod.rs @@ -0,0 +1,2733 @@ +//! This module defines x86_64-specific machine instruction types. + +use crate::binemit::{CodeOffset, StackMap}; +use crate::ir::{types, ExternalName, Opcode, SourceLoc, TrapCode, Type}; +use crate::isa::x64::settings as x64_settings; +use crate::machinst::*; +use crate::{settings, settings::Flags, CodegenError, CodegenResult}; +use alloc::boxed::Box; +use alloc::vec::Vec; +use regalloc::{ + PrettyPrint, PrettyPrintSized, RealRegUniverse, Reg, RegClass, RegUsageCollector, + RegUsageMapper, SpillSlot, VirtualReg, Writable, +}; +use smallvec::SmallVec; +use std::fmt; +use std::string::{String, ToString}; + +pub mod args; +mod emit; +#[cfg(test)] +mod emit_tests; +pub mod regs; +pub mod unwind; + +use args::*; +use regs::{create_reg_universe_systemv, show_ireg_sized}; + +//============================================================================= +// Instructions (top level): definition + +// Don't build these directly. Instead use the Inst:: functions to create them. + +/// Instructions. Destinations are on the RIGHT (a la AT&T syntax). +#[derive(Clone)] +pub enum Inst { + /// Nops of various sizes, including zero. + Nop { len: u8 }, + + // ===================================== + // Integer instructions. + /// Integer arithmetic/bit-twiddling: (add sub and or xor mul adc? sbb?) (32 64) (reg addr imm) reg + AluRmiR { + is_64: bool, + op: AluRmiROpcode, + src: RegMemImm, + dst: Writable<Reg>, + }, + + /// Instructions on GPR that only read src and defines dst (dst is not modified): bsr, etc. + UnaryRmR { + size: u8, // 2, 4 or 8 + op: UnaryRmROpcode, + src: RegMem, + dst: Writable<Reg>, + }, + + /// Bitwise not + Not { + size: u8, // 1, 2, 4 or 8 + src: Writable<Reg>, + }, + + /// Integer negation + Neg { + size: u8, // 1, 2, 4 or 8 + src: Writable<Reg>, + }, + + /// Integer quotient and remainder: (div idiv) $rax $rdx (reg addr) + Div { + size: u8, // 1, 2, 4 or 8 + signed: bool, + divisor: RegMem, + }, + + /// The high bits (RDX) of a (un)signed multiply: RDX:RAX := RAX * rhs. + MulHi { size: u8, signed: bool, rhs: RegMem }, + + /// A synthetic sequence to implement the right inline checks for remainder and division, + /// assuming the dividend is in %rax. + /// Puts the result back into %rax if is_div, %rdx if !is_div, to mimic what the div + /// instruction does. + /// The generated code sequence is described in the emit's function match arm for this + /// instruction. + /// + /// Note: %rdx is marked as modified by this instruction, to avoid an early clobber problem + /// with the temporary and divisor registers. Make sure to zero %rdx right before this + /// instruction, or you might run into regalloc failures where %rdx is live before its first + /// def! + CheckedDivOrRemSeq { + kind: DivOrRemKind, + size: u8, + /// The divisor operand. Note it's marked as modified so that it gets assigned a register + /// different from the temporary. + divisor: Writable<Reg>, + tmp: Option<Writable<Reg>>, + }, + + /// Do a sign-extend based on the sign of the value in rax into rdx: (cwd cdq cqo) + /// or al into ah: (cbw) + SignExtendData { + size: u8, // 1, 2, 4 or 8 + }, + + /// Constant materialization: (imm32 imm64) reg. + /// Either: movl $imm32, %reg32 or movabsq $imm64, %reg32. + Imm { + dst_is_64: bool, + simm64: u64, + dst: Writable<Reg>, + }, + + /// GPR to GPR move: mov (64 32) reg reg. + MovRR { + is_64: bool, + src: Reg, + dst: Writable<Reg>, + }, + + /// Zero-extended loads, except for 64 bits: movz (bl bq wl wq lq) addr reg. + /// Note that the lq variant doesn't really exist since the default zero-extend rule makes it + /// unnecessary. For that case we emit the equivalent "movl AM, reg32". + MovzxRmR { + ext_mode: ExtMode, + src: RegMem, + dst: Writable<Reg>, + }, + + /// A plain 64-bit integer load, since MovZX_RM_R can't represent that. + Mov64MR { + src: SyntheticAmode, + dst: Writable<Reg>, + }, + + /// Loads the memory address of addr into dst. + LoadEffectiveAddress { + addr: SyntheticAmode, + dst: Writable<Reg>, + }, + + /// Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg. + MovsxRmR { + ext_mode: ExtMode, + src: RegMem, + dst: Writable<Reg>, + }, + + /// Integer stores: mov (b w l q) reg addr. + MovRM { + size: u8, // 1, 2, 4 or 8. + src: Reg, + dst: SyntheticAmode, + }, + + /// Arithmetic shifts: (shl shr sar) (b w l q) imm reg. + ShiftR { + size: u8, // 1, 2, 4 or 8 + kind: ShiftKind, + /// shift count: Some(0 .. #bits-in-type - 1), or None to mean "%cl". + num_bits: Option<u8>, + dst: Writable<Reg>, + }, + + /// Arithmetic SIMD shifts. + XmmRmiReg { + opcode: SseOpcode, + src: RegMemImm, + dst: Writable<Reg>, + }, + + /// Integer comparisons/tests: cmp (b w l q) (reg addr imm) reg. + CmpRmiR { + size: u8, // 1, 2, 4 or 8 + src: RegMemImm, + dst: Reg, + }, + + /// Materializes the requested condition code in the destination reg. + Setcc { cc: CC, dst: Writable<Reg> }, + + /// Integer conditional move. + /// Overwrites the destination register. + Cmove { + /// Possible values are 2, 4 or 8. Checked in the related factory. + size: u8, + cc: CC, + src: RegMem, + dst: Writable<Reg>, + }, + + // ===================================== + // Stack manipulation. + /// pushq (reg addr imm) + Push64 { src: RegMemImm }, + + /// popq reg + Pop64 { dst: Writable<Reg> }, + + // ===================================== + // Floating-point operations. + /// XMM (scalar or vector) binary op: (add sub and or xor mul adc? sbb?) (32 64) (reg addr) reg + XmmRmR { + op: SseOpcode, + src: RegMem, + dst: Writable<Reg>, + }, + + /// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt, + /// etc. + /// + /// This differs from XMM_RM_R in that the dst register of XmmUnaryRmR is not used in the + /// computation of the instruction dst value and so does not have to be a previously valid + /// value. This is characteristic of mov instructions. + XmmUnaryRmR { + op: SseOpcode, + src: RegMem, + dst: Writable<Reg>, + }, + + /// XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd, movq + XmmMovRM { + op: SseOpcode, + src: Reg, + dst: SyntheticAmode, + }, + + /// XMM (vector) unary op (to move a constant value into an xmm register): movups + XmmLoadConst { + src: VCodeConstant, + dst: Writable<Reg>, + ty: Type, + }, + + /// XMM (scalar) unary op (from xmm to integer reg): movd, movq, cvtts{s,d}2si + XmmToGpr { + op: SseOpcode, + src: Reg, + dst: Writable<Reg>, + dst_size: OperandSize, + }, + + /// XMM (scalar) unary op (from integer to float reg): movd, movq, cvtsi2s{s,d} + GprToXmm { + op: SseOpcode, + src: RegMem, + dst: Writable<Reg>, + src_size: OperandSize, + }, + + /// Converts an unsigned int64 to a float32/float64. + CvtUint64ToFloatSeq { + /// Is the target a 64-bits or 32-bits register? + to_f64: bool, + /// A copy of the source register, fed by lowering. It is marked as modified during + /// register allocation to make sure that the temporary registers differ from the src + /// register, since both registers are live at the same time in the generated code + /// sequence. + src: Writable<Reg>, + dst: Writable<Reg>, + tmp_gpr1: Writable<Reg>, + tmp_gpr2: Writable<Reg>, + }, + + /// Converts a scalar xmm to a signed int32/int64. + CvtFloatToSintSeq { + dst_size: OperandSize, + src_size: OperandSize, + is_saturating: bool, + /// A copy of the source register, fed by lowering. It is marked as modified during + /// register allocation to make sure that the temporary xmm register differs from the src + /// register, since both registers are live at the same time in the generated code + /// sequence. + src: Writable<Reg>, + dst: Writable<Reg>, + tmp_gpr: Writable<Reg>, + tmp_xmm: Writable<Reg>, + }, + + /// Converts a scalar xmm to an unsigned int32/int64. + CvtFloatToUintSeq { + src_size: OperandSize, + dst_size: OperandSize, + is_saturating: bool, + /// A copy of the source register, fed by lowering, reused as a temporary. It is marked as + /// modified during register allocation to make sure that the temporary xmm register + /// differs from the src register, since both registers are live at the same time in the + /// generated code sequence. + src: Writable<Reg>, + dst: Writable<Reg>, + tmp_gpr: Writable<Reg>, + tmp_xmm: Writable<Reg>, + }, + + /// A sequence to compute min/max with the proper NaN semantics for xmm registers. + XmmMinMaxSeq { + size: OperandSize, + is_min: bool, + lhs: Reg, + rhs_dst: Writable<Reg>, + }, + + /// XMM (scalar) conditional move. + /// Overwrites the destination register if cc is set. + XmmCmove { + /// Whether the cmove is moving either 32 or 64 bits. + is_64: bool, + cc: CC, + src: RegMem, + dst: Writable<Reg>, + }, + + /// Float comparisons/tests: cmp (b w l q) (reg addr imm) reg. + XmmCmpRmR { + op: SseOpcode, + src: RegMem, + dst: Reg, + }, + + /// A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg + XmmRmRImm { + op: SseOpcode, + src: RegMem, + dst: Writable<Reg>, + imm: u8, + is64: bool, + }, + + // ===================================== + // Control flow instructions. + /// Direct call: call simm32. + CallKnown { + dest: ExternalName, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: Opcode, + }, + + /// Indirect call: callq (reg mem). + CallUnknown { + dest: RegMem, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: Opcode, + }, + + /// Return. + Ret, + + /// A placeholder instruction, generating no code, meaning that a function epilogue must be + /// inserted there. + EpiloguePlaceholder, + + /// Jump to a known target: jmp simm32. + JmpKnown { dst: MachLabel }, + + /// One-way conditional branch: jcond cond target. + /// + /// This instruction is useful when we have conditional jumps depending on more than two + /// conditions, see for instance the lowering of Brz/brnz with Fcmp inputs. + /// + /// A note of caution: in contexts where the branch target is another block, this has to be the + /// same successor as the one specified in the terminator branch of the current block. + /// Otherwise, this might confuse register allocation by creating new invisible edges. + JmpIf { cc: CC, taken: MachLabel }, + + /// Two-way conditional branch: jcond cond target target. + /// Emitted as a compound sequence; the MachBuffer will shrink it as appropriate. + JmpCond { + cc: CC, + taken: MachLabel, + not_taken: MachLabel, + }, + + /// Jump-table sequence, as one compound instruction (see note in lower.rs for rationale). + /// The generated code sequence is described in the emit's function match arm for this + /// instruction. + /// See comment in lowering about the temporaries signedness. + JmpTableSeq { + idx: Reg, + tmp1: Writable<Reg>, + tmp2: Writable<Reg>, + default_target: MachLabel, + targets: Vec<MachLabel>, + targets_for_term: Vec<MachLabel>, + }, + + /// Indirect jump: jmpq (reg mem). + JmpUnknown { target: RegMem }, + + /// Traps if the condition code is set. + TrapIf { cc: CC, trap_code: TrapCode }, + + /// A debug trap. + Hlt, + + /// An instruction that will always trigger the illegal instruction exception. + Ud2 { trap_code: TrapCode }, + + /// Loads an external symbol in a register, with a relocation: movabsq $name, dst + LoadExtName { + dst: Writable<Reg>, + name: Box<ExternalName>, + offset: i64, + }, + + // ===================================== + // Instructions pertaining to atomic memory accesses. + /// A standard (native) `lock cmpxchg src, (amode)`, with register conventions: + /// + /// `dst` (read) address + /// `src` (read) replacement value + /// %rax (modified) in: expected value, out: value that was actually at `dst` + /// %rflags is written. Do not assume anything about it after the instruction. + /// + /// The instruction "succeeded" iff the lowest `ty` bits of %rax afterwards are the same as + /// they were before. + LockCmpxchg { + ty: Type, // I8, I16, I32 or I64 + src: Reg, + dst: SyntheticAmode, + }, + + /// A synthetic instruction, based on a loop around a native `lock cmpxchg` instruction. + /// This atomically modifies a value in memory and returns the old value. The sequence + /// consists of an initial "normal" load from `dst`, followed by a loop which computes the + /// new value and tries to compare-and-swap ("CAS") it into `dst`, using the native + /// instruction `lock cmpxchg{b,w,l,q}` . The loop iterates until the CAS is successful. + /// If there is no contention, there will be only one pass through the loop body. The + /// sequence does *not* perform any explicit memory fence instructions + /// (mfence/sfence/lfence). + /// + /// Note that the transaction is atomic in the sense that, as observed by some other thread, + /// `dst` either has the initial or final value, but no other. It isn't atomic in the sense + /// of guaranteeing that no other thread writes to `dst` in between the initial load and the + /// CAS -- but that would cause the CAS to fail unless the other thread's last write before + /// the CAS wrote the same value that was already there. In other words, this + /// implementation suffers (unavoidably) from the A-B-A problem. + /// + /// This instruction sequence has fixed register uses as follows: + /// + /// %r9 (read) address + /// %r10 (read) second operand for `op` + /// %r11 (written) scratch reg; value afterwards has no meaning + /// %rax (written) the old value at %r9 + /// %rflags is written. Do not assume anything about it after the instruction. + AtomicRmwSeq { + ty: Type, // I8, I16, I32 or I64 + op: inst_common::AtomicRmwOp, + }, + + /// A memory fence (mfence, lfence or sfence). + Fence { kind: FenceKind }, + + // ===================================== + // Meta-instructions generating no code. + /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This + /// controls how MemArg::NominalSPOffset args are lowered. + VirtualSPOffsetAdj { offset: i64 }, + + /// Provides a way to tell the register allocator that the upcoming sequence of instructions + /// will overwrite `dst` so it should be considered as a `def`; use this with care. + /// + /// This is useful when we have a sequence of instructions whose register usages are nominally + /// `mod`s, but such that the combination of operations creates a result that is independent of + /// the initial register value. It's thus semantically a `def`, not a `mod`, when all the + /// instructions are taken together, so we want to ensure the register is defined (its + /// live-range starts) prior to the sequence to keep analyses happy. + /// + /// One alternative would be a compound instruction that somehow encapsulates the others and + /// reports its own `def`s/`use`s/`mod`s; this adds complexity (the instruction list is no + /// longer flat) and requires knowledge about semantics and initial-value independence anyway. + XmmUninitializedValue { dst: Writable<Reg> }, +} + +pub(crate) fn low32_will_sign_extend_to_64(x: u64) -> bool { + let xs = x as i64; + xs == ((xs << 32) >> 32) +} + +impl Inst { + fn isa_requirement(&self) -> Option<InstructionSet> { + match self { + // These instructions are part of SSE2, which is a basic requirement in Cranelift, and + // don't have to be checked. + Inst::AluRmiR { .. } + | Inst::AtomicRmwSeq { .. } + | Inst::CallKnown { .. } + | Inst::CallUnknown { .. } + | Inst::CheckedDivOrRemSeq { .. } + | Inst::Cmove { .. } + | Inst::CmpRmiR { .. } + | Inst::CvtFloatToSintSeq { .. } + | Inst::CvtFloatToUintSeq { .. } + | Inst::CvtUint64ToFloatSeq { .. } + | Inst::Div { .. } + | Inst::EpiloguePlaceholder + | Inst::Fence { .. } + | Inst::Hlt + | Inst::Imm { .. } + | Inst::JmpCond { .. } + | Inst::JmpIf { .. } + | Inst::JmpKnown { .. } + | Inst::JmpTableSeq { .. } + | Inst::JmpUnknown { .. } + | Inst::LoadEffectiveAddress { .. } + | Inst::LoadExtName { .. } + | Inst::LockCmpxchg { .. } + | Inst::Mov64MR { .. } + | Inst::MovRM { .. } + | Inst::MovRR { .. } + | Inst::MovsxRmR { .. } + | Inst::MovzxRmR { .. } + | Inst::MulHi { .. } + | Inst::Neg { .. } + | Inst::Not { .. } + | Inst::Nop { .. } + | Inst::Pop64 { .. } + | Inst::Push64 { .. } + | Inst::Ret + | Inst::Setcc { .. } + | Inst::ShiftR { .. } + | Inst::SignExtendData { .. } + | Inst::TrapIf { .. } + | Inst::Ud2 { .. } + | Inst::UnaryRmR { .. } + | Inst::VirtualSPOffsetAdj { .. } + | Inst::XmmCmove { .. } + | Inst::XmmCmpRmR { .. } + | Inst::XmmLoadConst { .. } + | Inst::XmmMinMaxSeq { .. } + | Inst::XmmUninitializedValue { .. } => None, + + // These use dynamic SSE opcodes. + Inst::GprToXmm { op, .. } + | Inst::XmmMovRM { op, .. } + | Inst::XmmRmiReg { opcode: op, .. } + | Inst::XmmRmR { op, .. } + | Inst::XmmRmRImm { op, .. } + | Inst::XmmToGpr { op, .. } + | Inst::XmmUnaryRmR { op, .. } => Some(op.available_from()), + } + } +} + +// Handy constructors for Insts. + +impl Inst { + pub(crate) fn nop(len: u8) -> Self { + debug_assert!(len <= 16); + Self::Nop { len } + } + + pub(crate) fn alu_rmi_r( + is_64: bool, + op: AluRmiROpcode, + src: RegMemImm, + dst: Writable<Reg>, + ) -> Self { + src.assert_regclass_is(RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Self::AluRmiR { + is_64, + op, + src, + dst, + } + } + + pub(crate) fn unary_rm_r( + size: u8, + op: UnaryRmROpcode, + src: RegMem, + dst: Writable<Reg>, + ) -> Self { + src.assert_regclass_is(RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2); + Self::UnaryRmR { size, op, src, dst } + } + + pub(crate) fn not(size: u8, src: Writable<Reg>) -> Inst { + debug_assert_eq!(src.to_reg().get_class(), RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + Inst::Not { size, src } + } + + pub(crate) fn neg(size: u8, src: Writable<Reg>) -> Inst { + debug_assert_eq!(src.to_reg().get_class(), RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + Inst::Neg { size, src } + } + + pub(crate) fn div(size: u8, signed: bool, divisor: RegMem) -> Inst { + divisor.assert_regclass_is(RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + Inst::Div { + size, + signed, + divisor, + } + } + + pub(crate) fn mul_hi(size: u8, signed: bool, rhs: RegMem) -> Inst { + rhs.assert_regclass_is(RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + Inst::MulHi { size, signed, rhs } + } + + pub(crate) fn checked_div_or_rem_seq( + kind: DivOrRemKind, + size: u8, + divisor: Writable<Reg>, + tmp: Option<Writable<Reg>>, + ) -> Inst { + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + debug_assert!(divisor.to_reg().get_class() == RegClass::I64); + debug_assert!(tmp + .map(|tmp| tmp.to_reg().get_class() == RegClass::I64) + .unwrap_or(true)); + Inst::CheckedDivOrRemSeq { + kind, + size, + divisor, + tmp, + } + } + + pub(crate) fn sign_extend_data(size: u8) -> Inst { + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + Inst::SignExtendData { size } + } + + pub(crate) fn imm(size: OperandSize, simm64: u64, dst: Writable<Reg>) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + // Try to generate a 32-bit immediate when the upper high bits are zeroed (which matches + // the semantics of movl). + let dst_is_64 = size == OperandSize::Size64 && simm64 > u32::max_value() as u64; + Inst::Imm { + dst_is_64, + simm64, + dst, + } + } + + pub(crate) fn mov_r_r(is_64: bool, src: Reg, dst: Writable<Reg>) -> Inst { + debug_assert!(src.get_class() == RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::MovRR { is_64, src, dst } + } + + // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level) + pub(crate) fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmUnaryRmR { op, src, dst } + } + + pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable<Reg>, ty: Type) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + debug_assert!(ty.is_vector() && ty.bits() == 128); + Inst::XmmLoadConst { src, dst, ty } + } + + /// Convenient helper for unary float operations. + pub(crate) fn xmm_unary_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmUnaryRmR { op, src, dst } + } + + pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmRmR { op, src, dst } + } + + pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self { + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmUninitializedValue { dst } + } + + pub(crate) fn xmm_mov_r_m(op: SseOpcode, src: Reg, dst: impl Into<SyntheticAmode>) -> Inst { + debug_assert!(src.get_class() == RegClass::V128); + Inst::XmmMovRM { + op, + src, + dst: dst.into(), + } + } + + pub(crate) fn xmm_to_gpr( + op: SseOpcode, + src: Reg, + dst: Writable<Reg>, + dst_size: OperandSize, + ) -> Inst { + debug_assert!(src.get_class() == RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::XmmToGpr { + op, + src, + dst, + dst_size, + } + } + + pub(crate) fn gpr_to_xmm( + op: SseOpcode, + src: RegMem, + src_size: OperandSize, + dst: Writable<Reg>, + ) -> Inst { + src.assert_regclass_is(RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::GprToXmm { + op, + src, + dst, + src_size, + } + } + + pub(crate) fn xmm_cmp_rm_r(op: SseOpcode, src: RegMem, dst: Reg) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.get_class() == RegClass::V128); + Inst::XmmCmpRmR { op, src, dst } + } + + pub(crate) fn cvt_u64_to_float_seq( + to_f64: bool, + src: Writable<Reg>, + tmp_gpr1: Writable<Reg>, + tmp_gpr2: Writable<Reg>, + dst: Writable<Reg>, + ) -> Inst { + debug_assert!(src.to_reg().get_class() == RegClass::I64); + debug_assert!(tmp_gpr1.to_reg().get_class() == RegClass::I64); + debug_assert!(tmp_gpr2.to_reg().get_class() == RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::CvtUint64ToFloatSeq { + src, + dst, + tmp_gpr1, + tmp_gpr2, + to_f64, + } + } + + pub(crate) fn cvt_float_to_sint_seq( + src_size: OperandSize, + dst_size: OperandSize, + is_saturating: bool, + src: Writable<Reg>, + dst: Writable<Reg>, + tmp_gpr: Writable<Reg>, + tmp_xmm: Writable<Reg>, + ) -> Inst { + debug_assert!(src.to_reg().get_class() == RegClass::V128); + debug_assert!(tmp_xmm.to_reg().get_class() == RegClass::V128); + debug_assert!(tmp_gpr.to_reg().get_class() == RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::CvtFloatToSintSeq { + src_size, + dst_size, + is_saturating, + src, + dst, + tmp_gpr, + tmp_xmm, + } + } + + pub(crate) fn cvt_float_to_uint_seq( + src_size: OperandSize, + dst_size: OperandSize, + is_saturating: bool, + src: Writable<Reg>, + dst: Writable<Reg>, + tmp_gpr: Writable<Reg>, + tmp_xmm: Writable<Reg>, + ) -> Inst { + debug_assert!(src.to_reg().get_class() == RegClass::V128); + debug_assert!(tmp_xmm.to_reg().get_class() == RegClass::V128); + debug_assert!(tmp_gpr.to_reg().get_class() == RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::CvtFloatToUintSeq { + src_size, + dst_size, + is_saturating, + src, + dst, + tmp_gpr, + tmp_xmm, + } + } + + pub(crate) fn xmm_min_max_seq( + size: OperandSize, + is_min: bool, + lhs: Reg, + rhs_dst: Writable<Reg>, + ) -> Inst { + debug_assert_eq!(lhs.get_class(), RegClass::V128); + debug_assert_eq!(rhs_dst.to_reg().get_class(), RegClass::V128); + Inst::XmmMinMaxSeq { + size, + is_min, + lhs, + rhs_dst, + } + } + + pub(crate) fn xmm_rm_r_imm( + op: SseOpcode, + src: RegMem, + dst: Writable<Reg>, + imm: u8, + is64: bool, + ) -> Inst { + Inst::XmmRmRImm { + op, + src, + dst, + imm, + is64, + } + } + + pub(crate) fn movzx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::MovzxRmR { ext_mode, src, dst } + } + + pub(crate) fn xmm_rmi_reg(opcode: SseOpcode, src: RegMemImm, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmRmiReg { opcode, src, dst } + } + + pub(crate) fn movsx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::I64); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::MovsxRmR { ext_mode, src, dst } + } + + pub(crate) fn mov64_m_r(src: impl Into<SyntheticAmode>, dst: Writable<Reg>) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::Mov64MR { + src: src.into(), + dst, + } + } + + /// A convenience function to be able to use a RegMem as the source of a move. + pub(crate) fn mov64_rm_r(src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::I64); + match src { + RegMem::Reg { reg } => Self::mov_r_r(true, reg, dst), + RegMem::Mem { addr } => Self::mov64_m_r(addr, dst), + } + } + + pub(crate) fn mov_r_m( + size: u8, // 1, 2, 4 or 8 + src: Reg, + dst: impl Into<SyntheticAmode>, + ) -> Inst { + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + debug_assert!(src.get_class() == RegClass::I64); + Inst::MovRM { + size, + src, + dst: dst.into(), + } + } + + pub(crate) fn lea(addr: impl Into<SyntheticAmode>, dst: Writable<Reg>) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::LoadEffectiveAddress { + addr: addr.into(), + dst, + } + } + + pub(crate) fn shift_r( + size: u8, + kind: ShiftKind, + num_bits: Option<u8>, + dst: Writable<Reg>, + ) -> Inst { + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + debug_assert!(if let Some(num_bits) = num_bits { + num_bits < size * 8 + } else { + true + }); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::ShiftR { + size, + kind, + num_bits, + dst, + } + } + + /// Does a comparison of dst - src for operands of size `size`, as stated by the machine + /// instruction semantics. Be careful with the order of parameters! + pub(crate) fn cmp_rmi_r( + size: u8, // 1, 2, 4 or 8 + src: RegMemImm, + dst: Reg, + ) -> Inst { + src.assert_regclass_is(RegClass::I64); + debug_assert!(size == 8 || size == 4 || size == 2 || size == 1); + debug_assert!(dst.get_class() == RegClass::I64); + Inst::CmpRmiR { size, src, dst } + } + + pub(crate) fn trap(trap_code: TrapCode) -> Inst { + Inst::Ud2 { + trap_code: trap_code, + } + } + + pub(crate) fn setcc(cc: CC, dst: Writable<Reg>) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::Setcc { cc, dst } + } + + pub(crate) fn cmove(size: u8, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst { + debug_assert!(size == 8 || size == 4 || size == 2); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::Cmove { size, cc, src, dst } + } + + pub(crate) fn xmm_cmove(is_64: bool, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmCmove { + is_64, + cc, + src, + dst, + } + } + + pub(crate) fn push64(src: RegMemImm) -> Inst { + src.assert_regclass_is(RegClass::I64); + Inst::Push64 { src } + } + + pub(crate) fn pop64(dst: Writable<Reg>) -> Inst { + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + Inst::Pop64 { dst } + } + + pub(crate) fn call_known( + dest: ExternalName, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: Opcode, + ) -> Inst { + Inst::CallKnown { + dest, + uses, + defs, + opcode, + } + } + + pub(crate) fn call_unknown( + dest: RegMem, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: Opcode, + ) -> Inst { + dest.assert_regclass_is(RegClass::I64); + Inst::CallUnknown { + dest, + uses, + defs, + opcode, + } + } + + pub(crate) fn ret() -> Inst { + Inst::Ret + } + + pub(crate) fn epilogue_placeholder() -> Inst { + Inst::EpiloguePlaceholder + } + + pub(crate) fn jmp_known(dst: MachLabel) -> Inst { + Inst::JmpKnown { dst } + } + + pub(crate) fn jmp_if(cc: CC, taken: MachLabel) -> Inst { + Inst::JmpIf { cc, taken } + } + + pub(crate) fn jmp_cond(cc: CC, taken: MachLabel, not_taken: MachLabel) -> Inst { + Inst::JmpCond { + cc, + taken, + not_taken, + } + } + + pub(crate) fn jmp_unknown(target: RegMem) -> Inst { + target.assert_regclass_is(RegClass::I64); + Inst::JmpUnknown { target } + } + + pub(crate) fn trap_if(cc: CC, trap_code: TrapCode) -> Inst { + Inst::TrapIf { cc, trap_code } + } + + /// Choose which instruction to use for loading a register value from memory. For loads smaller + /// than 64 bits, this method expects a way to extend the value (i.e. [ExtKind::SignExtend], + /// [ExtKind::ZeroExtend]); loads with no extension necessary will ignore this. + pub(crate) fn load( + ty: Type, + from_addr: impl Into<SyntheticAmode>, + to_reg: Writable<Reg>, + ext_kind: ExtKind, + ) -> Inst { + let rc = to_reg.to_reg().get_class(); + match rc { + RegClass::I64 => { + let ext_mode = match ty.bytes() { + 1 => Some(ExtMode::BQ), + 2 => Some(ExtMode::WQ), + 4 => Some(ExtMode::LQ), + 8 => None, + _ => unreachable!("the type should never use a scalar load: {}", ty), + }; + if let Some(ext_mode) = ext_mode { + // Values smaller than 64 bits must be extended in some way. + match ext_kind { + ExtKind::SignExtend => { + Inst::movsx_rm_r(ext_mode, RegMem::mem(from_addr), to_reg) + } + ExtKind::ZeroExtend => { + Inst::movzx_rm_r(ext_mode, RegMem::mem(from_addr), to_reg) + } + ExtKind::None => panic!( + "expected an extension kind for extension mode: {:?}", + ext_mode + ), + } + } else { + // 64-bit values can be moved directly. + Inst::mov64_m_r(from_addr, to_reg) + } + } + RegClass::V128 => { + let opcode = match ty { + types::F32 => SseOpcode::Movss, + types::F64 => SseOpcode::Movsd, + types::F32X4 => SseOpcode::Movups, + types::F64X2 => SseOpcode::Movupd, + _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqu, + _ => unimplemented!("unable to load type: {}", ty), + }; + Inst::xmm_unary_rm_r(opcode, RegMem::mem(from_addr), to_reg) + } + _ => panic!("unable to generate load for register class: {:?}", rc), + } + } + + /// Choose which instruction to use for storing a register value to memory. + pub(crate) fn store(ty: Type, from_reg: Reg, to_addr: impl Into<SyntheticAmode>) -> Inst { + let rc = from_reg.get_class(); + match rc { + RegClass::I64 => { + // Always store the full register, to ensure that the high bits are properly set + // when doing a full reload. + Inst::mov_r_m(8 /* bytes */, from_reg, to_addr) + } + RegClass::V128 => { + let opcode = match ty { + types::F32 => SseOpcode::Movss, + types::F64 => SseOpcode::Movsd, + types::F32X4 => SseOpcode::Movups, + types::F64X2 => SseOpcode::Movupd, + _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqu, + _ => unimplemented!("unable to store type: {}", ty), + }; + Inst::xmm_mov_r_m(opcode, from_reg, to_addr) + } + _ => panic!("unable to generate store for register class: {:?}", rc), + } + } +} + +// Inst helpers. + +impl Inst { + /// In certain cases, instructions of this format can act as a definition of an XMM register, + /// producing a value that is independent of its initial value. + /// + /// For example, a vector equality comparison (`cmppd` or `cmpps`) that compares a register to + /// itself will generate all ones as a result, regardless of its value. From the register + /// allocator's point of view, we should (i) record the first register, which is normally a + /// mod, as a def instead; and (ii) not record the second register as a use, because it is the + /// same as the first register (already handled). + fn produces_const(&self) -> bool { + match self { + Self::AluRmiR { op, src, dst, .. } => { + src.to_reg() == Some(dst.to_reg()) + && (*op == AluRmiROpcode::Xor || *op == AluRmiROpcode::Sub) + } + + Self::XmmRmR { op, src, dst, .. } => { + src.to_reg() == Some(dst.to_reg()) + && (*op == SseOpcode::Xorps + || *op == SseOpcode::Xorpd + || *op == SseOpcode::Pxor + || *op == SseOpcode::Pcmpeqb + || *op == SseOpcode::Pcmpeqw + || *op == SseOpcode::Pcmpeqd + || *op == SseOpcode::Pcmpeqq) + } + + Self::XmmRmRImm { + op, src, dst, imm, .. + } => { + src.to_reg() == Some(dst.to_reg()) + && (*op == SseOpcode::Cmppd || *op == SseOpcode::Cmpps) + && *imm == FcmpImm::Equal.encode() + } + + _ => false, + } + } + + /// Choose which instruction to use for comparing two values for equality. + pub(crate) fn equals(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst { + match ty { + types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to), + types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to), + types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to), + types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to), + types::F32X4 => { + Inst::xmm_rm_r_imm(SseOpcode::Cmpps, from, to, FcmpImm::Equal.encode(), false) + } + types::F64X2 => { + Inst::xmm_rm_r_imm(SseOpcode::Cmppd, from, to, FcmpImm::Equal.encode(), false) + } + _ => unimplemented!("unimplemented type for Inst::equals: {}", ty), + } + } + + /// Choose which instruction to use for computing a bitwise AND on two values. + pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst { + match ty { + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to), + _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pand, from, to), + _ => unimplemented!("unimplemented type for Inst::and: {}", ty), + } + } + + /// Choose which instruction to use for computing a bitwise AND NOT on two values. + pub(crate) fn and_not(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst { + match ty { + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to), + _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pandn, from, to), + _ => unimplemented!("unimplemented type for Inst::and_not: {}", ty), + } + } + + /// Choose which instruction to use for computing a bitwise OR on two values. + pub(crate) fn or(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst { + match ty { + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to), + _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Por, from, to), + _ => unimplemented!("unimplemented type for Inst::or: {}", ty), + } + } + + /// Choose which instruction to use for computing a bitwise XOR on two values. + pub(crate) fn xor(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst { + match ty { + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to), + _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pxor, from, to), + _ => unimplemented!("unimplemented type for Inst::xor: {}", ty), + } + } +} + +//============================================================================= +// Instructions: printing + +impl PrettyPrint for Inst { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + fn ljustify(s: String) -> String { + let w = 7; + if s.len() >= w { + s + } else { + let need = usize::min(w, w - s.len()); + s + &format!("{nil: <width$}", nil = "", width = need) + } + } + + fn ljustify2(s1: String, s2: String) -> String { + ljustify(s1 + &s2) + } + + fn suffix_lq(is_64: bool) -> String { + (if is_64 { "q" } else { "l" }).to_string() + } + + fn size_lq(is_64: bool) -> u8 { + if is_64 { + 8 + } else { + 4 + } + } + + fn suffix_bwlq(size: u8) -> String { + match size { + 1 => "b".to_string(), + 2 => "w".to_string(), + 4 => "l".to_string(), + 8 => "q".to_string(), + _ => panic!("Inst(x64).show.suffixBWLQ: size={}", size), + } + } + + match self { + Inst::Nop { len } => format!("{} len={}", ljustify("nop".to_string()), len), + + Inst::AluRmiR { + is_64, + op, + src, + dst, + } => format!( + "{} {}, {}", + ljustify2(op.to_string(), suffix_lq(*is_64)), + src.show_rru_sized(mb_rru, size_lq(*is_64)), + show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64)), + ), + + Inst::UnaryRmR { src, dst, op, size } => format!( + "{} {}, {}", + ljustify2(op.to_string(), suffix_bwlq(*size)), + src.show_rru_sized(mb_rru, *size), + show_ireg_sized(dst.to_reg(), mb_rru, *size), + ), + + Inst::Not { size, src } => format!( + "{} {}", + ljustify2("not".to_string(), suffix_bwlq(*size)), + show_ireg_sized(src.to_reg(), mb_rru, *size) + ), + + Inst::Neg { size, src } => format!( + "{} {}", + ljustify2("neg".to_string(), suffix_bwlq(*size)), + show_ireg_sized(src.to_reg(), mb_rru, *size) + ), + + Inst::Div { + size, + signed, + divisor, + .. + } => format!( + "{} {}", + ljustify(if *signed { + "idiv".to_string() + } else { + "div".into() + }), + divisor.show_rru_sized(mb_rru, *size) + ), + + Inst::MulHi { + size, signed, rhs, .. + } => format!( + "{} {}", + ljustify(if *signed { + "imul".to_string() + } else { + "mul".to_string() + }), + rhs.show_rru_sized(mb_rru, *size) + ), + + Inst::CheckedDivOrRemSeq { + kind, + size, + divisor, + .. + } => format!( + "{} $rax:$rdx, {}", + match kind { + DivOrRemKind::SignedDiv => "sdiv", + DivOrRemKind::UnsignedDiv => "udiv", + DivOrRemKind::SignedRem => "srem", + DivOrRemKind::UnsignedRem => "urem", + }, + show_ireg_sized(divisor.to_reg(), mb_rru, *size), + ), + + Inst::SignExtendData { size } => match size { + 1 => "cbw", + 2 => "cwd", + 4 => "cdq", + 8 => "cqo", + _ => unreachable!(), + } + .into(), + + Inst::XmmUnaryRmR { op, src, dst, .. } => format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru_sized(mb_rru, op.src_size()), + show_ireg_sized(dst.to_reg(), mb_rru, 8), + ), + + Inst::XmmMovRM { op, src, dst, .. } => format!( + "{} {}, {}", + ljustify(op.to_string()), + show_ireg_sized(*src, mb_rru, 8), + dst.show_rru(mb_rru), + ), + + Inst::XmmRmR { op, src, dst, .. } => format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru_sized(mb_rru, 8), + show_ireg_sized(dst.to_reg(), mb_rru, 8), + ), + + Inst::XmmMinMaxSeq { + lhs, + rhs_dst, + is_min, + size, + } => format!( + "{} {}, {}", + ljustify2( + if *is_min { + "xmm min seq ".to_string() + } else { + "xmm max seq ".to_string() + }, + match size { + OperandSize::Size32 => "f32", + OperandSize::Size64 => "f64", + } + .into() + ), + show_ireg_sized(*lhs, mb_rru, 8), + show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8), + ), + + Inst::XmmRmRImm { op, src, dst, imm, is64, .. } => format!( + "{} ${}, {}, {}", + ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })), + imm, + src.show_rru(mb_rru), + dst.show_rru(mb_rru), + ), + + Inst::XmmUninitializedValue { dst } => format!( + "{} {}", + ljustify("uninit".into()), + dst.show_rru(mb_rru), + ), + + Inst::XmmLoadConst { src, dst, .. } => { + format!("load_const {:?}, {}", src, dst.show_rru(mb_rru),) + } + + Inst::XmmToGpr { + op, + src, + dst, + dst_size, + } => { + let dst_size = match dst_size { + OperandSize::Size32 => 4, + OperandSize::Size64 => 8, + }; + format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru(mb_rru), + show_ireg_sized(dst.to_reg(), mb_rru, dst_size), + ) + } + + Inst::GprToXmm { + op, + src, + src_size, + dst, + } => format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru_sized(mb_rru, src_size.to_bytes()), + dst.show_rru(mb_rru) + ), + + Inst::XmmCmpRmR { op, src, dst } => format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru_sized(mb_rru, 8), + show_ireg_sized(*dst, mb_rru, 8), + ), + + Inst::CvtUint64ToFloatSeq { + src, dst, to_f64, .. + } => format!( + "{} {}, {}", + ljustify(format!( + "u64_to_{}_seq", + if *to_f64 { "f64" } else { "f32" } + )), + show_ireg_sized(src.to_reg(), mb_rru, 8), + dst.show_rru(mb_rru), + ), + + Inst::CvtFloatToSintSeq { + src, + dst, + src_size, + dst_size, + .. + } => format!( + "{} {}, {}", + ljustify(format!( + "cvt_float{}_to_sint{}_seq", + if *src_size == OperandSize::Size64 { + "64" + } else { + "32" + }, + if *dst_size == OperandSize::Size64 { + "64" + } else { + "32" + } + )), + show_ireg_sized(src.to_reg(), mb_rru, 8), + show_ireg_sized(dst.to_reg(), mb_rru, dst_size.to_bytes()), + ), + + Inst::CvtFloatToUintSeq { + src, + dst, + src_size, + dst_size, + .. + } => format!( + "{} {}, {}", + ljustify(format!( + "cvt_float{}_to_uint{}_seq", + if *src_size == OperandSize::Size64 { + "64" + } else { + "32" + }, + if *dst_size == OperandSize::Size64 { + "64" + } else { + "32" + } + )), + show_ireg_sized(src.to_reg(), mb_rru, 8), + show_ireg_sized(dst.to_reg(), mb_rru, dst_size.to_bytes()), + ), + + Inst::Imm { + dst_is_64, + simm64, + dst, + } => { + if *dst_is_64 { + format!( + "{} ${}, {}", + ljustify("movabsq".to_string()), + *simm64 as i64, + show_ireg_sized(dst.to_reg(), mb_rru, 8) + ) + } else { + format!( + "{} ${}, {}", + ljustify("movl".to_string()), + (*simm64 as u32) as i32, + show_ireg_sized(dst.to_reg(), mb_rru, 4) + ) + } + } + + Inst::MovRR { is_64, src, dst } => format!( + "{} {}, {}", + ljustify2("mov".to_string(), suffix_lq(*is_64)), + show_ireg_sized(*src, mb_rru, size_lq(*is_64)), + show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64)) + ), + + Inst::MovzxRmR { + ext_mode, src, dst, .. + } => { + if *ext_mode == ExtMode::LQ { + format!( + "{} {}, {}", + ljustify("movl".to_string()), + src.show_rru_sized(mb_rru, ext_mode.src_size()), + show_ireg_sized(dst.to_reg(), mb_rru, 4) + ) + } else { + format!( + "{} {}, {}", + ljustify2("movz".to_string(), ext_mode.to_string()), + src.show_rru_sized(mb_rru, ext_mode.src_size()), + show_ireg_sized(dst.to_reg(), mb_rru, ext_mode.dst_size()) + ) + } + } + + Inst::Mov64MR { src, dst, .. } => format!( + "{} {}, {}", + ljustify("movq".to_string()), + src.show_rru(mb_rru), + dst.show_rru(mb_rru) + ), + + Inst::LoadEffectiveAddress { addr, dst } => format!( + "{} {}, {}", + ljustify("lea".to_string()), + addr.show_rru(mb_rru), + dst.show_rru(mb_rru) + ), + + Inst::MovsxRmR { + ext_mode, src, dst, .. + } => format!( + "{} {}, {}", + ljustify2("movs".to_string(), ext_mode.to_string()), + src.show_rru_sized(mb_rru, ext_mode.src_size()), + show_ireg_sized(dst.to_reg(), mb_rru, ext_mode.dst_size()) + ), + + Inst::MovRM { size, src, dst, .. } => format!( + "{} {}, {}", + ljustify2("mov".to_string(), suffix_bwlq(*size)), + show_ireg_sized(*src, mb_rru, *size), + dst.show_rru(mb_rru) + ), + + Inst::ShiftR { + size, + kind, + num_bits, + dst, + } => match num_bits { + None => format!( + "{} %cl, {}", + ljustify2(kind.to_string(), suffix_bwlq(*size)), + show_ireg_sized(dst.to_reg(), mb_rru, *size) + ), + + Some(num_bits) => format!( + "{} ${}, {}", + ljustify2(kind.to_string(), suffix_bwlq(*size)), + num_bits, + show_ireg_sized(dst.to_reg(), mb_rru, *size) + ), + }, + + Inst::XmmRmiReg { opcode, src, dst } => format!( + "{} {}, {}", + ljustify(opcode.to_string()), + src.show_rru(mb_rru), + dst.to_reg().show_rru(mb_rru) + ), + + Inst::CmpRmiR { size, src, dst } => format!( + "{} {}, {}", + ljustify2("cmp".to_string(), suffix_bwlq(*size)), + src.show_rru_sized(mb_rru, *size), + show_ireg_sized(*dst, mb_rru, *size) + ), + + Inst::Setcc { cc, dst } => format!( + "{} {}", + ljustify2("set".to_string(), cc.to_string()), + show_ireg_sized(dst.to_reg(), mb_rru, 1) + ), + + Inst::Cmove { size, cc, src, dst } => format!( + "{} {}, {}", + ljustify(format!("cmov{}{}", cc.to_string(), suffix_bwlq(*size))), + src.show_rru_sized(mb_rru, *size), + show_ireg_sized(dst.to_reg(), mb_rru, *size) + ), + + Inst::XmmCmove { + is_64, + cc, + src, + dst, + } => { + let size = if *is_64 { 8 } else { 4 }; + format!( + "j{} $next; mov{} {}, {}; $next: ", + cc.invert().to_string(), + if *is_64 { "sd" } else { "ss" }, + src.show_rru_sized(mb_rru, size), + show_ireg_sized(dst.to_reg(), mb_rru, size) + ) + } + + Inst::Push64 { src } => { + format!("{} {}", ljustify("pushq".to_string()), src.show_rru(mb_rru)) + } + + Inst::Pop64 { dst } => { + format!("{} {}", ljustify("popq".to_string()), dst.show_rru(mb_rru)) + } + + Inst::CallKnown { dest, .. } => format!("{} {:?}", ljustify("call".to_string()), dest), + + Inst::CallUnknown { dest, .. } => format!( + "{} *{}", + ljustify("call".to_string()), + dest.show_rru(mb_rru) + ), + + Inst::Ret => "ret".to_string(), + + Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(), + + Inst::JmpKnown { dst } => { + format!("{} {}", ljustify("jmp".to_string()), dst.to_string()) + } + + Inst::JmpIf { cc, taken } => format!( + "{} {}", + ljustify2("j".to_string(), cc.to_string()), + taken.to_string(), + ), + + Inst::JmpCond { + cc, + taken, + not_taken, + } => format!( + "{} {}; j {}", + ljustify2("j".to_string(), cc.to_string()), + taken.to_string(), + not_taken.to_string() + ), + + Inst::JmpTableSeq { idx, .. } => { + format!("{} {}", ljustify("br_table".into()), idx.show_rru(mb_rru)) + } + + Inst::JmpUnknown { target } => format!( + "{} *{}", + ljustify("jmp".to_string()), + target.show_rru(mb_rru) + ), + + Inst::TrapIf { cc, trap_code, .. } => { + format!("j{} ; ud2 {} ;", cc.invert().to_string(), trap_code) + } + + Inst::LoadExtName { + dst, name, offset, .. + } => format!( + "{} {}+{}, {}", + ljustify("movaps".into()), + name, + offset, + show_ireg_sized(dst.to_reg(), mb_rru, 8), + ), + + Inst::LockCmpxchg { ty, src, dst, .. } => { + let size = ty.bytes() as u8; + format!("lock cmpxchg{} {}, {}", + suffix_bwlq(size), show_ireg_sized(*src, mb_rru, size), dst.show_rru(mb_rru)) + } + + Inst::AtomicRmwSeq { ty, op, .. } => { + format!( + "atomically {{ {}_bits_at_[%r9]) {:?}= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }}", + ty.bits(), op) + }, + + Inst::Fence { kind } => { + match kind { + FenceKind::MFence => "mfence".to_string(), + FenceKind::LFence => "lfence".to_string(), + FenceKind::SFence => "sfence".to_string(), + } + } + + Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset), + + Inst::Hlt => "hlt".into(), + + Inst::Ud2 { trap_code } => format!("ud2 {}", trap_code), + } + } +} + +// Temp hook for legacy printing machinery +impl fmt::Debug for Inst { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + // Print the insn without a Universe :-( + write!(fmt, "{}", self.show_rru(None)) + } +} + +fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { + // This is a bit subtle. If some register is in the modified set, then it may not be in either + // the use or def sets. However, enforcing that directly is somewhat difficult. Instead, + // regalloc.rs will "fix" this for us by removing the the modified set from the use and def + // sets. + match inst { + Inst::AluRmiR { src, dst, .. } => { + if inst.produces_const() { + // No need to account for src, since src == dst. + collector.add_def(*dst); + } else { + src.get_regs_as_uses(collector); + collector.add_mod(*dst); + } + } + Inst::Not { src, .. } => { + collector.add_mod(*src); + } + Inst::Neg { src, .. } => { + collector.add_mod(*src); + } + Inst::Div { size, divisor, .. } => { + collector.add_mod(Writable::from_reg(regs::rax())); + if *size == 1 { + collector.add_def(Writable::from_reg(regs::rdx())); + } else { + collector.add_mod(Writable::from_reg(regs::rdx())); + } + divisor.get_regs_as_uses(collector); + } + Inst::MulHi { rhs, .. } => { + collector.add_mod(Writable::from_reg(regs::rax())); + collector.add_def(Writable::from_reg(regs::rdx())); + rhs.get_regs_as_uses(collector); + } + Inst::CheckedDivOrRemSeq { divisor, tmp, .. } => { + // Mark both fixed registers as mods, to avoid an early clobber problem in codegen + // (i.e. the temporary is allocated one of the fixed registers). This requires writing + // the rdx register *before* the instruction, which is not too bad. + collector.add_mod(Writable::from_reg(regs::rax())); + collector.add_mod(Writable::from_reg(regs::rdx())); + collector.add_mod(*divisor); + if let Some(tmp) = tmp { + collector.add_def(*tmp); + } + } + Inst::SignExtendData { size } => match size { + 1 => collector.add_mod(Writable::from_reg(regs::rax())), + 2 | 4 | 8 => { + collector.add_use(regs::rax()); + collector.add_def(Writable::from_reg(regs::rdx())); + } + _ => unreachable!(), + }, + Inst::UnaryRmR { src, dst, .. } | Inst::XmmUnaryRmR { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_def(*dst); + } + Inst::XmmRmR { src, dst, .. } => { + if inst.produces_const() { + // No need to account for src, since src == dst. + collector.add_def(*dst); + } else { + src.get_regs_as_uses(collector); + collector.add_mod(*dst); + } + } + Inst::XmmRmRImm { op, src, dst, .. } => { + if inst.produces_const() { + // No need to account for src, since src == dst. + collector.add_def(*dst); + } else if *op == SseOpcode::Pextrb + || *op == SseOpcode::Pextrw + || *op == SseOpcode::Pextrd + || *op == SseOpcode::Pshufd + { + src.get_regs_as_uses(collector); + collector.add_def(*dst); + } else { + src.get_regs_as_uses(collector); + collector.add_mod(*dst); + } + } + Inst::XmmUninitializedValue { dst } => collector.add_def(*dst), + Inst::XmmLoadConst { dst, .. } => collector.add_def(*dst), + Inst::XmmMinMaxSeq { lhs, rhs_dst, .. } => { + collector.add_use(*lhs); + collector.add_mod(*rhs_dst); + } + Inst::XmmRmiReg { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_mod(*dst); + } + Inst::XmmMovRM { src, dst, .. } => { + collector.add_use(*src); + dst.get_regs_as_uses(collector); + } + Inst::XmmCmpRmR { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_use(*dst); + } + Inst::Imm { dst, .. } => { + collector.add_def(*dst); + } + Inst::MovRR { src, dst, .. } | Inst::XmmToGpr { src, dst, .. } => { + collector.add_use(*src); + collector.add_def(*dst); + } + Inst::GprToXmm { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_def(*dst); + } + Inst::CvtUint64ToFloatSeq { + src, + dst, + tmp_gpr1, + tmp_gpr2, + .. + } => { + collector.add_mod(*src); + collector.add_def(*dst); + collector.add_def(*tmp_gpr1); + collector.add_def(*tmp_gpr2); + } + Inst::CvtFloatToSintSeq { + src, + dst, + tmp_xmm, + tmp_gpr, + .. + } + | Inst::CvtFloatToUintSeq { + src, + dst, + tmp_gpr, + tmp_xmm, + .. + } => { + collector.add_mod(*src); + collector.add_def(*dst); + collector.add_def(*tmp_gpr); + collector.add_def(*tmp_xmm); + } + Inst::MovzxRmR { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_def(*dst); + } + Inst::Mov64MR { src, dst, .. } | Inst::LoadEffectiveAddress { addr: src, dst } => { + src.get_regs_as_uses(collector); + collector.add_def(*dst) + } + Inst::MovsxRmR { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_def(*dst); + } + Inst::MovRM { src, dst, .. } => { + collector.add_use(*src); + dst.get_regs_as_uses(collector); + } + Inst::ShiftR { num_bits, dst, .. } => { + if num_bits.is_none() { + collector.add_use(regs::rcx()); + } + collector.add_mod(*dst); + } + Inst::CmpRmiR { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_use(*dst); // yes, really `add_use` + } + Inst::Setcc { dst, .. } => { + collector.add_def(*dst); + } + Inst::Cmove { src, dst, .. } | Inst::XmmCmove { src, dst, .. } => { + src.get_regs_as_uses(collector); + collector.add_mod(*dst); + } + Inst::Push64 { src } => { + src.get_regs_as_uses(collector); + collector.add_mod(Writable::from_reg(regs::rsp())); + } + Inst::Pop64 { dst } => { + collector.add_def(*dst); + } + + Inst::CallKnown { + ref uses, ref defs, .. + } => { + collector.add_uses(uses); + collector.add_defs(defs); + } + + Inst::CallUnknown { + ref uses, + ref defs, + dest, + .. + } => { + collector.add_uses(uses); + collector.add_defs(defs); + dest.get_regs_as_uses(collector); + } + + Inst::JmpTableSeq { + ref idx, + ref tmp1, + ref tmp2, + .. + } => { + collector.add_use(*idx); + collector.add_def(*tmp1); + collector.add_def(*tmp2); + } + + Inst::JmpUnknown { target } => { + target.get_regs_as_uses(collector); + } + + Inst::LoadExtName { dst, .. } => { + collector.add_def(*dst); + } + + Inst::LockCmpxchg { src, dst, .. } => { + dst.get_regs_as_uses(collector); + collector.add_use(*src); + collector.add_mod(Writable::from_reg(regs::rax())); + } + + Inst::AtomicRmwSeq { .. } => { + collector.add_use(regs::r9()); + collector.add_use(regs::r10()); + collector.add_def(Writable::from_reg(regs::r11())); + collector.add_def(Writable::from_reg(regs::rax())); + } + + Inst::Ret + | Inst::EpiloguePlaceholder + | Inst::JmpKnown { .. } + | Inst::JmpIf { .. } + | Inst::JmpCond { .. } + | Inst::Nop { .. } + | Inst::TrapIf { .. } + | Inst::VirtualSPOffsetAdj { .. } + | Inst::Hlt + | Inst::Ud2 { .. } + | Inst::Fence { .. } => { + // No registers are used. + } + } +} + +//============================================================================= +// Instructions and subcomponents: map_regs + +fn map_use<RUM: RegUsageMapper>(m: &RUM, r: &mut Reg) { + if let Some(reg) = r.as_virtual_reg() { + let new = m.get_use(reg).unwrap().to_reg(); + *r = new; + } +} + +fn map_def<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) { + if let Some(reg) = r.to_reg().as_virtual_reg() { + let new = m.get_def(reg).unwrap().to_reg(); + *r = Writable::from_reg(new); + } +} + +fn map_mod<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) { + if let Some(reg) = r.to_reg().as_virtual_reg() { + let new = m.get_mod(reg).unwrap().to_reg(); + *r = Writable::from_reg(new); + } +} + +impl Amode { + fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) { + match self { + Amode::ImmReg { ref mut base, .. } => map_use(map, base), + Amode::ImmRegRegShift { + ref mut base, + ref mut index, + .. + } => { + map_use(map, base); + map_use(map, index); + } + Amode::RipRelative { .. } => { + // RIP isn't involved in regalloc. + } + } + } +} + +impl RegMemImm { + fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) { + match self { + RegMemImm::Reg { ref mut reg } => map_use(map, reg), + RegMemImm::Mem { ref mut addr } => addr.map_uses(map), + RegMemImm::Imm { .. } => {} + } + } + + fn map_as_def<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + match self { + Self::Reg { reg } => { + let mut writable_src = Writable::from_reg(*reg); + map_def(mapper, &mut writable_src); + *self = Self::reg(writable_src.to_reg()); + } + _ => panic!("unexpected RegMemImm kind in map_src_reg_as_def"), + } + } +} + +impl RegMem { + fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) { + match self { + RegMem::Reg { ref mut reg } => map_use(map, reg), + RegMem::Mem { ref mut addr, .. } => addr.map_uses(map), + } + } + + fn map_as_def<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + match self { + Self::Reg { reg } => { + let mut writable_src = Writable::from_reg(*reg); + map_def(mapper, &mut writable_src); + *self = Self::reg(writable_src.to_reg()); + } + _ => panic!("unexpected RegMem kind in map_src_reg_as_def"), + } + } +} + +fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) { + // Note this must be carefully synchronized with x64_get_regs. + let produces_const = inst.produces_const(); + + match inst { + // ** Nop + Inst::AluRmiR { + ref mut src, + ref mut dst, + .. + } => { + if produces_const { + src.map_as_def(mapper); + map_def(mapper, dst); + } else { + src.map_uses(mapper); + map_mod(mapper, dst); + } + } + Inst::Not { src, .. } | Inst::Neg { src, .. } => map_mod(mapper, src), + Inst::Div { divisor, .. } => divisor.map_uses(mapper), + Inst::MulHi { rhs, .. } => rhs.map_uses(mapper), + Inst::CheckedDivOrRemSeq { divisor, tmp, .. } => { + map_mod(mapper, divisor); + if let Some(tmp) = tmp { + map_def(mapper, tmp) + } + } + Inst::SignExtendData { .. } => {} + Inst::XmmUnaryRmR { + ref mut src, + ref mut dst, + .. + } + | Inst::UnaryRmR { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_def(mapper, dst); + } + Inst::XmmRmRImm { + ref op, + ref mut src, + ref mut dst, + .. + } => { + if produces_const { + src.map_as_def(mapper); + map_def(mapper, dst); + } else if *op == SseOpcode::Pextrb + || *op == SseOpcode::Pextrw + || *op == SseOpcode::Pextrd + || *op == SseOpcode::Pshufd + { + src.map_uses(mapper); + map_def(mapper, dst); + } else { + src.map_uses(mapper); + map_mod(mapper, dst); + } + } + Inst::XmmRmR { + ref mut src, + ref mut dst, + .. + } => { + if produces_const { + src.map_as_def(mapper); + map_def(mapper, dst); + } else { + src.map_uses(mapper); + map_mod(mapper, dst); + } + } + Inst::XmmRmiReg { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_mod(mapper, dst); + } + Inst::XmmUninitializedValue { ref mut dst, .. } => { + map_def(mapper, dst); + } + Inst::XmmLoadConst { ref mut dst, .. } => { + map_def(mapper, dst); + } + Inst::XmmMinMaxSeq { + ref mut lhs, + ref mut rhs_dst, + .. + } => { + map_use(mapper, lhs); + map_mod(mapper, rhs_dst); + } + Inst::XmmMovRM { + ref mut src, + ref mut dst, + .. + } => { + map_use(mapper, src); + dst.map_uses(mapper); + } + Inst::XmmCmpRmR { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_use(mapper, dst); + } + Inst::Imm { ref mut dst, .. } => map_def(mapper, dst), + Inst::MovRR { + ref mut src, + ref mut dst, + .. + } + | Inst::XmmToGpr { + ref mut src, + ref mut dst, + .. + } => { + map_use(mapper, src); + map_def(mapper, dst); + } + Inst::GprToXmm { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_def(mapper, dst); + } + Inst::CvtUint64ToFloatSeq { + ref mut src, + ref mut dst, + ref mut tmp_gpr1, + ref mut tmp_gpr2, + .. + } => { + map_mod(mapper, src); + map_def(mapper, dst); + map_def(mapper, tmp_gpr1); + map_def(mapper, tmp_gpr2); + } + Inst::CvtFloatToSintSeq { + ref mut src, + ref mut dst, + ref mut tmp_xmm, + ref mut tmp_gpr, + .. + } + | Inst::CvtFloatToUintSeq { + ref mut src, + ref mut dst, + ref mut tmp_gpr, + ref mut tmp_xmm, + .. + } => { + map_mod(mapper, src); + map_def(mapper, dst); + map_def(mapper, tmp_gpr); + map_def(mapper, tmp_xmm); + } + Inst::MovzxRmR { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_def(mapper, dst); + } + Inst::Mov64MR { src, dst, .. } | Inst::LoadEffectiveAddress { addr: src, dst } => { + src.map_uses(mapper); + map_def(mapper, dst); + } + Inst::MovsxRmR { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_def(mapper, dst); + } + Inst::MovRM { + ref mut src, + ref mut dst, + .. + } => { + map_use(mapper, src); + dst.map_uses(mapper); + } + Inst::ShiftR { ref mut dst, .. } => { + map_mod(mapper, dst); + } + Inst::CmpRmiR { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_use(mapper, dst); + } + Inst::Setcc { ref mut dst, .. } => map_def(mapper, dst), + Inst::Cmove { + ref mut src, + ref mut dst, + .. + } + | Inst::XmmCmove { + ref mut src, + ref mut dst, + .. + } => { + src.map_uses(mapper); + map_mod(mapper, dst) + } + Inst::Push64 { ref mut src } => src.map_uses(mapper), + Inst::Pop64 { ref mut dst } => { + map_def(mapper, dst); + } + + Inst::CallKnown { + ref mut uses, + ref mut defs, + .. + } => { + for r in uses.iter_mut() { + map_use(mapper, r); + } + for r in defs.iter_mut() { + map_def(mapper, r); + } + } + + Inst::CallUnknown { + ref mut uses, + ref mut defs, + ref mut dest, + .. + } => { + for r in uses.iter_mut() { + map_use(mapper, r); + } + for r in defs.iter_mut() { + map_def(mapper, r); + } + dest.map_uses(mapper); + } + + Inst::JmpTableSeq { + ref mut idx, + ref mut tmp1, + ref mut tmp2, + .. + } => { + map_use(mapper, idx); + map_def(mapper, tmp1); + map_def(mapper, tmp2); + } + + Inst::JmpUnknown { ref mut target } => target.map_uses(mapper), + + Inst::LoadExtName { ref mut dst, .. } => map_def(mapper, dst), + + Inst::LockCmpxchg { + ref mut src, + ref mut dst, + .. + } => { + map_use(mapper, src); + dst.map_uses(mapper); + } + + Inst::Ret + | Inst::EpiloguePlaceholder + | Inst::JmpKnown { .. } + | Inst::JmpCond { .. } + | Inst::JmpIf { .. } + | Inst::Nop { .. } + | Inst::TrapIf { .. } + | Inst::VirtualSPOffsetAdj { .. } + | Inst::Ud2 { .. } + | Inst::Hlt + | Inst::AtomicRmwSeq { .. } + | Inst::Fence { .. } => { + // Instruction doesn't explicitly mention any regs, so it can't have any virtual + // regs that we'd need to remap. Hence no action required. + } + } +} + +//============================================================================= +// Instructions: misc functions and external interface + +impl MachInst for Inst { + fn get_regs(&self, collector: &mut RegUsageCollector) { + x64_get_regs(&self, collector) + } + + fn map_regs<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + x64_map_regs(self, mapper); + } + + fn is_move(&self) -> Option<(Writable<Reg>, Reg)> { + match self { + // Note (carefully!) that a 32-bit mov *isn't* a no-op since it zeroes + // out the upper 32 bits of the destination. For example, we could + // conceivably use `movl %reg, %reg` to zero out the top 32 bits of + // %reg. + Self::MovRR { + is_64, src, dst, .. + } if *is_64 => Some((*dst, *src)), + // Note as well that MOVS[S|D] when used in the `XmmUnaryRmR` context are pure moves of + // scalar floating-point values (and annotate `dst` as `def`s to the register allocator) + // whereas the same operation in a packed context, e.g. `XMM_RM_R`, is used to merge a + // value into the lowest lane of a vector (not a move). + Self::XmmUnaryRmR { op, src, dst, .. } + if *op == SseOpcode::Movss + || *op == SseOpcode::Movsd + || *op == SseOpcode::Movaps + || *op == SseOpcode::Movapd + || *op == SseOpcode::Movups + || *op == SseOpcode::Movupd + || *op == SseOpcode::Movdqa + || *op == SseOpcode::Movdqu => + { + if let RegMem::Reg { reg } = src { + Some((*dst, *reg)) + } else { + None + } + } + _ => None, + } + } + + fn is_epilogue_placeholder(&self) -> bool { + if let Self::EpiloguePlaceholder = self { + true + } else { + false + } + } + + fn is_term<'a>(&'a self) -> MachTerminator<'a> { + match self { + // Interesting cases. + &Self::Ret | &Self::EpiloguePlaceholder => MachTerminator::Ret, + &Self::JmpKnown { dst } => MachTerminator::Uncond(dst), + &Self::JmpCond { + taken, not_taken, .. + } => MachTerminator::Cond(taken, not_taken), + &Self::JmpTableSeq { + ref targets_for_term, + .. + } => MachTerminator::Indirect(&targets_for_term[..]), + // All other cases are boring. + _ => MachTerminator::None, + } + } + + fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, ty: Type) -> Inst { + let rc_dst = dst_reg.to_reg().get_class(); + let rc_src = src_reg.get_class(); + // If this isn't true, we have gone way off the rails. + debug_assert!(rc_dst == rc_src); + match rc_dst { + RegClass::I64 => Inst::mov_r_r(true, src_reg, dst_reg), + RegClass::V128 => { + // The Intel optimization manual, in "3.5.1.13 Zero-Latency MOV Instructions", + // doesn't include MOVSS/MOVSD as instructions with zero-latency. Use movaps for + // those, which may write more lanes that we need, but are specified to have + // zero-latency. + let opcode = match ty { + types::F32 | types::F64 | types::F32X4 => SseOpcode::Movaps, + types::F64X2 => SseOpcode::Movapd, + _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqa, + _ => unimplemented!("unable to move type: {}", ty), + }; + Inst::xmm_unary_rm_r(opcode, RegMem::reg(src_reg), dst_reg) + } + _ => panic!("gen_move(x64): unhandled regclass {:?}", rc_dst), + } + } + + fn gen_zero_len_nop() -> Inst { + Inst::Nop { len: 0 } + } + + fn gen_nop(preferred_size: usize) -> Inst { + Inst::nop((preferred_size % 16) as u8) + } + + fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> { + None + } + + fn rc_for_type(ty: Type) -> CodegenResult<RegClass> { + match ty { + types::I8 + | types::I16 + | types::I32 + | types::I64 + | types::B1 + | types::B8 + | types::B16 + | types::B32 + | types::B64 + | types::R32 + | types::R64 => Ok(RegClass::I64), + types::F32 | types::F64 => Ok(RegClass::V128), + _ if ty.bits() == 128 => Ok(RegClass::V128), + types::IFLAGS | types::FFLAGS => Ok(RegClass::I64), + _ => Err(CodegenError::Unsupported(format!( + "Unexpected SSA-value type: {}", + ty + ))), + } + } + + fn gen_jump(label: MachLabel) -> Inst { + Inst::jmp_known(label) + } + + fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>( + to_reg: Writable<Reg>, + value: u64, + ty: Type, + mut alloc_tmp: F, + ) -> SmallVec<[Self; 4]> { + let mut ret = SmallVec::new(); + if ty == types::F32 { + if value == 0 { + ret.push(Inst::xmm_rm_r( + SseOpcode::Xorps, + RegMem::reg(to_reg.to_reg()), + to_reg, + )); + } else { + let tmp = alloc_tmp(RegClass::I64, types::I32); + ret.push(Inst::imm(OperandSize::Size32, value, tmp)); + + ret.push(Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::reg(tmp.to_reg()), + OperandSize::Size32, + to_reg, + )); + } + } else if ty == types::F64 { + if value == 0 { + ret.push(Inst::xmm_rm_r( + SseOpcode::Xorpd, + RegMem::reg(to_reg.to_reg()), + to_reg, + )); + } else { + let tmp = alloc_tmp(RegClass::I64, types::I64); + ret.push(Inst::imm(OperandSize::Size64, value, tmp)); + + ret.push(Inst::gpr_to_xmm( + SseOpcode::Movq, + RegMem::reg(tmp.to_reg()), + OperandSize::Size64, + to_reg, + )); + } + } else { + // Must be an integer type. + debug_assert!( + ty == types::B1 + || ty == types::I8 + || ty == types::B8 + || ty == types::I16 + || ty == types::B16 + || ty == types::I32 + || ty == types::B32 + || ty == types::I64 + || ty == types::B64 + || ty == types::R32 + || ty == types::R64 + ); + if value == 0 { + ret.push(Inst::alu_rmi_r( + ty == types::I64, + AluRmiROpcode::Xor, + RegMemImm::reg(to_reg.to_reg()), + to_reg, + )); + } else { + ret.push(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + value.into(), + to_reg, + )); + } + } + ret + } + + fn reg_universe(flags: &Flags) -> RealRegUniverse { + create_reg_universe_systemv(flags) + } + + fn worst_case_size() -> CodeOffset { + 15 + } + + fn ref_type_regclass(_: &settings::Flags) -> RegClass { + RegClass::I64 + } + + type LabelUse = LabelUse; +} + +/// State carried between emissions of a sequence of instructions. +#[derive(Default, Clone, Debug)] +pub struct EmitState { + /// Addend to convert nominal-SP offsets to real-SP offsets at the current + /// program point. + pub(crate) virtual_sp_offset: i64, + /// Offset of FP from nominal-SP. + pub(crate) nominal_sp_to_fp: i64, + /// Safepoint stack map for upcoming instruction, as provided to `pre_safepoint()`. + stack_map: Option<StackMap>, + /// Current source location. + cur_srcloc: SourceLoc, +} + +/// Constant state used during emissions of a sequence of instructions. +pub struct EmitInfo { + flags: settings::Flags, + isa_flags: x64_settings::Flags, +} + +impl EmitInfo { + pub(crate) fn new(flags: settings::Flags, isa_flags: x64_settings::Flags) -> Self { + Self { flags, isa_flags } + } +} + +impl MachInstEmitInfo for EmitInfo { + fn flags(&self) -> &Flags { + &self.flags + } +} + +impl MachInstEmit for Inst { + type State = EmitState; + type Info = EmitInfo; + type UnwindInfo = unwind::X64UnwindInfo; + + fn emit(&self, sink: &mut MachBuffer<Inst>, info: &Self::Info, state: &mut Self::State) { + emit::emit(self, sink, info, state); + } + + fn pretty_print(&self, mb_rru: Option<&RealRegUniverse>, _: &mut Self::State) -> String { + self.show_rru(mb_rru) + } +} + +impl MachInstEmitState<Inst> for EmitState { + fn new(abi: &dyn ABICallee<I = Inst>) -> Self { + EmitState { + virtual_sp_offset: 0, + nominal_sp_to_fp: abi.frame_size() as i64, + stack_map: None, + cur_srcloc: SourceLoc::default(), + } + } + + fn pre_safepoint(&mut self, stack_map: StackMap) { + self.stack_map = Some(stack_map); + } + + fn pre_sourceloc(&mut self, srcloc: SourceLoc) { + self.cur_srcloc = srcloc; + } +} + +impl EmitState { + fn take_stack_map(&mut self) -> Option<StackMap> { + self.stack_map.take() + } + + fn clear_post_insn(&mut self) { + self.stack_map = None; + } + + fn cur_srcloc(&self) -> SourceLoc { + self.cur_srcloc + } +} + +/// A label-use (internal relocation) in generated code. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LabelUse { + /// A 32-bit offset from location of relocation itself, added to the existing value at that + /// location. Used for control flow instructions which consider an offset from the start of the + /// next instruction (so the size of the payload -- 4 bytes -- is subtracted from the payload). + JmpRel32, + + /// A 32-bit offset from location of relocation itself, added to the existing value at that + /// location. + PCRel32, +} + +impl MachInstLabelUse for LabelUse { + const ALIGN: CodeOffset = 1; + + fn max_pos_range(self) -> CodeOffset { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => 0x7fff_ffff, + } + } + + fn max_neg_range(self) -> CodeOffset { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => 0x8000_0000, + } + } + + fn patch_size(self) -> CodeOffset { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => 4, + } + } + + fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) { + let pc_rel = (label_offset as i64) - (use_offset as i64); + debug_assert!(pc_rel <= self.max_pos_range() as i64); + debug_assert!(pc_rel >= -(self.max_neg_range() as i64)); + let pc_rel = pc_rel as u32; + match self { + LabelUse::JmpRel32 => { + let addend = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]); + let value = pc_rel.wrapping_add(addend).wrapping_sub(4); + buffer.copy_from_slice(&value.to_le_bytes()[..]); + } + LabelUse::PCRel32 => { + let addend = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]); + let value = pc_rel.wrapping_add(addend); + buffer.copy_from_slice(&value.to_le_bytes()[..]); + } + } + } + + fn supports_veneer(self) -> bool { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => false, + } + } + + fn veneer_size(self) -> CodeOffset { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => 0, + } + } + + fn generate_veneer(self, _: &mut [u8], _: CodeOffset) -> (CodeOffset, LabelUse) { + match self { + LabelUse::JmpRel32 | LabelUse::PCRel32 => { + panic!("Veneer not supported for JumpRel32 label-use."); + } + } + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs new file mode 100644 index 0000000000..04bc1f09bf --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/regs.rs @@ -0,0 +1,289 @@ +//! Registers, the Universe thereof, and printing. +//! +//! These are ordered by sequence number, as required in the Universe. The strange ordering is +//! intended to make callee-save registers available before caller-saved ones. This is a net win +//! provided that each function makes at least one onward call. It'll be a net loss for leaf +//! functions, and we should change the ordering in that case, so as to make caller-save regs +//! available first. +//! +//! TODO Maybe have two different universes, one for leaf functions and one for non-leaf functions? +//! Also, they will have to be ABI dependent. Need to find a way to avoid constructing a universe +//! for each function we compile. + +use crate::settings; +use alloc::vec::Vec; +use regalloc::{ + PrettyPrint, RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, NUM_REG_CLASSES, +}; +use std::string::String; + +// Hardware encodings for a few registers. + +pub const ENC_RBX: u8 = 3; +pub const ENC_RSP: u8 = 4; +pub const ENC_RBP: u8 = 5; +pub const ENC_R12: u8 = 12; +pub const ENC_R13: u8 = 13; +pub const ENC_R14: u8 = 14; +pub const ENC_R15: u8 = 15; + +fn gpr(enc: u8, index: u8) -> Reg { + Reg::new_real(RegClass::I64, enc, index) +} + +pub(crate) fn r12() -> Reg { + gpr(ENC_R12, 16) +} +pub(crate) fn r13() -> Reg { + gpr(ENC_R13, 17) +} +pub(crate) fn r14() -> Reg { + gpr(ENC_R14, 18) +} +pub(crate) fn rbx() -> Reg { + gpr(ENC_RBX, 19) +} +pub(crate) fn rsi() -> Reg { + gpr(6, 20) +} +pub(crate) fn rdi() -> Reg { + gpr(7, 21) +} +pub(crate) fn rax() -> Reg { + gpr(0, 22) +} +pub(crate) fn rcx() -> Reg { + gpr(1, 23) +} +pub(crate) fn rdx() -> Reg { + gpr(2, 24) +} +pub(crate) fn r8() -> Reg { + gpr(8, 25) +} +pub(crate) fn r9() -> Reg { + gpr(9, 26) +} +pub(crate) fn r10() -> Reg { + gpr(10, 27) +} +pub(crate) fn r11() -> Reg { + gpr(11, 28) +} + +pub(crate) fn r15() -> Reg { + // r15 is put aside since this is the pinned register. + gpr(ENC_R15, 29) +} + +/// The pinned register on this architecture. +/// It must be the same as Spidermonkey's HeapReg, as found in this file. +/// https://searchfox.org/mozilla-central/source/js/src/jit/x64/Assembler-x64.h#99 +pub(crate) fn pinned_reg() -> Reg { + r15() +} + +fn fpr(enc: u8, index: u8) -> Reg { + Reg::new_real(RegClass::V128, enc, index) +} + +pub(crate) fn xmm0() -> Reg { + fpr(0, 0) +} +pub(crate) fn xmm1() -> Reg { + fpr(1, 1) +} +pub(crate) fn xmm2() -> Reg { + fpr(2, 2) +} +pub(crate) fn xmm3() -> Reg { + fpr(3, 3) +} +pub(crate) fn xmm4() -> Reg { + fpr(4, 4) +} +pub(crate) fn xmm5() -> Reg { + fpr(5, 5) +} +pub(crate) fn xmm6() -> Reg { + fpr(6, 6) +} +pub(crate) fn xmm7() -> Reg { + fpr(7, 7) +} +pub(crate) fn xmm8() -> Reg { + fpr(8, 8) +} +pub(crate) fn xmm9() -> Reg { + fpr(9, 9) +} +pub(crate) fn xmm10() -> Reg { + fpr(10, 10) +} +pub(crate) fn xmm11() -> Reg { + fpr(11, 11) +} +pub(crate) fn xmm12() -> Reg { + fpr(12, 12) +} +pub(crate) fn xmm13() -> Reg { + fpr(13, 13) +} +pub(crate) fn xmm14() -> Reg { + fpr(14, 14) +} +pub(crate) fn xmm15() -> Reg { + fpr(15, 15) +} + +pub(crate) fn rsp() -> Reg { + gpr(ENC_RSP, 30) +} +pub(crate) fn rbp() -> Reg { + gpr(ENC_RBP, 31) +} + +/// Create the register universe for X64. +/// +/// The ordering of registers matters, as commented in the file doc comment: assumes the +/// calling-convention is SystemV, at the moment. +pub(crate) fn create_reg_universe_systemv(flags: &settings::Flags) -> RealRegUniverse { + let mut regs = Vec::<(RealReg, String)>::new(); + let mut allocable_by_class = [None; NUM_REG_CLASSES]; + + let use_pinned_reg = flags.enable_pinned_reg(); + + // XMM registers + let first_fpr = regs.len(); + regs.push((xmm0().to_real_reg(), "%xmm0".into())); + regs.push((xmm1().to_real_reg(), "%xmm1".into())); + regs.push((xmm2().to_real_reg(), "%xmm2".into())); + regs.push((xmm3().to_real_reg(), "%xmm3".into())); + regs.push((xmm4().to_real_reg(), "%xmm4".into())); + regs.push((xmm5().to_real_reg(), "%xmm5".into())); + regs.push((xmm6().to_real_reg(), "%xmm6".into())); + regs.push((xmm7().to_real_reg(), "%xmm7".into())); + regs.push((xmm8().to_real_reg(), "%xmm8".into())); + regs.push((xmm9().to_real_reg(), "%xmm9".into())); + regs.push((xmm10().to_real_reg(), "%xmm10".into())); + regs.push((xmm11().to_real_reg(), "%xmm11".into())); + regs.push((xmm12().to_real_reg(), "%xmm12".into())); + regs.push((xmm13().to_real_reg(), "%xmm13".into())); + regs.push((xmm14().to_real_reg(), "%xmm14".into())); + regs.push((xmm15().to_real_reg(), "%xmm15".into())); + let last_fpr = regs.len() - 1; + + // Integer regs. + let first_gpr = regs.len(); + + // Callee-saved, in the SystemV x86_64 ABI. + regs.push((r12().to_real_reg(), "%r12".into())); + regs.push((r13().to_real_reg(), "%r13".into())); + regs.push((r14().to_real_reg(), "%r14".into())); + + regs.push((rbx().to_real_reg(), "%rbx".into())); + + // Caller-saved, in the SystemV x86_64 ABI. + regs.push((rsi().to_real_reg(), "%rsi".into())); + regs.push((rdi().to_real_reg(), "%rdi".into())); + regs.push((rax().to_real_reg(), "%rax".into())); + regs.push((rcx().to_real_reg(), "%rcx".into())); + regs.push((rdx().to_real_reg(), "%rdx".into())); + regs.push((r8().to_real_reg(), "%r8".into())); + regs.push((r9().to_real_reg(), "%r9".into())); + regs.push((r10().to_real_reg(), "%r10".into())); + regs.push((r11().to_real_reg(), "%r11".into())); + + // Other regs, not available to the allocator. + debug_assert_eq!(r15(), pinned_reg()); + let allocable = if use_pinned_reg { + // The pinned register is not allocatable in this case, so record the length before adding + // it. + let len = regs.len(); + regs.push((r15().to_real_reg(), "%r15/pinned".into())); + len + } else { + regs.push((r15().to_real_reg(), "%r15".into())); + regs.len() + }; + let last_gpr = allocable - 1; + + regs.push((rsp().to_real_reg(), "%rsp".into())); + regs.push((rbp().to_real_reg(), "%rbp".into())); + + allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo { + first: first_gpr, + last: last_gpr, + suggested_scratch: Some(r12().get_index()), + }); + allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo { + first: first_fpr, + last: last_fpr, + suggested_scratch: Some(xmm15().get_index()), + }); + + // Sanity-check: the index passed to the Reg ctor must match the order in the register list. + for (i, reg) in regs.iter().enumerate() { + assert_eq!(i, reg.0.get_index()); + } + + RealRegUniverse { + regs, + allocable, + allocable_by_class, + } +} + +/// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show its name at some +/// smaller size (4, 2 or 1 bytes). +pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: u8) -> String { + let mut s = reg.show_rru(mb_rru); + + if reg.get_class() != RegClass::I64 || size == 8 { + // We can't do any better. + return s; + } + + if reg.is_real() { + // Change (eg) "rax" into "eax", "ax" or "al" as appropriate. This is something one could + // describe diplomatically as "a kludge", but it's only debug code. + let remapper = match s.as_str() { + "%rax" => Some(["%eax", "%ax", "%al"]), + "%rbx" => Some(["%ebx", "%bx", "%bl"]), + "%rcx" => Some(["%ecx", "%cx", "%cl"]), + "%rdx" => Some(["%edx", "%dx", "%dl"]), + "%rsi" => Some(["%esi", "%si", "%sil"]), + "%rdi" => Some(["%edi", "%di", "%dil"]), + "%rbp" => Some(["%ebp", "%bp", "%bpl"]), + "%rsp" => Some(["%esp", "%sp", "%spl"]), + "%r8" => Some(["%r8d", "%r8w", "%r8b"]), + "%r9" => Some(["%r9d", "%r9w", "%r9b"]), + "%r10" => Some(["%r10d", "%r10w", "%r10b"]), + "%r11" => Some(["%r11d", "%r11w", "%r11b"]), + "%r12" => Some(["%r12d", "%r12w", "%r12b"]), + "%r13" => Some(["%r13d", "%r13w", "%r13b"]), + "%r14" => Some(["%r14d", "%r14w", "%r14b"]), + "%r15" => Some(["%r15d", "%r15w", "%r15b"]), + _ => None, + }; + if let Some(smaller_names) = remapper { + match size { + 4 => s = smaller_names[0].into(), + 2 => s = smaller_names[1].into(), + 1 => s = smaller_names[2].into(), + _ => panic!("show_ireg_sized: real"), + } + } + } else { + // Add a "l", "w" or "b" suffix to RegClass::I64 vregs used at narrower widths. + let suffix = match size { + 4 => "l", + 2 => "w", + 1 => "b", + _ => panic!("show_ireg_sized: virtual"), + }; + s = s + suffix; + } + + s +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs new file mode 100644 index 0000000000..ffe43930f0 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind.rs @@ -0,0 +1,125 @@ +use crate::isa::unwind::input::UnwindInfo; +use crate::isa::x64::inst::{ + args::{AluRmiROpcode, Amode, RegMemImm, SyntheticAmode}, + regs, Inst, +}; +use crate::machinst::{UnwindInfoContext, UnwindInfoGenerator}; +use crate::result::CodegenResult; +use alloc::vec::Vec; +use regalloc::Reg; + +#[cfg(feature = "unwind")] +pub(crate) mod systemv; + +pub struct X64UnwindInfo; + +impl UnwindInfoGenerator<Inst> for X64UnwindInfo { + fn create_unwind_info( + context: UnwindInfoContext<Inst>, + ) -> CodegenResult<Option<UnwindInfo<Reg>>> { + use crate::isa::unwind::input::{self, UnwindCode}; + let mut codes = Vec::new(); + const WORD_SIZE: u8 = 8; + + for i in context.prologue.clone() { + let i = i as usize; + let inst = &context.insts[i]; + let offset = context.insts_layout[i]; + + match inst { + Inst::Push64 { + src: RegMemImm::Reg { reg }, + } => { + codes.push(( + offset, + UnwindCode::StackAlloc { + size: WORD_SIZE.into(), + }, + )); + codes.push(( + offset, + UnwindCode::SaveRegister { + reg: *reg, + stack_offset: 0, + }, + )); + } + Inst::MovRR { src, dst, .. } => { + if *src == regs::rsp() { + codes.push((offset, UnwindCode::SetFramePointer { reg: dst.to_reg() })); + } + } + Inst::AluRmiR { + is_64: true, + op: AluRmiROpcode::Sub, + src: RegMemImm::Imm { simm32 }, + dst, + .. + } if dst.to_reg() == regs::rsp() => { + let imm = *simm32; + codes.push((offset, UnwindCode::StackAlloc { size: imm })); + } + Inst::MovRM { + src, + dst: SyntheticAmode::Real(Amode::ImmReg { simm32, base, .. }), + .. + } if *base == regs::rsp() => { + // `mov reg, imm(rsp)` + let imm = *simm32; + codes.push(( + offset, + UnwindCode::SaveRegister { + reg: *src, + stack_offset: imm, + }, + )); + } + Inst::AluRmiR { + is_64: true, + op: AluRmiROpcode::Add, + src: RegMemImm::Imm { simm32 }, + dst, + .. + } if dst.to_reg() == regs::rsp() => { + let imm = *simm32; + codes.push((offset, UnwindCode::StackDealloc { size: imm })); + } + _ => {} + } + } + + let last_epilogue_end = context.len; + let epilogues_unwind_codes = context + .epilogues + .iter() + .map(|epilogue| { + // TODO add logic to process epilogue instruction instead of + // returning empty array. + let end = epilogue.end as usize - 1; + let end_offset = context.insts_layout[end]; + if end_offset == last_epilogue_end { + // Do not remember/restore for very last epilogue. + return vec![]; + } + + let start = epilogue.start as usize; + let offset = context.insts_layout[start]; + vec![ + (offset, UnwindCode::RememberState), + // TODO epilogue instructions + (end_offset, UnwindCode::RestoreState), + ] + }) + .collect(); + + let prologue_size = context.insts_layout[context.prologue.end as usize]; + Ok(Some(input::UnwindInfo { + prologue_size, + prologue_unwind_codes: codes, + epilogues_unwind_codes, + function_size: context.len, + word_size: WORD_SIZE, + initial_sp_offset: WORD_SIZE, + })) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs new file mode 100644 index 0000000000..68473a8afb --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/inst/unwind/systemv.rs @@ -0,0 +1,204 @@ +//! Unwind information for System V ABI (x86-64). + +use crate::isa::unwind::input; +use crate::isa::unwind::systemv::{RegisterMappingError, UnwindInfo}; +use crate::result::CodegenResult; +use gimli::{write::CommonInformationEntry, Encoding, Format, Register, X86_64}; +use regalloc::{Reg, RegClass}; + +/// Creates a new x86-64 common information entry (CIE). +pub fn create_cie() -> CommonInformationEntry { + use gimli::write::CallFrameInstruction; + + let mut entry = CommonInformationEntry::new( + Encoding { + address_size: 8, + format: Format::Dwarf32, + version: 1, + }, + 1, // Code alignment factor + -8, // Data alignment factor + X86_64::RA, + ); + + // Every frame will start with the call frame address (CFA) at RSP+8 + // It is +8 to account for the push of the return address by the call instruction + entry.add_instruction(CallFrameInstruction::Cfa(X86_64::RSP, 8)); + + // Every frame will start with the return address at RSP (CFA-8 = RSP+8-8 = RSP) + entry.add_instruction(CallFrameInstruction::Offset(X86_64::RA, -8)); + + entry +} + +/// Map Cranelift registers to their corresponding Gimli registers. +pub fn map_reg(reg: Reg) -> Result<Register, RegisterMappingError> { + // Mapping from https://github.com/bytecodealliance/cranelift/pull/902 by @iximeow + const X86_GP_REG_MAP: [gimli::Register; 16] = [ + X86_64::RAX, + X86_64::RCX, + X86_64::RDX, + X86_64::RBX, + X86_64::RSP, + X86_64::RBP, + X86_64::RSI, + X86_64::RDI, + X86_64::R8, + X86_64::R9, + X86_64::R10, + X86_64::R11, + X86_64::R12, + X86_64::R13, + X86_64::R14, + X86_64::R15, + ]; + const X86_XMM_REG_MAP: [gimli::Register; 16] = [ + X86_64::XMM0, + X86_64::XMM1, + X86_64::XMM2, + X86_64::XMM3, + X86_64::XMM4, + X86_64::XMM5, + X86_64::XMM6, + X86_64::XMM7, + X86_64::XMM8, + X86_64::XMM9, + X86_64::XMM10, + X86_64::XMM11, + X86_64::XMM12, + X86_64::XMM13, + X86_64::XMM14, + X86_64::XMM15, + ]; + + match reg.get_class() { + RegClass::I64 => { + // x86 GP registers have a weird mapping to DWARF registers, so we use a + // lookup table. + Ok(X86_GP_REG_MAP[reg.get_hw_encoding() as usize]) + } + RegClass::V128 => Ok(X86_XMM_REG_MAP[reg.get_hw_encoding() as usize]), + _ => Err(RegisterMappingError::UnsupportedRegisterBank("class?")), + } +} + +pub(crate) fn create_unwind_info( + unwind: input::UnwindInfo<Reg>, +) -> CodegenResult<Option<UnwindInfo>> { + struct RegisterMapper; + impl crate::isa::unwind::systemv::RegisterMapper<Reg> for RegisterMapper { + fn map(&self, reg: Reg) -> Result<u16, RegisterMappingError> { + Ok(map_reg(reg)?.0) + } + fn sp(&self) -> u16 { + X86_64::RSP.0 + } + } + let map = RegisterMapper; + + Ok(Some(UnwindInfo::build(unwind, &map)?)) +} + +#[cfg(test)] +mod tests { + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::{ + types, AbiParam, ExternalName, Function, InstBuilder, Signature, StackSlotData, + StackSlotKind, + }; + use crate::isa::{lookup, CallConv}; + use crate::settings::{builder, Flags}; + use crate::Context; + use gimli::write::Address; + use std::str::FromStr; + use target_lexicon::triple; + + #[test] + fn test_simple_func() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_function( + CallConv::SystemV, + Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)), + )); + + context.compile(&*isa).expect("expected compilation"); + + let fde = match context + .create_unwind_info(isa.as_ref()) + .expect("can create unwind info") + { + Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => { + info.to_fde(Address::Constant(1234)) + } + _ => panic!("expected unwind information"), + }; + + assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(1234), length: 13, lsda: None, instructions: [(1, CfaOffset(16)), (1, Offset(Register(6), -16)), (4, CfaRegister(Register(6)))] }"); + } + + fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function { + let mut func = + Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv)); + + let block0 = func.dfg.make_block(); + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().return_(&[]); + + if let Some(stack_slot) = stack_slot { + func.stack_slots.push(stack_slot); + } + + func + } + + #[test] + fn test_multi_return_func() { + let isa = lookup(triple!("x86_64")) + .expect("expect x86 ISA") + .finish(Flags::new(builder())); + + let mut context = Context::for_function(create_multi_return_function(CallConv::SystemV)); + + context.compile(&*isa).expect("expected compilation"); + + let fde = match context + .create_unwind_info(isa.as_ref()) + .expect("can create unwind info") + { + Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => { + info.to_fde(Address::Constant(4321)) + } + _ => panic!("expected unwind information"), + }; + + assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 23, lsda: None, instructions: [(1, CfaOffset(16)), (1, Offset(Register(6), -16)), (4, CfaRegister(Register(6))), (16, RememberState), (18, RestoreState)] }"); + } + + fn create_multi_return_function(call_conv: CallConv) -> Function { + let mut sig = Signature::new(call_conv); + sig.params.push(AbiParam::new(types::I32)); + let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig); + + let block0 = func.dfg.make_block(); + let v0 = func.dfg.append_block_param(block0, types::I32); + let block1 = func.dfg.make_block(); + let block2 = func.dfg.make_block(); + + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(block0); + pos.ins().brnz(v0, block2, &[]); + pos.ins().jump(block1, &[]); + + pos.insert_block(block1); + pos.ins().return_(&[]); + + pos.insert_block(block2); + pos.ins().return_(&[]); + + func + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs new file mode 100644 index 0000000000..0862154360 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs @@ -0,0 +1,3771 @@ +//! Lowering rules for X64. + +use crate::data_value::DataValue; +use crate::ir::{ + condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName, + Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type, +}; +use crate::isa::x64::abi::*; +use crate::isa::x64::inst::args::*; +use crate::isa::x64::inst::*; +use crate::isa::{x64::X64Backend, CallConv}; +use crate::machinst::lower::*; +use crate::machinst::*; +use crate::result::CodegenResult; +use crate::settings::Flags; +use alloc::boxed::Box; +use alloc::vec::Vec; +use cranelift_codegen_shared::condcodes::CondCode; +use log::trace; +use regalloc::{Reg, RegClass, Writable}; +use smallvec::SmallVec; +use std::convert::TryFrom; +use target_lexicon::Triple; + +/// Context passed to all lowering functions. +type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>; + +//============================================================================= +// Helpers for instruction lowering. + +fn is_int_or_ref_ty(ty: Type) -> bool { + match ty { + types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true, + types::R32 => panic!("shouldn't have 32-bits refs on x64"), + _ => false, + } +} + +fn is_bool_ty(ty: Type) -> bool { + match ty { + types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true, + types::R32 => panic!("shouldn't have 32-bits refs on x64"), + _ => false, + } +} + +/// This is target-word-size dependent. And it excludes booleans and reftypes. +fn is_valid_atomic_transaction_ty(ty: Type) -> bool { + match ty { + types::I8 | types::I16 | types::I32 | types::I64 => true, + _ => false, + } +} + +/// Returns whether the given specified `input` is a result produced by an instruction with Opcode +/// `op`. +// TODO investigate failures with checking against the result index. +fn matches_input<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + op: Opcode, +) -> Option<IRInst> { + let inputs = ctx.get_input(input.insn, input.input); + inputs.inst.and_then(|(src_inst, _)| { + let data = ctx.data(src_inst); + if data.opcode() == op { + return Some(src_inst); + } + None + }) +} + +/// Returns whether the given specified `input` is a result produced by an instruction with any of +/// the opcodes specified in `ops`. +fn matches_input_any<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + ops: &[Opcode], +) -> Option<IRInst> { + let inputs = ctx.get_input(input.insn, input.input); + inputs.inst.and_then(|(src_inst, _)| { + let data = ctx.data(src_inst); + for &op in ops { + if data.opcode() == op { + return Some(src_inst); + } + } + None + }) +} + +fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg { + ctx.use_input_reg(input); + input.reg +} + +/// Put the given input into a register, and mark it as used (side-effect). +fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg { + let input = ctx.get_input(spec.insn, spec.input); + + if let Some(c) = input.constant { + // Generate constants fresh at each use to minimize long-range register pressure. + let ty = ctx.input_ty(spec.insn, spec.input); + let from_bits = ty_bits(ty); + let masked = if from_bits < 64 { + c & ((1u64 << from_bits) - 1) + } else { + c + }; + + let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty); + for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) + .into_iter() + { + ctx.emit(inst); + } + cst_copy.to_reg() + } else { + lowerinput_to_reg(ctx, input) + } +} + +/// An extension specification for `extend_input_to_reg`. +#[derive(Clone, Copy)] +enum ExtSpec { + ZeroExtendTo32, + ZeroExtendTo64, + SignExtendTo32, + SignExtendTo64, +} + +/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if +/// required. (This obviously causes side-effects.) +fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg { + let requested_size = match ext_spec { + ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32, + ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64, + }; + let input_size = ctx.input_ty(spec.insn, spec.input).bits(); + + let requested_ty = if requested_size == 32 { + types::I32 + } else { + types::I64 + }; + + let ext_mode = match (input_size, requested_size) { + (a, b) if a == b => return put_input_in_reg(ctx, spec), + (1, 8) => return put_input_in_reg(ctx, spec), + (a, b) => ExtMode::new(a, b).expect(&format!("invalid extension: {} -> {}", a, b)), + }; + + let src = input_to_reg_mem(ctx, spec); + let dst = ctx.alloc_tmp(RegClass::I64, requested_ty); + match ext_spec { + ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => { + ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst)) + } + ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => { + ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst)) + } + } + dst.to_reg() +} + +fn lowerinput_to_reg_mem(ctx: Ctx, input: LowerInput) -> RegMem { + // TODO handle memory. + RegMem::reg(lowerinput_to_reg(ctx, input)) +} + +/// Put the given input into a register or a memory operand. +/// Effectful: may mark the given input as used, when returning the register form. +fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem { + let input = ctx.get_input(spec.insn, spec.input); + lowerinput_to_reg_mem(ctx, input) +} + +/// Returns whether the given input is an immediate that can be properly sign-extended, without any +/// possible side-effect. +fn lowerinput_to_sext_imm(input: LowerInput, input_ty: Type) -> Option<u32> { + input.constant.and_then(|x| { + // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend + // to 64 bits. For other sizes, it doesn't matter and we can just use the plain + // constant. + if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) { + Some(x as u32) + } else { + None + } + }) +} + +fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> { + let input = ctx.get_input(spec.insn, spec.input); + let input_ty = ctx.input_ty(spec.insn, spec.input); + lowerinput_to_sext_imm(input, input_ty) +} + +fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> { + ctx.get_input(spec.insn, spec.input).constant +} + +/// Put the given input into an immediate, a register or a memory operand. +/// Effectful: may mark the given input as used, when returning the register form. +fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm { + let input = ctx.get_input(spec.insn, spec.input); + let input_ty = ctx.input_ty(spec.insn, spec.input); + match lowerinput_to_sext_imm(input, input_ty) { + Some(x) => RegMemImm::imm(x), + None => match lowerinput_to_reg_mem(ctx, input) { + RegMem::Reg { reg } => RegMemImm::reg(reg), + RegMem::Mem { addr } => RegMemImm::mem(addr), + }, + } +} + +/// Emit an instruction to insert a value `src` into a lane of `dst`. +fn emit_insert_lane<C: LowerCtx<I = Inst>>( + ctx: &mut C, + src: RegMem, + dst: Writable<Reg>, + lane: u8, + ty: Type, +) { + if !ty.is_float() { + let (sse_op, is64) = match ty.lane_bits() { + 8 => (SseOpcode::Pinsrb, false), + 16 => (SseOpcode::Pinsrw, false), + 32 => (SseOpcode::Pinsrd, false), + 64 => (SseOpcode::Pinsrd, true), + _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()), + }; + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64)); + } else if ty == types::F32 { + let sse_op = SseOpcode::Insertps; + // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane + // shifted into bits 5:6). + let lane = 0b00_00_00_00 | lane << 4; + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false)); + } else if ty == types::F64 { + let sse_op = match lane { + // Move the lowest quadword in replacement to vector without changing + // the upper bits. + 0 => SseOpcode::Movsd, + // Move the low 64 bits of replacement vector to the high 64 bits of the + // vector. + 1 => SseOpcode::Movlhps, + _ => unreachable!(), + }; + // Here we use the `xmm_rm_r` encoding because it correctly tells the register + // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other + // encoding formats like `xmm_unary_rm_r` treat it as a `def`. + ctx.emit(Inst::xmm_rm_r(sse_op, src, dst)); + } else { + panic!("unable to emit insertlane for type: {}", ty) + } +} + +/// Emits an int comparison instruction. +/// +/// Note: make sure that there are no instructions modifying the flags between a call to this +/// function and the use of the flags! +fn emit_cmp(ctx: Ctx, insn: IRInst) { + let ty = ctx.input_ty(insn, 0); + + let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; + + // TODO Try to commute the operands (and invert the condition) if one is an immediate. + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem_imm(ctx, inputs[1]); + + // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives + // us dst - src at the machine instruction level, so invert operands. + ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs)); +} + +/// A specification for a fcmp emission. +enum FcmpSpec { + /// Normal flow. + Normal, + + /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that + /// happens with `InvertedEqualOrConditions`. + /// + /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or + /// sequence of instructions) that check for an "AND" combination of condition codes; see for + /// instance lowering of Select. + InvertEqual, +} + +/// This explains how to interpret the results of an fcmp instruction. +enum FcmpCondResult { + /// The given condition code must be set. + Condition(CC), + + /// Both condition codes must be set. + AndConditions(CC, CC), + + /// Either of the conditions codes must be set. + OrConditions(CC, CC), + + /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either + /// of the condition codes must be set, and the user must invert meaning of analyzing the + /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be + /// reached. + InvertedEqualOrConditions(CC, CC), +} + +/// Emits a float comparison instruction. +/// +/// Note: make sure that there are no instructions modifying the flags between a call to this +/// function and the use of the flags! +fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult { + let (flip_operands, inverted_equal) = match cond_code { + FloatCC::LessThan + | FloatCC::LessThanOrEqual + | FloatCC::UnorderedOrGreaterThan + | FloatCC::UnorderedOrGreaterThanOrEqual => { + cond_code = cond_code.reverse(); + (true, false) + } + FloatCC::Equal => { + let inverted_equal = match spec { + FcmpSpec::Normal => false, + FcmpSpec::InvertEqual => { + cond_code = FloatCC::NotEqual; // same as .inverse() + true + } + }; + (false, inverted_equal) + } + _ => (false, false), + }; + + // The only valid CC constructed with `from_floatcc` can be put in the flag + // register with a direct float comparison; do this here. + let op = match ctx.input_ty(insn, 0) { + types::F32 => SseOpcode::Ucomiss, + types::F64 => SseOpcode::Ucomisd, + _ => panic!("Bad input type to Fcmp"), + }; + + let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; + let (lhs_input, rhs_input) = if flip_operands { + (inputs[1], inputs[0]) + } else { + (inputs[0], inputs[1]) + }; + let lhs = put_input_in_reg(ctx, lhs_input); + let rhs = input_to_reg_mem(ctx, rhs_input); + ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs)); + + let cond_result = match cond_code { + FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z), + FloatCC::NotEqual if inverted_equal => { + FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ) + } + FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ), + _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)), + }; + + cond_result +} + +fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature { + let mut sig = Signature::new(call_conv); + for i in 0..ctx.num_inputs(insn) { + sig.params.push(AbiParam::new(ctx.input_ty(insn, i))); + } + for i in 0..ctx.num_outputs(insn) { + sig.returns.push(AbiParam::new(ctx.output_ty(insn, i))); + } + if call_conv.extends_baldrdash() { + // Adds the special VMContext parameter to the signature. + sig.params + .push(AbiParam::special(ptr_ty, ArgumentPurpose::VMContext)); + } + sig +} + +fn emit_vm_call<C: LowerCtx<I = Inst>>( + ctx: &mut C, + flags: &Flags, + triple: &Triple, + libcall: LibCall, + insn: IRInst, + inputs: SmallVec<[InsnInput; 4]>, + outputs: SmallVec<[InsnOutput; 2]>, +) -> CodegenResult<()> { + let extname = ExternalName::LibCall(libcall); + + let dist = if flags.use_colocated_libcalls() { + RelocDistance::Near + } else { + RelocDistance::Far + }; + + // TODO avoid recreating signatures for every single Libcall function. + let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple)); + let sig = make_libcall_sig(ctx, insn, call_conv, types::I64); + let caller_conv = ctx.abi().call_conv(); + + let mut abi = X64ABICaller::from_func(&sig, &extname, dist, caller_conv)?; + + abi.emit_stack_pre_adjust(ctx); + + let vm_context = if call_conv.extends_baldrdash() { 1 } else { 0 }; + assert_eq!(inputs.len() + vm_context, abi.num_args()); + + for (i, input) in inputs.iter().enumerate() { + let arg_reg = put_input_in_reg(ctx, *input); + abi.emit_copy_reg_to_arg(ctx, i, arg_reg); + } + if call_conv.extends_baldrdash() { + let vm_context_vreg = ctx + .get_vm_context() + .expect("should have a VMContext to pass to libcall funcs"); + abi.emit_copy_reg_to_arg(ctx, inputs.len(), vm_context_vreg); + } + + abi.emit_call(ctx); + for (i, output) in outputs.iter().enumerate() { + let retval_reg = get_output_reg(ctx, *output); + abi.emit_copy_retval_to_reg(ctx, i, retval_reg); + } + abi.emit_stack_post_adjust(ctx); + + Ok(()) +} + +/// Returns whether the given input is a shift by a constant value less or equal than 3. +/// The goal is to embed it within an address mode. +fn matches_small_constant_shift<C: LowerCtx<I = Inst>>( + ctx: &mut C, + spec: InsnInput, +) -> Option<(InsnInput, u8)> { + matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| { + match input_to_imm( + ctx, + InsnInput { + insn: shift, + input: 1, + }, + ) { + Some(shift_amt) if shift_amt <= 3 => Some(( + InsnInput { + insn: shift, + input: 0, + }, + shift_amt as u8, + )), + _ => None, + } + }) +} + +/// Lowers an instruction to one of the x86 addressing modes. +/// +/// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior. +fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i32) -> Amode { + let flags = ctx + .memflags(spec.insn) + .expect("Instruction with amode should have memflags"); + + // We now either have an add that we must materialize, or some other input; as well as the + // final offset. + if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) { + debug_assert_eq!(ctx.output_ty(add, 0), types::I64); + let add_inputs = &[ + InsnInput { + insn: add, + input: 0, + }, + InsnInput { + insn: add, + input: 1, + }, + ]; + + // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations + // aren't happening in the wasm case. We could do better, given some range analysis. + let (base, index, shift) = if let Some((shift_input, shift_amt)) = + matches_small_constant_shift(ctx, add_inputs[0]) + { + ( + put_input_in_reg(ctx, add_inputs[1]), + put_input_in_reg(ctx, shift_input), + shift_amt, + ) + } else if let Some((shift_input, shift_amt)) = + matches_small_constant_shift(ctx, add_inputs[1]) + { + ( + put_input_in_reg(ctx, add_inputs[0]), + put_input_in_reg(ctx, shift_input), + shift_amt, + ) + } else { + for i in 0..=1 { + let input = ctx.get_input(add, i); + + // Try to pierce through uextend. + if let Some(uextend) = matches_input( + ctx, + InsnInput { + insn: add, + input: i, + }, + Opcode::Uextend, + ) { + if let Some(cst) = ctx.get_input(uextend, 0).constant { + // Zero the upper bits. + let input_size = ctx.input_ty(uextend, 0).bits() as u64; + let shift: u64 = 64 - input_size; + let uext_cst: u64 = (cst << shift) >> shift; + + let final_offset = (offset as i64).wrapping_add(uext_cst as i64); + if low32_will_sign_extend_to_64(final_offset as u64) { + let base = put_input_in_reg(ctx, add_inputs[1 - i]); + return Amode::imm_reg(final_offset as u32, base).with_flags(flags); + } + } + } + + // If it's a constant, add it directly! + if let Some(cst) = input.constant { + let final_offset = (offset as i64).wrapping_add(cst as i64); + if low32_will_sign_extend_to_64(final_offset as u64) { + let base = put_input_in_reg(ctx, add_inputs[1 - i]); + return Amode::imm_reg(final_offset as u32, base).with_flags(flags); + } + } + } + + ( + put_input_in_reg(ctx, add_inputs[0]), + put_input_in_reg(ctx, add_inputs[1]), + 0, + ) + }; + + return Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags); + } + + let input = put_input_in_reg(ctx, spec); + Amode::imm_reg(offset as u32, input).with_flags(flags) +} + +//============================================================================= +// Top-level instruction lowering entry point, for one instruction. + +/// Actually codegen an instruction's results into registers. +fn lower_insn_to_regs<C: LowerCtx<I = Inst>>( + ctx: &mut C, + insn: IRInst, + flags: &Flags, + triple: &Triple, +) -> CodegenResult<()> { + let op = ctx.data(insn).opcode(); + + let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn)) + .map(|i| InsnInput { insn, input: i }) + .collect(); + let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn)) + .map(|i| InsnOutput { insn, output: i }) + .collect(); + + let ty = if outputs.len() > 0 { + Some(ctx.output_ty(insn, 0)) + } else { + None + }; + + match op { + Opcode::Iconst | Opcode::Bconst | Opcode::Null => { + let value = ctx + .get_constant(insn) + .expect("constant value for iconst et al"); + let dst = get_output_reg(ctx, outputs[0]); + for inst in Inst::gen_constant(dst, value, ty.unwrap(), |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + } + + Opcode::Iadd + | Opcode::IaddIfcout + | Opcode::SaddSat + | Opcode::UaddSat + | Opcode::Isub + | Opcode::SsubSat + | Opcode::UsubSat + | Opcode::Imul + | Opcode::AvgRound + | Opcode::Band + | Opcode::Bor + | Opcode::Bxor => { + let ty = ty.unwrap(); + if ty.lane_count() > 1 { + let sse_op = match op { + Opcode::Iadd => match ty { + types::I8X16 => SseOpcode::Paddb, + types::I16X8 => SseOpcode::Paddw, + types::I32X4 => SseOpcode::Paddd, + types::I64X2 => SseOpcode::Paddq, + _ => panic!("Unsupported type for packed iadd instruction: {}", ty), + }, + Opcode::SaddSat => match ty { + types::I8X16 => SseOpcode::Paddsb, + types::I16X8 => SseOpcode::Paddsw, + _ => panic!("Unsupported type for packed sadd_sat instruction: {}", ty), + }, + Opcode::UaddSat => match ty { + types::I8X16 => SseOpcode::Paddusb, + types::I16X8 => SseOpcode::Paddusw, + _ => panic!("Unsupported type for packed uadd_sat instruction: {}", ty), + }, + Opcode::Isub => match ty { + types::I8X16 => SseOpcode::Psubb, + types::I16X8 => SseOpcode::Psubw, + types::I32X4 => SseOpcode::Psubd, + types::I64X2 => SseOpcode::Psubq, + _ => panic!("Unsupported type for packed isub instruction: {}", ty), + }, + Opcode::SsubSat => match ty { + types::I8X16 => SseOpcode::Psubsb, + types::I16X8 => SseOpcode::Psubsw, + _ => panic!("Unsupported type for packed ssub_sat instruction: {}", ty), + }, + Opcode::UsubSat => match ty { + types::I8X16 => SseOpcode::Psubusb, + types::I16X8 => SseOpcode::Psubusw, + _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty), + }, + Opcode::Imul => match ty { + types::I16X8 => SseOpcode::Pmullw, + types::I32X4 => SseOpcode::Pmulld, + types::I64X2 => { + // Note for I64X2 we describe a lane A as being composed of a + // 32-bit upper half "Ah" and a 32-bit lower half "Al". + // The 32-bit long hand multiplication can then be written as: + // Ah Al + // * Bh Bl + // ----- + // Al * Bl + // + (Ah * Bl) << 32 + // + (Al * Bh) << 32 + // + // So for each lane we will compute: + // A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32 + // + // Note, the algorithm will use pmuldq which operates directly on + // the lower 32-bit (Al or Bl) of a lane and writes the result + // to the full 64-bits of the lane of the destination. For this + // reason we don't need shifts to isolate the lower 32-bits, however + // we will need to use shifts to isolate the high 32-bits when doing + // calculations, i.e. Ah == A >> 32 + // + // The full sequence then is as follows: + // A' = A + // A' = A' >> 32 + // A' = Ah' * Bl + // B' = B + // B' = B' >> 32 + // B' = Bh' * Al + // B' = B' + A' + // B' = B' << 32 + // A' = A + // A' = Al' * Bl + // A' = A' + B' + // dst = A' + + // Get inputs rhs=A and lhs=B and the dst register + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + // A' = A + let rhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2); + ctx.emit(Inst::gen_move(rhs_1, rhs, ty)); + + // A' = A' >> 32 + // A' = Ah' * Bl + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psrlq, + RegMemImm::imm(32), + rhs_1, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmuludq, + RegMem::reg(lhs.clone()), + rhs_1, + )); + + // B' = B + let lhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2); + ctx.emit(Inst::gen_move(lhs_1, lhs, ty)); + + // B' = B' >> 32 + // B' = Bh' * Al + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psrlq, + RegMemImm::imm(32), + lhs_1, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1)); + + // B' = B' + A' + // B' = B' << 32 + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddq, + RegMem::reg(rhs_1.to_reg()), + lhs_1, + )); + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psllq, + RegMemImm::imm(32), + lhs_1, + )); + + // A' = A + // A' = Al' * Bl + // A' = A' + B' + // dst = A' + ctx.emit(Inst::gen_move(rhs_1, rhs, ty)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmuludq, + RegMem::reg(lhs.clone()), + rhs_1, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddq, + RegMem::reg(lhs_1.to_reg()), + rhs_1, + )); + ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty)); + return Ok(()); + } + _ => panic!("Unsupported type for packed imul instruction: {}", ty), + }, + Opcode::AvgRound => match ty { + types::I8X16 => SseOpcode::Pavgb, + types::I16X8 => SseOpcode::Pavgw, + _ => panic!("Unsupported type for packed avg_round instruction: {}", ty), + }, + Opcode::Band => match ty { + types::F32X4 => SseOpcode::Andps, + types::F64X2 => SseOpcode::Andpd, + _ => SseOpcode::Pand, + }, + Opcode::Bor => match ty { + types::F32X4 => SseOpcode::Orps, + types::F64X2 => SseOpcode::Orpd, + _ => SseOpcode::Por, + }, + Opcode::Bxor => match ty { + types::F32X4 => SseOpcode::Xorps, + types::F64X2 => SseOpcode::Xorpd, + _ => SseOpcode::Pxor, + }, + _ => panic!("Unsupported packed instruction: {}", op), + }; + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + // Move the `lhs` to the same register as `dst`. + ctx.emit(Inst::gen_move(dst, lhs, ty)); + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + } else { + let is_64 = ty == types::I64; + let alu_op = match op { + Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add, + Opcode::Isub => AluRmiROpcode::Sub, + Opcode::Imul => AluRmiROpcode::Mul, + Opcode::Band => AluRmiROpcode::And, + Opcode::Bor => AluRmiROpcode::Or, + Opcode::Bxor => AluRmiROpcode::Xor, + _ => unreachable!(), + }; + + let (lhs, rhs) = match op { + Opcode::Iadd + | Opcode::IaddIfcout + | Opcode::Imul + | Opcode::Band + | Opcode::Bor + | Opcode::Bxor => { + // For commutative operations, try to commute operands if one is an + // immediate. + if let Some(imm) = input_to_sext_imm(ctx, inputs[0]) { + (put_input_in_reg(ctx, inputs[1]), RegMemImm::imm(imm)) + } else { + ( + put_input_in_reg(ctx, inputs[0]), + input_to_reg_mem_imm(ctx, inputs[1]), + ) + } + } + Opcode::Isub => ( + put_input_in_reg(ctx, inputs[0]), + input_to_reg_mem_imm(ctx, inputs[1]), + ), + _ => unreachable!(), + }; + + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::mov_r_r(true, lhs, dst)); + ctx.emit(Inst::alu_rmi_r(is_64, alu_op, rhs, dst)); + } + } + + Opcode::BandNot => { + let ty = ty.unwrap(); + debug_assert!(ty.is_vector() && ty.bytes() == 16); + let lhs = input_to_reg_mem(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let sse_op = match ty { + types::F32X4 => SseOpcode::Andnps, + types::F64X2 => SseOpcode::Andnpd, + _ => SseOpcode::Pandn, + }; + // Note the flipping of operands: the `rhs` operand is used as the destination instead + // of the `lhs` as in the other bit operations above (e.g. `band`). + ctx.emit(Inst::gen_move(dst, rhs, ty)); + ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst)); + } + + Opcode::Iabs => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + if ty.is_vector() { + let opcode = match ty { + types::I8X16 => SseOpcode::Pabsb, + types::I16X8 => SseOpcode::Pabsw, + types::I32X4 => SseOpcode::Pabsd, + _ => panic!("Unsupported type for packed iabs instruction: {}", ty), + }; + ctx.emit(Inst::xmm_unary_rm_r(opcode, src, dst)); + } else { + unimplemented!("iabs is unimplemented for non-vector type: {}", ty); + } + } + + Opcode::Imax | Opcode::Umax | Opcode::Imin | Opcode::Umin => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + if ty.is_vector() { + let sse_op = match op { + Opcode::Imax => match ty { + types::I8X16 => SseOpcode::Pmaxsb, + types::I16X8 => SseOpcode::Pmaxsw, + types::I32X4 => SseOpcode::Pmaxsd, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + Opcode::Umax => match ty { + types::I8X16 => SseOpcode::Pmaxub, + types::I16X8 => SseOpcode::Pmaxuw, + types::I32X4 => SseOpcode::Pmaxud, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + Opcode::Imin => match ty { + types::I8X16 => SseOpcode::Pminsb, + types::I16X8 => SseOpcode::Pminsw, + types::I32X4 => SseOpcode::Pminsd, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + Opcode::Umin => match ty { + types::I8X16 => SseOpcode::Pminub, + types::I16X8 => SseOpcode::Pminuw, + types::I32X4 => SseOpcode::Pminud, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + _ => unreachable!("This is a bug: the external and internal `match op` should be over the same opcodes."), + }; + + // Move the `lhs` to the same register as `dst`. + ctx.emit(Inst::gen_move(dst, lhs, ty)); + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + } else { + panic!("Unsupported type for {} instruction: {}", op, ty); + } + } + + Opcode::Bnot => { + let ty = ty.unwrap(); + let size = ty.bytes() as u8; + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, src, ty)); + + if ty.is_vector() { + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp)); + ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst)); + } else if ty.is_bool() { + unimplemented!("bool bnot") + } else { + ctx.emit(Inst::not(size, dst)); + } + } + + Opcode::Bitselect => { + let ty = ty.unwrap(); + let condition = put_input_in_reg(ctx, inputs[0]); + let if_true = put_input_in_reg(ctx, inputs[1]); + let if_false = input_to_reg_mem(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); + + if ty.is_vector() { + let tmp1 = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::gen_move(tmp1, if_true, ty)); + ctx.emit(Inst::and(ty, RegMem::reg(condition.clone()), tmp1)); + + let tmp2 = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::gen_move(tmp2, condition, ty)); + ctx.emit(Inst::and_not(ty, if_false, tmp2)); + + ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty)); + ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst)); + } else { + unimplemented!("scalar bitselect") + } + } + + Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => { + let dst_ty = ctx.output_ty(insn, 0); + debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty); + + let (size, lhs) = match dst_ty { + types::I8 | types::I16 => match op { + Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])), + Opcode::Ushr => ( + 4, + extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32), + ), + Opcode::Sshr => ( + 4, + extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32), + ), + Opcode::Rotl | Opcode::Rotr => { + (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])) + } + _ => unreachable!(), + }, + types::I32 | types::I64 => (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])), + _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty), + }; + + let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant { + // Mask count, according to Cranelift's semantics. + let cst = (cst as u8) & (dst_ty.bits() as u8 - 1); + (Some(cst), None) + } else { + (None, Some(put_input_in_reg(ctx, inputs[1]))) + }; + + let dst = get_output_reg(ctx, outputs[0]); + + let shift_kind = match op { + Opcode::Ishl => ShiftKind::ShiftLeft, + Opcode::Ushr => ShiftKind::ShiftRightLogical, + Opcode::Sshr => ShiftKind::ShiftRightArithmetic, + Opcode::Rotl => ShiftKind::RotateLeft, + Opcode::Rotr => ShiftKind::RotateRight, + _ => unreachable!(), + }; + + let w_rcx = Writable::from_reg(regs::rcx()); + ctx.emit(Inst::mov_r_r(true, lhs, dst)); + if count.is_none() { + ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx)); + } + ctx.emit(Inst::shift_r(size, shift_kind, count, dst)); + } + + Opcode::Ineg => { + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + if ty.is_vector() { + // Zero's out a register and then does a packed subtraction + // of the input from the register. + + let src = input_to_reg_mem(ctx, inputs[0]); + let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4); + + let subtract_opcode = match ty { + types::I8X16 => SseOpcode::Psubb, + types::I16X8 => SseOpcode::Psubw, + types::I32X4 => SseOpcode::Psubd, + types::I64X2 => SseOpcode::Psubq, + _ => panic!("Unsupported type for Ineg instruction, found {}", ty), + }; + + // Note we must zero out a tmp instead of using the destination register since + // the desitnation could be an alias for the source input register + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(tmp.to_reg()), + tmp, + )); + ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp)); + ctx.emit(Inst::xmm_unary_rm_r( + SseOpcode::Movapd, + RegMem::reg(tmp.to_reg()), + dst, + )); + } else { + let size = ty.bytes() as u8; + let src = put_input_in_reg(ctx, inputs[0]); + ctx.emit(Inst::gen_move(dst, src, ty)); + ctx.emit(Inst::neg(size, dst)); + } + } + + Opcode::Clz => { + // TODO when the x86 flags have use_lzcnt, we can use LZCNT. + + // General formula using bit-scan reverse (BSR): + // mov -1, %dst + // bsr %src, %tmp + // cmovz %dst, %tmp + // mov $(size_bits - 1), %dst + // sub %tmp, %dst + + let (ext_spec, ty) = match ctx.input_ty(insn, 0) { + types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), + a if a == types::I32 || a == types::I64 => (None, a), + _ => unreachable!(), + }; + + let src = if let Some(ext_spec) = ext_spec { + RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) + } else { + input_to_reg_mem(ctx, inputs[0]) + }; + let dst = get_output_reg(ctx, outputs[0]); + + let tmp = ctx.alloc_tmp(RegClass::I64, ty); + ctx.emit(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + u64::max_value(), + dst, + )); + + ctx.emit(Inst::unary_rm_r( + ty.bytes() as u8, + UnaryRmROpcode::Bsr, + src, + tmp, + )); + + ctx.emit(Inst::cmove( + ty.bytes() as u8, + CC::Z, + RegMem::reg(dst.to_reg()), + tmp, + )); + + ctx.emit(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + ty.bits() as u64 - 1, + dst, + )); + + ctx.emit(Inst::alu_rmi_r( + ty == types::I64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + } + + Opcode::Ctz => { + // TODO when the x86 flags have use_bmi1, we can use TZCNT. + + // General formula using bit-scan forward (BSF): + // bsf %src, %dst + // mov $(size_bits), %tmp + // cmovz %tmp, %dst + let ty = ctx.input_ty(insn, 0); + let ty = if ty.bits() < 32 { types::I32 } else { ty }; + debug_assert!(ty == types::I32 || ty == types::I64); + + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + let tmp = ctx.alloc_tmp(RegClass::I64, ty); + ctx.emit(Inst::imm(OperandSize::Size32, ty.bits() as u64, tmp)); + + ctx.emit(Inst::unary_rm_r( + ty.bytes() as u8, + UnaryRmROpcode::Bsf, + src, + dst, + )); + + ctx.emit(Inst::cmove( + ty.bytes() as u8, + CC::Z, + RegMem::reg(tmp.to_reg()), + dst, + )); + } + + Opcode::Popcnt => { + // TODO when the x86 flags have use_popcnt, we can use the popcnt instruction. + + let (ext_spec, ty) = match ctx.input_ty(insn, 0) { + types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), + a if a == types::I32 || a == types::I64 => (None, a), + _ => unreachable!(), + }; + + let src = if let Some(ext_spec) = ext_spec { + RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) + } else { + input_to_reg_mem(ctx, inputs[0]) + }; + let dst = get_output_reg(ctx, outputs[0]); + + if ty == types::I64 { + let is_64 = true; + + let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64); + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64); + let cst = ctx.alloc_tmp(RegClass::I64, types::I64); + + // mov src, tmp1 + ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // mov 0x7777_7777_7777_7777, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst)); + + // andq cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); + + // mov src, tmp2 + ctx.emit(Inst::mov64_rm_r(src, tmp2)); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // mov tmp2, dst + ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); + + // shr $4, dst + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst)); + + // add tmp2, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + dst, + )); + + // mov $0x0F0F_0F0F_0F0F_0F0F, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst)); + + // and cst, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + dst, + )); + + // mov $0x0101_0101_0101_0101, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst)); + + // mul cst, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Mul, + RegMemImm::reg(cst.to_reg()), + dst, + )); + + // shr $56, dst + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(56), + dst, + )); + } else { + assert_eq!(ty, types::I32); + let is_64 = false; + + let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64); + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64); + + // mov src, tmp1 + ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // andq $0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // mov src, tmp2 + ctx.emit(Inst::mov64_rm_r(src, tmp2)); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and 0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and $0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // mov tmp2, dst + ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); + + // shr $4, dst + ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst)); + + // add tmp2, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + dst, + )); + + // and $0x0F0F_0F0F, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x0F0F0F0F), + dst, + )); + + // mul $0x0101_0101, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Mul, + RegMemImm::imm(0x01010101), + dst, + )); + + // shr $24, dst + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(24), + dst, + )); + } + } + + Opcode::IsNull | Opcode::IsInvalid => { + // Null references are represented by the constant value 0; invalid references are + // represented by the constant value -1. See `define_reftypes()` in + // `meta/src/isa/x86/encodings.rs` to confirm. + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + let imm = match op { + Opcode::IsNull => { + // TODO could use tst src, src for IsNull + 0 + } + Opcode::IsInvalid => { + // We can do a 32-bit comparison even in 64-bits mode, as the constant is then + // sign-extended. + 0xffffffff + } + _ => unreachable!(), + }; + ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::imm(imm), src)); + ctx.emit(Inst::setcc(CC::Z, dst)); + } + + Opcode::Uextend + | Opcode::Sextend + | Opcode::Bint + | Opcode::Breduce + | Opcode::Bextend + | Opcode::Ireduce => { + let src_ty = ctx.input_ty(insn, 0); + let dst_ty = ctx.output_ty(insn, 0); + + // Sextend requires a sign-extended move, but all the other opcodes are simply a move + // from a zero-extended source. Here is why this works, in each case: + // + // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to + // zero-extend here. + // + // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so + // again, this is a zero-extend / no-op. + // + // - Ireduce: changing width of an integer. Smaller ints are stored with undefined + // high-order bits, so we can simply do a copy. + + if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend { + // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on + // 32-bits will zero-extend the upper 32-bits, so we can even not generate a + // zero-extended move in this case. + // TODO add loads and shifts here. + if let Some(_) = matches_input_any( + ctx, + inputs[0], + &[ + Opcode::Iadd, + Opcode::IaddIfcout, + Opcode::Isub, + Opcode::Imul, + Opcode::Band, + Opcode::Bor, + Opcode::Bxor, + ], + ) { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, src, types::I64)); + return Ok(()); + } + } + + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits()); + assert_eq!( + src_ty.bits() < dst_ty.bits(), + ext_mode.is_some(), + "unexpected extension: {} -> {}", + src_ty, + dst_ty + ); + + if let Some(ext_mode) = ext_mode { + if op == Opcode::Sextend { + ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst)); + } else { + ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst)); + } + } else { + ctx.emit(Inst::mov64_rm_r(src, dst)); + } + } + + Opcode::Icmp => { + let condcode = ctx.data(insn).cond_code().unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + if !ty.is_vector() { + emit_cmp(ctx, insn); + let cc = CC::from_intcc(condcode); + ctx.emit(Inst::setcc(cc, dst)); + } else { + assert_eq!(ty.bits(), 128); + let eq = |ty| match ty { + types::I8X16 => SseOpcode::Pcmpeqb, + types::I16X8 => SseOpcode::Pcmpeqw, + types::I32X4 => SseOpcode::Pcmpeqd, + types::I64X2 => SseOpcode::Pcmpeqq, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let gt = |ty| match ty { + types::I8X16 => SseOpcode::Pcmpgtb, + types::I16X8 => SseOpcode::Pcmpgtw, + types::I32X4 => SseOpcode::Pcmpgtd, + types::I64X2 => SseOpcode::Pcmpgtq, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let maxu = |ty| match ty { + types::I8X16 => SseOpcode::Pmaxub, + types::I16X8 => SseOpcode::Pmaxuw, + types::I32X4 => SseOpcode::Pmaxud, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let mins = |ty| match ty { + types::I8X16 => SseOpcode::Pminsb, + types::I16X8 => SseOpcode::Pminsw, + types::I32X4 => SseOpcode::Pminsd, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let minu = |ty| match ty { + types::I8X16 => SseOpcode::Pminub, + types::I16X8 => SseOpcode::Pminuw, + types::I32X4 => SseOpcode::Pminud, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + + // Here we decide which operand to use as the read/write `dst` (ModRM reg field) + // and which to use as the read `input` (ModRM r/m field). In the normal case we + // use Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for + // the less-than cases so that we can reuse the greater-than implementation. + let input = match condcode { + IntCC::SignedLessThan + | IntCC::SignedLessThanOrEqual + | IntCC::UnsignedLessThan + | IntCC::UnsignedLessThanOrEqual => { + let lhs = input_to_reg_mem(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + lhs + } + _ => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + ctx.emit(Inst::gen_move(dst, lhs, ty)); + rhs + } + }; + + match condcode { + IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)), + IntCC::NotEqual => { + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)); + // Emit all 1s into the `tmp` register. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp)); + // Invert the result of the `PCMPEQ*`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst)); + } + IntCC::SignedGreaterThan | IntCC::SignedLessThan => { + ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst)) + } + IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => { + ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)) + } + IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => { + ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)); + // Emit all 1s into the `tmp` register. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp)); + // Invert the result of the `PCMPEQ*`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst)); + } + IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => { + ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)) + } + _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode), + } + } + } + + Opcode::Fcmp => { + let cond_code = ctx.data(insn).fp_cond_code().unwrap(); + let input_ty = ctx.input_ty(insn, 0); + if !input_ty.is_vector() { + // Unordered is returned by setting ZF, PF, CF <- 111 + // Greater than by ZF, PF, CF <- 000 + // Less than by ZF, PF, CF <- 001 + // Equal by ZF, PF, CF <- 100 + // + // Checking the result of comiss is somewhat annoying because you don't have setcc + // instructions that explicitly check simultaneously for the condition (i.e. eq, le, + // gt, etc) *and* orderedness. + // + // So that might mean we need more than one setcc check and then a logical "and" or + // "or" to determine both, in some cases. However knowing that if the parity bit is + // set, then the result was considered unordered and knowing that if the parity bit is + // set, then both the ZF and CF flag bits must also be set we can get away with using + // one setcc for most condition codes. + + let dst = get_output_reg(ctx, outputs[0]); + + match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { + ctx.emit(Inst::setcc(cc, dst)); + } + FcmpCondResult::AndConditions(cc1, cc2) => { + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, dst)); + ctx.emit(Inst::alu_rmi_r( + false, + AluRmiROpcode::And, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, dst)); + ctx.emit(Inst::alu_rmi_r( + false, + AluRmiROpcode::Or, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), + } + } else { + let op = match input_ty { + types::F32X4 => SseOpcode::Cmpps, + types::F64X2 => SseOpcode::Cmppd, + _ => panic!("Bad input type to fcmp: {}", input_ty), + }; + + // Since some packed comparisons are not available, some of the condition codes + // must be inverted, with a corresponding `flip` of the operands. + let (imm, flip) = match cond_code { + FloatCC::GreaterThan => (FcmpImm::LessThan, true), + FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true), + FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true), + FloatCC::UnorderedOrLessThanOrEqual => { + (FcmpImm::UnorderedOrGreaterThanOrEqual, true) + } + FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => { + panic!("unsupported float condition code: {}", cond_code) + } + _ => (FcmpImm::from(cond_code), false), + }; + + // Determine the operands of the comparison, possibly by flipping them. + let (lhs, rhs) = if flip { + ( + put_input_in_reg(ctx, inputs[1]), + input_to_reg_mem(ctx, inputs[0]), + ) + } else { + ( + put_input_in_reg(ctx, inputs[0]), + input_to_reg_mem(ctx, inputs[1]), + ) + }; + + // Move the `lhs` to the same register as `dst`; this may not emit an actual move + // but ensures that the registers are the same to match x86's read-write operand + // encoding. + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, lhs, input_ty)); + + // Emit the comparison. + ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false)); + } + } + + Opcode::FallthroughReturn | Opcode::Return => { + for i in 0..ctx.num_inputs(insn) { + let src_reg = put_input_in_reg(ctx, inputs[i]); + let retval_reg = ctx.retval(i); + let ty = ctx.input_ty(insn, i); + ctx.emit(Inst::gen_move(retval_reg, src_reg, ty)); + } + // N.B.: the Ret itself is generated by the ABI. + } + + Opcode::Call | Opcode::CallIndirect => { + let caller_conv = ctx.abi().call_conv(); + let (mut abi, inputs) = match op { + Opcode::Call => { + let (extname, dist) = ctx.call_target(insn).unwrap(); + let sig = ctx.call_sig(insn).unwrap(); + assert_eq!(inputs.len(), sig.params.len()); + assert_eq!(outputs.len(), sig.returns.len()); + ( + X64ABICaller::from_func(sig, &extname, dist, caller_conv)?, + &inputs[..], + ) + } + + Opcode::CallIndirect => { + let ptr = put_input_in_reg(ctx, inputs[0]); + let sig = ctx.call_sig(insn).unwrap(); + assert_eq!(inputs.len() - 1, sig.params.len()); + assert_eq!(outputs.len(), sig.returns.len()); + ( + X64ABICaller::from_ptr(sig, ptr, op, caller_conv)?, + &inputs[1..], + ) + } + + _ => unreachable!(), + }; + + abi.emit_stack_pre_adjust(ctx); + assert_eq!(inputs.len(), abi.num_args()); + for (i, input) in inputs.iter().enumerate() { + let arg_reg = put_input_in_reg(ctx, *input); + abi.emit_copy_reg_to_arg(ctx, i, arg_reg); + } + abi.emit_call(ctx); + for (i, output) in outputs.iter().enumerate() { + let retval_reg = get_output_reg(ctx, *output); + abi.emit_copy_retval_to_reg(ctx, i, retval_reg); + } + abi.emit_stack_post_adjust(ctx); + } + + Opcode::Debugtrap => { + ctx.emit(Inst::Hlt); + } + + Opcode::Trap | Opcode::ResumableTrap => { + let trap_code = ctx.data(insn).trap_code().unwrap(); + ctx.emit_safepoint(Inst::Ud2 { trap_code }); + } + + Opcode::Trapif | Opcode::Trapff => { + let trap_code = ctx.data(insn).trap_code().unwrap(); + + if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() { + let cond_code = ctx.data(insn).cond_code().unwrap(); + // The flags must not have been clobbered by any other instruction between the + // iadd_ifcout and this instruction, as verified by the CLIF validator; so we can + // simply use the flags here. + let cc = CC::from_intcc(cond_code); + + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }); + } else if op == Opcode::Trapif { + let cond_code = ctx.data(insn).cond_code().unwrap(); + let cc = CC::from_intcc(cond_code); + + // Verification ensures that the input is always a single-def ifcmp. + let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + emit_cmp(ctx, ifcmp); + + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }); + } else { + let cond_code = ctx.data(insn).fp_cond_code().unwrap(); + + // Verification ensures that the input is always a single-def ffcmp. + let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap(); + + match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }) + } + FcmpCondResult::AndConditions(cc1, cc2) => { + // A bit unfortunate, but materialize the flags in their own register, and + // check against this. + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, tmp2)); + ctx.emit(Inst::alu_rmi_r( + false, /* is_64 */ + AluRmiROpcode::And, + RegMemImm::reg(tmp.to_reg()), + tmp2, + )); + ctx.emit_safepoint(Inst::TrapIf { + trap_code, + cc: CC::NZ, + }); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc1 }); + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc2 }); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), + }; + }; + } + + Opcode::F64const => { + // TODO use cmpeqpd for all 1s. + let value = ctx.get_constant(insn).unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + for inst in Inst::gen_constant(dst, value, types::F64, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + } + + Opcode::F32const => { + // TODO use cmpeqps for all 1s. + let value = ctx.get_constant(insn).unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + for inst in Inst::gen_constant(dst, value, types::F32, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + } + + Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + // Move the `lhs` to the same register as `dst`; this may not emit an actual move + // but ensures that the registers are the same to match x86's read-write operand + // encoding. + ctx.emit(Inst::gen_move(dst, lhs, ty)); + + // Note: min and max can't be handled here, because of the way Cranelift defines them: + // if any operand is a NaN, they must return the NaN operand, while the x86 machine + // instruction will return the second operand if either operand is a NaN. + let sse_op = match ty { + types::F32 => match op { + Opcode::Fadd => SseOpcode::Addss, + Opcode::Fsub => SseOpcode::Subss, + Opcode::Fmul => SseOpcode::Mulss, + Opcode::Fdiv => SseOpcode::Divss, + _ => unreachable!(), + }, + types::F64 => match op { + Opcode::Fadd => SseOpcode::Addsd, + Opcode::Fsub => SseOpcode::Subsd, + Opcode::Fmul => SseOpcode::Mulsd, + Opcode::Fdiv => SseOpcode::Divsd, + _ => unreachable!(), + }, + types::F32X4 => match op { + Opcode::Fadd => SseOpcode::Addps, + Opcode::Fsub => SseOpcode::Subps, + Opcode::Fmul => SseOpcode::Mulps, + Opcode::Fdiv => SseOpcode::Divps, + _ => unreachable!(), + }, + types::F64X2 => match op { + Opcode::Fadd => SseOpcode::Addpd, + Opcode::Fsub => SseOpcode::Subpd, + Opcode::Fmul => SseOpcode::Mulpd, + Opcode::Fdiv => SseOpcode::Divpd, + _ => unreachable!(), + }, + _ => panic!( + "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}", + ty + ), + }; + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + } + + Opcode::Fmin | Opcode::Fmax => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let is_min = op == Opcode::Fmin; + let output_ty = ty.unwrap(); + ctx.emit(Inst::gen_move(dst, rhs, output_ty)); + if !output_ty.is_vector() { + let op_size = match output_ty { + types::F32 => OperandSize::Size32, + types::F64 => OperandSize::Size64, + _ => panic!("unexpected type {:?} for fmin/fmax", output_ty), + }; + ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst)); + } else { + // X64's implementation of floating point min and floating point max does not + // propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the + // scalar approach we use jumps to handle cases where NaN and +0 propagation is + // not consistent with what is needed. However for packed floating point min and + // floating point max we implement a different approach to avoid the sequence + // of jumps that would be required on a per lane basis. Because we do not need to + // lower labels and jumps but do need ctx for creating temporaries we implement + // the lowering here in lower.rs instead of emit.rs as is done in the case for scalars. + // The outline of approach is as follows: + // + // First we preform the Min/Max in both directions. This is because in the + // case of an operand's lane containing a NaN or in the case of the lanes of the + // two operands containing 0 but with mismatched signs, x64 will return the second + // operand regardless of its contents. So in order to make sure we capture NaNs and + // normalize NaNs and 0 values we capture the operation in both directions and merge the + // results. Then we normalize the results through operations that create a mask for the + // lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize + // 0s. + // + // The following sequence is generated for min: + // + // movap{s,d} %lhs, %tmp + // minp{s,d} %dst, %tmp + // minp,{s,d} %lhs, %dst + // orp{s,d} %dst, %tmp + // cmpp{s,d} %tmp, %dst, $3 + // orps{s,d} %dst, %tmp + // psrl{s,d} {$10, $13}, %dst + // andnp{s,d} %tmp, %dst + // + // and for max the sequence is: + // + // movap{s,d} %lhs, %tmp + // minp{s,d} %dst, %tmp + // minp,{s,d} %lhs, %dst + // xorp{s,d} %tmp, %dst + // orp{s,d} %dst, %tmp + // subp{s,d} %dst, %tmp + // cmpp{s,d} %tmp, %dst, $3 + // psrl{s,d} {$10, $13}, %dst + // andnp{s,d} %tmp, %dst + + if is_min { + let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) = + match output_ty { + types::F32X4 => ( + SseOpcode::Movaps, + SseOpcode::Minps, + SseOpcode::Orps, + SseOpcode::Cmpps, + SseOpcode::Psrld, + 10, + SseOpcode::Andnps, + ), + types::F64X2 => ( + SseOpcode::Movapd, + SseOpcode::Minpd, + SseOpcode::Orpd, + SseOpcode::Cmppd, + SseOpcode::Psrlq, + 13, + SseOpcode::Andnpd, + ), + _ => unimplemented!("unsupported op type {:?}", output_ty), + }; + + // Copy lhs into tmp + let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, output_ty); + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1)); + + // Perform min in reverse direction + ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1)); + + // Perform min in original direction + ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst)); + + // X64 handles propagation of -0's and Nans differently between left and right + // operands. After doing the min in both directions, this OR will + // guarrentee capture of -0's and Nan in our tmp register + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1)); + + // Compare unordered to create mask for lanes containing NaNs and then use + // that mask to saturate the NaN containing lanes in the tmp register with 1s. + // TODO: Would a check for NaN and then a jump be better here in the + // common case than continuing on to normalize NaNs that might not exist? + let cond = FcmpImm::from(FloatCC::Unordered); + ctx.emit(Inst::xmm_rm_r_imm( + cmp_op, + RegMem::reg(tmp_xmm1.to_reg()), + dst, + cond.encode(), + false, + )); + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // The dst register holds a mask for lanes containing NaNs. + // We take that mask and shift in preparation for creating a different mask + // to normalize NaNs (create a quite NaN) by zeroing out the appropriate + // number of least signficant bits. We shift right each lane by 10 bits + // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign + + // 11 exp. + 1 MSB sig.) for F64X2. + ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst)); + + // Finally we do a nand with the tmp register to produce the final results + // in the dst. + ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + } else { + let ( + mov_op, + max_op, + xor_op, + or_op, + sub_op, + cmp_op, + shift_op, + shift_by, + andn_op, + ) = match output_ty { + types::F32X4 => ( + SseOpcode::Movaps, + SseOpcode::Maxps, + SseOpcode::Xorps, + SseOpcode::Orps, + SseOpcode::Subps, + SseOpcode::Cmpps, + SseOpcode::Psrld, + 10, + SseOpcode::Andnps, + ), + types::F64X2 => ( + SseOpcode::Movapd, + SseOpcode::Maxpd, + SseOpcode::Xorpd, + SseOpcode::Orpd, + SseOpcode::Subpd, + SseOpcode::Cmppd, + SseOpcode::Psrlq, + 13, + SseOpcode::Andnpd, + ), + _ => unimplemented!("unsupported op type {:?}", output_ty), + }; + + // Copy lhs into tmp. + let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32); + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1)); + + // Perform max in reverse direction. + ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // Perform max in original direction. + ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst)); + + // Get the difference between the two results and store in tmp. + // Max uses a different approach than min to account for potential + // discrepancies with plus/minus 0. + ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + + // X64 handles propagation of -0's and Nans differently between left and right + // operands. After doing the max in both directions, this OR will + // guarentee capture of 0's and Nan in our tmp register. + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // Capture NaNs and sign discrepancies. + ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // Compare unordered to create mask for lanes containing NaNs and then use + // that mask to saturate the NaN containing lanes in the tmp register with 1s. + let cond = FcmpImm::from(FloatCC::Unordered); + ctx.emit(Inst::xmm_rm_r_imm( + cmp_op, + RegMem::reg(tmp_xmm1.to_reg()), + dst, + cond.encode(), + false, + )); + + // The dst register holds a mask for lanes containing NaNs. + // We take that mask and shift in preparation for creating a different mask + // to normalize NaNs (create a quite NaN) by zeroing out the appropriate + // number of least signficant bits. We shift right each lane by 10 bits + // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign + + // 11 exp. + 1 MSB sig.) for F64X2. + ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst)); + + // Finally we do a nand with the tmp register to produce the final results + // in the dst. + ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + } + } + } + + Opcode::FminPseudo | Opcode::FmaxPseudo => { + let lhs = input_to_reg_mem(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + let sse_opcode = match (ty, op) { + (types::F32X4, Opcode::FminPseudo) => SseOpcode::Minps, + (types::F32X4, Opcode::FmaxPseudo) => SseOpcode::Maxps, + (types::F64X2, Opcode::FminPseudo) => SseOpcode::Minpd, + (types::F64X2, Opcode::FmaxPseudo) => SseOpcode::Maxpd, + _ => unimplemented!("unsupported type {} for {}", ty, op), + }; + ctx.emit(Inst::xmm_rm_r(sse_opcode, lhs, dst)); + } + + Opcode::Sqrt => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + let sse_op = match ty { + types::F32 => SseOpcode::Sqrtss, + types::F64 => SseOpcode::Sqrtsd, + types::F32X4 => SseOpcode::Sqrtps, + types::F64X2 => SseOpcode::Sqrtpd, + _ => panic!( + "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}", + ty + ), + }; + + ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst)); + } + + Opcode::Fpromote => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst)); + } + + Opcode::Fdemote => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst)); + } + + Opcode::FcvtFromSint => { + let output_ty = ty.unwrap(); + if !output_ty.is_vector() { + let (ext_spec, src_size) = match ctx.input_ty(insn, 0) { + types::I8 | types::I16 => (Some(ExtSpec::SignExtendTo32), OperandSize::Size32), + types::I32 => (None, OperandSize::Size32), + types::I64 => (None, OperandSize::Size64), + _ => unreachable!(), + }; + + let src = match ext_spec { + Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)), + None => input_to_reg_mem(ctx, inputs[0]), + }; + + let opcode = if output_ty == types::F32 { + SseOpcode::Cvtsi2ss + } else { + assert_eq!(output_ty, types::F64); + SseOpcode::Cvtsi2sd + }; + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gpr_to_xmm(opcode, src, src_size, dst)); + } else { + let ty = ty.unwrap(); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let opcode = match ctx.input_ty(insn, 0) { + types::I32X4 => SseOpcode::Cvtdq2ps, + _ => { + unimplemented!("unable to use type {} for op {}", ctx.input_ty(insn, 0), op) + } + }; + ctx.emit(Inst::gen_move(dst, src, ty)); + ctx.emit(Inst::xmm_rm_r(opcode, RegMem::from(dst), dst)); + } + } + + Opcode::FcvtFromUint => { + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + let input_ty = ctx.input_ty(insn, 0); + if !ty.is_vector() { + match input_ty { + types::I8 | types::I16 | types::I32 => { + // Conversion from an unsigned int smaller than 64-bit is easy: zero-extend + + // do a signed conversion (which won't overflow). + let opcode = if ty == types::F32 { + SseOpcode::Cvtsi2ss + } else { + assert_eq!(ty, types::F64); + SseOpcode::Cvtsi2sd + }; + + let src = RegMem::reg(extend_input_to_reg( + ctx, + inputs[0], + ExtSpec::ZeroExtendTo64, + )); + ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst)); + } + + types::I64 => { + let src = put_input_in_reg(ctx, inputs[0]); + + let src_copy = ctx.alloc_tmp(RegClass::I64, types::I64); + ctx.emit(Inst::gen_move(src_copy, src, types::I64)); + + let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I64); + let tmp_gpr2 = ctx.alloc_tmp(RegClass::I64, types::I64); + ctx.emit(Inst::cvt_u64_to_float_seq( + ty == types::F64, + src_copy, + tmp_gpr1, + tmp_gpr2, + dst, + )); + } + _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty), + }; + } else { + // Converting packed unsigned integers to packed floats requires a few steps. + // There is no single instruction lowering for converting unsigned floats but there + // is for converting packed signed integers to float (cvtdq2ps). In the steps below + // we isolate the upper half (16 bits) and lower half (16 bits) of each lane and + // then we convert each half separately using cvtdq2ps meant for signed integers. + // In order for this to work for the upper half bits we must shift right by 1 + // (divide by 2) these bits in order to ensure the most significant bit is 0 not + // signed, and then after the conversion we double the value. Finally we add the + // converted values where addition will correctly round. + // + // Sequence: + // -> A = 0xffffffff + // -> Ah = 0xffff0000 + // -> Al = 0x0000ffff + // -> Convert(Al) // Convert int to float + // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed + // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift + // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion. + // -> dst = Ah + Al // Add the two floats together + + assert_eq!(ctx.input_ty(insn, 0), types::I32X4); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + // Create a temporary register + let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4); + ctx.emit(Inst::xmm_unary_rm_r( + SseOpcode::Movapd, + RegMem::reg(src), + tmp, + )); + ctx.emit(Inst::gen_move(dst, src, ty)); + + // Get the low 16 bits + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp)); + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp)); + + // Get the high 16 bits + ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst)); + + // Convert the low 16 bits + ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp)); + + // Shift the high bits by 1, convert, and double to get the correct value. + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Addps, + RegMem::reg(dst.to_reg()), + dst, + )); + + // Add together the two converted values. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Addps, + RegMem::reg(tmp.to_reg()), + dst, + )); + } + } + + Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + let input_ty = ctx.input_ty(insn, 0); + if !input_ty.is_vector() { + let src_size = if input_ty == types::F32 { + OperandSize::Size32 + } else { + assert_eq!(input_ty, types::F64); + OperandSize::Size64 + }; + + let output_ty = ty.unwrap(); + let dst_size = if output_ty == types::I32 { + OperandSize::Size32 + } else { + assert_eq!(output_ty, types::I64); + OperandSize::Size64 + }; + + let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat; + let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat; + + let src_copy = ctx.alloc_tmp(RegClass::V128, input_ty); + ctx.emit(Inst::gen_move(src_copy, src, input_ty)); + + let tmp_xmm = ctx.alloc_tmp(RegClass::V128, input_ty); + let tmp_gpr = ctx.alloc_tmp(RegClass::I64, output_ty); + + if to_signed { + ctx.emit(Inst::cvt_float_to_sint_seq( + src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, + )); + } else { + ctx.emit(Inst::cvt_float_to_uint_seq( + src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, + )); + } + } else { + if op == Opcode::FcvtToSintSat { + // Sets destination to zero if float is NaN + let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4); + ctx.emit(Inst::xmm_unary_rm_r( + SseOpcode::Movapd, + RegMem::reg(src), + tmp, + )); + ctx.emit(Inst::gen_move(dst, src, input_ty)); + let cond = FcmpImm::from(FloatCC::Equal); + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Cmpps, + RegMem::reg(tmp.to_reg()), + tmp, + cond.encode(), + false, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Andps, + RegMem::reg(tmp.to_reg()), + dst, + )); + + // Sets top bit of tmp if float is positive + // Setting up to set top bit on negative float values + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(dst.to_reg()), + tmp, + )); + + // Convert the packed float to packed doubleword. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Cvttps2dq, + RegMem::reg(dst.to_reg()), + dst, + )); + + // Set top bit only if < 0 + // Saturate lane with sign (top) bit. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pand, + RegMem::reg(dst.to_reg()), + tmp, + )); + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp)); + + // On overflow 0x80000000 is returned to a lane. + // Below sets positive overflow lanes to 0x7FFFFFFF + // Keeps negative overflow lanes as is. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(tmp.to_reg()), + dst, + )); + } else if op == Opcode::FcvtToUintSat { + unimplemented!("f32x4.convert_i32x4_u"); + } else { + // Since this branch is also guarded by a check for vector types + // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here + // due to vector varients not existing. The first two branches will + // cover all reachable cases. + unreachable!(); + } + } + } + + Opcode::Bitcast => { + let input_ty = ctx.input_ty(insn, 0); + let output_ty = ctx.output_ty(insn, 0); + match (input_ty, output_ty) { + (types::F32, types::I32) => { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_to_gpr( + SseOpcode::Movd, + src, + dst, + OperandSize::Size32, + )); + } + (types::I32, types::F32) => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gpr_to_xmm( + SseOpcode::Movd, + src, + OperandSize::Size32, + dst, + )); + } + (types::F64, types::I64) => { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_to_gpr( + SseOpcode::Movq, + src, + dst, + OperandSize::Size64, + )); + } + (types::I64, types::F64) => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gpr_to_xmm( + SseOpcode::Movq, + src, + OperandSize::Size64, + dst, + )); + } + _ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty), + } + } + + Opcode::Fabs | Opcode::Fneg => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + // In both cases, generate a constant and apply a single binary instruction: + // - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the + // src with it. + // - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the + // src with it. + let output_ty = ty.unwrap(); + if !output_ty.is_vector() { + let (val, opcode) = match output_ty { + types::F32 => match op { + Opcode::Fabs => (0x7fffffff, SseOpcode::Andps), + Opcode::Fneg => (0x80000000, SseOpcode::Xorps), + _ => unreachable!(), + }, + types::F64 => match op { + Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd), + Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd), + _ => unreachable!(), + }, + _ => panic!("unexpected type {:?} for Fabs", output_ty), + }; + + for inst in Inst::gen_constant(dst, val, output_ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + + ctx.emit(Inst::xmm_rm_r(opcode, src, dst)); + } else { + // Eventually vector constants should be available in `gen_constant` and this block + // can be merged with the one above (TODO). + if output_ty.bits() == 128 { + // Move the `lhs` to the same register as `dst`; this may not emit an actual move + // but ensures that the registers are the same to match x86's read-write operand + // encoding. + let src = put_input_in_reg(ctx, inputs[0]); + ctx.emit(Inst::gen_move(dst, src, output_ty)); + + // Generate an all 1s constant in an XMM register. This uses CMPPS but could + // have used CMPPD with the same effect. + let tmp = ctx.alloc_tmp(RegClass::V128, output_ty); + let cond = FcmpImm::from(FloatCC::Equal); + let cmpps = Inst::xmm_rm_r_imm( + SseOpcode::Cmpps, + RegMem::reg(tmp.to_reg()), + tmp, + cond.encode(), + false, + ); + ctx.emit(cmpps); + + // Shift the all 1s constant to generate the mask. + let lane_bits = output_ty.lane_bits(); + let (shift_opcode, opcode, shift_by) = match (op, lane_bits) { + (Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1), + (Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1), + (Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31), + (Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63), + _ => unreachable!( + "unexpected opcode and lane size: {:?}, {} bits", + op, lane_bits + ), + }; + let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp); + ctx.emit(shift); + + // Apply shifted mask (XOR or AND). + let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst); + ctx.emit(mask); + } else { + panic!("unexpected type {:?} for Fabs", output_ty); + } + } + } + + Opcode::Fcopysign => { + let dst = get_output_reg(ctx, outputs[0]); + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + + let ty = ty.unwrap(); + + // We're going to generate the following sequence: + // + // movabs $INT_MIN, tmp_gpr1 + // mov{d,q} tmp_gpr1, tmp_xmm1 + // movap{s,d} tmp_xmm1, dst + // andnp{s,d} src_1, dst + // movap{s,d} src_2, tmp_xmm2 + // andp{s,d} tmp_xmm1, tmp_xmm2 + // orp{s,d} tmp_xmm2, dst + + let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32); + let tmp_xmm2 = ctx.alloc_tmp(RegClass::V128, types::F32); + + let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty { + types::F32 => ( + 0x8000_0000, + SseOpcode::Movaps, + SseOpcode::Andnps, + SseOpcode::Andps, + SseOpcode::Orps, + ), + types::F64 => ( + 0x8000_0000_0000_0000, + SseOpcode::Movapd, + SseOpcode::Andnpd, + SseOpcode::Andpd, + SseOpcode::Orpd, + ), + _ => { + panic!("unexpected type {:?} for copysign", ty); + } + }; + + for inst in Inst::gen_constant(tmp_xmm1, sign_bit_cst, ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst)); + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2)); + ctx.emit(Inst::xmm_rm_r( + and_op, + RegMem::reg(tmp_xmm1.to_reg()), + tmp_xmm2, + )); + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst)); + } + + Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => { + // TODO use ROUNDSS/ROUNDSD after sse4.1. + + // Lower to VM calls when there's no access to SSE4.1. + let ty = ty.unwrap(); + let libcall = match (ty, op) { + (types::F32, Opcode::Ceil) => LibCall::CeilF32, + (types::F64, Opcode::Ceil) => LibCall::CeilF64, + (types::F32, Opcode::Floor) => LibCall::FloorF32, + (types::F64, Opcode::Floor) => LibCall::FloorF64, + (types::F32, Opcode::Nearest) => LibCall::NearestF32, + (types::F64, Opcode::Nearest) => LibCall::NearestF64, + (types::F32, Opcode::Trunc) => LibCall::TruncF32, + (types::F64, Opcode::Trunc) => LibCall::TruncF64, + _ => panic!( + "unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc", + ty, op + ), + }; + + emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?; + } + + Opcode::Load + | Opcode::Uload8 + | Opcode::Sload8 + | Opcode::Uload16 + | Opcode::Sload16 + | Opcode::Uload32 + | Opcode::Sload32 + | Opcode::LoadComplex + | Opcode::Uload8Complex + | Opcode::Sload8Complex + | Opcode::Uload16Complex + | Opcode::Sload16Complex + | Opcode::Uload32Complex + | Opcode::Sload32Complex => { + let offset = ctx.data(insn).load_store_offset().unwrap(); + + let elem_ty = match op { + Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => { + types::I8 + } + Opcode::Sload16 + | Opcode::Uload16 + | Opcode::Sload16Complex + | Opcode::Uload16Complex => types::I16, + Opcode::Sload32 + | Opcode::Uload32 + | Opcode::Sload32Complex + | Opcode::Uload32Complex => types::I32, + Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0), + _ => unimplemented!(), + }; + + let ext_mode = ExtMode::new(elem_ty.bits(), 64); + + let sign_extend = match op { + Opcode::Sload8 + | Opcode::Sload8Complex + | Opcode::Sload16 + | Opcode::Sload16Complex + | Opcode::Sload32 + | Opcode::Sload32Complex => true, + _ => false, + }; + + let amode = match op { + Opcode::Load + | Opcode::Uload8 + | Opcode::Sload8 + | Opcode::Uload16 + | Opcode::Sload16 + | Opcode::Uload32 + | Opcode::Sload32 => { + assert_eq!(inputs.len(), 1, "only one input for load operands"); + lower_to_amode(ctx, inputs[0], offset) + } + + Opcode::LoadComplex + | Opcode::Uload8Complex + | Opcode::Sload8Complex + | Opcode::Uload16Complex + | Opcode::Sload16Complex + | Opcode::Uload32Complex + | Opcode::Sload32Complex => { + assert_eq!( + inputs.len(), + 2, + "can't handle more than two inputs in complex load" + ); + let base = put_input_in_reg(ctx, inputs[0]); + let index = put_input_in_reg(ctx, inputs[1]); + let shift = 0; + let flags = ctx.memflags(insn).expect("load should have memflags"); + Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags) + } + + _ => unreachable!(), + }; + + let dst = get_output_reg(ctx, outputs[0]); + let is_xmm = elem_ty.is_float() || elem_ty.is_vector(); + match (sign_extend, is_xmm) { + (true, false) => { + // The load is sign-extended only when the output size is lower than 64 bits, + // so ext-mode is defined in this case. + ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst)); + } + (false, false) => { + if elem_ty.bytes() == 8 { + // Use a plain load. + ctx.emit(Inst::mov64_m_r(amode, dst)) + } else { + // Use a zero-extended load. + ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst)) + } + } + (_, true) => { + ctx.emit(match elem_ty { + types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst), + types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst), + _ if elem_ty.is_vector() && elem_ty.bits() == 128 => { + Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst) + } // TODO Specialize for different types: MOVUPD, MOVDQU + _ => unreachable!("unexpected type for load: {:?}", elem_ty), + }); + } + } + } + + Opcode::Store + | Opcode::Istore8 + | Opcode::Istore16 + | Opcode::Istore32 + | Opcode::StoreComplex + | Opcode::Istore8Complex + | Opcode::Istore16Complex + | Opcode::Istore32Complex => { + let offset = ctx.data(insn).load_store_offset().unwrap(); + + let elem_ty = match op { + Opcode::Istore8 | Opcode::Istore8Complex => types::I8, + Opcode::Istore16 | Opcode::Istore16Complex => types::I16, + Opcode::Istore32 | Opcode::Istore32Complex => types::I32, + Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0), + _ => unreachable!(), + }; + + let addr = match op { + Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => { + assert_eq!(inputs.len(), 2, "only one input for store memory operands"); + lower_to_amode(ctx, inputs[1], offset) + } + + Opcode::StoreComplex + | Opcode::Istore8Complex + | Opcode::Istore16Complex + | Opcode::Istore32Complex => { + assert_eq!( + inputs.len(), + 3, + "can't handle more than two inputs in complex store" + ); + let base = put_input_in_reg(ctx, inputs[1]); + let index = put_input_in_reg(ctx, inputs[2]); + let shift = 0; + let flags = ctx.memflags(insn).expect("store should have memflags"); + Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags) + } + + _ => unreachable!(), + }; + + let src = put_input_in_reg(ctx, inputs[0]); + + ctx.emit(match elem_ty { + types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr), + types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr), + _ if elem_ty.is_vector() && elem_ty.bits() == 128 => { + // TODO Specialize for different types: MOVUPD, MOVDQU, etc. + Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr) + } + _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr), + }); + } + + Opcode::AtomicRmw => { + // This is a simple, general-case atomic update, based on a loop involving + // `cmpxchg`. Note that we could do much better than this in the case where the old + // value at the location (that is to say, the SSA `Value` computed by this CLIF + // instruction) is not required. In that case, we could instead implement this + // using a single `lock`-prefixed x64 read-modify-write instruction. Also, even in + // the case where the old value is required, for the `add` and `sub` cases, we can + // use the single instruction `lock xadd`. However, those improvements have been + // left for another day. + // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153 + let dst = get_output_reg(ctx, outputs[0]); + let mut addr = put_input_in_reg(ctx, inputs[0]); + let mut arg2 = put_input_in_reg(ctx, inputs[1]); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + // Make sure that both args are in virtual regs, since in effect we have to do a + // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not + // guaranteed safe if either is in a real reg. + addr = ctx.ensure_in_vreg(addr, types::I64); + arg2 = ctx.ensure_in_vreg(arg2, types::I64); + + // Move the args to the preordained AtomicRMW input regs. Note that `AtomicRmwSeq` + // operates at whatever width is specified by `ty`, so there's no need to + // zero-extend `arg2` in the case of `ty` being I8/I16/I32. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::r9()), + addr, + types::I64, + )); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::r10()), + arg2, + types::I64, + )); + + // Now the AtomicRmwSeq (pseudo-) instruction itself + let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap()); + ctx.emit(Inst::AtomicRmwSeq { ty: ty_access, op }); + + // And finally, copy the preordained AtomicRmwSeq output reg to its destination. + ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64)); + } + + Opcode::AtomicCas => { + // This is very similar to, but not identical to, the `AtomicRmw` case. As with + // `AtomicRmw`, there's no need to zero-extend narrow values here. + let dst = get_output_reg(ctx, outputs[0]); + let addr = lower_to_amode(ctx, inputs[0], 0); + let expected = put_input_in_reg(ctx, inputs[1]); + let replacement = put_input_in_reg(ctx, inputs[2]); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + // Move the expected value into %rax. Because there's only one fixed register on + // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the + // `AtomicRmw` case. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + expected, + types::I64, + )); + ctx.emit(Inst::LockCmpxchg { + ty: ty_access, + src: replacement, + dst: addr.into(), + }); + // And finally, copy the old value at the location to its destination reg. + ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64)); + } + + Opcode::AtomicLoad => { + // This is a normal load. The x86-TSO memory model provides sufficient sequencing + // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the + // need for any fence instructions. + let data = get_output_reg(ctx, outputs[0]); + let addr = lower_to_amode(ctx, inputs[0], 0); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + let rm = RegMem::mem(addr); + if ty_access == types::I64 { + ctx.emit(Inst::mov64_rm_r(rm, data)); + } else { + let ext_mode = ExtMode::new(ty_access.bits(), 64).expect(&format!( + "invalid extension during AtomicLoad: {} -> {}", + ty_access.bits(), + 64 + )); + ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data)); + } + } + + Opcode::AtomicStore => { + // This is a normal store, followed by an `mfence` instruction. + let data = put_input_in_reg(ctx, inputs[0]); + let addr = lower_to_amode(ctx, inputs[1], 0); + let ty_access = ctx.input_ty(insn, 0); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + ctx.emit(Inst::mov_r_m(ty_access.bytes() as u8, data, addr)); + ctx.emit(Inst::Fence { + kind: FenceKind::MFence, + }); + } + + Opcode::Fence => { + ctx.emit(Inst::Fence { + kind: FenceKind::MFence, + }); + } + + Opcode::FuncAddr => { + let dst = get_output_reg(ctx, outputs[0]); + let (extname, _) = ctx.call_target(insn).unwrap(); + let extname = extname.clone(); + ctx.emit(Inst::LoadExtName { + dst, + name: Box::new(extname), + offset: 0, + }); + } + + Opcode::SymbolValue => { + let dst = get_output_reg(ctx, outputs[0]); + let (extname, _, offset) = ctx.symbol_value(insn).unwrap(); + let extname = extname.clone(); + ctx.emit(Inst::LoadExtName { + dst, + name: Box::new(extname), + offset, + }); + } + + Opcode::StackAddr => { + let (stack_slot, offset) = match *ctx.data(insn) { + InstructionData::StackLoad { + opcode: Opcode::StackAddr, + stack_slot, + offset, + } => (stack_slot, offset), + _ => unreachable!(), + }; + let dst = get_output_reg(ctx, outputs[0]); + let offset: i32 = offset.into(); + let inst = ctx + .abi() + .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst); + ctx.emit(inst); + } + + Opcode::Select => { + let flag_input = inputs[0]; + if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) { + let cond_code = ctx.data(fcmp).fp_cond_code().unwrap(); + + // For equal, we flip the operands, because we can't test a conjunction of + // CPU flags with a single cmove; see InvertedEqualOrConditions doc comment. + let (lhs_input, rhs_input) = match cond_code { + FloatCC::Equal => (inputs[2], inputs[1]), + _ => (inputs[1], inputs[2]), + }; + + let ty = ctx.output_ty(insn, 0); + let rhs = put_input_in_reg(ctx, rhs_input); + let dst = get_output_reg(ctx, outputs[0]); + let lhs = if is_int_or_ref_ty(ty) && ty.bytes() < 4 { + // Special case: since the higher bits are undefined per CLIF semantics, we + // can just apply a 32-bit cmove here. Force inputs into registers, to + // avoid partial spilling out-of-bounds with memory accesses, though. + // Sign-extend operands to 32, then do a cmove of size 4. + RegMem::reg(put_input_in_reg(ctx, lhs_input)) + } else { + input_to_reg_mem(ctx, lhs_input) + }; + + // We request inversion of Equal to NotEqual here: taking LHS if equal would mean + // take it if both CC::NP and CC::Z are set, the conjunction of which can't be + // modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the + // select operation, and invert the equal to a not-equal here. + let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual); + + if let FcmpCondResult::InvertedEqualOrConditions(_, _) = &fcmp_results { + // Keep this sync'd with the lowering of the select inputs above. + assert_eq!(cond_code, FloatCC::Equal); + } + + ctx.emit(Inst::gen_move(dst, rhs, ty)); + + match fcmp_results { + FcmpCondResult::Condition(cc) => { + if is_int_or_ref_ty(ty) { + let size = u8::max(ty.bytes() as u8, 4); + ctx.emit(Inst::cmove(size, cc, lhs, dst)); + } else { + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + } + } + FcmpCondResult::AndConditions(_, _) => { + unreachable!( + "can't AND with select; see above comment about inverting equal" + ); + } + FcmpCondResult::InvertedEqualOrConditions(cc1, cc2) + | FcmpCondResult::OrConditions(cc1, cc2) => { + if is_int_or_ref_ty(ty) { + let size = u8::max(ty.bytes() as u8, 4); + ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst)); + ctx.emit(Inst::cmove(size, cc2, lhs, dst)); + } else { + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst)); + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst)); + } + } + } + } else { + let ty = ty.unwrap(); + + let mut size = ty.bytes() as u8; + let lhs = if is_int_or_ref_ty(ty) { + if size < 4 { + // Special case: since the higher bits are undefined per CLIF semantics, we + // can just apply a 32-bit cmove here. Force inputs into registers, to + // avoid partial spilling out-of-bounds with memory accesses, though. + size = 4; + RegMem::reg(put_input_in_reg(ctx, inputs[1])) + } else { + input_to_reg_mem(ctx, inputs[1]) + } + } else { + input_to_reg_mem(ctx, inputs[1]) + }; + + let rhs = put_input_in_reg(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); + + let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) { + emit_cmp(ctx, icmp); + let cond_code = ctx.data(icmp).cond_code().unwrap(); + CC::from_intcc(cond_code) + } else { + // The input is a boolean value, compare it against zero. + let size = ctx.input_ty(insn, 0).bytes() as u8; + let test = put_input_in_reg(ctx, flag_input); + ctx.emit(Inst::cmp_rmi_r(size, RegMemImm::imm(0), test)); + CC::NZ + }; + + // This doesn't affect the flags. + ctx.emit(Inst::gen_move(dst, rhs, ty)); + + if is_int_or_ref_ty(ty) { + ctx.emit(Inst::cmove(size, cc, lhs, dst)); + } else { + debug_assert!(ty == types::F32 || ty == types::F64); + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + } + } + } + + Opcode::Selectif | Opcode::SelectifSpectreGuard => { + let lhs = input_to_reg_mem(ctx, inputs[1]); + let rhs = put_input_in_reg(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ctx.output_ty(insn, 0); + + // Verification ensures that the input is always a single-def ifcmp. + let cmp_insn = ctx + .get_input(inputs[0].insn, inputs[0].input) + .inst + .unwrap() + .0; + debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp); + emit_cmp(ctx, cmp_insn); + + let cc = CC::from_intcc(ctx.data(insn).cond_code().unwrap()); + + if is_int_or_ref_ty(ty) { + let size = ty.bytes() as u8; + if size == 1 { + // Sign-extend operands to 32, then do a cmove of size 4. + let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se)); + ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst)); + ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst)); + } else { + ctx.emit(Inst::gen_move(dst, rhs, ty)); + ctx.emit(Inst::cmove(size, cc, lhs, dst)); + } + } else { + debug_assert!(ty == types::F32 || ty == types::F64); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + } + } + + Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => { + let kind = match op { + Opcode::Udiv => DivOrRemKind::UnsignedDiv, + Opcode::Sdiv => DivOrRemKind::SignedDiv, + Opcode::Urem => DivOrRemKind::UnsignedRem, + Opcode::Srem => DivOrRemKind::SignedRem, + _ => unreachable!(), + }; + let is_div = kind.is_div(); + + let input_ty = ctx.input_ty(insn, 0); + let size = input_ty.bytes() as u8; + + let dividend = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + dividend, + input_ty, + )); + + if flags.avoid_div_traps() { + // A vcode meta-instruction is used to lower the inline checks, since they embed + // pc-relative offsets that must not change, thus requiring regalloc to not + // interfere by introducing spills and reloads. + // + // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that + // regalloc is aware of the coalescing opportunity between rax/rdx and the + // destination register. + let divisor = put_input_in_reg(ctx, inputs[1]); + + let divisor_copy = ctx.alloc_tmp(RegClass::I64, types::I64); + ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64)); + + let tmp = if op == Opcode::Sdiv && size == 8 { + Some(ctx.alloc_tmp(RegClass::I64, types::I64)) + } else { + None + }; + // TODO use xor + ctx.emit(Inst::imm( + OperandSize::Size32, + 0, + Writable::from_reg(regs::rdx()), + )); + ctx.emit(Inst::checked_div_or_rem_seq(kind, size, divisor_copy, tmp)); + } else { + let divisor = input_to_reg_mem(ctx, inputs[1]); + + // Fill in the high parts: + if kind.is_signed() { + // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for + // signed opcodes. + ctx.emit(Inst::sign_extend_data(size)); + } else if input_ty == types::I8 { + ctx.emit(Inst::movzx_rm_r( + ExtMode::BL, + RegMem::reg(regs::rax()), + Writable::from_reg(regs::rax()), + )); + } else { + // zero for unsigned opcodes. + ctx.emit(Inst::imm( + OperandSize::Size64, + 0, + Writable::from_reg(regs::rdx()), + )); + } + + // Emit the actual idiv. + ctx.emit(Inst::div(size, kind.is_signed(), divisor)); + } + + // Move the result back into the destination reg. + if is_div { + // The quotient is in rax. + ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty)); + } else { + // The remainder is in rdx. + ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty)); + } + } + + Opcode::Umulhi | Opcode::Smulhi => { + let input_ty = ctx.input_ty(insn, 0); + let size = input_ty.bytes() as u8; + + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + // Move lhs in %rax. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + lhs, + input_ty, + )); + + // Emit the actual mul or imul. + let signed = op == Opcode::Smulhi; + ctx.emit(Inst::mul_hi(size, signed, rhs)); + + // Read the result from the high part (stored in %rdx). + ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty)); + } + + Opcode::GetPinnedReg => { + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64)); + } + + Opcode::SetPinnedReg => { + let src = put_input_in_reg(ctx, inputs[0]); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::pinned_reg()), + src, + types::I64, + )); + } + + Opcode::Vconst => { + let used_constant = if let &InstructionData::UnaryConst { + constant_handle, .. + } = ctx.data(insn) + { + ctx.use_constant(VCodeConstantData::Pool( + constant_handle, + ctx.get_constant_data(constant_handle).clone(), + )) + } else { + unreachable!("vconst should always have unary_const format") + }; + // TODO use Inst::gen_constant() instead. + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + ctx.emit(Inst::xmm_load_const(used_constant, dst, ty)); + } + + Opcode::RawBitcast => { + // A raw_bitcast is just a mechanism for correcting the type of V128 values (see + // https://github.com/bytecodealliance/wasmtime/issues/1147). As such, this IR + // instruction should emit no machine code but a move is necessary to give the register + // allocator a definition for the output virtual register. + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + ctx.emit(Inst::gen_move(dst, src, ty)); + } + + Opcode::Shuffle => { + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let lhs_ty = ctx.input_ty(insn, 0); + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let mask = match ctx.get_immediate(insn) { + Some(DataValue::V128(bytes)) => bytes.to_vec(), + _ => unreachable!("shuffle should always have a 16-byte immediate"), + }; + + // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a + // 1 in the most significant position zeroes the lane. + let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b }; + + ctx.emit(Inst::gen_move(dst, rhs, ty)); + if rhs == lhs { + // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM + // register. We statically build `constructed_mask` to zero out any unknown lane + // indices (may not be completely necessary: verification could fail incorrect mask + // values) and fix the indexes to all point to the `dst` vector. + let constructed_mask = mask + .iter() + // If the mask is greater than 15 it still may be referring to a lane in b. + .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b }) + .map(zero_unknown_lane_index) + .collect(); + let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); + let tmp = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const(constant, tmp, ty)); + // After loading the constructed mask in a temporary register, we use this to + // shuffle the `dst` register (remember that, in this case, it is the same as + // `src` so we disregard this register). + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)); + } else { + // If `lhs` and `rhs` are different, we must shuffle each separately and then OR + // them together. This is necessary due to PSHUFB semantics. As in the case above, + // we build the `constructed_mask` for each case statically. + + // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes. + let tmp0 = ctx.alloc_tmp(RegClass::V128, lhs_ty); + ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty)); + let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect(); + let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); + let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const(constant, tmp1, ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0)); + + // PSHUFB the second argument, placing zeroes for unused lanes. + let constructed_mask = mask + .iter() + .map(|b| b.wrapping_sub(16)) + .map(zero_unknown_lane_index) + .collect(); + let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); + let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const(constant, tmp2, ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst)); + + // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers + // is not important). + ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst)); + + // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB + } + } + + Opcode::Swizzle => { + // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec + // requiring mask indexes greater than 15 to have the same semantics as a 0 index. For + // the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF + // semantics match the Wasm SIMD semantics for this instruction. + // The instruction format maps to variables like: %dst = swizzle %src, %mask + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src = put_input_in_reg(ctx, inputs[0]); + let swizzle_mask = put_input_in_reg(ctx, inputs[1]); + + // Inform the register allocator that `src` and `dst` should be in the same register. + ctx.emit(Inst::gen_move(dst, src, ty)); + + // Create a mask for zeroing out-of-bounds lanes of the swizzle mask. + let zero_mask = ctx.alloc_tmp(RegClass::V128, types::I8X16); + static ZERO_MASK_VALUE: [u8; 16] = [ + 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, + 0x70, 0x70, + ]; + let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE)); + ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty)); + + // Use the `zero_mask` on a writable `swizzle_mask`. + let swizzle_mask = Writable::from_reg(swizzle_mask); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddusb, + RegMem::from(zero_mask), + swizzle_mask, + )); + + // Shuffle `dst` using the fixed-up `swizzle_mask`. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::from(swizzle_mask), + dst, + )); + } + + Opcode::Insertlane => { + // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let in_vec = put_input_in_reg(ctx, inputs[0]); + let src_ty = ctx.input_ty(insn, 1); + debug_assert!(!src_ty.is_vector()); + let src = input_to_reg_mem(ctx, inputs[1]); + let lane = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) { + *imm + } else { + unreachable!(); + }; + debug_assert!(lane < ty.lane_count() as u8); + + ctx.emit(Inst::gen_move(dst, in_vec, ty)); + emit_insert_lane(ctx, src, dst, lane, ty.lane_type()); + } + + Opcode::Extractlane => { + // The instruction format maps to variables like: %dst = extractlane %src, %lane + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = put_input_in_reg(ctx, inputs[0]); + let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) { + *imm + } else { + unreachable!(); + }; + debug_assert!(lane < src_ty.lane_count() as u8); + + if !ty.is_float() { + let (sse_op, w_bit) = match ty.lane_bits() { + 8 => (SseOpcode::Pextrb, false), + 16 => (SseOpcode::Pextrw, false), + 32 => (SseOpcode::Pextrd, false), + 64 => (SseOpcode::Pextrd, true), + _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()), + }; + let src = RegMem::reg(src); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit)); + } else { + if lane == 0 { + // Remove the extractlane instruction, leaving the float where it is. The upper + // bits will remain unchanged; for correctness, this relies on Cranelift type + // checking to avoid using those bits. + ctx.emit(Inst::gen_move(dst, src, ty)); + } else { + // Otherwise, shuffle the bits in `lane` to the lowest lane. + let sse_op = SseOpcode::Pshufd; + let mask = match src_ty { + // Move the value at `lane` to lane 0, copying existing value at lane 0 to + // other lanes. Again, this relies on Cranelift type checking to avoid + // using those bits. + types::F32X4 => 0b00_00_00_00 | lane, + // Move the value at `lane` 1 (we know it must be 1 because of the `if` + // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type + // checking assumption also applies here. + types::F64X2 => 0b11_10_11_10, + _ => unreachable!(), + }; + let src = RegMem::reg(src); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false)); + } + } + } + + Opcode::Splat | Opcode::LoadSplat => { + let ty = ty.unwrap(); + assert_eq!(ty.bits(), 128); + let src_ty = ctx.input_ty(insn, 0); + assert!(src_ty.bits() < 128); + + let src = match op { + Opcode::Splat => input_to_reg_mem(ctx, inputs[0]), + Opcode::LoadSplat => { + let offset = ctx.data(insn).load_store_offset().unwrap(); + let amode = lower_to_amode(ctx, inputs[0], offset); + RegMem::mem(amode) + } + _ => unreachable!(), + }; + let dst = get_output_reg(ctx, outputs[0]); + + // We know that splat will overwrite all of the lanes of `dst` but it takes several + // instructions to do so. Because of the multiple instructions, there is no good way to + // declare `dst` a `def` except with the following pseudo-instruction. + ctx.emit(Inst::xmm_uninit_value(dst)); + + // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST* + // and VPBROADCAST*. + match ty.lane_bits() { + 8 => { + emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); + // Initialize a register with all 0s. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Shuffle the lowest byte lane to all other lanes. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)) + } + 16 => { + emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); + emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); + // Shuffle the lowest two lanes to all other lanes. + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Pshufd, + RegMem::from(dst), + dst, + 0, + false, + )) + } + 32 => { + emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); + // Shuffle the lowest lane to all other lanes. + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Pshufd, + RegMem::from(dst), + dst, + 0, + false, + )) + } + 64 => { + emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); + emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); + } + _ => panic!("Invalid type to splat: {}", ty), + } + } + + Opcode::VanyTrue => { + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = put_input_in_reg(ctx, inputs[0]); + // Set the ZF if the result is all zeroes. + ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src)); + // If the ZF is not set, place a 1 in `dst`. + ctx.emit(Inst::setcc(CC::NZ, dst)); + } + + Opcode::VallTrue => { + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = input_to_reg_mem(ctx, inputs[0]); + + let eq = |ty: Type| match ty.lane_bits() { + 8 => SseOpcode::Pcmpeqb, + 16 => SseOpcode::Pcmpeqw, + 32 => SseOpcode::Pcmpeqd, + 64 => SseOpcode::Pcmpeqq, + _ => panic!("Unable to find an instruction for {} for type: {}", op, ty), + }; + + // Initialize a register with all 0s. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Compare to see what lanes are filled with all 1s. + ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp)); + // Set the ZF if the result is all zeroes. + ctx.emit(Inst::xmm_cmp_rm_r( + SseOpcode::Ptest, + RegMem::from(tmp), + tmp.to_reg(), + )); + // If the ZF is set, place a 1 in `dst`. + ctx.emit(Inst::setcc(CC::Z, dst)); + } + + Opcode::VhighBits => { + let src = put_input_in_reg(ctx, inputs[0]); + let src_ty = ctx.input_ty(insn, 0); + debug_assert!(src_ty.is_vector() && src_ty.bits() == 128); + let dst = get_output_reg(ctx, outputs[0]); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + + // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for + // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode, + // the instruction can access additional registers when used with a REX.R prefix. The + // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development + // Manual, vol. 2). This being the case, we will always clear REX.W since its use is + // unnecessary (`OperandSize` is used for setting/clearing REX.W). + let size = OperandSize::Size32; + + match src_ty { + types::I8X16 | types::B8X16 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size)) + } + types::I32X4 | types::B32X4 | types::F32X4 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size)) + } + types::I64X2 | types::B64X2 | types::F64X2 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size)) + } + types::I16X8 | types::B16X8 => { + // There is no x86 instruction for extracting the high bit of 16-bit lanes so + // here we: + // - duplicate the 16-bit lanes of `src` into 8-bit lanes: + // PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...] + // - use PMOVMSKB to gather the high bits; now we have duplicates, though + // - shift away the bottom 8 high bits to remove the duplicates. + let tmp = ctx.alloc_tmp(RegClass::V128, src_ty); + ctx.emit(Inst::gen_move(tmp, src, src_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp)); + ctx.emit(Inst::xmm_to_gpr( + SseOpcode::Pmovmskb, + tmp.to_reg(), + dst, + size, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(8), dst)); + } + _ => unimplemented!("unknown input type {} for {}", src_ty, op), + } + } + + Opcode::IaddImm + | Opcode::ImulImm + | Opcode::UdivImm + | Opcode::SdivImm + | Opcode::UremImm + | Opcode::SremImm + | Opcode::IrsubImm + | Opcode::IaddCin + | Opcode::IaddIfcin + | Opcode::IaddCout + | Opcode::IaddCarry + | Opcode::IaddIfcarry + | Opcode::IsubBin + | Opcode::IsubIfbin + | Opcode::IsubBout + | Opcode::IsubIfbout + | Opcode::IsubBorrow + | Opcode::IsubIfborrow + | Opcode::BandImm + | Opcode::BorImm + | Opcode::BxorImm + | Opcode::RotlImm + | Opcode::RotrImm + | Opcode::IshlImm + | Opcode::UshrImm + | Opcode::SshrImm => { + panic!("ALU+imm and ALU+carry ops should not appear here!"); + } + _ => unimplemented!("unimplemented lowering for opcode {:?}", op), + } + + Ok(()) +} + +//============================================================================= +// Lowering-backend trait implementation. + +impl LowerBackend for X64Backend { + type MInst = Inst; + + fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> { + lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.triple) + } + + fn lower_branch_group<C: LowerCtx<I = Inst>>( + &self, + ctx: &mut C, + branches: &[IRInst], + targets: &[MachLabel], + fallthrough: Option<MachLabel>, + ) -> CodegenResult<()> { + // A block should end with at most two branches. The first may be a + // conditional branch; a conditional branch can be followed only by an + // unconditional branch or fallthrough. Otherwise, if only one branch, + // it may be an unconditional branch, a fallthrough, a return, or a + // trap. These conditions are verified by `is_ebb_basic()` during the + // verifier pass. + assert!(branches.len() <= 2); + + if branches.len() == 2 { + // Must be a conditional branch followed by an unconditional branch. + let op0 = ctx.data(branches[0]).opcode(); + let op1 = ctx.data(branches[1]).opcode(); + + trace!( + "lowering two-branch group: opcodes are {:?} and {:?}", + op0, + op1 + ); + assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough); + + let taken = targets[0]; + let not_taken = match op1 { + Opcode::Jump => targets[1], + Opcode::Fallthrough => fallthrough.unwrap(), + _ => unreachable!(), // assert above. + }; + + match op0 { + Opcode::Brz | Opcode::Brnz => { + let flag_input = InsnInput { + insn: branches[0], + input: 0, + }; + + let src_ty = ctx.input_ty(branches[0], 0); + + if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) { + emit_cmp(ctx, icmp); + + let cond_code = ctx.data(icmp).cond_code().unwrap(); + let cond_code = if op0 == Opcode::Brz { + cond_code.inverse() + } else { + cond_code + }; + + let cc = CC::from_intcc(cond_code); + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) { + let cond_code = ctx.data(fcmp).fp_cond_code().unwrap(); + let cond_code = if op0 == Opcode::Brz { + cond_code.inverse() + } else { + cond_code + }; + match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } + FcmpCondResult::AndConditions(cc1, cc2) => { + ctx.emit(Inst::jmp_if(cc1.invert(), not_taken)); + ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken)); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + ctx.emit(Inst::jmp_if(cc1, taken)); + ctx.emit(Inst::jmp_cond(cc2, taken, not_taken)); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), + } + } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) { + let src = put_input_in_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + ); + let cc = match op0 { + Opcode::Brz => CC::Z, + Opcode::Brnz => CC::NZ, + _ => unreachable!(), + }; + let size_bytes = src_ty.bytes() as u8; + ctx.emit(Inst::cmp_rmi_r(size_bytes, RegMemImm::imm(0), src)); + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } else { + unimplemented!("brz/brnz with non-int type {:?}", src_ty); + } + } + + Opcode::BrIcmp => { + let src_ty = ctx.input_ty(branches[0], 0); + if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) { + let lhs = put_input_in_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + ); + let rhs = input_to_reg_mem_imm( + ctx, + InsnInput { + insn: branches[0], + input: 1, + }, + ); + let cc = CC::from_intcc(ctx.data(branches[0]).cond_code().unwrap()); + let byte_size = src_ty.bytes() as u8; + // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives + // us dst - src at the machine instruction level, so invert operands. + ctx.emit(Inst::cmp_rmi_r(byte_size, rhs, lhs)); + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } else { + unimplemented!("bricmp with non-int type {:?}", src_ty); + } + } + + _ => panic!("unexpected branch opcode: {:?}", op0), + } + } else { + assert_eq!(branches.len(), 1); + + // Must be an unconditional branch or trap. + let op = ctx.data(branches[0]).opcode(); + match op { + Opcode::Jump | Opcode::Fallthrough => { + ctx.emit(Inst::jmp_known(targets[0])); + } + + Opcode::BrTable => { + let jt_size = targets.len() - 1; + assert!(jt_size <= u32::max_value() as usize); + let jt_size = jt_size as u32; + + let idx = extend_input_to_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + ExtSpec::ZeroExtendTo32, + ); + + // Bounds-check (compute flags from idx - jt_size) and branch to default. + ctx.emit(Inst::cmp_rmi_r(4, RegMemImm::imm(jt_size), idx)); + + // Emit the compound instruction that does: + // + // lea $jt, %rA + // movsbl [%rA, %rIndex, 2], %rB + // add %rB, %rA + // j *%rA + // [jt entries] + // + // This must be *one* instruction in the vcode because we cannot allow regalloc + // to insert any spills/fills in the middle of the sequence; otherwise, the + // lea PC-rel offset to the jumptable would be incorrect. (The alternative + // is to introduce a relocation pass for inlined jumptables, which is much + // worse.) + + // This temporary is used as a signed integer of 64-bits (to hold addresses). + let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64); + // This temporary is used as a signed integer of 32-bits (for the wasm-table + // index) and then 64-bits (address addend). The small lie about the I64 type + // is benign, since the temporary is dead after this instruction (and its + // Cranelift type is thus unused). + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64); + + let targets_for_term: Vec<MachLabel> = targets.to_vec(); + let default_target = targets[0]; + + let jt_targets: Vec<MachLabel> = targets.iter().skip(1).cloned().collect(); + + ctx.emit(Inst::JmpTableSeq { + idx, + tmp1, + tmp2, + default_target, + targets: jt_targets, + targets_for_term, + }); + } + + _ => panic!("Unknown branch type {:?}", op), + } + } + + Ok(()) + } + + fn maybe_pinned_reg(&self) -> Option<Reg> { + Some(regs::pinned_reg()) + } +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs b/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs new file mode 100644 index 0000000000..fd4444498d --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/mod.rs @@ -0,0 +1,149 @@ +//! X86_64-bit Instruction Set Architecture. + +use self::inst::EmitInfo; + +use super::TargetIsa; +use crate::ir::{condcodes::IntCC, Function}; +use crate::isa::x64::{inst::regs::create_reg_universe_systemv, settings as x64_settings}; +use crate::isa::Builder as IsaBuilder; +use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode}; +use crate::result::CodegenResult; +use crate::settings::{self as shared_settings, Flags}; +use alloc::boxed::Box; +use regalloc::{PrettyPrint, RealRegUniverse}; +use target_lexicon::Triple; + +mod abi; +mod inst; +mod lower; +mod settings; + +/// An X64 backend. +pub(crate) struct X64Backend { + triple: Triple, + flags: Flags, + x64_flags: x64_settings::Flags, + reg_universe: RealRegUniverse, +} + +impl X64Backend { + /// Create a new X64 backend with the given (shared) flags. + fn new_with_flags(triple: Triple, flags: Flags, x64_flags: x64_settings::Flags) -> Self { + let reg_universe = create_reg_universe_systemv(&flags); + Self { + triple, + flags, + x64_flags, + reg_universe, + } + } + + fn compile_vcode(&self, func: &Function, flags: Flags) -> CodegenResult<VCode<inst::Inst>> { + // This performs lowering to VCode, register-allocates the code, computes + // block layout and finalizes branches. The result is ready for binary emission. + let emit_info = EmitInfo::new(flags.clone(), self.x64_flags.clone()); + let abi = Box::new(abi::X64ABICallee::new(&func, flags)?); + compile::compile::<Self>(&func, self, abi, emit_info) + } +} + +impl MachBackend for X64Backend { + fn compile_function( + &self, + func: &Function, + want_disasm: bool, + ) -> CodegenResult<MachCompileResult> { + let flags = self.flags(); + let vcode = self.compile_vcode(func, flags.clone())?; + + let buffer = vcode.emit(); + let buffer = buffer.finish(); + let frame_size = vcode.frame_size(); + let unwind_info = vcode.unwind_info()?; + + let disasm = if want_disasm { + Some(vcode.show_rru(Some(&create_reg_universe_systemv(flags)))) + } else { + None + }; + + Ok(MachCompileResult { + buffer, + frame_size, + disasm, + unwind_info, + }) + } + + fn flags(&self) -> &Flags { + &self.flags + } + + fn name(&self) -> &'static str { + "x64" + } + + fn triple(&self) -> Triple { + self.triple.clone() + } + + fn reg_universe(&self) -> &RealRegUniverse { + &self.reg_universe + } + + fn unsigned_add_overflow_condition(&self) -> IntCC { + // Unsigned `>=`; this corresponds to the carry flag set on x86, which happens on + // overflow of an add. + IntCC::UnsignedGreaterThanOrEqual + } + + fn unsigned_sub_overflow_condition(&self) -> IntCC { + // unsigned `>=`; this corresponds to the carry flag set on x86, which happens on + // underflow of a subtract (carry is borrow for subtract). + IntCC::UnsignedGreaterThanOrEqual + } + + #[cfg(feature = "unwind")] + fn emit_unwind_info( + &self, + result: &MachCompileResult, + kind: crate::machinst::UnwindInfoKind, + ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> { + use crate::isa::unwind::UnwindInfo; + use crate::machinst::UnwindInfoKind; + Ok(match (result.unwind_info.as_ref(), kind) { + (Some(info), UnwindInfoKind::SystemV) => { + inst::unwind::systemv::create_unwind_info(info.clone())?.map(UnwindInfo::SystemV) + } + (Some(_info), UnwindInfoKind::Windows) => { + //TODO inst::unwind::winx64::create_unwind_info(info.clone())?.map(|u| UnwindInfo::WindowsX64(u)) + None + } + _ => None, + }) + } + + #[cfg(feature = "unwind")] + fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> { + Some(inst::unwind::systemv::create_cie()) + } +} + +/// Create a new `isa::Builder`. +pub(crate) fn isa_builder(triple: Triple) -> IsaBuilder { + IsaBuilder { + triple, + setup: x64_settings::builder(), + constructor: isa_constructor, + } +} + +fn isa_constructor( + triple: Triple, + shared_flags: Flags, + builder: shared_settings::Builder, +) -> Box<dyn TargetIsa> { + let isa_flags = x64_settings::Flags::new(&shared_flags, builder); + let backend = X64Backend::new_with_flags(triple, shared_flags, isa_flags); + Box::new(TargetIsaAdapter::new(backend)) +} diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs b/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs new file mode 100644 index 0000000000..c5371bb132 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/settings.rs @@ -0,0 +1,9 @@ +//! x86 Settings. + +use crate::settings::{self, detail, Builder}; +use core::fmt; + +// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a +// public `Flags` struct with an impl for all of the settings defined in +// `cranelift-codegen/meta/src/isa/x86/settings.rs`. +include!(concat!(env!("OUT_DIR"), "/settings-x86.rs")); |