diff options
Diffstat (limited to 'third_party/rust/cranelift-codegen/src/isa/x64/lower.rs')
-rw-r--r-- | third_party/rust/cranelift-codegen/src/isa/x64/lower.rs | 3771 |
1 files changed, 3771 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs new file mode 100644 index 0000000000..0862154360 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/isa/x64/lower.rs @@ -0,0 +1,3771 @@ +//! Lowering rules for X64. + +use crate::data_value::DataValue; +use crate::ir::{ + condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName, + Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type, +}; +use crate::isa::x64::abi::*; +use crate::isa::x64::inst::args::*; +use crate::isa::x64::inst::*; +use crate::isa::{x64::X64Backend, CallConv}; +use crate::machinst::lower::*; +use crate::machinst::*; +use crate::result::CodegenResult; +use crate::settings::Flags; +use alloc::boxed::Box; +use alloc::vec::Vec; +use cranelift_codegen_shared::condcodes::CondCode; +use log::trace; +use regalloc::{Reg, RegClass, Writable}; +use smallvec::SmallVec; +use std::convert::TryFrom; +use target_lexicon::Triple; + +/// Context passed to all lowering functions. +type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>; + +//============================================================================= +// Helpers for instruction lowering. + +fn is_int_or_ref_ty(ty: Type) -> bool { + match ty { + types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true, + types::R32 => panic!("shouldn't have 32-bits refs on x64"), + _ => false, + } +} + +fn is_bool_ty(ty: Type) -> bool { + match ty { + types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true, + types::R32 => panic!("shouldn't have 32-bits refs on x64"), + _ => false, + } +} + +/// This is target-word-size dependent. And it excludes booleans and reftypes. +fn is_valid_atomic_transaction_ty(ty: Type) -> bool { + match ty { + types::I8 | types::I16 | types::I32 | types::I64 => true, + _ => false, + } +} + +/// Returns whether the given specified `input` is a result produced by an instruction with Opcode +/// `op`. +// TODO investigate failures with checking against the result index. +fn matches_input<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + op: Opcode, +) -> Option<IRInst> { + let inputs = ctx.get_input(input.insn, input.input); + inputs.inst.and_then(|(src_inst, _)| { + let data = ctx.data(src_inst); + if data.opcode() == op { + return Some(src_inst); + } + None + }) +} + +/// Returns whether the given specified `input` is a result produced by an instruction with any of +/// the opcodes specified in `ops`. +fn matches_input_any<C: LowerCtx<I = Inst>>( + ctx: &mut C, + input: InsnInput, + ops: &[Opcode], +) -> Option<IRInst> { + let inputs = ctx.get_input(input.insn, input.input); + inputs.inst.and_then(|(src_inst, _)| { + let data = ctx.data(src_inst); + for &op in ops { + if data.opcode() == op { + return Some(src_inst); + } + } + None + }) +} + +fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg { + ctx.use_input_reg(input); + input.reg +} + +/// Put the given input into a register, and mark it as used (side-effect). +fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg { + let input = ctx.get_input(spec.insn, spec.input); + + if let Some(c) = input.constant { + // Generate constants fresh at each use to minimize long-range register pressure. + let ty = ctx.input_ty(spec.insn, spec.input); + let from_bits = ty_bits(ty); + let masked = if from_bits < 64 { + c & ((1u64 << from_bits) - 1) + } else { + c + }; + + let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty); + for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) + .into_iter() + { + ctx.emit(inst); + } + cst_copy.to_reg() + } else { + lowerinput_to_reg(ctx, input) + } +} + +/// An extension specification for `extend_input_to_reg`. +#[derive(Clone, Copy)] +enum ExtSpec { + ZeroExtendTo32, + ZeroExtendTo64, + SignExtendTo32, + SignExtendTo64, +} + +/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if +/// required. (This obviously causes side-effects.) +fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg { + let requested_size = match ext_spec { + ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32, + ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64, + }; + let input_size = ctx.input_ty(spec.insn, spec.input).bits(); + + let requested_ty = if requested_size == 32 { + types::I32 + } else { + types::I64 + }; + + let ext_mode = match (input_size, requested_size) { + (a, b) if a == b => return put_input_in_reg(ctx, spec), + (1, 8) => return put_input_in_reg(ctx, spec), + (a, b) => ExtMode::new(a, b).expect(&format!("invalid extension: {} -> {}", a, b)), + }; + + let src = input_to_reg_mem(ctx, spec); + let dst = ctx.alloc_tmp(RegClass::I64, requested_ty); + match ext_spec { + ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => { + ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst)) + } + ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => { + ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst)) + } + } + dst.to_reg() +} + +fn lowerinput_to_reg_mem(ctx: Ctx, input: LowerInput) -> RegMem { + // TODO handle memory. + RegMem::reg(lowerinput_to_reg(ctx, input)) +} + +/// Put the given input into a register or a memory operand. +/// Effectful: may mark the given input as used, when returning the register form. +fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem { + let input = ctx.get_input(spec.insn, spec.input); + lowerinput_to_reg_mem(ctx, input) +} + +/// Returns whether the given input is an immediate that can be properly sign-extended, without any +/// possible side-effect. +fn lowerinput_to_sext_imm(input: LowerInput, input_ty: Type) -> Option<u32> { + input.constant.and_then(|x| { + // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend + // to 64 bits. For other sizes, it doesn't matter and we can just use the plain + // constant. + if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) { + Some(x as u32) + } else { + None + } + }) +} + +fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> { + let input = ctx.get_input(spec.insn, spec.input); + let input_ty = ctx.input_ty(spec.insn, spec.input); + lowerinput_to_sext_imm(input, input_ty) +} + +fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> { + ctx.get_input(spec.insn, spec.input).constant +} + +/// Put the given input into an immediate, a register or a memory operand. +/// Effectful: may mark the given input as used, when returning the register form. +fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm { + let input = ctx.get_input(spec.insn, spec.input); + let input_ty = ctx.input_ty(spec.insn, spec.input); + match lowerinput_to_sext_imm(input, input_ty) { + Some(x) => RegMemImm::imm(x), + None => match lowerinput_to_reg_mem(ctx, input) { + RegMem::Reg { reg } => RegMemImm::reg(reg), + RegMem::Mem { addr } => RegMemImm::mem(addr), + }, + } +} + +/// Emit an instruction to insert a value `src` into a lane of `dst`. +fn emit_insert_lane<C: LowerCtx<I = Inst>>( + ctx: &mut C, + src: RegMem, + dst: Writable<Reg>, + lane: u8, + ty: Type, +) { + if !ty.is_float() { + let (sse_op, is64) = match ty.lane_bits() { + 8 => (SseOpcode::Pinsrb, false), + 16 => (SseOpcode::Pinsrw, false), + 32 => (SseOpcode::Pinsrd, false), + 64 => (SseOpcode::Pinsrd, true), + _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()), + }; + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64)); + } else if ty == types::F32 { + let sse_op = SseOpcode::Insertps; + // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane + // shifted into bits 5:6). + let lane = 0b00_00_00_00 | lane << 4; + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false)); + } else if ty == types::F64 { + let sse_op = match lane { + // Move the lowest quadword in replacement to vector without changing + // the upper bits. + 0 => SseOpcode::Movsd, + // Move the low 64 bits of replacement vector to the high 64 bits of the + // vector. + 1 => SseOpcode::Movlhps, + _ => unreachable!(), + }; + // Here we use the `xmm_rm_r` encoding because it correctly tells the register + // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other + // encoding formats like `xmm_unary_rm_r` treat it as a `def`. + ctx.emit(Inst::xmm_rm_r(sse_op, src, dst)); + } else { + panic!("unable to emit insertlane for type: {}", ty) + } +} + +/// Emits an int comparison instruction. +/// +/// Note: make sure that there are no instructions modifying the flags between a call to this +/// function and the use of the flags! +fn emit_cmp(ctx: Ctx, insn: IRInst) { + let ty = ctx.input_ty(insn, 0); + + let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; + + // TODO Try to commute the operands (and invert the condition) if one is an immediate. + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem_imm(ctx, inputs[1]); + + // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives + // us dst - src at the machine instruction level, so invert operands. + ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs)); +} + +/// A specification for a fcmp emission. +enum FcmpSpec { + /// Normal flow. + Normal, + + /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that + /// happens with `InvertedEqualOrConditions`. + /// + /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or + /// sequence of instructions) that check for an "AND" combination of condition codes; see for + /// instance lowering of Select. + InvertEqual, +} + +/// This explains how to interpret the results of an fcmp instruction. +enum FcmpCondResult { + /// The given condition code must be set. + Condition(CC), + + /// Both condition codes must be set. + AndConditions(CC, CC), + + /// Either of the conditions codes must be set. + OrConditions(CC, CC), + + /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either + /// of the condition codes must be set, and the user must invert meaning of analyzing the + /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be + /// reached. + InvertedEqualOrConditions(CC, CC), +} + +/// Emits a float comparison instruction. +/// +/// Note: make sure that there are no instructions modifying the flags between a call to this +/// function and the use of the flags! +fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult { + let (flip_operands, inverted_equal) = match cond_code { + FloatCC::LessThan + | FloatCC::LessThanOrEqual + | FloatCC::UnorderedOrGreaterThan + | FloatCC::UnorderedOrGreaterThanOrEqual => { + cond_code = cond_code.reverse(); + (true, false) + } + FloatCC::Equal => { + let inverted_equal = match spec { + FcmpSpec::Normal => false, + FcmpSpec::InvertEqual => { + cond_code = FloatCC::NotEqual; // same as .inverse() + true + } + }; + (false, inverted_equal) + } + _ => (false, false), + }; + + // The only valid CC constructed with `from_floatcc` can be put in the flag + // register with a direct float comparison; do this here. + let op = match ctx.input_ty(insn, 0) { + types::F32 => SseOpcode::Ucomiss, + types::F64 => SseOpcode::Ucomisd, + _ => panic!("Bad input type to Fcmp"), + }; + + let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; + let (lhs_input, rhs_input) = if flip_operands { + (inputs[1], inputs[0]) + } else { + (inputs[0], inputs[1]) + }; + let lhs = put_input_in_reg(ctx, lhs_input); + let rhs = input_to_reg_mem(ctx, rhs_input); + ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs)); + + let cond_result = match cond_code { + FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z), + FloatCC::NotEqual if inverted_equal => { + FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ) + } + FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ), + _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)), + }; + + cond_result +} + +fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature { + let mut sig = Signature::new(call_conv); + for i in 0..ctx.num_inputs(insn) { + sig.params.push(AbiParam::new(ctx.input_ty(insn, i))); + } + for i in 0..ctx.num_outputs(insn) { + sig.returns.push(AbiParam::new(ctx.output_ty(insn, i))); + } + if call_conv.extends_baldrdash() { + // Adds the special VMContext parameter to the signature. + sig.params + .push(AbiParam::special(ptr_ty, ArgumentPurpose::VMContext)); + } + sig +} + +fn emit_vm_call<C: LowerCtx<I = Inst>>( + ctx: &mut C, + flags: &Flags, + triple: &Triple, + libcall: LibCall, + insn: IRInst, + inputs: SmallVec<[InsnInput; 4]>, + outputs: SmallVec<[InsnOutput; 2]>, +) -> CodegenResult<()> { + let extname = ExternalName::LibCall(libcall); + + let dist = if flags.use_colocated_libcalls() { + RelocDistance::Near + } else { + RelocDistance::Far + }; + + // TODO avoid recreating signatures for every single Libcall function. + let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple)); + let sig = make_libcall_sig(ctx, insn, call_conv, types::I64); + let caller_conv = ctx.abi().call_conv(); + + let mut abi = X64ABICaller::from_func(&sig, &extname, dist, caller_conv)?; + + abi.emit_stack_pre_adjust(ctx); + + let vm_context = if call_conv.extends_baldrdash() { 1 } else { 0 }; + assert_eq!(inputs.len() + vm_context, abi.num_args()); + + for (i, input) in inputs.iter().enumerate() { + let arg_reg = put_input_in_reg(ctx, *input); + abi.emit_copy_reg_to_arg(ctx, i, arg_reg); + } + if call_conv.extends_baldrdash() { + let vm_context_vreg = ctx + .get_vm_context() + .expect("should have a VMContext to pass to libcall funcs"); + abi.emit_copy_reg_to_arg(ctx, inputs.len(), vm_context_vreg); + } + + abi.emit_call(ctx); + for (i, output) in outputs.iter().enumerate() { + let retval_reg = get_output_reg(ctx, *output); + abi.emit_copy_retval_to_reg(ctx, i, retval_reg); + } + abi.emit_stack_post_adjust(ctx); + + Ok(()) +} + +/// Returns whether the given input is a shift by a constant value less or equal than 3. +/// The goal is to embed it within an address mode. +fn matches_small_constant_shift<C: LowerCtx<I = Inst>>( + ctx: &mut C, + spec: InsnInput, +) -> Option<(InsnInput, u8)> { + matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| { + match input_to_imm( + ctx, + InsnInput { + insn: shift, + input: 1, + }, + ) { + Some(shift_amt) if shift_amt <= 3 => Some(( + InsnInput { + insn: shift, + input: 0, + }, + shift_amt as u8, + )), + _ => None, + } + }) +} + +/// Lowers an instruction to one of the x86 addressing modes. +/// +/// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior. +fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i32) -> Amode { + let flags = ctx + .memflags(spec.insn) + .expect("Instruction with amode should have memflags"); + + // We now either have an add that we must materialize, or some other input; as well as the + // final offset. + if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) { + debug_assert_eq!(ctx.output_ty(add, 0), types::I64); + let add_inputs = &[ + InsnInput { + insn: add, + input: 0, + }, + InsnInput { + insn: add, + input: 1, + }, + ]; + + // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations + // aren't happening in the wasm case. We could do better, given some range analysis. + let (base, index, shift) = if let Some((shift_input, shift_amt)) = + matches_small_constant_shift(ctx, add_inputs[0]) + { + ( + put_input_in_reg(ctx, add_inputs[1]), + put_input_in_reg(ctx, shift_input), + shift_amt, + ) + } else if let Some((shift_input, shift_amt)) = + matches_small_constant_shift(ctx, add_inputs[1]) + { + ( + put_input_in_reg(ctx, add_inputs[0]), + put_input_in_reg(ctx, shift_input), + shift_amt, + ) + } else { + for i in 0..=1 { + let input = ctx.get_input(add, i); + + // Try to pierce through uextend. + if let Some(uextend) = matches_input( + ctx, + InsnInput { + insn: add, + input: i, + }, + Opcode::Uextend, + ) { + if let Some(cst) = ctx.get_input(uextend, 0).constant { + // Zero the upper bits. + let input_size = ctx.input_ty(uextend, 0).bits() as u64; + let shift: u64 = 64 - input_size; + let uext_cst: u64 = (cst << shift) >> shift; + + let final_offset = (offset as i64).wrapping_add(uext_cst as i64); + if low32_will_sign_extend_to_64(final_offset as u64) { + let base = put_input_in_reg(ctx, add_inputs[1 - i]); + return Amode::imm_reg(final_offset as u32, base).with_flags(flags); + } + } + } + + // If it's a constant, add it directly! + if let Some(cst) = input.constant { + let final_offset = (offset as i64).wrapping_add(cst as i64); + if low32_will_sign_extend_to_64(final_offset as u64) { + let base = put_input_in_reg(ctx, add_inputs[1 - i]); + return Amode::imm_reg(final_offset as u32, base).with_flags(flags); + } + } + } + + ( + put_input_in_reg(ctx, add_inputs[0]), + put_input_in_reg(ctx, add_inputs[1]), + 0, + ) + }; + + return Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags); + } + + let input = put_input_in_reg(ctx, spec); + Amode::imm_reg(offset as u32, input).with_flags(flags) +} + +//============================================================================= +// Top-level instruction lowering entry point, for one instruction. + +/// Actually codegen an instruction's results into registers. +fn lower_insn_to_regs<C: LowerCtx<I = Inst>>( + ctx: &mut C, + insn: IRInst, + flags: &Flags, + triple: &Triple, +) -> CodegenResult<()> { + let op = ctx.data(insn).opcode(); + + let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn)) + .map(|i| InsnInput { insn, input: i }) + .collect(); + let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn)) + .map(|i| InsnOutput { insn, output: i }) + .collect(); + + let ty = if outputs.len() > 0 { + Some(ctx.output_ty(insn, 0)) + } else { + None + }; + + match op { + Opcode::Iconst | Opcode::Bconst | Opcode::Null => { + let value = ctx + .get_constant(insn) + .expect("constant value for iconst et al"); + let dst = get_output_reg(ctx, outputs[0]); + for inst in Inst::gen_constant(dst, value, ty.unwrap(), |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + } + + Opcode::Iadd + | Opcode::IaddIfcout + | Opcode::SaddSat + | Opcode::UaddSat + | Opcode::Isub + | Opcode::SsubSat + | Opcode::UsubSat + | Opcode::Imul + | Opcode::AvgRound + | Opcode::Band + | Opcode::Bor + | Opcode::Bxor => { + let ty = ty.unwrap(); + if ty.lane_count() > 1 { + let sse_op = match op { + Opcode::Iadd => match ty { + types::I8X16 => SseOpcode::Paddb, + types::I16X8 => SseOpcode::Paddw, + types::I32X4 => SseOpcode::Paddd, + types::I64X2 => SseOpcode::Paddq, + _ => panic!("Unsupported type for packed iadd instruction: {}", ty), + }, + Opcode::SaddSat => match ty { + types::I8X16 => SseOpcode::Paddsb, + types::I16X8 => SseOpcode::Paddsw, + _ => panic!("Unsupported type for packed sadd_sat instruction: {}", ty), + }, + Opcode::UaddSat => match ty { + types::I8X16 => SseOpcode::Paddusb, + types::I16X8 => SseOpcode::Paddusw, + _ => panic!("Unsupported type for packed uadd_sat instruction: {}", ty), + }, + Opcode::Isub => match ty { + types::I8X16 => SseOpcode::Psubb, + types::I16X8 => SseOpcode::Psubw, + types::I32X4 => SseOpcode::Psubd, + types::I64X2 => SseOpcode::Psubq, + _ => panic!("Unsupported type for packed isub instruction: {}", ty), + }, + Opcode::SsubSat => match ty { + types::I8X16 => SseOpcode::Psubsb, + types::I16X8 => SseOpcode::Psubsw, + _ => panic!("Unsupported type for packed ssub_sat instruction: {}", ty), + }, + Opcode::UsubSat => match ty { + types::I8X16 => SseOpcode::Psubusb, + types::I16X8 => SseOpcode::Psubusw, + _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty), + }, + Opcode::Imul => match ty { + types::I16X8 => SseOpcode::Pmullw, + types::I32X4 => SseOpcode::Pmulld, + types::I64X2 => { + // Note for I64X2 we describe a lane A as being composed of a + // 32-bit upper half "Ah" and a 32-bit lower half "Al". + // The 32-bit long hand multiplication can then be written as: + // Ah Al + // * Bh Bl + // ----- + // Al * Bl + // + (Ah * Bl) << 32 + // + (Al * Bh) << 32 + // + // So for each lane we will compute: + // A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32 + // + // Note, the algorithm will use pmuldq which operates directly on + // the lower 32-bit (Al or Bl) of a lane and writes the result + // to the full 64-bits of the lane of the destination. For this + // reason we don't need shifts to isolate the lower 32-bits, however + // we will need to use shifts to isolate the high 32-bits when doing + // calculations, i.e. Ah == A >> 32 + // + // The full sequence then is as follows: + // A' = A + // A' = A' >> 32 + // A' = Ah' * Bl + // B' = B + // B' = B' >> 32 + // B' = Bh' * Al + // B' = B' + A' + // B' = B' << 32 + // A' = A + // A' = Al' * Bl + // A' = A' + B' + // dst = A' + + // Get inputs rhs=A and lhs=B and the dst register + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + // A' = A + let rhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2); + ctx.emit(Inst::gen_move(rhs_1, rhs, ty)); + + // A' = A' >> 32 + // A' = Ah' * Bl + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psrlq, + RegMemImm::imm(32), + rhs_1, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmuludq, + RegMem::reg(lhs.clone()), + rhs_1, + )); + + // B' = B + let lhs_1 = ctx.alloc_tmp(RegClass::V128, types::I64X2); + ctx.emit(Inst::gen_move(lhs_1, lhs, ty)); + + // B' = B' >> 32 + // B' = Bh' * Al + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psrlq, + RegMemImm::imm(32), + lhs_1, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1)); + + // B' = B' + A' + // B' = B' << 32 + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddq, + RegMem::reg(rhs_1.to_reg()), + lhs_1, + )); + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psllq, + RegMemImm::imm(32), + lhs_1, + )); + + // A' = A + // A' = Al' * Bl + // A' = A' + B' + // dst = A' + ctx.emit(Inst::gen_move(rhs_1, rhs, ty)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmuludq, + RegMem::reg(lhs.clone()), + rhs_1, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddq, + RegMem::reg(lhs_1.to_reg()), + rhs_1, + )); + ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty)); + return Ok(()); + } + _ => panic!("Unsupported type for packed imul instruction: {}", ty), + }, + Opcode::AvgRound => match ty { + types::I8X16 => SseOpcode::Pavgb, + types::I16X8 => SseOpcode::Pavgw, + _ => panic!("Unsupported type for packed avg_round instruction: {}", ty), + }, + Opcode::Band => match ty { + types::F32X4 => SseOpcode::Andps, + types::F64X2 => SseOpcode::Andpd, + _ => SseOpcode::Pand, + }, + Opcode::Bor => match ty { + types::F32X4 => SseOpcode::Orps, + types::F64X2 => SseOpcode::Orpd, + _ => SseOpcode::Por, + }, + Opcode::Bxor => match ty { + types::F32X4 => SseOpcode::Xorps, + types::F64X2 => SseOpcode::Xorpd, + _ => SseOpcode::Pxor, + }, + _ => panic!("Unsupported packed instruction: {}", op), + }; + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + // Move the `lhs` to the same register as `dst`. + ctx.emit(Inst::gen_move(dst, lhs, ty)); + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + } else { + let is_64 = ty == types::I64; + let alu_op = match op { + Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add, + Opcode::Isub => AluRmiROpcode::Sub, + Opcode::Imul => AluRmiROpcode::Mul, + Opcode::Band => AluRmiROpcode::And, + Opcode::Bor => AluRmiROpcode::Or, + Opcode::Bxor => AluRmiROpcode::Xor, + _ => unreachable!(), + }; + + let (lhs, rhs) = match op { + Opcode::Iadd + | Opcode::IaddIfcout + | Opcode::Imul + | Opcode::Band + | Opcode::Bor + | Opcode::Bxor => { + // For commutative operations, try to commute operands if one is an + // immediate. + if let Some(imm) = input_to_sext_imm(ctx, inputs[0]) { + (put_input_in_reg(ctx, inputs[1]), RegMemImm::imm(imm)) + } else { + ( + put_input_in_reg(ctx, inputs[0]), + input_to_reg_mem_imm(ctx, inputs[1]), + ) + } + } + Opcode::Isub => ( + put_input_in_reg(ctx, inputs[0]), + input_to_reg_mem_imm(ctx, inputs[1]), + ), + _ => unreachable!(), + }; + + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::mov_r_r(true, lhs, dst)); + ctx.emit(Inst::alu_rmi_r(is_64, alu_op, rhs, dst)); + } + } + + Opcode::BandNot => { + let ty = ty.unwrap(); + debug_assert!(ty.is_vector() && ty.bytes() == 16); + let lhs = input_to_reg_mem(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let sse_op = match ty { + types::F32X4 => SseOpcode::Andnps, + types::F64X2 => SseOpcode::Andnpd, + _ => SseOpcode::Pandn, + }; + // Note the flipping of operands: the `rhs` operand is used as the destination instead + // of the `lhs` as in the other bit operations above (e.g. `band`). + ctx.emit(Inst::gen_move(dst, rhs, ty)); + ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst)); + } + + Opcode::Iabs => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + if ty.is_vector() { + let opcode = match ty { + types::I8X16 => SseOpcode::Pabsb, + types::I16X8 => SseOpcode::Pabsw, + types::I32X4 => SseOpcode::Pabsd, + _ => panic!("Unsupported type for packed iabs instruction: {}", ty), + }; + ctx.emit(Inst::xmm_unary_rm_r(opcode, src, dst)); + } else { + unimplemented!("iabs is unimplemented for non-vector type: {}", ty); + } + } + + Opcode::Imax | Opcode::Umax | Opcode::Imin | Opcode::Umin => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + if ty.is_vector() { + let sse_op = match op { + Opcode::Imax => match ty { + types::I8X16 => SseOpcode::Pmaxsb, + types::I16X8 => SseOpcode::Pmaxsw, + types::I32X4 => SseOpcode::Pmaxsd, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + Opcode::Umax => match ty { + types::I8X16 => SseOpcode::Pmaxub, + types::I16X8 => SseOpcode::Pmaxuw, + types::I32X4 => SseOpcode::Pmaxud, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + Opcode::Imin => match ty { + types::I8X16 => SseOpcode::Pminsb, + types::I16X8 => SseOpcode::Pminsw, + types::I32X4 => SseOpcode::Pminsd, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + Opcode::Umin => match ty { + types::I8X16 => SseOpcode::Pminub, + types::I16X8 => SseOpcode::Pminuw, + types::I32X4 => SseOpcode::Pminud, + _ => panic!("Unsupported type for packed {} instruction: {}", op, ty), + }, + _ => unreachable!("This is a bug: the external and internal `match op` should be over the same opcodes."), + }; + + // Move the `lhs` to the same register as `dst`. + ctx.emit(Inst::gen_move(dst, lhs, ty)); + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + } else { + panic!("Unsupported type for {} instruction: {}", op, ty); + } + } + + Opcode::Bnot => { + let ty = ty.unwrap(); + let size = ty.bytes() as u8; + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, src, ty)); + + if ty.is_vector() { + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp)); + ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst)); + } else if ty.is_bool() { + unimplemented!("bool bnot") + } else { + ctx.emit(Inst::not(size, dst)); + } + } + + Opcode::Bitselect => { + let ty = ty.unwrap(); + let condition = put_input_in_reg(ctx, inputs[0]); + let if_true = put_input_in_reg(ctx, inputs[1]); + let if_false = input_to_reg_mem(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); + + if ty.is_vector() { + let tmp1 = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::gen_move(tmp1, if_true, ty)); + ctx.emit(Inst::and(ty, RegMem::reg(condition.clone()), tmp1)); + + let tmp2 = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::gen_move(tmp2, condition, ty)); + ctx.emit(Inst::and_not(ty, if_false, tmp2)); + + ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty)); + ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst)); + } else { + unimplemented!("scalar bitselect") + } + } + + Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => { + let dst_ty = ctx.output_ty(insn, 0); + debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty); + + let (size, lhs) = match dst_ty { + types::I8 | types::I16 => match op { + Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])), + Opcode::Ushr => ( + 4, + extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32), + ), + Opcode::Sshr => ( + 4, + extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32), + ), + Opcode::Rotl | Opcode::Rotr => { + (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])) + } + _ => unreachable!(), + }, + types::I32 | types::I64 => (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])), + _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty), + }; + + let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant { + // Mask count, according to Cranelift's semantics. + let cst = (cst as u8) & (dst_ty.bits() as u8 - 1); + (Some(cst), None) + } else { + (None, Some(put_input_in_reg(ctx, inputs[1]))) + }; + + let dst = get_output_reg(ctx, outputs[0]); + + let shift_kind = match op { + Opcode::Ishl => ShiftKind::ShiftLeft, + Opcode::Ushr => ShiftKind::ShiftRightLogical, + Opcode::Sshr => ShiftKind::ShiftRightArithmetic, + Opcode::Rotl => ShiftKind::RotateLeft, + Opcode::Rotr => ShiftKind::RotateRight, + _ => unreachable!(), + }; + + let w_rcx = Writable::from_reg(regs::rcx()); + ctx.emit(Inst::mov_r_r(true, lhs, dst)); + if count.is_none() { + ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx)); + } + ctx.emit(Inst::shift_r(size, shift_kind, count, dst)); + } + + Opcode::Ineg => { + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + if ty.is_vector() { + // Zero's out a register and then does a packed subtraction + // of the input from the register. + + let src = input_to_reg_mem(ctx, inputs[0]); + let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4); + + let subtract_opcode = match ty { + types::I8X16 => SseOpcode::Psubb, + types::I16X8 => SseOpcode::Psubw, + types::I32X4 => SseOpcode::Psubd, + types::I64X2 => SseOpcode::Psubq, + _ => panic!("Unsupported type for Ineg instruction, found {}", ty), + }; + + // Note we must zero out a tmp instead of using the destination register since + // the desitnation could be an alias for the source input register + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(tmp.to_reg()), + tmp, + )); + ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp)); + ctx.emit(Inst::xmm_unary_rm_r( + SseOpcode::Movapd, + RegMem::reg(tmp.to_reg()), + dst, + )); + } else { + let size = ty.bytes() as u8; + let src = put_input_in_reg(ctx, inputs[0]); + ctx.emit(Inst::gen_move(dst, src, ty)); + ctx.emit(Inst::neg(size, dst)); + } + } + + Opcode::Clz => { + // TODO when the x86 flags have use_lzcnt, we can use LZCNT. + + // General formula using bit-scan reverse (BSR): + // mov -1, %dst + // bsr %src, %tmp + // cmovz %dst, %tmp + // mov $(size_bits - 1), %dst + // sub %tmp, %dst + + let (ext_spec, ty) = match ctx.input_ty(insn, 0) { + types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), + a if a == types::I32 || a == types::I64 => (None, a), + _ => unreachable!(), + }; + + let src = if let Some(ext_spec) = ext_spec { + RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) + } else { + input_to_reg_mem(ctx, inputs[0]) + }; + let dst = get_output_reg(ctx, outputs[0]); + + let tmp = ctx.alloc_tmp(RegClass::I64, ty); + ctx.emit(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + u64::max_value(), + dst, + )); + + ctx.emit(Inst::unary_rm_r( + ty.bytes() as u8, + UnaryRmROpcode::Bsr, + src, + tmp, + )); + + ctx.emit(Inst::cmove( + ty.bytes() as u8, + CC::Z, + RegMem::reg(dst.to_reg()), + tmp, + )); + + ctx.emit(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + ty.bits() as u64 - 1, + dst, + )); + + ctx.emit(Inst::alu_rmi_r( + ty == types::I64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + } + + Opcode::Ctz => { + // TODO when the x86 flags have use_bmi1, we can use TZCNT. + + // General formula using bit-scan forward (BSF): + // bsf %src, %dst + // mov $(size_bits), %tmp + // cmovz %tmp, %dst + let ty = ctx.input_ty(insn, 0); + let ty = if ty.bits() < 32 { types::I32 } else { ty }; + debug_assert!(ty == types::I32 || ty == types::I64); + + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + let tmp = ctx.alloc_tmp(RegClass::I64, ty); + ctx.emit(Inst::imm(OperandSize::Size32, ty.bits() as u64, tmp)); + + ctx.emit(Inst::unary_rm_r( + ty.bytes() as u8, + UnaryRmROpcode::Bsf, + src, + dst, + )); + + ctx.emit(Inst::cmove( + ty.bytes() as u8, + CC::Z, + RegMem::reg(tmp.to_reg()), + dst, + )); + } + + Opcode::Popcnt => { + // TODO when the x86 flags have use_popcnt, we can use the popcnt instruction. + + let (ext_spec, ty) = match ctx.input_ty(insn, 0) { + types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), + a if a == types::I32 || a == types::I64 => (None, a), + _ => unreachable!(), + }; + + let src = if let Some(ext_spec) = ext_spec { + RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) + } else { + input_to_reg_mem(ctx, inputs[0]) + }; + let dst = get_output_reg(ctx, outputs[0]); + + if ty == types::I64 { + let is_64 = true; + + let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64); + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64); + let cst = ctx.alloc_tmp(RegClass::I64, types::I64); + + // mov src, tmp1 + ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // mov 0x7777_7777_7777_7777, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst)); + + // andq cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); + + // mov src, tmp2 + ctx.emit(Inst::mov64_rm_r(src, tmp2)); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // mov tmp2, dst + ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); + + // shr $4, dst + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst)); + + // add tmp2, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + dst, + )); + + // mov $0x0F0F_0F0F_0F0F_0F0F, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst)); + + // and cst, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + dst, + )); + + // mov $0x0101_0101_0101_0101, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst)); + + // mul cst, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Mul, + RegMemImm::reg(cst.to_reg()), + dst, + )); + + // shr $56, dst + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(56), + dst, + )); + } else { + assert_eq!(ty, types::I32); + let is_64 = false; + + let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64); + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64); + + // mov src, tmp1 + ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // andq $0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // mov src, tmp2 + ctx.emit(Inst::mov64_rm_r(src, tmp2)); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and 0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and $0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // mov tmp2, dst + ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); + + // shr $4, dst + ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst)); + + // add tmp2, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + dst, + )); + + // and $0x0F0F_0F0F, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x0F0F0F0F), + dst, + )); + + // mul $0x0101_0101, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Mul, + RegMemImm::imm(0x01010101), + dst, + )); + + // shr $24, dst + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(24), + dst, + )); + } + } + + Opcode::IsNull | Opcode::IsInvalid => { + // Null references are represented by the constant value 0; invalid references are + // represented by the constant value -1. See `define_reftypes()` in + // `meta/src/isa/x86/encodings.rs` to confirm. + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + let imm = match op { + Opcode::IsNull => { + // TODO could use tst src, src for IsNull + 0 + } + Opcode::IsInvalid => { + // We can do a 32-bit comparison even in 64-bits mode, as the constant is then + // sign-extended. + 0xffffffff + } + _ => unreachable!(), + }; + ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::imm(imm), src)); + ctx.emit(Inst::setcc(CC::Z, dst)); + } + + Opcode::Uextend + | Opcode::Sextend + | Opcode::Bint + | Opcode::Breduce + | Opcode::Bextend + | Opcode::Ireduce => { + let src_ty = ctx.input_ty(insn, 0); + let dst_ty = ctx.output_ty(insn, 0); + + // Sextend requires a sign-extended move, but all the other opcodes are simply a move + // from a zero-extended source. Here is why this works, in each case: + // + // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to + // zero-extend here. + // + // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so + // again, this is a zero-extend / no-op. + // + // - Ireduce: changing width of an integer. Smaller ints are stored with undefined + // high-order bits, so we can simply do a copy. + + if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend { + // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on + // 32-bits will zero-extend the upper 32-bits, so we can even not generate a + // zero-extended move in this case. + // TODO add loads and shifts here. + if let Some(_) = matches_input_any( + ctx, + inputs[0], + &[ + Opcode::Iadd, + Opcode::IaddIfcout, + Opcode::Isub, + Opcode::Imul, + Opcode::Band, + Opcode::Bor, + Opcode::Bxor, + ], + ) { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, src, types::I64)); + return Ok(()); + } + } + + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits()); + assert_eq!( + src_ty.bits() < dst_ty.bits(), + ext_mode.is_some(), + "unexpected extension: {} -> {}", + src_ty, + dst_ty + ); + + if let Some(ext_mode) = ext_mode { + if op == Opcode::Sextend { + ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst)); + } else { + ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst)); + } + } else { + ctx.emit(Inst::mov64_rm_r(src, dst)); + } + } + + Opcode::Icmp => { + let condcode = ctx.data(insn).cond_code().unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + if !ty.is_vector() { + emit_cmp(ctx, insn); + let cc = CC::from_intcc(condcode); + ctx.emit(Inst::setcc(cc, dst)); + } else { + assert_eq!(ty.bits(), 128); + let eq = |ty| match ty { + types::I8X16 => SseOpcode::Pcmpeqb, + types::I16X8 => SseOpcode::Pcmpeqw, + types::I32X4 => SseOpcode::Pcmpeqd, + types::I64X2 => SseOpcode::Pcmpeqq, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let gt = |ty| match ty { + types::I8X16 => SseOpcode::Pcmpgtb, + types::I16X8 => SseOpcode::Pcmpgtw, + types::I32X4 => SseOpcode::Pcmpgtd, + types::I64X2 => SseOpcode::Pcmpgtq, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let maxu = |ty| match ty { + types::I8X16 => SseOpcode::Pmaxub, + types::I16X8 => SseOpcode::Pmaxuw, + types::I32X4 => SseOpcode::Pmaxud, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let mins = |ty| match ty { + types::I8X16 => SseOpcode::Pminsb, + types::I16X8 => SseOpcode::Pminsw, + types::I32X4 => SseOpcode::Pminsd, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let minu = |ty| match ty { + types::I8X16 => SseOpcode::Pminub, + types::I16X8 => SseOpcode::Pminuw, + types::I32X4 => SseOpcode::Pminud, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + + // Here we decide which operand to use as the read/write `dst` (ModRM reg field) + // and which to use as the read `input` (ModRM r/m field). In the normal case we + // use Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for + // the less-than cases so that we can reuse the greater-than implementation. + let input = match condcode { + IntCC::SignedLessThan + | IntCC::SignedLessThanOrEqual + | IntCC::UnsignedLessThan + | IntCC::UnsignedLessThanOrEqual => { + let lhs = input_to_reg_mem(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + lhs + } + _ => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + ctx.emit(Inst::gen_move(dst, lhs, ty)); + rhs + } + }; + + match condcode { + IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)), + IntCC::NotEqual => { + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)); + // Emit all 1s into the `tmp` register. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp)); + // Invert the result of the `PCMPEQ*`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst)); + } + IntCC::SignedGreaterThan | IntCC::SignedLessThan => { + ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst)) + } + IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => { + ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)) + } + IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => { + ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)); + // Emit all 1s into the `tmp` register. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp)); + // Invert the result of the `PCMPEQ*`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst)); + } + IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => { + ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)) + } + _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode), + } + } + } + + Opcode::Fcmp => { + let cond_code = ctx.data(insn).fp_cond_code().unwrap(); + let input_ty = ctx.input_ty(insn, 0); + if !input_ty.is_vector() { + // Unordered is returned by setting ZF, PF, CF <- 111 + // Greater than by ZF, PF, CF <- 000 + // Less than by ZF, PF, CF <- 001 + // Equal by ZF, PF, CF <- 100 + // + // Checking the result of comiss is somewhat annoying because you don't have setcc + // instructions that explicitly check simultaneously for the condition (i.e. eq, le, + // gt, etc) *and* orderedness. + // + // So that might mean we need more than one setcc check and then a logical "and" or + // "or" to determine both, in some cases. However knowing that if the parity bit is + // set, then the result was considered unordered and knowing that if the parity bit is + // set, then both the ZF and CF flag bits must also be set we can get away with using + // one setcc for most condition codes. + + let dst = get_output_reg(ctx, outputs[0]); + + match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { + ctx.emit(Inst::setcc(cc, dst)); + } + FcmpCondResult::AndConditions(cc1, cc2) => { + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, dst)); + ctx.emit(Inst::alu_rmi_r( + false, + AluRmiROpcode::And, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, dst)); + ctx.emit(Inst::alu_rmi_r( + false, + AluRmiROpcode::Or, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), + } + } else { + let op = match input_ty { + types::F32X4 => SseOpcode::Cmpps, + types::F64X2 => SseOpcode::Cmppd, + _ => panic!("Bad input type to fcmp: {}", input_ty), + }; + + // Since some packed comparisons are not available, some of the condition codes + // must be inverted, with a corresponding `flip` of the operands. + let (imm, flip) = match cond_code { + FloatCC::GreaterThan => (FcmpImm::LessThan, true), + FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true), + FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true), + FloatCC::UnorderedOrLessThanOrEqual => { + (FcmpImm::UnorderedOrGreaterThanOrEqual, true) + } + FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => { + panic!("unsupported float condition code: {}", cond_code) + } + _ => (FcmpImm::from(cond_code), false), + }; + + // Determine the operands of the comparison, possibly by flipping them. + let (lhs, rhs) = if flip { + ( + put_input_in_reg(ctx, inputs[1]), + input_to_reg_mem(ctx, inputs[0]), + ) + } else { + ( + put_input_in_reg(ctx, inputs[0]), + input_to_reg_mem(ctx, inputs[1]), + ) + }; + + // Move the `lhs` to the same register as `dst`; this may not emit an actual move + // but ensures that the registers are the same to match x86's read-write operand + // encoding. + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, lhs, input_ty)); + + // Emit the comparison. + ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false)); + } + } + + Opcode::FallthroughReturn | Opcode::Return => { + for i in 0..ctx.num_inputs(insn) { + let src_reg = put_input_in_reg(ctx, inputs[i]); + let retval_reg = ctx.retval(i); + let ty = ctx.input_ty(insn, i); + ctx.emit(Inst::gen_move(retval_reg, src_reg, ty)); + } + // N.B.: the Ret itself is generated by the ABI. + } + + Opcode::Call | Opcode::CallIndirect => { + let caller_conv = ctx.abi().call_conv(); + let (mut abi, inputs) = match op { + Opcode::Call => { + let (extname, dist) = ctx.call_target(insn).unwrap(); + let sig = ctx.call_sig(insn).unwrap(); + assert_eq!(inputs.len(), sig.params.len()); + assert_eq!(outputs.len(), sig.returns.len()); + ( + X64ABICaller::from_func(sig, &extname, dist, caller_conv)?, + &inputs[..], + ) + } + + Opcode::CallIndirect => { + let ptr = put_input_in_reg(ctx, inputs[0]); + let sig = ctx.call_sig(insn).unwrap(); + assert_eq!(inputs.len() - 1, sig.params.len()); + assert_eq!(outputs.len(), sig.returns.len()); + ( + X64ABICaller::from_ptr(sig, ptr, op, caller_conv)?, + &inputs[1..], + ) + } + + _ => unreachable!(), + }; + + abi.emit_stack_pre_adjust(ctx); + assert_eq!(inputs.len(), abi.num_args()); + for (i, input) in inputs.iter().enumerate() { + let arg_reg = put_input_in_reg(ctx, *input); + abi.emit_copy_reg_to_arg(ctx, i, arg_reg); + } + abi.emit_call(ctx); + for (i, output) in outputs.iter().enumerate() { + let retval_reg = get_output_reg(ctx, *output); + abi.emit_copy_retval_to_reg(ctx, i, retval_reg); + } + abi.emit_stack_post_adjust(ctx); + } + + Opcode::Debugtrap => { + ctx.emit(Inst::Hlt); + } + + Opcode::Trap | Opcode::ResumableTrap => { + let trap_code = ctx.data(insn).trap_code().unwrap(); + ctx.emit_safepoint(Inst::Ud2 { trap_code }); + } + + Opcode::Trapif | Opcode::Trapff => { + let trap_code = ctx.data(insn).trap_code().unwrap(); + + if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() { + let cond_code = ctx.data(insn).cond_code().unwrap(); + // The flags must not have been clobbered by any other instruction between the + // iadd_ifcout and this instruction, as verified by the CLIF validator; so we can + // simply use the flags here. + let cc = CC::from_intcc(cond_code); + + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }); + } else if op == Opcode::Trapif { + let cond_code = ctx.data(insn).cond_code().unwrap(); + let cc = CC::from_intcc(cond_code); + + // Verification ensures that the input is always a single-def ifcmp. + let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + emit_cmp(ctx, ifcmp); + + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }); + } else { + let cond_code = ctx.data(insn).fp_cond_code().unwrap(); + + // Verification ensures that the input is always a single-def ffcmp. + let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap(); + + match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }) + } + FcmpCondResult::AndConditions(cc1, cc2) => { + // A bit unfortunate, but materialize the flags in their own register, and + // check against this. + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, tmp2)); + ctx.emit(Inst::alu_rmi_r( + false, /* is_64 */ + AluRmiROpcode::And, + RegMemImm::reg(tmp.to_reg()), + tmp2, + )); + ctx.emit_safepoint(Inst::TrapIf { + trap_code, + cc: CC::NZ, + }); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc1 }); + ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc2 }); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), + }; + }; + } + + Opcode::F64const => { + // TODO use cmpeqpd for all 1s. + let value = ctx.get_constant(insn).unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + for inst in Inst::gen_constant(dst, value, types::F64, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + } + + Opcode::F32const => { + // TODO use cmpeqps for all 1s. + let value = ctx.get_constant(insn).unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + for inst in Inst::gen_constant(dst, value, types::F32, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + } + + Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + // Move the `lhs` to the same register as `dst`; this may not emit an actual move + // but ensures that the registers are the same to match x86's read-write operand + // encoding. + ctx.emit(Inst::gen_move(dst, lhs, ty)); + + // Note: min and max can't be handled here, because of the way Cranelift defines them: + // if any operand is a NaN, they must return the NaN operand, while the x86 machine + // instruction will return the second operand if either operand is a NaN. + let sse_op = match ty { + types::F32 => match op { + Opcode::Fadd => SseOpcode::Addss, + Opcode::Fsub => SseOpcode::Subss, + Opcode::Fmul => SseOpcode::Mulss, + Opcode::Fdiv => SseOpcode::Divss, + _ => unreachable!(), + }, + types::F64 => match op { + Opcode::Fadd => SseOpcode::Addsd, + Opcode::Fsub => SseOpcode::Subsd, + Opcode::Fmul => SseOpcode::Mulsd, + Opcode::Fdiv => SseOpcode::Divsd, + _ => unreachable!(), + }, + types::F32X4 => match op { + Opcode::Fadd => SseOpcode::Addps, + Opcode::Fsub => SseOpcode::Subps, + Opcode::Fmul => SseOpcode::Mulps, + Opcode::Fdiv => SseOpcode::Divps, + _ => unreachable!(), + }, + types::F64X2 => match op { + Opcode::Fadd => SseOpcode::Addpd, + Opcode::Fsub => SseOpcode::Subpd, + Opcode::Fmul => SseOpcode::Mulpd, + Opcode::Fdiv => SseOpcode::Divpd, + _ => unreachable!(), + }, + _ => panic!( + "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}", + ty + ), + }; + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + } + + Opcode::Fmin | Opcode::Fmax => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let is_min = op == Opcode::Fmin; + let output_ty = ty.unwrap(); + ctx.emit(Inst::gen_move(dst, rhs, output_ty)); + if !output_ty.is_vector() { + let op_size = match output_ty { + types::F32 => OperandSize::Size32, + types::F64 => OperandSize::Size64, + _ => panic!("unexpected type {:?} for fmin/fmax", output_ty), + }; + ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst)); + } else { + // X64's implementation of floating point min and floating point max does not + // propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the + // scalar approach we use jumps to handle cases where NaN and +0 propagation is + // not consistent with what is needed. However for packed floating point min and + // floating point max we implement a different approach to avoid the sequence + // of jumps that would be required on a per lane basis. Because we do not need to + // lower labels and jumps but do need ctx for creating temporaries we implement + // the lowering here in lower.rs instead of emit.rs as is done in the case for scalars. + // The outline of approach is as follows: + // + // First we preform the Min/Max in both directions. This is because in the + // case of an operand's lane containing a NaN or in the case of the lanes of the + // two operands containing 0 but with mismatched signs, x64 will return the second + // operand regardless of its contents. So in order to make sure we capture NaNs and + // normalize NaNs and 0 values we capture the operation in both directions and merge the + // results. Then we normalize the results through operations that create a mask for the + // lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize + // 0s. + // + // The following sequence is generated for min: + // + // movap{s,d} %lhs, %tmp + // minp{s,d} %dst, %tmp + // minp,{s,d} %lhs, %dst + // orp{s,d} %dst, %tmp + // cmpp{s,d} %tmp, %dst, $3 + // orps{s,d} %dst, %tmp + // psrl{s,d} {$10, $13}, %dst + // andnp{s,d} %tmp, %dst + // + // and for max the sequence is: + // + // movap{s,d} %lhs, %tmp + // minp{s,d} %dst, %tmp + // minp,{s,d} %lhs, %dst + // xorp{s,d} %tmp, %dst + // orp{s,d} %dst, %tmp + // subp{s,d} %dst, %tmp + // cmpp{s,d} %tmp, %dst, $3 + // psrl{s,d} {$10, $13}, %dst + // andnp{s,d} %tmp, %dst + + if is_min { + let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) = + match output_ty { + types::F32X4 => ( + SseOpcode::Movaps, + SseOpcode::Minps, + SseOpcode::Orps, + SseOpcode::Cmpps, + SseOpcode::Psrld, + 10, + SseOpcode::Andnps, + ), + types::F64X2 => ( + SseOpcode::Movapd, + SseOpcode::Minpd, + SseOpcode::Orpd, + SseOpcode::Cmppd, + SseOpcode::Psrlq, + 13, + SseOpcode::Andnpd, + ), + _ => unimplemented!("unsupported op type {:?}", output_ty), + }; + + // Copy lhs into tmp + let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, output_ty); + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1)); + + // Perform min in reverse direction + ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1)); + + // Perform min in original direction + ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst)); + + // X64 handles propagation of -0's and Nans differently between left and right + // operands. After doing the min in both directions, this OR will + // guarrentee capture of -0's and Nan in our tmp register + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1)); + + // Compare unordered to create mask for lanes containing NaNs and then use + // that mask to saturate the NaN containing lanes in the tmp register with 1s. + // TODO: Would a check for NaN and then a jump be better here in the + // common case than continuing on to normalize NaNs that might not exist? + let cond = FcmpImm::from(FloatCC::Unordered); + ctx.emit(Inst::xmm_rm_r_imm( + cmp_op, + RegMem::reg(tmp_xmm1.to_reg()), + dst, + cond.encode(), + false, + )); + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // The dst register holds a mask for lanes containing NaNs. + // We take that mask and shift in preparation for creating a different mask + // to normalize NaNs (create a quite NaN) by zeroing out the appropriate + // number of least signficant bits. We shift right each lane by 10 bits + // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign + + // 11 exp. + 1 MSB sig.) for F64X2. + ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst)); + + // Finally we do a nand with the tmp register to produce the final results + // in the dst. + ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + } else { + let ( + mov_op, + max_op, + xor_op, + or_op, + sub_op, + cmp_op, + shift_op, + shift_by, + andn_op, + ) = match output_ty { + types::F32X4 => ( + SseOpcode::Movaps, + SseOpcode::Maxps, + SseOpcode::Xorps, + SseOpcode::Orps, + SseOpcode::Subps, + SseOpcode::Cmpps, + SseOpcode::Psrld, + 10, + SseOpcode::Andnps, + ), + types::F64X2 => ( + SseOpcode::Movapd, + SseOpcode::Maxpd, + SseOpcode::Xorpd, + SseOpcode::Orpd, + SseOpcode::Subpd, + SseOpcode::Cmppd, + SseOpcode::Psrlq, + 13, + SseOpcode::Andnpd, + ), + _ => unimplemented!("unsupported op type {:?}", output_ty), + }; + + // Copy lhs into tmp. + let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32); + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1)); + + // Perform max in reverse direction. + ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // Perform max in original direction. + ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst)); + + // Get the difference between the two results and store in tmp. + // Max uses a different approach than min to account for potential + // discrepancies with plus/minus 0. + ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + + // X64 handles propagation of -0's and Nans differently between left and right + // operands. After doing the max in both directions, this OR will + // guarentee capture of 0's and Nan in our tmp register. + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // Capture NaNs and sign discrepancies. + ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + + // Compare unordered to create mask for lanes containing NaNs and then use + // that mask to saturate the NaN containing lanes in the tmp register with 1s. + let cond = FcmpImm::from(FloatCC::Unordered); + ctx.emit(Inst::xmm_rm_r_imm( + cmp_op, + RegMem::reg(tmp_xmm1.to_reg()), + dst, + cond.encode(), + false, + )); + + // The dst register holds a mask for lanes containing NaNs. + // We take that mask and shift in preparation for creating a different mask + // to normalize NaNs (create a quite NaN) by zeroing out the appropriate + // number of least signficant bits. We shift right each lane by 10 bits + // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign + + // 11 exp. + 1 MSB sig.) for F64X2. + ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst)); + + // Finally we do a nand with the tmp register to produce the final results + // in the dst. + ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + } + } + } + + Opcode::FminPseudo | Opcode::FmaxPseudo => { + let lhs = input_to_reg_mem(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + let sse_opcode = match (ty, op) { + (types::F32X4, Opcode::FminPseudo) => SseOpcode::Minps, + (types::F32X4, Opcode::FmaxPseudo) => SseOpcode::Maxps, + (types::F64X2, Opcode::FminPseudo) => SseOpcode::Minpd, + (types::F64X2, Opcode::FmaxPseudo) => SseOpcode::Maxpd, + _ => unimplemented!("unsupported type {} for {}", ty, op), + }; + ctx.emit(Inst::xmm_rm_r(sse_opcode, lhs, dst)); + } + + Opcode::Sqrt => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + let sse_op = match ty { + types::F32 => SseOpcode::Sqrtss, + types::F64 => SseOpcode::Sqrtsd, + types::F32X4 => SseOpcode::Sqrtps, + types::F64X2 => SseOpcode::Sqrtpd, + _ => panic!( + "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}", + ty + ), + }; + + ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst)); + } + + Opcode::Fpromote => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst)); + } + + Opcode::Fdemote => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst)); + } + + Opcode::FcvtFromSint => { + let output_ty = ty.unwrap(); + if !output_ty.is_vector() { + let (ext_spec, src_size) = match ctx.input_ty(insn, 0) { + types::I8 | types::I16 => (Some(ExtSpec::SignExtendTo32), OperandSize::Size32), + types::I32 => (None, OperandSize::Size32), + types::I64 => (None, OperandSize::Size64), + _ => unreachable!(), + }; + + let src = match ext_spec { + Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)), + None => input_to_reg_mem(ctx, inputs[0]), + }; + + let opcode = if output_ty == types::F32 { + SseOpcode::Cvtsi2ss + } else { + assert_eq!(output_ty, types::F64); + SseOpcode::Cvtsi2sd + }; + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gpr_to_xmm(opcode, src, src_size, dst)); + } else { + let ty = ty.unwrap(); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let opcode = match ctx.input_ty(insn, 0) { + types::I32X4 => SseOpcode::Cvtdq2ps, + _ => { + unimplemented!("unable to use type {} for op {}", ctx.input_ty(insn, 0), op) + } + }; + ctx.emit(Inst::gen_move(dst, src, ty)); + ctx.emit(Inst::xmm_rm_r(opcode, RegMem::from(dst), dst)); + } + } + + Opcode::FcvtFromUint => { + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + let input_ty = ctx.input_ty(insn, 0); + if !ty.is_vector() { + match input_ty { + types::I8 | types::I16 | types::I32 => { + // Conversion from an unsigned int smaller than 64-bit is easy: zero-extend + + // do a signed conversion (which won't overflow). + let opcode = if ty == types::F32 { + SseOpcode::Cvtsi2ss + } else { + assert_eq!(ty, types::F64); + SseOpcode::Cvtsi2sd + }; + + let src = RegMem::reg(extend_input_to_reg( + ctx, + inputs[0], + ExtSpec::ZeroExtendTo64, + )); + ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst)); + } + + types::I64 => { + let src = put_input_in_reg(ctx, inputs[0]); + + let src_copy = ctx.alloc_tmp(RegClass::I64, types::I64); + ctx.emit(Inst::gen_move(src_copy, src, types::I64)); + + let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I64); + let tmp_gpr2 = ctx.alloc_tmp(RegClass::I64, types::I64); + ctx.emit(Inst::cvt_u64_to_float_seq( + ty == types::F64, + src_copy, + tmp_gpr1, + tmp_gpr2, + dst, + )); + } + _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty), + }; + } else { + // Converting packed unsigned integers to packed floats requires a few steps. + // There is no single instruction lowering for converting unsigned floats but there + // is for converting packed signed integers to float (cvtdq2ps). In the steps below + // we isolate the upper half (16 bits) and lower half (16 bits) of each lane and + // then we convert each half separately using cvtdq2ps meant for signed integers. + // In order for this to work for the upper half bits we must shift right by 1 + // (divide by 2) these bits in order to ensure the most significant bit is 0 not + // signed, and then after the conversion we double the value. Finally we add the + // converted values where addition will correctly round. + // + // Sequence: + // -> A = 0xffffffff + // -> Ah = 0xffff0000 + // -> Al = 0x0000ffff + // -> Convert(Al) // Convert int to float + // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed + // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift + // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion. + // -> dst = Ah + Al // Add the two floats together + + assert_eq!(ctx.input_ty(insn, 0), types::I32X4); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + // Create a temporary register + let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4); + ctx.emit(Inst::xmm_unary_rm_r( + SseOpcode::Movapd, + RegMem::reg(src), + tmp, + )); + ctx.emit(Inst::gen_move(dst, src, ty)); + + // Get the low 16 bits + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp)); + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp)); + + // Get the high 16 bits + ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst)); + + // Convert the low 16 bits + ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp)); + + // Shift the high bits by 1, convert, and double to get the correct value. + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Addps, + RegMem::reg(dst.to_reg()), + dst, + )); + + // Add together the two converted values. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Addps, + RegMem::reg(tmp.to_reg()), + dst, + )); + } + } + + Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + let input_ty = ctx.input_ty(insn, 0); + if !input_ty.is_vector() { + let src_size = if input_ty == types::F32 { + OperandSize::Size32 + } else { + assert_eq!(input_ty, types::F64); + OperandSize::Size64 + }; + + let output_ty = ty.unwrap(); + let dst_size = if output_ty == types::I32 { + OperandSize::Size32 + } else { + assert_eq!(output_ty, types::I64); + OperandSize::Size64 + }; + + let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat; + let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat; + + let src_copy = ctx.alloc_tmp(RegClass::V128, input_ty); + ctx.emit(Inst::gen_move(src_copy, src, input_ty)); + + let tmp_xmm = ctx.alloc_tmp(RegClass::V128, input_ty); + let tmp_gpr = ctx.alloc_tmp(RegClass::I64, output_ty); + + if to_signed { + ctx.emit(Inst::cvt_float_to_sint_seq( + src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, + )); + } else { + ctx.emit(Inst::cvt_float_to_uint_seq( + src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, + )); + } + } else { + if op == Opcode::FcvtToSintSat { + // Sets destination to zero if float is NaN + let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4); + ctx.emit(Inst::xmm_unary_rm_r( + SseOpcode::Movapd, + RegMem::reg(src), + tmp, + )); + ctx.emit(Inst::gen_move(dst, src, input_ty)); + let cond = FcmpImm::from(FloatCC::Equal); + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Cmpps, + RegMem::reg(tmp.to_reg()), + tmp, + cond.encode(), + false, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Andps, + RegMem::reg(tmp.to_reg()), + dst, + )); + + // Sets top bit of tmp if float is positive + // Setting up to set top bit on negative float values + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(dst.to_reg()), + tmp, + )); + + // Convert the packed float to packed doubleword. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Cvttps2dq, + RegMem::reg(dst.to_reg()), + dst, + )); + + // Set top bit only if < 0 + // Saturate lane with sign (top) bit. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pand, + RegMem::reg(dst.to_reg()), + tmp, + )); + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp)); + + // On overflow 0x80000000 is returned to a lane. + // Below sets positive overflow lanes to 0x7FFFFFFF + // Keeps negative overflow lanes as is. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(tmp.to_reg()), + dst, + )); + } else if op == Opcode::FcvtToUintSat { + unimplemented!("f32x4.convert_i32x4_u"); + } else { + // Since this branch is also guarded by a check for vector types + // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here + // due to vector varients not existing. The first two branches will + // cover all reachable cases. + unreachable!(); + } + } + } + + Opcode::Bitcast => { + let input_ty = ctx.input_ty(insn, 0); + let output_ty = ctx.output_ty(insn, 0); + match (input_ty, output_ty) { + (types::F32, types::I32) => { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_to_gpr( + SseOpcode::Movd, + src, + dst, + OperandSize::Size32, + )); + } + (types::I32, types::F32) => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gpr_to_xmm( + SseOpcode::Movd, + src, + OperandSize::Size32, + dst, + )); + } + (types::F64, types::I64) => { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::xmm_to_gpr( + SseOpcode::Movq, + src, + dst, + OperandSize::Size64, + )); + } + (types::I64, types::F64) => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gpr_to_xmm( + SseOpcode::Movq, + src, + OperandSize::Size64, + dst, + )); + } + _ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty), + } + } + + Opcode::Fabs | Opcode::Fneg => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + // In both cases, generate a constant and apply a single binary instruction: + // - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the + // src with it. + // - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the + // src with it. + let output_ty = ty.unwrap(); + if !output_ty.is_vector() { + let (val, opcode) = match output_ty { + types::F32 => match op { + Opcode::Fabs => (0x7fffffff, SseOpcode::Andps), + Opcode::Fneg => (0x80000000, SseOpcode::Xorps), + _ => unreachable!(), + }, + types::F64 => match op { + Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd), + Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd), + _ => unreachable!(), + }, + _ => panic!("unexpected type {:?} for Fabs", output_ty), + }; + + for inst in Inst::gen_constant(dst, val, output_ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + + ctx.emit(Inst::xmm_rm_r(opcode, src, dst)); + } else { + // Eventually vector constants should be available in `gen_constant` and this block + // can be merged with the one above (TODO). + if output_ty.bits() == 128 { + // Move the `lhs` to the same register as `dst`; this may not emit an actual move + // but ensures that the registers are the same to match x86's read-write operand + // encoding. + let src = put_input_in_reg(ctx, inputs[0]); + ctx.emit(Inst::gen_move(dst, src, output_ty)); + + // Generate an all 1s constant in an XMM register. This uses CMPPS but could + // have used CMPPD with the same effect. + let tmp = ctx.alloc_tmp(RegClass::V128, output_ty); + let cond = FcmpImm::from(FloatCC::Equal); + let cmpps = Inst::xmm_rm_r_imm( + SseOpcode::Cmpps, + RegMem::reg(tmp.to_reg()), + tmp, + cond.encode(), + false, + ); + ctx.emit(cmpps); + + // Shift the all 1s constant to generate the mask. + let lane_bits = output_ty.lane_bits(); + let (shift_opcode, opcode, shift_by) = match (op, lane_bits) { + (Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1), + (Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1), + (Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31), + (Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63), + _ => unreachable!( + "unexpected opcode and lane size: {:?}, {} bits", + op, lane_bits + ), + }; + let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp); + ctx.emit(shift); + + // Apply shifted mask (XOR or AND). + let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst); + ctx.emit(mask); + } else { + panic!("unexpected type {:?} for Fabs", output_ty); + } + } + } + + Opcode::Fcopysign => { + let dst = get_output_reg(ctx, outputs[0]); + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + + let ty = ty.unwrap(); + + // We're going to generate the following sequence: + // + // movabs $INT_MIN, tmp_gpr1 + // mov{d,q} tmp_gpr1, tmp_xmm1 + // movap{s,d} tmp_xmm1, dst + // andnp{s,d} src_1, dst + // movap{s,d} src_2, tmp_xmm2 + // andp{s,d} tmp_xmm1, tmp_xmm2 + // orp{s,d} tmp_xmm2, dst + + let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32); + let tmp_xmm2 = ctx.alloc_tmp(RegClass::V128, types::F32); + + let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty { + types::F32 => ( + 0x8000_0000, + SseOpcode::Movaps, + SseOpcode::Andnps, + SseOpcode::Andps, + SseOpcode::Orps, + ), + types::F64 => ( + 0x8000_0000_0000_0000, + SseOpcode::Movapd, + SseOpcode::Andnpd, + SseOpcode::Andpd, + SseOpcode::Orpd, + ), + _ => { + panic!("unexpected type {:?} for copysign", ty); + } + }; + + for inst in Inst::gen_constant(tmp_xmm1, sign_bit_cst, ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); + } + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst)); + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2)); + ctx.emit(Inst::xmm_rm_r( + and_op, + RegMem::reg(tmp_xmm1.to_reg()), + tmp_xmm2, + )); + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst)); + } + + Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => { + // TODO use ROUNDSS/ROUNDSD after sse4.1. + + // Lower to VM calls when there's no access to SSE4.1. + let ty = ty.unwrap(); + let libcall = match (ty, op) { + (types::F32, Opcode::Ceil) => LibCall::CeilF32, + (types::F64, Opcode::Ceil) => LibCall::CeilF64, + (types::F32, Opcode::Floor) => LibCall::FloorF32, + (types::F64, Opcode::Floor) => LibCall::FloorF64, + (types::F32, Opcode::Nearest) => LibCall::NearestF32, + (types::F64, Opcode::Nearest) => LibCall::NearestF64, + (types::F32, Opcode::Trunc) => LibCall::TruncF32, + (types::F64, Opcode::Trunc) => LibCall::TruncF64, + _ => panic!( + "unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc", + ty, op + ), + }; + + emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?; + } + + Opcode::Load + | Opcode::Uload8 + | Opcode::Sload8 + | Opcode::Uload16 + | Opcode::Sload16 + | Opcode::Uload32 + | Opcode::Sload32 + | Opcode::LoadComplex + | Opcode::Uload8Complex + | Opcode::Sload8Complex + | Opcode::Uload16Complex + | Opcode::Sload16Complex + | Opcode::Uload32Complex + | Opcode::Sload32Complex => { + let offset = ctx.data(insn).load_store_offset().unwrap(); + + let elem_ty = match op { + Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => { + types::I8 + } + Opcode::Sload16 + | Opcode::Uload16 + | Opcode::Sload16Complex + | Opcode::Uload16Complex => types::I16, + Opcode::Sload32 + | Opcode::Uload32 + | Opcode::Sload32Complex + | Opcode::Uload32Complex => types::I32, + Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0), + _ => unimplemented!(), + }; + + let ext_mode = ExtMode::new(elem_ty.bits(), 64); + + let sign_extend = match op { + Opcode::Sload8 + | Opcode::Sload8Complex + | Opcode::Sload16 + | Opcode::Sload16Complex + | Opcode::Sload32 + | Opcode::Sload32Complex => true, + _ => false, + }; + + let amode = match op { + Opcode::Load + | Opcode::Uload8 + | Opcode::Sload8 + | Opcode::Uload16 + | Opcode::Sload16 + | Opcode::Uload32 + | Opcode::Sload32 => { + assert_eq!(inputs.len(), 1, "only one input for load operands"); + lower_to_amode(ctx, inputs[0], offset) + } + + Opcode::LoadComplex + | Opcode::Uload8Complex + | Opcode::Sload8Complex + | Opcode::Uload16Complex + | Opcode::Sload16Complex + | Opcode::Uload32Complex + | Opcode::Sload32Complex => { + assert_eq!( + inputs.len(), + 2, + "can't handle more than two inputs in complex load" + ); + let base = put_input_in_reg(ctx, inputs[0]); + let index = put_input_in_reg(ctx, inputs[1]); + let shift = 0; + let flags = ctx.memflags(insn).expect("load should have memflags"); + Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags) + } + + _ => unreachable!(), + }; + + let dst = get_output_reg(ctx, outputs[0]); + let is_xmm = elem_ty.is_float() || elem_ty.is_vector(); + match (sign_extend, is_xmm) { + (true, false) => { + // The load is sign-extended only when the output size is lower than 64 bits, + // so ext-mode is defined in this case. + ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst)); + } + (false, false) => { + if elem_ty.bytes() == 8 { + // Use a plain load. + ctx.emit(Inst::mov64_m_r(amode, dst)) + } else { + // Use a zero-extended load. + ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst)) + } + } + (_, true) => { + ctx.emit(match elem_ty { + types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst), + types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst), + _ if elem_ty.is_vector() && elem_ty.bits() == 128 => { + Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst) + } // TODO Specialize for different types: MOVUPD, MOVDQU + _ => unreachable!("unexpected type for load: {:?}", elem_ty), + }); + } + } + } + + Opcode::Store + | Opcode::Istore8 + | Opcode::Istore16 + | Opcode::Istore32 + | Opcode::StoreComplex + | Opcode::Istore8Complex + | Opcode::Istore16Complex + | Opcode::Istore32Complex => { + let offset = ctx.data(insn).load_store_offset().unwrap(); + + let elem_ty = match op { + Opcode::Istore8 | Opcode::Istore8Complex => types::I8, + Opcode::Istore16 | Opcode::Istore16Complex => types::I16, + Opcode::Istore32 | Opcode::Istore32Complex => types::I32, + Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0), + _ => unreachable!(), + }; + + let addr = match op { + Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => { + assert_eq!(inputs.len(), 2, "only one input for store memory operands"); + lower_to_amode(ctx, inputs[1], offset) + } + + Opcode::StoreComplex + | Opcode::Istore8Complex + | Opcode::Istore16Complex + | Opcode::Istore32Complex => { + assert_eq!( + inputs.len(), + 3, + "can't handle more than two inputs in complex store" + ); + let base = put_input_in_reg(ctx, inputs[1]); + let index = put_input_in_reg(ctx, inputs[2]); + let shift = 0; + let flags = ctx.memflags(insn).expect("store should have memflags"); + Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags) + } + + _ => unreachable!(), + }; + + let src = put_input_in_reg(ctx, inputs[0]); + + ctx.emit(match elem_ty { + types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr), + types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr), + _ if elem_ty.is_vector() && elem_ty.bits() == 128 => { + // TODO Specialize for different types: MOVUPD, MOVDQU, etc. + Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr) + } + _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr), + }); + } + + Opcode::AtomicRmw => { + // This is a simple, general-case atomic update, based on a loop involving + // `cmpxchg`. Note that we could do much better than this in the case where the old + // value at the location (that is to say, the SSA `Value` computed by this CLIF + // instruction) is not required. In that case, we could instead implement this + // using a single `lock`-prefixed x64 read-modify-write instruction. Also, even in + // the case where the old value is required, for the `add` and `sub` cases, we can + // use the single instruction `lock xadd`. However, those improvements have been + // left for another day. + // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153 + let dst = get_output_reg(ctx, outputs[0]); + let mut addr = put_input_in_reg(ctx, inputs[0]); + let mut arg2 = put_input_in_reg(ctx, inputs[1]); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + // Make sure that both args are in virtual regs, since in effect we have to do a + // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not + // guaranteed safe if either is in a real reg. + addr = ctx.ensure_in_vreg(addr, types::I64); + arg2 = ctx.ensure_in_vreg(arg2, types::I64); + + // Move the args to the preordained AtomicRMW input regs. Note that `AtomicRmwSeq` + // operates at whatever width is specified by `ty`, so there's no need to + // zero-extend `arg2` in the case of `ty` being I8/I16/I32. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::r9()), + addr, + types::I64, + )); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::r10()), + arg2, + types::I64, + )); + + // Now the AtomicRmwSeq (pseudo-) instruction itself + let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap()); + ctx.emit(Inst::AtomicRmwSeq { ty: ty_access, op }); + + // And finally, copy the preordained AtomicRmwSeq output reg to its destination. + ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64)); + } + + Opcode::AtomicCas => { + // This is very similar to, but not identical to, the `AtomicRmw` case. As with + // `AtomicRmw`, there's no need to zero-extend narrow values here. + let dst = get_output_reg(ctx, outputs[0]); + let addr = lower_to_amode(ctx, inputs[0], 0); + let expected = put_input_in_reg(ctx, inputs[1]); + let replacement = put_input_in_reg(ctx, inputs[2]); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + // Move the expected value into %rax. Because there's only one fixed register on + // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the + // `AtomicRmw` case. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + expected, + types::I64, + )); + ctx.emit(Inst::LockCmpxchg { + ty: ty_access, + src: replacement, + dst: addr.into(), + }); + // And finally, copy the old value at the location to its destination reg. + ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64)); + } + + Opcode::AtomicLoad => { + // This is a normal load. The x86-TSO memory model provides sufficient sequencing + // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the + // need for any fence instructions. + let data = get_output_reg(ctx, outputs[0]); + let addr = lower_to_amode(ctx, inputs[0], 0); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + let rm = RegMem::mem(addr); + if ty_access == types::I64 { + ctx.emit(Inst::mov64_rm_r(rm, data)); + } else { + let ext_mode = ExtMode::new(ty_access.bits(), 64).expect(&format!( + "invalid extension during AtomicLoad: {} -> {}", + ty_access.bits(), + 64 + )); + ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data)); + } + } + + Opcode::AtomicStore => { + // This is a normal store, followed by an `mfence` instruction. + let data = put_input_in_reg(ctx, inputs[0]); + let addr = lower_to_amode(ctx, inputs[1], 0); + let ty_access = ctx.input_ty(insn, 0); + assert!(is_valid_atomic_transaction_ty(ty_access)); + + ctx.emit(Inst::mov_r_m(ty_access.bytes() as u8, data, addr)); + ctx.emit(Inst::Fence { + kind: FenceKind::MFence, + }); + } + + Opcode::Fence => { + ctx.emit(Inst::Fence { + kind: FenceKind::MFence, + }); + } + + Opcode::FuncAddr => { + let dst = get_output_reg(ctx, outputs[0]); + let (extname, _) = ctx.call_target(insn).unwrap(); + let extname = extname.clone(); + ctx.emit(Inst::LoadExtName { + dst, + name: Box::new(extname), + offset: 0, + }); + } + + Opcode::SymbolValue => { + let dst = get_output_reg(ctx, outputs[0]); + let (extname, _, offset) = ctx.symbol_value(insn).unwrap(); + let extname = extname.clone(); + ctx.emit(Inst::LoadExtName { + dst, + name: Box::new(extname), + offset, + }); + } + + Opcode::StackAddr => { + let (stack_slot, offset) = match *ctx.data(insn) { + InstructionData::StackLoad { + opcode: Opcode::StackAddr, + stack_slot, + offset, + } => (stack_slot, offset), + _ => unreachable!(), + }; + let dst = get_output_reg(ctx, outputs[0]); + let offset: i32 = offset.into(); + let inst = ctx + .abi() + .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst); + ctx.emit(inst); + } + + Opcode::Select => { + let flag_input = inputs[0]; + if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) { + let cond_code = ctx.data(fcmp).fp_cond_code().unwrap(); + + // For equal, we flip the operands, because we can't test a conjunction of + // CPU flags with a single cmove; see InvertedEqualOrConditions doc comment. + let (lhs_input, rhs_input) = match cond_code { + FloatCC::Equal => (inputs[2], inputs[1]), + _ => (inputs[1], inputs[2]), + }; + + let ty = ctx.output_ty(insn, 0); + let rhs = put_input_in_reg(ctx, rhs_input); + let dst = get_output_reg(ctx, outputs[0]); + let lhs = if is_int_or_ref_ty(ty) && ty.bytes() < 4 { + // Special case: since the higher bits are undefined per CLIF semantics, we + // can just apply a 32-bit cmove here. Force inputs into registers, to + // avoid partial spilling out-of-bounds with memory accesses, though. + // Sign-extend operands to 32, then do a cmove of size 4. + RegMem::reg(put_input_in_reg(ctx, lhs_input)) + } else { + input_to_reg_mem(ctx, lhs_input) + }; + + // We request inversion of Equal to NotEqual here: taking LHS if equal would mean + // take it if both CC::NP and CC::Z are set, the conjunction of which can't be + // modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the + // select operation, and invert the equal to a not-equal here. + let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual); + + if let FcmpCondResult::InvertedEqualOrConditions(_, _) = &fcmp_results { + // Keep this sync'd with the lowering of the select inputs above. + assert_eq!(cond_code, FloatCC::Equal); + } + + ctx.emit(Inst::gen_move(dst, rhs, ty)); + + match fcmp_results { + FcmpCondResult::Condition(cc) => { + if is_int_or_ref_ty(ty) { + let size = u8::max(ty.bytes() as u8, 4); + ctx.emit(Inst::cmove(size, cc, lhs, dst)); + } else { + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + } + } + FcmpCondResult::AndConditions(_, _) => { + unreachable!( + "can't AND with select; see above comment about inverting equal" + ); + } + FcmpCondResult::InvertedEqualOrConditions(cc1, cc2) + | FcmpCondResult::OrConditions(cc1, cc2) => { + if is_int_or_ref_ty(ty) { + let size = u8::max(ty.bytes() as u8, 4); + ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst)); + ctx.emit(Inst::cmove(size, cc2, lhs, dst)); + } else { + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst)); + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst)); + } + } + } + } else { + let ty = ty.unwrap(); + + let mut size = ty.bytes() as u8; + let lhs = if is_int_or_ref_ty(ty) { + if size < 4 { + // Special case: since the higher bits are undefined per CLIF semantics, we + // can just apply a 32-bit cmove here. Force inputs into registers, to + // avoid partial spilling out-of-bounds with memory accesses, though. + size = 4; + RegMem::reg(put_input_in_reg(ctx, inputs[1])) + } else { + input_to_reg_mem(ctx, inputs[1]) + } + } else { + input_to_reg_mem(ctx, inputs[1]) + }; + + let rhs = put_input_in_reg(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); + + let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) { + emit_cmp(ctx, icmp); + let cond_code = ctx.data(icmp).cond_code().unwrap(); + CC::from_intcc(cond_code) + } else { + // The input is a boolean value, compare it against zero. + let size = ctx.input_ty(insn, 0).bytes() as u8; + let test = put_input_in_reg(ctx, flag_input); + ctx.emit(Inst::cmp_rmi_r(size, RegMemImm::imm(0), test)); + CC::NZ + }; + + // This doesn't affect the flags. + ctx.emit(Inst::gen_move(dst, rhs, ty)); + + if is_int_or_ref_ty(ty) { + ctx.emit(Inst::cmove(size, cc, lhs, dst)); + } else { + debug_assert!(ty == types::F32 || ty == types::F64); + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + } + } + } + + Opcode::Selectif | Opcode::SelectifSpectreGuard => { + let lhs = input_to_reg_mem(ctx, inputs[1]); + let rhs = put_input_in_reg(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ctx.output_ty(insn, 0); + + // Verification ensures that the input is always a single-def ifcmp. + let cmp_insn = ctx + .get_input(inputs[0].insn, inputs[0].input) + .inst + .unwrap() + .0; + debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp); + emit_cmp(ctx, cmp_insn); + + let cc = CC::from_intcc(ctx.data(insn).cond_code().unwrap()); + + if is_int_or_ref_ty(ty) { + let size = ty.bytes() as u8; + if size == 1 { + // Sign-extend operands to 32, then do a cmove of size 4. + let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se)); + ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst)); + ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst)); + } else { + ctx.emit(Inst::gen_move(dst, rhs, ty)); + ctx.emit(Inst::cmove(size, cc, lhs, dst)); + } + } else { + debug_assert!(ty == types::F32 || ty == types::F64); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + } + } + + Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => { + let kind = match op { + Opcode::Udiv => DivOrRemKind::UnsignedDiv, + Opcode::Sdiv => DivOrRemKind::SignedDiv, + Opcode::Urem => DivOrRemKind::UnsignedRem, + Opcode::Srem => DivOrRemKind::SignedRem, + _ => unreachable!(), + }; + let is_div = kind.is_div(); + + let input_ty = ctx.input_ty(insn, 0); + let size = input_ty.bytes() as u8; + + let dividend = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + dividend, + input_ty, + )); + + if flags.avoid_div_traps() { + // A vcode meta-instruction is used to lower the inline checks, since they embed + // pc-relative offsets that must not change, thus requiring regalloc to not + // interfere by introducing spills and reloads. + // + // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that + // regalloc is aware of the coalescing opportunity between rax/rdx and the + // destination register. + let divisor = put_input_in_reg(ctx, inputs[1]); + + let divisor_copy = ctx.alloc_tmp(RegClass::I64, types::I64); + ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64)); + + let tmp = if op == Opcode::Sdiv && size == 8 { + Some(ctx.alloc_tmp(RegClass::I64, types::I64)) + } else { + None + }; + // TODO use xor + ctx.emit(Inst::imm( + OperandSize::Size32, + 0, + Writable::from_reg(regs::rdx()), + )); + ctx.emit(Inst::checked_div_or_rem_seq(kind, size, divisor_copy, tmp)); + } else { + let divisor = input_to_reg_mem(ctx, inputs[1]); + + // Fill in the high parts: + if kind.is_signed() { + // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for + // signed opcodes. + ctx.emit(Inst::sign_extend_data(size)); + } else if input_ty == types::I8 { + ctx.emit(Inst::movzx_rm_r( + ExtMode::BL, + RegMem::reg(regs::rax()), + Writable::from_reg(regs::rax()), + )); + } else { + // zero for unsigned opcodes. + ctx.emit(Inst::imm( + OperandSize::Size64, + 0, + Writable::from_reg(regs::rdx()), + )); + } + + // Emit the actual idiv. + ctx.emit(Inst::div(size, kind.is_signed(), divisor)); + } + + // Move the result back into the destination reg. + if is_div { + // The quotient is in rax. + ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty)); + } else { + // The remainder is in rdx. + ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty)); + } + } + + Opcode::Umulhi | Opcode::Smulhi => { + let input_ty = ctx.input_ty(insn, 0); + let size = input_ty.bytes() as u8; + + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + // Move lhs in %rax. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + lhs, + input_ty, + )); + + // Emit the actual mul or imul. + let signed = op == Opcode::Smulhi; + ctx.emit(Inst::mul_hi(size, signed, rhs)); + + // Read the result from the high part (stored in %rdx). + ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty)); + } + + Opcode::GetPinnedReg => { + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64)); + } + + Opcode::SetPinnedReg => { + let src = put_input_in_reg(ctx, inputs[0]); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::pinned_reg()), + src, + types::I64, + )); + } + + Opcode::Vconst => { + let used_constant = if let &InstructionData::UnaryConst { + constant_handle, .. + } = ctx.data(insn) + { + ctx.use_constant(VCodeConstantData::Pool( + constant_handle, + ctx.get_constant_data(constant_handle).clone(), + )) + } else { + unreachable!("vconst should always have unary_const format") + }; + // TODO use Inst::gen_constant() instead. + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + ctx.emit(Inst::xmm_load_const(used_constant, dst, ty)); + } + + Opcode::RawBitcast => { + // A raw_bitcast is just a mechanism for correcting the type of V128 values (see + // https://github.com/bytecodealliance/wasmtime/issues/1147). As such, this IR + // instruction should emit no machine code but a move is necessary to give the register + // allocator a definition for the output virtual register. + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + ctx.emit(Inst::gen_move(dst, src, ty)); + } + + Opcode::Shuffle => { + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let lhs_ty = ctx.input_ty(insn, 0); + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + let mask = match ctx.get_immediate(insn) { + Some(DataValue::V128(bytes)) => bytes.to_vec(), + _ => unreachable!("shuffle should always have a 16-byte immediate"), + }; + + // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a + // 1 in the most significant position zeroes the lane. + let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b }; + + ctx.emit(Inst::gen_move(dst, rhs, ty)); + if rhs == lhs { + // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM + // register. We statically build `constructed_mask` to zero out any unknown lane + // indices (may not be completely necessary: verification could fail incorrect mask + // values) and fix the indexes to all point to the `dst` vector. + let constructed_mask = mask + .iter() + // If the mask is greater than 15 it still may be referring to a lane in b. + .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b }) + .map(zero_unknown_lane_index) + .collect(); + let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); + let tmp = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const(constant, tmp, ty)); + // After loading the constructed mask in a temporary register, we use this to + // shuffle the `dst` register (remember that, in this case, it is the same as + // `src` so we disregard this register). + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)); + } else { + // If `lhs` and `rhs` are different, we must shuffle each separately and then OR + // them together. This is necessary due to PSHUFB semantics. As in the case above, + // we build the `constructed_mask` for each case statically. + + // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes. + let tmp0 = ctx.alloc_tmp(RegClass::V128, lhs_ty); + ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty)); + let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect(); + let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); + let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const(constant, tmp1, ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0)); + + // PSHUFB the second argument, placing zeroes for unused lanes. + let constructed_mask = mask + .iter() + .map(|b| b.wrapping_sub(16)) + .map(zero_unknown_lane_index) + .collect(); + let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); + let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16); + ctx.emit(Inst::xmm_load_const(constant, tmp2, ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst)); + + // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers + // is not important). + ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst)); + + // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB + } + } + + Opcode::Swizzle => { + // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec + // requiring mask indexes greater than 15 to have the same semantics as a 0 index. For + // the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF + // semantics match the Wasm SIMD semantics for this instruction. + // The instruction format maps to variables like: %dst = swizzle %src, %mask + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src = put_input_in_reg(ctx, inputs[0]); + let swizzle_mask = put_input_in_reg(ctx, inputs[1]); + + // Inform the register allocator that `src` and `dst` should be in the same register. + ctx.emit(Inst::gen_move(dst, src, ty)); + + // Create a mask for zeroing out-of-bounds lanes of the swizzle mask. + let zero_mask = ctx.alloc_tmp(RegClass::V128, types::I8X16); + static ZERO_MASK_VALUE: [u8; 16] = [ + 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, + 0x70, 0x70, + ]; + let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE)); + ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty)); + + // Use the `zero_mask` on a writable `swizzle_mask`. + let swizzle_mask = Writable::from_reg(swizzle_mask); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddusb, + RegMem::from(zero_mask), + swizzle_mask, + )); + + // Shuffle `dst` using the fixed-up `swizzle_mask`. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::from(swizzle_mask), + dst, + )); + } + + Opcode::Insertlane => { + // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let in_vec = put_input_in_reg(ctx, inputs[0]); + let src_ty = ctx.input_ty(insn, 1); + debug_assert!(!src_ty.is_vector()); + let src = input_to_reg_mem(ctx, inputs[1]); + let lane = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) { + *imm + } else { + unreachable!(); + }; + debug_assert!(lane < ty.lane_count() as u8); + + ctx.emit(Inst::gen_move(dst, in_vec, ty)); + emit_insert_lane(ctx, src, dst, lane, ty.lane_type()); + } + + Opcode::Extractlane => { + // The instruction format maps to variables like: %dst = extractlane %src, %lane + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = put_input_in_reg(ctx, inputs[0]); + let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) { + *imm + } else { + unreachable!(); + }; + debug_assert!(lane < src_ty.lane_count() as u8); + + if !ty.is_float() { + let (sse_op, w_bit) = match ty.lane_bits() { + 8 => (SseOpcode::Pextrb, false), + 16 => (SseOpcode::Pextrw, false), + 32 => (SseOpcode::Pextrd, false), + 64 => (SseOpcode::Pextrd, true), + _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()), + }; + let src = RegMem::reg(src); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit)); + } else { + if lane == 0 { + // Remove the extractlane instruction, leaving the float where it is. The upper + // bits will remain unchanged; for correctness, this relies on Cranelift type + // checking to avoid using those bits. + ctx.emit(Inst::gen_move(dst, src, ty)); + } else { + // Otherwise, shuffle the bits in `lane` to the lowest lane. + let sse_op = SseOpcode::Pshufd; + let mask = match src_ty { + // Move the value at `lane` to lane 0, copying existing value at lane 0 to + // other lanes. Again, this relies on Cranelift type checking to avoid + // using those bits. + types::F32X4 => 0b00_00_00_00 | lane, + // Move the value at `lane` 1 (we know it must be 1 because of the `if` + // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type + // checking assumption also applies here. + types::F64X2 => 0b11_10_11_10, + _ => unreachable!(), + }; + let src = RegMem::reg(src); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false)); + } + } + } + + Opcode::Splat | Opcode::LoadSplat => { + let ty = ty.unwrap(); + assert_eq!(ty.bits(), 128); + let src_ty = ctx.input_ty(insn, 0); + assert!(src_ty.bits() < 128); + + let src = match op { + Opcode::Splat => input_to_reg_mem(ctx, inputs[0]), + Opcode::LoadSplat => { + let offset = ctx.data(insn).load_store_offset().unwrap(); + let amode = lower_to_amode(ctx, inputs[0], offset); + RegMem::mem(amode) + } + _ => unreachable!(), + }; + let dst = get_output_reg(ctx, outputs[0]); + + // We know that splat will overwrite all of the lanes of `dst` but it takes several + // instructions to do so. Because of the multiple instructions, there is no good way to + // declare `dst` a `def` except with the following pseudo-instruction. + ctx.emit(Inst::xmm_uninit_value(dst)); + + // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST* + // and VPBROADCAST*. + match ty.lane_bits() { + 8 => { + emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); + // Initialize a register with all 0s. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Shuffle the lowest byte lane to all other lanes. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)) + } + 16 => { + emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); + emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); + // Shuffle the lowest two lanes to all other lanes. + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Pshufd, + RegMem::from(dst), + dst, + 0, + false, + )) + } + 32 => { + emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); + // Shuffle the lowest lane to all other lanes. + ctx.emit(Inst::xmm_rm_r_imm( + SseOpcode::Pshufd, + RegMem::from(dst), + dst, + 0, + false, + )) + } + 64 => { + emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); + emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); + } + _ => panic!("Invalid type to splat: {}", ty), + } + } + + Opcode::VanyTrue => { + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = put_input_in_reg(ctx, inputs[0]); + // Set the ZF if the result is all zeroes. + ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src)); + // If the ZF is not set, place a 1 in `dst`. + ctx.emit(Inst::setcc(CC::NZ, dst)); + } + + Opcode::VallTrue => { + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = input_to_reg_mem(ctx, inputs[0]); + + let eq = |ty: Type| match ty.lane_bits() { + 8 => SseOpcode::Pcmpeqb, + 16 => SseOpcode::Pcmpeqw, + 32 => SseOpcode::Pcmpeqd, + 64 => SseOpcode::Pcmpeqq, + _ => panic!("Unable to find an instruction for {} for type: {}", op, ty), + }; + + // Initialize a register with all 0s. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Compare to see what lanes are filled with all 1s. + ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp)); + // Set the ZF if the result is all zeroes. + ctx.emit(Inst::xmm_cmp_rm_r( + SseOpcode::Ptest, + RegMem::from(tmp), + tmp.to_reg(), + )); + // If the ZF is set, place a 1 in `dst`. + ctx.emit(Inst::setcc(CC::Z, dst)); + } + + Opcode::VhighBits => { + let src = put_input_in_reg(ctx, inputs[0]); + let src_ty = ctx.input_ty(insn, 0); + debug_assert!(src_ty.is_vector() && src_ty.bits() == 128); + let dst = get_output_reg(ctx, outputs[0]); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + + // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for + // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode, + // the instruction can access additional registers when used with a REX.R prefix. The + // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development + // Manual, vol. 2). This being the case, we will always clear REX.W since its use is + // unnecessary (`OperandSize` is used for setting/clearing REX.W). + let size = OperandSize::Size32; + + match src_ty { + types::I8X16 | types::B8X16 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size)) + } + types::I32X4 | types::B32X4 | types::F32X4 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size)) + } + types::I64X2 | types::B64X2 | types::F64X2 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size)) + } + types::I16X8 | types::B16X8 => { + // There is no x86 instruction for extracting the high bit of 16-bit lanes so + // here we: + // - duplicate the 16-bit lanes of `src` into 8-bit lanes: + // PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...] + // - use PMOVMSKB to gather the high bits; now we have duplicates, though + // - shift away the bottom 8 high bits to remove the duplicates. + let tmp = ctx.alloc_tmp(RegClass::V128, src_ty); + ctx.emit(Inst::gen_move(tmp, src, src_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp)); + ctx.emit(Inst::xmm_to_gpr( + SseOpcode::Pmovmskb, + tmp.to_reg(), + dst, + size, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(8), dst)); + } + _ => unimplemented!("unknown input type {} for {}", src_ty, op), + } + } + + Opcode::IaddImm + | Opcode::ImulImm + | Opcode::UdivImm + | Opcode::SdivImm + | Opcode::UremImm + | Opcode::SremImm + | Opcode::IrsubImm + | Opcode::IaddCin + | Opcode::IaddIfcin + | Opcode::IaddCout + | Opcode::IaddCarry + | Opcode::IaddIfcarry + | Opcode::IsubBin + | Opcode::IsubIfbin + | Opcode::IsubBout + | Opcode::IsubIfbout + | Opcode::IsubBorrow + | Opcode::IsubIfborrow + | Opcode::BandImm + | Opcode::BorImm + | Opcode::BxorImm + | Opcode::RotlImm + | Opcode::RotrImm + | Opcode::IshlImm + | Opcode::UshrImm + | Opcode::SshrImm => { + panic!("ALU+imm and ALU+carry ops should not appear here!"); + } + _ => unimplemented!("unimplemented lowering for opcode {:?}", op), + } + + Ok(()) +} + +//============================================================================= +// Lowering-backend trait implementation. + +impl LowerBackend for X64Backend { + type MInst = Inst; + + fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> { + lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.triple) + } + + fn lower_branch_group<C: LowerCtx<I = Inst>>( + &self, + ctx: &mut C, + branches: &[IRInst], + targets: &[MachLabel], + fallthrough: Option<MachLabel>, + ) -> CodegenResult<()> { + // A block should end with at most two branches. The first may be a + // conditional branch; a conditional branch can be followed only by an + // unconditional branch or fallthrough. Otherwise, if only one branch, + // it may be an unconditional branch, a fallthrough, a return, or a + // trap. These conditions are verified by `is_ebb_basic()` during the + // verifier pass. + assert!(branches.len() <= 2); + + if branches.len() == 2 { + // Must be a conditional branch followed by an unconditional branch. + let op0 = ctx.data(branches[0]).opcode(); + let op1 = ctx.data(branches[1]).opcode(); + + trace!( + "lowering two-branch group: opcodes are {:?} and {:?}", + op0, + op1 + ); + assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough); + + let taken = targets[0]; + let not_taken = match op1 { + Opcode::Jump => targets[1], + Opcode::Fallthrough => fallthrough.unwrap(), + _ => unreachable!(), // assert above. + }; + + match op0 { + Opcode::Brz | Opcode::Brnz => { + let flag_input = InsnInput { + insn: branches[0], + input: 0, + }; + + let src_ty = ctx.input_ty(branches[0], 0); + + if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) { + emit_cmp(ctx, icmp); + + let cond_code = ctx.data(icmp).cond_code().unwrap(); + let cond_code = if op0 == Opcode::Brz { + cond_code.inverse() + } else { + cond_code + }; + + let cc = CC::from_intcc(cond_code); + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) { + let cond_code = ctx.data(fcmp).fp_cond_code().unwrap(); + let cond_code = if op0 == Opcode::Brz { + cond_code.inverse() + } else { + cond_code + }; + match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } + FcmpCondResult::AndConditions(cc1, cc2) => { + ctx.emit(Inst::jmp_if(cc1.invert(), not_taken)); + ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken)); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + ctx.emit(Inst::jmp_if(cc1, taken)); + ctx.emit(Inst::jmp_cond(cc2, taken, not_taken)); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), + } + } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) { + let src = put_input_in_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + ); + let cc = match op0 { + Opcode::Brz => CC::Z, + Opcode::Brnz => CC::NZ, + _ => unreachable!(), + }; + let size_bytes = src_ty.bytes() as u8; + ctx.emit(Inst::cmp_rmi_r(size_bytes, RegMemImm::imm(0), src)); + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } else { + unimplemented!("brz/brnz with non-int type {:?}", src_ty); + } + } + + Opcode::BrIcmp => { + let src_ty = ctx.input_ty(branches[0], 0); + if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) { + let lhs = put_input_in_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + ); + let rhs = input_to_reg_mem_imm( + ctx, + InsnInput { + insn: branches[0], + input: 1, + }, + ); + let cc = CC::from_intcc(ctx.data(branches[0]).cond_code().unwrap()); + let byte_size = src_ty.bytes() as u8; + // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives + // us dst - src at the machine instruction level, so invert operands. + ctx.emit(Inst::cmp_rmi_r(byte_size, rhs, lhs)); + ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); + } else { + unimplemented!("bricmp with non-int type {:?}", src_ty); + } + } + + _ => panic!("unexpected branch opcode: {:?}", op0), + } + } else { + assert_eq!(branches.len(), 1); + + // Must be an unconditional branch or trap. + let op = ctx.data(branches[0]).opcode(); + match op { + Opcode::Jump | Opcode::Fallthrough => { + ctx.emit(Inst::jmp_known(targets[0])); + } + + Opcode::BrTable => { + let jt_size = targets.len() - 1; + assert!(jt_size <= u32::max_value() as usize); + let jt_size = jt_size as u32; + + let idx = extend_input_to_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + ExtSpec::ZeroExtendTo32, + ); + + // Bounds-check (compute flags from idx - jt_size) and branch to default. + ctx.emit(Inst::cmp_rmi_r(4, RegMemImm::imm(jt_size), idx)); + + // Emit the compound instruction that does: + // + // lea $jt, %rA + // movsbl [%rA, %rIndex, 2], %rB + // add %rB, %rA + // j *%rA + // [jt entries] + // + // This must be *one* instruction in the vcode because we cannot allow regalloc + // to insert any spills/fills in the middle of the sequence; otherwise, the + // lea PC-rel offset to the jumptable would be incorrect. (The alternative + // is to introduce a relocation pass for inlined jumptables, which is much + // worse.) + + // This temporary is used as a signed integer of 64-bits (to hold addresses). + let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64); + // This temporary is used as a signed integer of 32-bits (for the wasm-table + // index) and then 64-bits (address addend). The small lie about the I64 type + // is benign, since the temporary is dead after this instruction (and its + // Cranelift type is thus unused). + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64); + + let targets_for_term: Vec<MachLabel> = targets.to_vec(); + let default_target = targets[0]; + + let jt_targets: Vec<MachLabel> = targets.iter().skip(1).cloned().collect(); + + ctx.emit(Inst::JmpTableSeq { + idx, + tmp1, + tmp2, + default_target, + targets: jt_targets, + targets_for_term, + }); + } + + _ => panic!("Unknown branch type {:?}", op), + } + } + + Ok(()) + } + + fn maybe_pinned_reg(&self) -> Option<Reg> { + Some(regs::pinned_reg()) + } +} |