diff options
Diffstat (limited to 'third_party/rust/cranelift-codegen/src/machinst')
11 files changed, 6727 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen/src/machinst/abi.rs b/third_party/rust/cranelift-codegen/src/machinst/abi.rs new file mode 100644 index 0000000000..c72a81dcc2 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/abi.rs @@ -0,0 +1,233 @@ +//! ABI definitions. + +use crate::binemit::StackMap; +use crate::ir::StackSlot; +use crate::isa::CallConv; +use crate::machinst::*; +use crate::settings; + +use regalloc::{Reg, Set, SpillSlot, Writable}; + +/// Trait implemented by an object that tracks ABI-related state (e.g., stack +/// layout) and can generate code while emitting the *body* of a function. +pub trait ABICallee { + /// The instruction type for the ISA associated with this ABI. + type I: VCodeInst; + + /// Does the ABI-body code need a temp reg? One will be provided to `init()` + /// as the `maybe_tmp` arg if so. + fn temp_needed(&self) -> bool; + + /// Initialize. This is called after the ABICallee is constructed because it + /// may be provided with a temp vreg, which can only be allocated once the + /// lowering context exists. + fn init(&mut self, maybe_tmp: Option<Writable<Reg>>); + + /// Accumulate outgoing arguments. This ensures that at least SIZE bytes + /// are allocated in the prologue to be available for use in function calls + /// to hold arguments and/or return values. If this function is called + /// multiple times, the maximum of all SIZE values will be available. + fn accumulate_outgoing_args_size(&mut self, size: u32); + + /// Get the settings controlling this function's compilation. + fn flags(&self) -> &settings::Flags; + + /// Get the calling convention implemented by this ABI object. + fn call_conv(&self) -> CallConv; + + /// Get the liveins of the function. + fn liveins(&self) -> Set<RealReg>; + + /// Get the liveouts of the function. + fn liveouts(&self) -> Set<RealReg>; + + /// Number of arguments. + fn num_args(&self) -> usize; + + /// Number of return values. + fn num_retvals(&self) -> usize; + + /// Number of stack slots (not spill slots). + fn num_stackslots(&self) -> usize; + + /// Generate an instruction which copies an argument to a destination + /// register. + fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I; + + /// Is the given argument needed in the body (as opposed to, e.g., serving + /// only as a special ABI-specific placeholder)? This controls whether + /// lowering will copy it to a virtual reg use by CLIF instructions. + fn arg_is_needed_in_body(&self, idx: usize) -> bool; + + /// Generate any setup instruction needed to save values to the + /// return-value area. This is usually used when were are multiple return + /// values or an otherwise large return value that must be passed on the + /// stack; typically the ABI specifies an extra hidden argument that is a + /// pointer to that memory. + fn gen_retval_area_setup(&self) -> Option<Self::I>; + + /// Generate an instruction which copies a source register to a return value slot. + fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Writable<Reg>) -> Vec<Self::I>; + + /// Generate a return instruction. + fn gen_ret(&self) -> Self::I; + + /// Generate an epilogue placeholder. The returned instruction should return `true` from + /// `is_epilogue_placeholder()`; this is used to indicate to the lowering driver when + /// the epilogue should be inserted. + fn gen_epilogue_placeholder(&self) -> Self::I; + + // ----------------------------------------------------------------- + // Every function above this line may only be called pre-regalloc. + // Every function below this line may only be called post-regalloc. + // `spillslots()` must be called before any other post-regalloc + // function. + // ---------------------------------------------------------------- + + /// Update with the number of spillslots, post-regalloc. + fn set_num_spillslots(&mut self, slots: usize); + + /// Update with the clobbered registers, post-regalloc. + fn set_clobbered(&mut self, clobbered: Set<Writable<RealReg>>); + + /// Get the address of a stackslot. + fn stackslot_addr(&self, slot: StackSlot, offset: u32, into_reg: Writable<Reg>) -> Self::I; + + /// Load from a stackslot. + fn load_stackslot( + &self, + slot: StackSlot, + offset: u32, + ty: Type, + into_reg: Writable<Reg>, + ) -> Self::I; + + /// Store to a stackslot. + fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Self::I; + + /// Load from a spillslot. + fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Self::I; + + /// Store to a spillslot. + fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Self::I; + + /// Generate a stack map, given a list of spillslots and the emission state + /// at a given program point (prior to emission fo the safepointing + /// instruction). + fn spillslots_to_stack_map( + &self, + slots: &[SpillSlot], + state: &<Self::I as MachInstEmit>::State, + ) -> StackMap; + + /// Generate a prologue, post-regalloc. This should include any stack + /// frame or other setup necessary to use the other methods (`load_arg`, + /// `store_retval`, and spillslot accesses.) `self` is mutable so that we + /// can store information in it which will be useful when creating the + /// epilogue. + fn gen_prologue(&mut self) -> Vec<Self::I>; + + /// Generate an epilogue, post-regalloc. Note that this must generate the + /// actual return instruction (rather than emitting this in the lowering + /// logic), because the epilogue code comes before the return and the two are + /// likely closely related. + fn gen_epilogue(&self) -> Vec<Self::I>; + + /// Returns the full frame size for the given function, after prologue + /// emission has run. This comprises the spill slots and stack-storage slots + /// (but not storage for clobbered callee-save registers, arguments pushed + /// at callsites within this function, or other ephemeral pushes). This is + /// used for ABI variants where the client generates prologue/epilogue code, + /// as in Baldrdash (SpiderMonkey integration). + fn frame_size(&self) -> u32; + + /// Returns the size of arguments expected on the stack. + fn stack_args_size(&self) -> u32; + + /// Get the spill-slot size. + fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32; + + /// Generate a spill. The type, if known, is given; this can be used to + /// generate a store instruction optimized for the particular type rather + /// than the RegClass (e.g., only F64 that resides in a V128 register). If + /// no type is given, the implementation should spill the whole register. + fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Option<Type>) -> Self::I; + + /// Generate a reload (fill). As for spills, the type may be given to allow + /// a more optimized load instruction to be generated. + fn gen_reload( + &self, + to_reg: Writable<RealReg>, + from_slot: SpillSlot, + ty: Option<Type>, + ) -> Self::I; + + /// Desired unwind info type. + fn unwind_info_kind(&self) -> UnwindInfoKind; +} + +/// Trait implemented by an object that tracks ABI-related state and can +/// generate code while emitting a *call* to a function. +/// +/// An instance of this trait returns information for a *particular* +/// callsite. It will usually be computed from the called function's +/// signature. +/// +/// Unlike `ABICallee` above, methods on this trait are not invoked directly +/// by the machine-independent code. Rather, the machine-specific lowering +/// code will typically create an `ABICaller` when creating machine instructions +/// for an IR call instruction inside `lower()`, directly emit the arg and +/// and retval copies, and attach the register use/def info to the call. +/// +/// This trait is thus provided for convenience to the backends. +pub trait ABICaller { + /// The instruction type for the ISA associated with this ABI. + type I: VCodeInst; + + /// Get the number of arguments expected. + fn num_args(&self) -> usize; + + /// Emit a copy of an argument value from a source register, prior to the call. + fn emit_copy_reg_to_arg<C: LowerCtx<I = Self::I>>( + &self, + ctx: &mut C, + idx: usize, + from_reg: Reg, + ); + + /// Emit a copy a return value into a destination register, after the call returns. + fn emit_copy_retval_to_reg<C: LowerCtx<I = Self::I>>( + &self, + ctx: &mut C, + idx: usize, + into_reg: Writable<Reg>, + ); + + /// Emit code to pre-adjust the stack, prior to argument copies and call. + fn emit_stack_pre_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C); + + /// Emit code to post-adjust the satck, after call return and return-value copies. + fn emit_stack_post_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C); + + /// Accumulate outgoing arguments. This ensures that the caller (as + /// identified via the CTX argument) allocates enough space in the + /// prologue to hold all arguments and return values for this call. + /// There is no code emitted at the call site, everything is done + /// in the caller's function prologue. + fn accumulate_outgoing_args_size<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C); + + /// Emit the call itself. + /// + /// The returned instruction should have proper use- and def-sets according + /// to the argument registers, return-value registers, and clobbered + /// registers for this function signature in this ABI. + /// + /// (Arg registers are uses, and retval registers are defs. Clobbered + /// registers are also logically defs, but should never be read; their + /// values are "defined" (to the regalloc) but "undefined" in every other + /// sense.) + /// + /// This function should only be called once, as it is allowed to re-use + /// parts of the ABICaller object in emitting instructions. + fn emit_call<C: LowerCtx<I = Self::I>>(&mut self, ctx: &mut C); +} diff --git a/third_party/rust/cranelift-codegen/src/machinst/abi_impl.rs b/third_party/rust/cranelift-codegen/src/machinst/abi_impl.rs new file mode 100644 index 0000000000..e47379da37 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/abi_impl.rs @@ -0,0 +1,1320 @@ +//! Implementation of a vanilla ABI, shared between several machines. The +//! implementation here assumes that arguments will be passed in registers +//! first, then additional args on the stack; that the stack grows downward, +//! contains a standard frame (return address and frame pointer), and the +//! compiler is otherwise free to allocate space below that with its choice of +//! layout; and that the machine has some notion of caller- and callee-save +//! registers. Most modern machines, e.g. x86-64 and AArch64, should fit this +//! mold and thus both of these backends use this shared implementation. +//! +//! See the documentation in specific machine backends for the "instantiation" +//! of this generic ABI, i.e., which registers are caller/callee-save, arguments +//! and return values, and any other special requirements. +//! +//! For now the implementation here assumes a 64-bit machine, but we intend to +//! make this 32/64-bit-generic shortly. +//! +//! # Vanilla ABI +//! +//! First, arguments and return values are passed in registers up to a certain +//! fixed count, after which they overflow onto the stack. Multiple return +//! values either fit in registers, or are returned in a separate return-value +//! area on the stack, given by a hidden extra parameter. +//! +//! Note that the exact stack layout is up to us. We settled on the +//! below design based on several requirements. In particular, we need to be +//! able to generate instructions (or instruction sequences) to access +//! arguments, stack slots, and spill slots before we know how many spill slots +//! or clobber-saves there will be, because of our pass structure. We also +//! prefer positive offsets to negative offsets because of an asymmetry in +//! some machines' addressing modes (e.g., on AArch64, positive offsets have a +//! larger possible range without a long-form sequence to synthesize an +//! arbitrary offset). Finally, it is not allowed to access memory below the +//! current SP value. +//! +//! We assume that a prologue first pushes the frame pointer (and return address +//! above that, if the machine does not do that in hardware). We set FP to point +//! to this two-word frame record. We store all other frame slots below this +//! two-word frame record, with the stack pointer remaining at or below this +//! fixed frame storage for the rest of the function. We can then access frame +//! storage slots using positive offsets from SP. In order to allow codegen for +//! the latter before knowing how many clobber-saves we have, and also allow it +//! while SP is being adjusted to set up a call, we implement a "nominal SP" +//! tracking feature by which a fixup (distance between actual SP and a +//! "nominal" SP) is known at each instruction. +//! +//! # Stack Layout +//! +//! The stack looks like: +//! +//! ```plain +//! (high address) +//! +//! +---------------------------+ +//! | ... | +//! | stack args | +//! | (accessed via FP) | +//! +---------------------------+ +//! SP at function entry -----> | return address | +//! +---------------------------+ +//! FP after prologue --------> | FP (pushed by prologue) | +//! +---------------------------+ +//! | ... | +//! | spill slots | +//! | (accessed via nominal SP) | +//! | ... | +//! | stack slots | +//! | (accessed via nominal SP) | +//! nominal SP ---------------> | (alloc'd by prologue) | +//! +---------------------------+ +//! | ... | +//! | clobbered callee-saves | +//! SP at end of prologue ----> | (pushed by prologue) | +//! +---------------------------+ +//! | [alignment as needed] | +//! | ... | +//! | args for call | +//! SP before making a call --> | (pushed at callsite) | +//! +---------------------------+ +//! +//! (low address) +//! ``` +//! +//! # Multi-value Returns +//! +//! Note that we support multi-value returns in two ways. First, we allow for +//! multiple return-value registers. Second, if teh appropriate flag is set, we +//! support the SpiderMonkey Wasm ABI. For details of the multi-value return +//! ABI, see: +//! +//! https://searchfox.org/mozilla-central/rev/bc3600def806859c31b2c7ac06e3d69271052a89/js/src/wasm/WasmStubs.h#134 +//! +//! In brief: +//! - Return values are processed in *reverse* order. +//! - The first return value in this order (so the last return) goes into the +//! ordinary return register. +//! - Any further returns go in a struct-return area, allocated upwards (in +//! address order) during the reverse traversal. +//! - This struct-return area is provided by the caller, and a pointer to its +//! start is passed as an invisible last (extra) argument. Normally the caller +//! will allocate this area on the stack. When we generate calls, we place it +//! just above the on-stack argument area. +//! - So, for example, a function returning 4 i64's (v0, v1, v2, v3), with no +//! formal arguments, would: +//! - Accept a pointer `P` to the struct return area as a hidden argument in the +//! first argument register on entry. +//! - Return v3 in the one and only return-value register. +//! - Return v2 in memory at `[P]`. +//! - Return v1 in memory at `[P+8]`. +//! - Return v0 in memory at `[P+16]`. + +use super::abi::*; +use crate::binemit::StackMap; +use crate::ir::types::*; +use crate::ir::{ArgumentExtension, StackSlot}; +use crate::machinst::*; +use crate::settings; +use crate::CodegenResult; +use crate::{ir, isa}; +use alloc::vec::Vec; +use log::{debug, trace}; +use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable}; +use std::convert::TryFrom; +use std::marker::PhantomData; +use std::mem; + +/// A location for an argument or return value. +#[derive(Clone, Copy, Debug)] +pub enum ABIArg { + /// In a real register. + Reg( + RealReg, + ir::Type, + ir::ArgumentExtension, + ir::ArgumentPurpose, + ), + /// Arguments only: on stack, at given offset from SP at entry. + Stack(i64, ir::Type, ir::ArgumentExtension, ir::ArgumentPurpose), +} + +impl ABIArg { + /// Get the purpose of this arg. + fn get_purpose(self) -> ir::ArgumentPurpose { + match self { + ABIArg::Reg(_, _, _, purpose) => purpose, + ABIArg::Stack(_, _, _, purpose) => purpose, + } + } +} + +/// Are we computing information about arguments or return values? Much of the +/// handling is factored out into common routines; this enum allows us to +/// distinguish which case we're handling. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ArgsOrRets { + /// Arguments. + Args, + /// Return values. + Rets, +} + +/// Is an instruction returned by an ABI machine-specific backend a safepoint, +/// or not? +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum InstIsSafepoint { + /// The instruction is a safepoint. + Yes, + /// The instruction is not a safepoint. + No, +} + +/// Abstract location for a machine-specific ABI impl to translate into the +/// appropriate addressing mode. +#[derive(Clone, Copy, Debug)] +pub enum StackAMode { + /// Offset from the frame pointer, possibly making use of a specific type + /// for a scaled indexing operation. + FPOffset(i64, ir::Type), + /// Offset from the nominal stack pointer, possibly making use of a specific + /// type for a scaled indexing operation. + NominalSPOffset(i64, ir::Type), + /// Offset from the real stack pointer, possibly making use of a specific + /// type for a scaled indexing operation. + SPOffset(i64, ir::Type), +} + +/// Trait implemented by machine-specific backend to provide information about +/// register assignments and to allow generating the specific instructions for +/// stack loads/saves, prologues/epilogues, etc. +pub trait ABIMachineSpec { + /// The instruction type. + type I: VCodeInst; + + /// Returns the number of bits in a word, that is 32/64 for 32/64-bit architecture. + fn word_bits() -> u32; + + /// Returns the number of bytes in a word. + fn word_bytes() -> u32 { + return Self::word_bits() / 8; + } + + /// Returns word-size integer type. + fn word_type() -> Type { + match Self::word_bits() { + 32 => I32, + 64 => I64, + _ => unreachable!(), + } + } + + /// Returns word register class. + fn word_reg_class() -> RegClass { + match Self::word_bits() { + 32 => RegClass::I32, + 64 => RegClass::I64, + _ => unreachable!(), + } + } + + /// Returns required stack alignment in bytes. + fn stack_align(call_conv: isa::CallConv) -> u32; + + /// Process a list of parameters or return values and allocate them to registers + /// and stack slots. + /// + /// Returns the list of argument locations, the stack-space used (rounded up + /// to as alignment requires), and if `add_ret_area_ptr` was passed, the + /// index of the extra synthetic arg that was added. + fn compute_arg_locs( + call_conv: isa::CallConv, + params: &[ir::AbiParam], + args_or_rets: ArgsOrRets, + add_ret_area_ptr: bool, + ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)>; + + /// Returns the offset from FP to the argument area, i.e., jumping over the saved FP, return + /// address, and maybe other standard elements depending on ABI (e.g. Wasm TLS reg). + fn fp_to_arg_offset(call_conv: isa::CallConv, flags: &settings::Flags) -> i64; + + /// Generate a load from the stack. + fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I; + + /// Generate a store to the stack. + fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Self::I; + + /// Generate a move. + fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self::I; + + /// Generate an integer-extend operation. + fn gen_extend( + to_reg: Writable<Reg>, + from_reg: Reg, + is_signed: bool, + from_bits: u8, + to_bits: u8, + ) -> Self::I; + + /// Generate a return instruction. + fn gen_ret() -> Self::I; + + /// Generate an "epilogue placeholder" instruction, recognized by lowering + /// when using the Baldrdash ABI. + fn gen_epilogue_placeholder() -> Self::I; + + /// Generate an add-with-immediate. Note that even if this uses a scratch + /// register, it must satisfy two requirements: + /// + /// - The add-imm sequence must only clobber caller-save registers, because + /// it will be placed in the prologue before the clobbered callee-save + /// registers are saved. + /// + /// - The add-imm sequence must work correctly when `from_reg` and/or + /// `into_reg` are the register returned by `get_stacklimit_reg()`. + fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallVec<[Self::I; 4]>; + + /// Generate a sequence that traps with a `TrapCode::StackOverflow` code if + /// the stack pointer is less than the given limit register (assuming the + /// stack grows downward). + fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallVec<[Self::I; 2]>; + + /// Generate an instruction to compute an address of a stack slot (FP- or + /// SP-based offset). + fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I; + + /// Get a fixed register to use to compute a stack limit. This is needed for + /// certain sequences generated after the register allocator has already + /// run. This must satisfy two requirements: + /// + /// - It must be a caller-save register, because it will be clobbered in the + /// prologue before the clobbered callee-save registers are saved. + /// + /// - It must be safe to pass as an argument and/or destination to + /// `gen_add_imm()`. This is relevant when an addition with a large + /// immediate needs its own temporary; it cannot use the same fixed + /// temporary as this one. + fn get_stacklimit_reg() -> Reg; + + /// Generate a store to the given [base+offset] address. + fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Self::I; + + /// Generate a load from the given [base+offset] address. + fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Self::I; + + /// Adjust the stack pointer up or down. + fn gen_sp_reg_adjust(amount: i32) -> SmallVec<[Self::I; 2]>; + + /// Generate a meta-instruction that adjusts the nominal SP offset. + fn gen_nominal_sp_adj(amount: i32) -> Self::I; + + /// Generate the usual frame-setup sequence for this architecture: e.g., + /// `push rbp / mov rbp, rsp` on x86-64, or `stp fp, lr, [sp, #-16]!` on + /// AArch64. + fn gen_prologue_frame_setup() -> SmallVec<[Self::I; 2]>; + + /// Generate the usual frame-restore sequence for this architecture. + fn gen_epilogue_frame_restore() -> SmallVec<[Self::I; 2]>; + + /// Generate a clobber-save sequence. This takes the list of *all* registers + /// written/modified by the function body. The implementation here is + /// responsible for determining which of these are callee-saved according to + /// the ABI. It should return a sequence of instructions that "push" or + /// otherwise save these values to the stack. The sequence of instructions + /// should adjust the stack pointer downward, and should align as necessary + /// according to ABI requirements. + /// + /// Returns stack bytes used as well as instructions. Does not adjust + /// nominal SP offset; caller will do that. + fn gen_clobber_save( + call_conv: isa::CallConv, + flags: &settings::Flags, + clobbers: &Set<Writable<RealReg>>, + fixed_frame_storage_size: u32, + outgoing_args_size: u32, + ) -> (u64, SmallVec<[Self::I; 16]>); + + /// Generate a clobber-restore sequence. This sequence should perform the + /// opposite of the clobber-save sequence generated above, assuming that SP + /// going into the sequence is at the same point that it was left when the + /// clobber-save sequence finished. + fn gen_clobber_restore( + call_conv: isa::CallConv, + flags: &settings::Flags, + clobbers: &Set<Writable<RealReg>>, + fixed_frame_storage_size: u32, + outgoing_args_size: u32, + ) -> SmallVec<[Self::I; 16]>; + + /// Generate a call instruction/sequence. This method is provided one + /// temporary register to use to synthesize the called address, if needed. + fn gen_call( + dest: &CallDest, + uses: Vec<Reg>, + defs: Vec<Writable<Reg>>, + opcode: ir::Opcode, + tmp: Writable<Reg>, + callee_conv: isa::CallConv, + callee_conv: isa::CallConv, + ) -> SmallVec<[(InstIsSafepoint, Self::I); 2]>; + + /// Get the number of spillslots required for the given register-class and + /// type. + fn get_number_of_spillslots_for_value(rc: RegClass, ty: Type) -> u32; + + /// Get the current virtual-SP offset from an instruction-emission state. + fn get_virtual_sp_offset_from_state(s: &<Self::I as MachInstEmit>::State) -> i64; + + /// Get the "nominal SP to FP" offset from an instruction-emission state. + fn get_nominal_sp_to_fp(s: &<Self::I as MachInstEmit>::State) -> i64; + + /// Get all caller-save registers, that is, registers that we expect + /// not to be saved across a call to a callee with the given ABI. + fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> Vec<Writable<Reg>>; +} + +/// ABI information shared between body (callee) and caller. +struct ABISig { + /// Argument locations (regs or stack slots). Stack offsets are relative to + /// SP on entry to function. + args: Vec<ABIArg>, + /// Return-value locations. Stack offsets are relative to the return-area + /// pointer. + rets: Vec<ABIArg>, + /// Space on stack used to store arguments. + stack_arg_space: i64, + /// Space on stack used to store return values. + stack_ret_space: i64, + /// Index in `args` of the stack-return-value-area argument. + stack_ret_arg: Option<usize>, + /// Calling convention used. + call_conv: isa::CallConv, +} + +impl ABISig { + fn from_func_sig<M: ABIMachineSpec>(sig: &ir::Signature) -> CodegenResult<ABISig> { + // Compute args and retvals from signature. Handle retvals first, + // because we may need to add a return-area arg to the args. + let (rets, stack_ret_space, _) = M::compute_arg_locs( + sig.call_conv, + &sig.returns, + ArgsOrRets::Rets, + /* extra ret-area ptr = */ false, + )?; + let need_stack_return_area = stack_ret_space > 0; + let (args, stack_arg_space, stack_ret_arg) = M::compute_arg_locs( + sig.call_conv, + &sig.params, + ArgsOrRets::Args, + need_stack_return_area, + )?; + + trace!( + "ABISig: sig {:?} => args = {:?} rets = {:?} arg stack = {} ret stack = {} stack_ret_arg = {:?}", + sig, + args, + rets, + stack_arg_space, + stack_ret_space, + stack_ret_arg + ); + + Ok(ABISig { + args, + rets, + stack_arg_space, + stack_ret_space, + stack_ret_arg, + call_conv: sig.call_conv, + }) + } +} + +/// ABI object for a function body. +pub struct ABICalleeImpl<M: ABIMachineSpec> { + /// Signature: arg and retval regs. + sig: ABISig, + /// Offsets to each stackslot. + stackslots: Vec<u32>, + /// Total stack size of all stackslots. + stackslots_size: u32, + /// Stack size to be reserved for outgoing arguments. + outgoing_args_size: u32, + /// Clobbered registers, from regalloc. + clobbered: Set<Writable<RealReg>>, + /// Total number of spillslots, from regalloc. + spillslots: Option<usize>, + /// Storage allocated for the fixed part of the stack frame. This is + /// usually the same as the total frame size below, except in the case + /// of the baldrdash calling convention. + fixed_frame_storage_size: u32, + /// "Total frame size", as defined by "distance between FP and nominal SP". + /// Some items are pushed below nominal SP, so the function may actually use + /// more stack than this would otherwise imply. It is simply the initial + /// frame/allocation size needed for stackslots and spillslots. + total_frame_size: Option<u32>, + /// The register holding the return-area pointer, if needed. + ret_area_ptr: Option<Writable<Reg>>, + /// Calling convention this function expects. + call_conv: isa::CallConv, + /// The settings controlling this function's compilation. + flags: settings::Flags, + /// Whether or not this function is a "leaf", meaning it calls no other + /// functions + is_leaf: bool, + /// If this function has a stack limit specified, then `Reg` is where the + /// stack limit will be located after the instructions specified have been + /// executed. + /// + /// Note that this is intended for insertion into the prologue, if + /// present. Also note that because the instructions here execute in the + /// prologue this happens after legalization/register allocation/etc so we + /// need to be extremely careful with each instruction. The instructions are + /// manually register-allocated and carefully only use caller-saved + /// registers and keep nothing live after this sequence of instructions. + stack_limit: Option<(Reg, Vec<M::I>)>, + + _mach: PhantomData<M>, +} + +fn get_special_purpose_param_register( + f: &ir::Function, + abi: &ABISig, + purpose: ir::ArgumentPurpose, +) -> Option<Reg> { + let idx = f.signature.special_param_index(purpose)?; + match abi.args[idx] { + ABIArg::Reg(reg, ..) => Some(reg.to_reg()), + ABIArg::Stack(..) => None, + } +} + +impl<M: ABIMachineSpec> ABICalleeImpl<M> { + /// Create a new body ABI instance. + pub fn new(f: &ir::Function, flags: settings::Flags) -> CodegenResult<Self> { + debug!("ABI: func signature {:?}", f.signature); + + let sig = ABISig::from_func_sig::<M>(&f.signature)?; + + let call_conv = f.signature.call_conv; + // Only these calling conventions are supported. + debug_assert!( + call_conv == isa::CallConv::SystemV + || call_conv == isa::CallConv::Fast + || call_conv == isa::CallConv::Cold + || call_conv.extends_baldrdash(), + "Unsupported calling convention: {:?}", + call_conv + ); + + // Compute stackslot locations and total stackslot size. + let mut stack_offset: u32 = 0; + let mut stackslots = vec![]; + for (stackslot, data) in f.stack_slots.iter() { + let off = stack_offset; + stack_offset += data.size; + let mask = M::word_bytes() - 1; + stack_offset = (stack_offset + mask) & !mask; + debug_assert_eq!(stackslot.as_u32() as usize, stackslots.len()); + stackslots.push(off); + } + + // Figure out what instructions, if any, will be needed to check the + // stack limit. This can either be specified as a special-purpose + // argument or as a global value which often calculates the stack limit + // from the arguments. + let stack_limit = + get_special_purpose_param_register(f, &sig, ir::ArgumentPurpose::StackLimit) + .map(|reg| (reg, Vec::new())) + .or_else(|| f.stack_limit.map(|gv| gen_stack_limit::<M>(f, &sig, gv))); + + Ok(Self { + sig, + stackslots, + stackslots_size: stack_offset, + outgoing_args_size: 0, + clobbered: Set::empty(), + spillslots: None, + fixed_frame_storage_size: 0, + total_frame_size: None, + ret_area_ptr: None, + call_conv, + flags, + is_leaf: f.is_leaf(), + stack_limit, + _mach: PhantomData, + }) + } + + /// Inserts instructions necessary for checking the stack limit into the + /// prologue. + /// + /// This function will generate instructions necessary for perform a stack + /// check at the header of a function. The stack check is intended to trap + /// if the stack pointer goes below a particular threshold, preventing stack + /// overflow in wasm or other code. The `stack_limit` argument here is the + /// register which holds the threshold below which we're supposed to trap. + /// This function is known to allocate `stack_size` bytes and we'll push + /// instructions onto `insts`. + /// + /// Note that the instructions generated here are special because this is + /// happening so late in the pipeline (e.g. after register allocation). This + /// means that we need to do manual register allocation here and also be + /// careful to not clobber any callee-saved or argument registers. For now + /// this routine makes do with the `spilltmp_reg` as one temporary + /// register, and a second register of `tmp2` which is caller-saved. This + /// should be fine for us since no spills should happen in this sequence of + /// instructions, so our register won't get accidentally clobbered. + /// + /// No values can be live after the prologue, but in this case that's ok + /// because we just need to perform a stack check before progressing with + /// the rest of the function. + fn insert_stack_check(&self, stack_limit: Reg, stack_size: u32, insts: &mut Vec<M::I>) { + // With no explicit stack allocated we can just emit the simple check of + // the stack registers against the stack limit register, and trap if + // it's out of bounds. + if stack_size == 0 { + insts.extend(M::gen_stack_lower_bound_trap(stack_limit)); + return; + } + + // Note that the 32k stack size here is pretty special. See the + // documentation in x86/abi.rs for why this is here. The general idea is + // that we're protecting against overflow in the addition that happens + // below. + if stack_size >= 32 * 1024 { + insts.extend(M::gen_stack_lower_bound_trap(stack_limit)); + } + + // Add the `stack_size` to `stack_limit`, placing the result in + // `scratch`. + // + // Note though that `stack_limit`'s register may be the same as + // `scratch`. If our stack size doesn't fit into an immediate this + // means we need a second scratch register for loading the stack size + // into a register. + let scratch = Writable::from_reg(M::get_stacklimit_reg()); + insts.extend(M::gen_add_imm(scratch, stack_limit, stack_size).into_iter()); + insts.extend(M::gen_stack_lower_bound_trap(scratch.to_reg())); + } +} + +/// Generates the instructions necessary for the `gv` to be materialized into a +/// register. +/// +/// This function will return a register that will contain the result of +/// evaluating `gv`. It will also return any instructions necessary to calculate +/// the value of the register. +/// +/// Note that global values are typically lowered to instructions via the +/// standard legalization pass. Unfortunately though prologue generation happens +/// so late in the pipeline that we can't use these legalization passes to +/// generate the instructions for `gv`. As a result we duplicate some lowering +/// of `gv` here and support only some global values. This is similar to what +/// the x86 backend does for now, and hopefully this can be somewhat cleaned up +/// in the future too! +/// +/// Also note that this function will make use of `writable_spilltmp_reg()` as a +/// temporary register to store values in if necessary. Currently after we write +/// to this register there's guaranteed to be no spilled values between where +/// it's used, because we're not participating in register allocation anyway! +fn gen_stack_limit<M: ABIMachineSpec>( + f: &ir::Function, + abi: &ABISig, + gv: ir::GlobalValue, +) -> (Reg, Vec<M::I>) { + let mut insts = Vec::new(); + let reg = generate_gv::<M>(f, abi, gv, &mut insts); + return (reg, insts); +} + +fn generate_gv<M: ABIMachineSpec>( + f: &ir::Function, + abi: &ABISig, + gv: ir::GlobalValue, + insts: &mut Vec<M::I>, +) -> Reg { + match f.global_values[gv] { + // Return the direct register the vmcontext is in + ir::GlobalValueData::VMContext => { + get_special_purpose_param_register(f, abi, ir::ArgumentPurpose::VMContext) + .expect("no vmcontext parameter found") + } + // Load our base value into a register, then load from that register + // in to a temporary register. + ir::GlobalValueData::Load { + base, + offset, + global_type: _, + readonly: _, + } => { + let base = generate_gv::<M>(f, abi, base, insts); + let into_reg = Writable::from_reg(M::get_stacklimit_reg()); + insts.push(M::gen_load_base_offset( + into_reg, + base, + offset.into(), + M::word_type(), + )); + return into_reg.to_reg(); + } + ref other => panic!("global value for stack limit not supported: {}", other), + } +} + +/// Return a type either from an optional type hint, or if not, from the default +/// type associated with the given register's class. This is used to generate +/// loads/spills appropriately given the type of value loaded/stored (which may +/// be narrower than the spillslot). We usually have the type because the +/// regalloc usually provides the vreg being spilled/reloaded, and we know every +/// vreg's type. However, the regalloc *can* request a spill/reload without an +/// associated vreg when needed to satisfy a safepoint (which requires all +/// ref-typed values, even those in real registers in the original vcode, to be +/// in spillslots). +fn ty_from_ty_hint_or_reg_class<M: ABIMachineSpec>(r: Reg, ty: Option<Type>) -> Type { + match (ty, r.get_class()) { + // If the type is provided + (Some(t), _) => t, + // If no type is provided, this should be a register spill for a + // safepoint, so we only expect I32/I64 (integer) registers. + (None, rc) if rc == M::word_reg_class() => M::word_type(), + _ => panic!("Unexpected register class!"), + } +} + +impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> { + type I = M::I; + + fn temp_needed(&self) -> bool { + self.sig.stack_ret_arg.is_some() + } + + fn init(&mut self, maybe_tmp: Option<Writable<Reg>>) { + if self.sig.stack_ret_arg.is_some() { + assert!(maybe_tmp.is_some()); + self.ret_area_ptr = maybe_tmp; + } + } + + fn accumulate_outgoing_args_size(&mut self, size: u32) { + if size > self.outgoing_args_size { + self.outgoing_args_size = size; + } + } + + fn flags(&self) -> &settings::Flags { + &self.flags + } + + fn call_conv(&self) -> isa::CallConv { + self.sig.call_conv + } + + fn liveins(&self) -> Set<RealReg> { + let mut set: Set<RealReg> = Set::empty(); + for &arg in &self.sig.args { + if let ABIArg::Reg(r, ..) = arg { + set.insert(r); + } + } + set + } + + fn liveouts(&self) -> Set<RealReg> { + let mut set: Set<RealReg> = Set::empty(); + for &ret in &self.sig.rets { + if let ABIArg::Reg(r, ..) = ret { + set.insert(r); + } + } + set + } + + fn num_args(&self) -> usize { + self.sig.args.len() + } + + fn num_retvals(&self) -> usize { + self.sig.rets.len() + } + + fn num_stackslots(&self) -> usize { + self.stackslots.len() + } + + fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I { + match &self.sig.args[idx] { + // Extension mode doesn't matter (we're copying out, not in; we + // ignore high bits by convention). + &ABIArg::Reg(r, ty, ..) => M::gen_move(into_reg, r.to_reg(), ty), + &ABIArg::Stack(off, ty, ..) => M::gen_load_stack( + StackAMode::FPOffset(M::fp_to_arg_offset(self.call_conv, &self.flags) + off, ty), + into_reg, + ty, + ), + } + } + + fn arg_is_needed_in_body(&self, idx: usize) -> bool { + match self.sig.args[idx].get_purpose() { + // Special Baldrdash-specific pseudo-args that are present only to + // fill stack slots. Won't ever be used as ordinary values in the + // body. + ir::ArgumentPurpose::CalleeTLS | ir::ArgumentPurpose::CallerTLS => false, + _ => true, + } + } + + fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Writable<Reg>) -> Vec<Self::I> { + let mut ret = Vec::new(); + let word_bits = M::word_bits() as u8; + match &self.sig.rets[idx] { + &ABIArg::Reg(r, ty, ext, ..) => { + let from_bits = ty_bits(ty) as u8; + let dest_reg = Writable::from_reg(r.to_reg()); + match (ext, from_bits) { + (ArgumentExtension::Uext, n) | (ArgumentExtension::Sext, n) + if n < word_bits => + { + let signed = ext == ArgumentExtension::Sext; + ret.push(M::gen_extend( + dest_reg, + from_reg.to_reg(), + signed, + from_bits, + /* to_bits = */ word_bits, + )); + } + _ => ret.push(M::gen_move(dest_reg, from_reg.to_reg(), ty)), + }; + } + &ABIArg::Stack(off, mut ty, ext, ..) => { + let from_bits = ty_bits(ty) as u8; + // A machine ABI implementation should ensure that stack frames + // have "reasonable" size. All current ABIs for machinst + // backends (aarch64 and x64) enforce a 128MB limit. + let off = i32::try_from(off) + .expect("Argument stack offset greater than 2GB; should hit impl limit first"); + // Trash the from_reg; it should be its last use. + match (ext, from_bits) { + (ArgumentExtension::Uext, n) | (ArgumentExtension::Sext, n) + if n < word_bits => + { + assert_eq!(M::word_reg_class(), from_reg.to_reg().get_class()); + let signed = ext == ArgumentExtension::Sext; + ret.push(M::gen_extend( + from_reg, + from_reg.to_reg(), + signed, + from_bits, + /* to_bits = */ word_bits, + )); + // Store the extended version. + ty = M::word_type(); + } + _ => {} + }; + ret.push(M::gen_store_base_offset( + self.ret_area_ptr.unwrap().to_reg(), + off, + from_reg.to_reg(), + ty, + )); + } + } + ret + } + + fn gen_retval_area_setup(&self) -> Option<Self::I> { + if let Some(i) = self.sig.stack_ret_arg { + let inst = self.gen_copy_arg_to_reg(i, self.ret_area_ptr.unwrap()); + trace!( + "gen_retval_area_setup: inst {:?}; ptr reg is {:?}", + inst, + self.ret_area_ptr.unwrap().to_reg() + ); + Some(inst) + } else { + trace!("gen_retval_area_setup: not needed"); + None + } + } + + fn gen_ret(&self) -> Self::I { + M::gen_ret() + } + + fn gen_epilogue_placeholder(&self) -> Self::I { + M::gen_epilogue_placeholder() + } + + fn set_num_spillslots(&mut self, slots: usize) { + self.spillslots = Some(slots); + } + + fn set_clobbered(&mut self, clobbered: Set<Writable<RealReg>>) { + self.clobbered = clobbered; + } + + /// Load from a stackslot. + fn load_stackslot( + &self, + slot: StackSlot, + offset: u32, + ty: Type, + into_reg: Writable<Reg>, + ) -> Self::I { + // Offset from beginning of stackslot area, which is at nominal SP (see + // [MemArg::NominalSPOffset] for more details on nominal SP tracking). + let stack_off = self.stackslots[slot.as_u32() as usize] as i64; + let sp_off: i64 = stack_off + (offset as i64); + trace!("load_stackslot: slot {} -> sp_off {}", slot, sp_off); + M::gen_load_stack(StackAMode::NominalSPOffset(sp_off, ty), into_reg, ty) + } + + /// Store to a stackslot. + fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Self::I { + // Offset from beginning of stackslot area, which is at nominal SP (see + // [MemArg::NominalSPOffset] for more details on nominal SP tracking). + let stack_off = self.stackslots[slot.as_u32() as usize] as i64; + let sp_off: i64 = stack_off + (offset as i64); + trace!("store_stackslot: slot {} -> sp_off {}", slot, sp_off); + M::gen_store_stack(StackAMode::NominalSPOffset(sp_off, ty), from_reg, ty) + } + + /// Produce an instruction that computes a stackslot address. + fn stackslot_addr(&self, slot: StackSlot, offset: u32, into_reg: Writable<Reg>) -> Self::I { + // Offset from beginning of stackslot area, which is at nominal SP (see + // [MemArg::NominalSPOffset] for more details on nominal SP tracking). + let stack_off = self.stackslots[slot.as_u32() as usize] as i64; + let sp_off: i64 = stack_off + (offset as i64); + M::gen_get_stack_addr(StackAMode::NominalSPOffset(sp_off, I8), into_reg, I8) + } + + /// Load from a spillslot. + fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Self::I { + // Offset from beginning of spillslot area, which is at nominal SP + stackslots_size. + let islot = slot.get() as i64; + let spill_off = islot * M::word_bytes() as i64; + let sp_off = self.stackslots_size as i64 + spill_off; + trace!("load_spillslot: slot {:?} -> sp_off {}", slot, sp_off); + M::gen_load_stack(StackAMode::NominalSPOffset(sp_off, ty), into_reg, ty) + } + + /// Store to a spillslot. + fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Self::I { + // Offset from beginning of spillslot area, which is at nominal SP + stackslots_size. + let islot = slot.get() as i64; + let spill_off = islot * M::word_bytes() as i64; + let sp_off = self.stackslots_size as i64 + spill_off; + trace!("store_spillslot: slot {:?} -> sp_off {}", slot, sp_off); + M::gen_store_stack(StackAMode::NominalSPOffset(sp_off, ty), from_reg, ty) + } + + fn spillslots_to_stack_map( + &self, + slots: &[SpillSlot], + state: &<Self::I as MachInstEmit>::State, + ) -> StackMap { + let virtual_sp_offset = M::get_virtual_sp_offset_from_state(state); + let nominal_sp_to_fp = M::get_nominal_sp_to_fp(state); + assert!(virtual_sp_offset >= 0); + trace!( + "spillslots_to_stackmap: slots = {:?}, state = {:?}", + slots, + state + ); + let map_size = (virtual_sp_offset + nominal_sp_to_fp) as u32; + let bytes = M::word_bytes(); + let map_words = (map_size + bytes - 1) / bytes; + let mut bits = std::iter::repeat(false) + .take(map_words as usize) + .collect::<Vec<bool>>(); + + let first_spillslot_word = + ((self.stackslots_size + virtual_sp_offset as u32) / bytes) as usize; + for &slot in slots { + let slot = slot.get() as usize; + bits[first_spillslot_word + slot] = true; + } + + StackMap::from_slice(&bits[..]) + } + + fn gen_prologue(&mut self) -> Vec<Self::I> { + let mut insts = vec![]; + if !self.call_conv.extends_baldrdash() { + // set up frame + insts.extend(M::gen_prologue_frame_setup().into_iter()); + } + + let bytes = M::word_bytes(); + let mut total_stacksize = self.stackslots_size + bytes * self.spillslots.unwrap() as u32; + if self.call_conv.extends_baldrdash() { + debug_assert!( + !self.flags.enable_probestack(), + "baldrdash does not expect cranelift to emit stack probes" + ); + total_stacksize += self.flags.baldrdash_prologue_words() as u32 * bytes; + } + let mask = M::stack_align(self.call_conv) - 1; + let total_stacksize = (total_stacksize + mask) & !mask; // 16-align the stack. + + if !self.call_conv.extends_baldrdash() { + // Leaf functions with zero stack don't need a stack check if one's + // specified, otherwise always insert the stack check. + if total_stacksize > 0 || !self.is_leaf { + if let Some((reg, stack_limit_load)) = &self.stack_limit { + insts.extend_from_slice(stack_limit_load); + self.insert_stack_check(*reg, total_stacksize, &mut insts); + } + } + if total_stacksize > 0 { + self.fixed_frame_storage_size += total_stacksize; + } + } + + // N.B.: "nominal SP", which we use to refer to stackslots and + // spillslots, is defined to be equal to the stack pointer at this point + // in the prologue. + // + // If we push any clobbers below, we emit a virtual-SP adjustment + // meta-instruction so that the nominal SP references behave as if SP + // were still at this point. See documentation for + // [crate::machinst::abi_impl](this module) for more details on + // stackframe layout and nominal SP maintenance. + + // Save clobbered registers. + let (clobber_size, clobber_insts) = M::gen_clobber_save( + self.call_conv, + &self.flags, + &self.clobbered, + self.fixed_frame_storage_size, + self.outgoing_args_size, + ); + insts.extend(clobber_insts); + + let sp_adj = self.outgoing_args_size as i32 + clobber_size as i32; + if sp_adj > 0 { + insts.push(M::gen_nominal_sp_adj(sp_adj)); + } + + self.total_frame_size = Some(total_stacksize); + insts + } + + fn gen_epilogue(&self) -> Vec<M::I> { + let mut insts = vec![]; + + // Restore clobbered registers. + insts.extend(M::gen_clobber_restore( + self.call_conv, + &self.flags, + &self.clobbered, + self.fixed_frame_storage_size, + self.outgoing_args_size, + )); + + // N.B.: we do *not* emit a nominal SP adjustment here, because (i) there will be no + // references to nominal SP offsets before the return below, and (ii) the instruction + // emission tracks running SP offset linearly (in straight-line order), not according to + // the CFG, so early returns in the middle of function bodies would cause an incorrect + // offset for the rest of the body. + + if !self.call_conv.extends_baldrdash() { + insts.extend(M::gen_epilogue_frame_restore()); + insts.push(M::gen_ret()); + } + + debug!("Epilogue: {:?}", insts); + insts + } + + fn frame_size(&self) -> u32 { + self.total_frame_size + .expect("frame size not computed before prologue generation") + } + + fn stack_args_size(&self) -> u32 { + self.sig.stack_arg_space as u32 + } + + fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32 { + M::get_number_of_spillslots_for_value(rc, ty) + } + + fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Option<Type>) -> Self::I { + let ty = ty_from_ty_hint_or_reg_class::<M>(from_reg.to_reg(), ty); + self.store_spillslot(to_slot, ty, from_reg.to_reg()) + } + + fn gen_reload( + &self, + to_reg: Writable<RealReg>, + from_slot: SpillSlot, + ty: Option<Type>, + ) -> Self::I { + let ty = ty_from_ty_hint_or_reg_class::<M>(to_reg.to_reg().to_reg(), ty); + self.load_spillslot(from_slot, ty, to_reg.map(|r| r.to_reg())) + } + + fn unwind_info_kind(&self) -> UnwindInfoKind { + match self.sig.call_conv { + #[cfg(feature = "unwind")] + isa::CallConv::Fast | isa::CallConv::Cold | isa::CallConv::SystemV => { + UnwindInfoKind::SystemV + } + #[cfg(feature = "unwind")] + isa::CallConv::WindowsFastcall => UnwindInfoKind::Windows, + _ => UnwindInfoKind::None, + } + } +} + +fn abisig_to_uses_and_defs<M: ABIMachineSpec>(sig: &ABISig) -> (Vec<Reg>, Vec<Writable<Reg>>) { + // Compute uses: all arg regs. + let mut uses = Vec::new(); + for arg in &sig.args { + match arg { + &ABIArg::Reg(reg, ..) => uses.push(reg.to_reg()), + _ => {} + } + } + + // Compute defs: all retval regs, and all caller-save (clobbered) regs. + let mut defs = M::get_regs_clobbered_by_call(sig.call_conv); + for ret in &sig.rets { + match ret { + &ABIArg::Reg(reg, ..) => defs.push(Writable::from_reg(reg.to_reg())), + _ => {} + } + } + + (uses, defs) +} + +/// ABI object for a callsite. +pub struct ABICallerImpl<M: ABIMachineSpec> { + /// The called function's signature. + sig: ABISig, + /// All uses for the callsite, i.e., function args. + uses: Vec<Reg>, + /// All defs for the callsite, i.e., return values and caller-saves. + defs: Vec<Writable<Reg>>, + /// Call destination. + dest: CallDest, + /// Actual call opcode; used to distinguish various types of calls. + opcode: ir::Opcode, + /// Caller's calling convention. + caller_conv: isa::CallConv, + + _mach: PhantomData<M>, +} + +/// Destination for a call. +#[derive(Debug, Clone)] +pub enum CallDest { + /// Call to an ExtName (named function symbol). + ExtName(ir::ExternalName, RelocDistance), + /// Indirect call to a function pointer in a register. + Reg(Reg), +} + +impl<M: ABIMachineSpec> ABICallerImpl<M> { + /// Create a callsite ABI object for a call directly to the specified function. + pub fn from_func( + sig: &ir::Signature, + extname: &ir::ExternalName, + dist: RelocDistance, + caller_conv: isa::CallConv, + ) -> CodegenResult<ABICallerImpl<M>> { + let sig = ABISig::from_func_sig::<M>(sig)?; + let (uses, defs) = abisig_to_uses_and_defs::<M>(&sig); + Ok(ABICallerImpl { + sig, + uses, + defs, + dest: CallDest::ExtName(extname.clone(), dist), + opcode: ir::Opcode::Call, + caller_conv, + _mach: PhantomData, + }) + } + + /// Create a callsite ABI object for a call to a function pointer with the + /// given signature. + pub fn from_ptr( + sig: &ir::Signature, + ptr: Reg, + opcode: ir::Opcode, + caller_conv: isa::CallConv, + ) -> CodegenResult<ABICallerImpl<M>> { + let sig = ABISig::from_func_sig::<M>(sig)?; + let (uses, defs) = abisig_to_uses_and_defs::<M>(&sig); + Ok(ABICallerImpl { + sig, + uses, + defs, + dest: CallDest::Reg(ptr), + opcode, + caller_conv, + _mach: PhantomData, + }) + } +} + +fn adjust_stack_and_nominal_sp<M: ABIMachineSpec, C: LowerCtx<I = M::I>>( + ctx: &mut C, + off: i32, + is_sub: bool, +) { + if off == 0 { + return; + } + let amt = if is_sub { -off } else { off }; + for inst in M::gen_sp_reg_adjust(amt) { + ctx.emit(inst); + } + ctx.emit(M::gen_nominal_sp_adj(-amt)); +} + +impl<M: ABIMachineSpec> ABICaller for ABICallerImpl<M> { + type I = M::I; + + fn num_args(&self) -> usize { + if self.sig.stack_ret_arg.is_some() { + self.sig.args.len() - 1 + } else { + self.sig.args.len() + } + } + + fn accumulate_outgoing_args_size<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) { + let off = self.sig.stack_arg_space + self.sig.stack_ret_space; + ctx.abi().accumulate_outgoing_args_size(off as u32); + } + + fn emit_stack_pre_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) { + let off = self.sig.stack_arg_space + self.sig.stack_ret_space; + adjust_stack_and_nominal_sp::<M, C>(ctx, off as i32, /* is_sub = */ true) + } + + fn emit_stack_post_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) { + let off = self.sig.stack_arg_space + self.sig.stack_ret_space; + adjust_stack_and_nominal_sp::<M, C>(ctx, off as i32, /* is_sub = */ false) + } + + fn emit_copy_reg_to_arg<C: LowerCtx<I = Self::I>>( + &self, + ctx: &mut C, + idx: usize, + from_reg: Reg, + ) { + let word_rc = M::word_reg_class(); + let word_bits = M::word_bits() as usize; + match &self.sig.args[idx] { + &ABIArg::Reg(reg, ty, ext, _) + if ext != ir::ArgumentExtension::None && ty_bits(ty) < word_bits => + { + assert_eq!(word_rc, reg.get_class()); + let signed = match ext { + ir::ArgumentExtension::Uext => false, + ir::ArgumentExtension::Sext => true, + _ => unreachable!(), + }; + ctx.emit(M::gen_extend( + Writable::from_reg(reg.to_reg()), + from_reg, + signed, + ty_bits(ty) as u8, + word_bits as u8, + )); + } + &ABIArg::Reg(reg, ty, _, _) => { + ctx.emit(M::gen_move(Writable::from_reg(reg.to_reg()), from_reg, ty)); + } + &ABIArg::Stack(off, mut ty, ext, _) => { + if ext != ir::ArgumentExtension::None && ty_bits(ty) < word_bits { + assert_eq!(word_rc, from_reg.get_class()); + let signed = match ext { + ir::ArgumentExtension::Uext => false, + ir::ArgumentExtension::Sext => true, + _ => unreachable!(), + }; + // Extend in place in the source register. Our convention is to + // treat high bits as undefined for values in registers, so this + // is safe, even for an argument that is nominally read-only. + ctx.emit(M::gen_extend( + Writable::from_reg(from_reg), + from_reg, + signed, + ty_bits(ty) as u8, + word_bits as u8, + )); + // Store the extended version. + ty = M::word_type(); + } + ctx.emit(M::gen_store_stack( + StackAMode::SPOffset(off, ty), + from_reg, + ty, + )); + } + } + } + + fn emit_copy_retval_to_reg<C: LowerCtx<I = Self::I>>( + &self, + ctx: &mut C, + idx: usize, + into_reg: Writable<Reg>, + ) { + match &self.sig.rets[idx] { + // Extension mode doesn't matter because we're copying out, not in, + // and we ignore high bits in our own registers by convention. + &ABIArg::Reg(reg, ty, _, _) => ctx.emit(M::gen_move(into_reg, reg.to_reg(), ty)), + &ABIArg::Stack(off, ty, _, _) => { + let ret_area_base = self.sig.stack_arg_space; + ctx.emit(M::gen_load_stack( + StackAMode::SPOffset(off + ret_area_base, ty), + into_reg, + ty, + )); + } + } + } + + fn emit_call<C: LowerCtx<I = Self::I>>(&mut self, ctx: &mut C) { + let (uses, defs) = ( + mem::replace(&mut self.uses, Default::default()), + mem::replace(&mut self.defs, Default::default()), + ); + let word_rc = M::word_reg_class(); + let word_type = M::word_type(); + if let Some(i) = self.sig.stack_ret_arg { + let rd = ctx.alloc_tmp(word_rc, word_type); + let ret_area_base = self.sig.stack_arg_space; + ctx.emit(M::gen_get_stack_addr( + StackAMode::SPOffset(ret_area_base, I8), + rd, + I8, + )); + self.emit_copy_reg_to_arg(ctx, i, rd.to_reg()); + } + let tmp = ctx.alloc_tmp(word_rc, word_type); + for (is_safepoint, inst) in M::gen_call( + &self.dest, + uses, + defs, + self.opcode, + tmp, + self.sig.call_conv, + self.caller_conv, + ) + .into_iter() + { + match is_safepoint { + InstIsSafepoint::Yes => ctx.emit_safepoint(inst), + InstIsSafepoint::No => ctx.emit(inst), + } + } + } +} diff --git a/third_party/rust/cranelift-codegen/src/machinst/adapter.rs b/third_party/rust/cranelift-codegen/src/machinst/adapter.rs new file mode 100644 index 0000000000..43ff9e51a4 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/adapter.rs @@ -0,0 +1,140 @@ +//! Adapter for a `MachBackend` to implement the `TargetIsa` trait. + +use crate::binemit; +use crate::ir; +use crate::isa::{EncInfo, Encoding, Encodings, Legalize, RegClass, RegInfo, TargetIsa}; +use crate::machinst::*; +use crate::regalloc::RegisterSet; +use crate::settings::Flags; + +#[cfg(feature = "testing_hooks")] +use crate::regalloc::RegDiversions; + +use core::any::Any; +use std::borrow::Cow; +use std::fmt; +use target_lexicon::Triple; + +/// A wrapper around a `MachBackend` that provides a `TargetIsa` impl. +pub struct TargetIsaAdapter { + backend: Box<dyn MachBackend + Send + Sync + 'static>, + triple: Triple, +} + +impl TargetIsaAdapter { + /// Create a new `TargetIsa` wrapper around a `MachBackend`. + pub fn new<B: MachBackend + Send + Sync + 'static>(backend: B) -> TargetIsaAdapter { + let triple = backend.triple(); + TargetIsaAdapter { + backend: Box::new(backend), + triple, + } + } +} + +impl fmt::Display for TargetIsaAdapter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("MachBackend") + .field("name", &self.backend.name()) + .field("triple", &self.backend.triple()) + .field("flags", &format!("{}", self.backend.flags())) + .finish() + } +} + +impl TargetIsa for TargetIsaAdapter { + fn name(&self) -> &'static str { + self.backend.name() + } + + fn triple(&self) -> &Triple { + &self.triple + } + + fn flags(&self) -> &Flags { + self.backend.flags() + } + + fn register_info(&self) -> RegInfo { + // Called from function's Display impl, so we need a stub here. + RegInfo { + banks: &[], + classes: &[], + } + } + + fn legal_encodings<'a>( + &'a self, + _func: &'a ir::Function, + _inst: &'a ir::InstructionData, + _ctrl_typevar: ir::Type, + ) -> Encodings<'a> { + panic!("Should not be called when new-style backend is available!") + } + + fn encode( + &self, + _func: &ir::Function, + _inst: &ir::InstructionData, + _ctrl_typevar: ir::Type, + ) -> Result<Encoding, Legalize> { + panic!("Should not be called when new-style backend is available!") + } + + fn encoding_info(&self) -> EncInfo { + panic!("Should not be called when new-style backend is available!") + } + + fn legalize_signature(&self, _sig: &mut Cow<ir::Signature>, _current: bool) { + panic!("Should not be called when new-style backend is available!") + } + + fn regclass_for_abi_type(&self, _ty: ir::Type) -> RegClass { + panic!("Should not be called when new-style backend is available!") + } + + fn allocatable_registers(&self, _func: &ir::Function) -> RegisterSet { + panic!("Should not be called when new-style backend is available!") + } + + fn prologue_epilogue(&self, _func: &mut ir::Function) -> CodegenResult<()> { + panic!("Should not be called when new-style backend is available!") + } + + #[cfg(feature = "testing_hooks")] + fn emit_inst( + &self, + _func: &ir::Function, + _inst: ir::Inst, + _divert: &mut RegDiversions, + _sink: &mut dyn binemit::CodeSink, + ) { + panic!("Should not be called when new-style backend is available!") + } + + /// Emit a whole function into memory. + fn emit_function_to_memory(&self, _func: &ir::Function, _sink: &mut binemit::MemoryCodeSink) { + panic!("Should not be called when new-style backend is available!") + } + + fn get_mach_backend(&self) -> Option<&dyn MachBackend> { + Some(&*self.backend) + } + + fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC { + self.backend.unsigned_add_overflow_condition() + } + + fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC { + self.backend.unsigned_sub_overflow_condition() + } + + #[cfg(feature = "unwind")] + fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> { + self.backend.create_systemv_cie() + } + + fn as_any(&self) -> &dyn Any { + self as &dyn Any + } +} diff --git a/third_party/rust/cranelift-codegen/src/machinst/blockorder.rs b/third_party/rust/cranelift-codegen/src/machinst/blockorder.rs new file mode 100644 index 0000000000..1052d83858 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/blockorder.rs @@ -0,0 +1,624 @@ +//! Computation of basic block order in emitted code. +//! +//! This module handles the translation from CLIF BBs to VCode BBs. +//! +//! The basic idea is that we compute a sequence of "lowered blocks" that +//! correspond to one or more blocks in the graph: (CLIF CFG) `union` (implicit +//! block on *every* edge). Conceptually, the lowering pipeline wants to insert +//! moves for phi-nodes on every block-to-block transfer; these blocks always +//! conceptually exist, but may be merged with an "original" CLIF block (and +//! hence not actually exist; this is equivalent to inserting the blocks only on +//! critical edges). +//! +//! In other words, starting from a CFG like this (where each "CLIF block" and +//! "(edge N->M)" is a separate basic block): +//! +//! ```plain +//! +//! CLIF block 0 +//! / \ +//! (edge 0->1) (edge 0->2) +//! | | +//! CLIF block 1 CLIF block 2 +//! \ / +//! (edge 1->3) (edge 2->3) +//! \ / +//! CLIF block 3 +//! ``` +//! +//! We can produce a CFG of lowered blocks like so: +//! +//! ```plain +//! +--------------+ +//! | CLIF block 0 | +//! +--------------+ +//! / \ +//! +--------------+ +--------------+ +//! | (edge 0->1) | |(edge 0->2) | +//! | CLIF block 1 | | CLIF block 2 | +//! +--------------+ +--------------+ +//! \ / +//! +-----------+ +-----------+ +//! |(edge 1->3)| |(edge 2->3)| +//! +-----------+ +-----------+ +//! \ / +//! +------------+ +//! |CLIF block 3| +//! +------------+ +//! ``` +//! +//! (note that the edges into CLIF blocks 1 and 2 could be merged with those +//! blocks' original bodies, but the out-edges could not because for simplicity +//! in the successor-function definition, we only ever merge an edge onto one +//! side of an original CLIF block.) +//! +//! Each `LoweredBlock` names just an original CLIF block, an original CLIF +//! block prepended or appended with an edge block (never both, though), or just +//! an edge block. +//! +//! To compute this lowering, we do a DFS over the CLIF-plus-edge-block graph +//! (never actually materialized, just defined by a "successors" function), and +//! compute the reverse postorder. +//! +//! This algorithm isn't perfect w.r.t. generated code quality: we don't, for +//! example, consider any information about whether edge blocks will actually +//! have content, because this computation happens as part of lowering *before* +//! regalloc, and regalloc may or may not insert moves/spills/reloads on any +//! particular edge. But it works relatively well and is conceptually simple. +//! Furthermore, the [MachBuffer] machine-code sink performs final peephole-like +//! branch editing that in practice elides empty blocks and simplifies some of +//! the other redundancies that this scheme produces. + +use crate::entity::SecondaryMap; +use crate::fx::{FxHashMap, FxHashSet}; +use crate::ir::{Block, Function, Inst, Opcode}; +use crate::machinst::lower::visit_block_succs; +use crate::machinst::*; + +use log::debug; +use smallvec::SmallVec; + +/// Mapping from CLIF BBs to VCode BBs. +#[derive(Debug)] +pub struct BlockLoweringOrder { + /// Lowered blocks, in BlockIndex order. Each block is some combination of + /// (i) a CLIF block, and (ii) inserted crit-edge blocks before or after; + /// see [LoweredBlock] for details. + lowered_order: Vec<LoweredBlock>, + /// Successors for all lowered blocks, in one serialized vector. Indexed by + /// the ranges in `lowered_succ_ranges`. + lowered_succs: Vec<(Inst, LoweredBlock)>, + /// BlockIndex values for successors for all lowered blocks, in the same + /// order as `lowered_succs`. + lowered_succ_indices: Vec<(Inst, BlockIndex)>, + /// Ranges in `lowered_succs` giving the successor lists for each lowered + /// block. Indexed by lowering-order index (`BlockIndex`). + lowered_succ_ranges: Vec<(usize, usize)>, + /// Mapping from CLIF BB to BlockIndex (index in lowered order). Note that + /// some CLIF BBs may not be lowered; in particular, we skip unreachable + /// blocks. + orig_map: SecondaryMap<Block, Option<BlockIndex>>, +} + +/// The origin of a block in the lowered block-order: either an original CLIF +/// block, or an inserted edge-block, or a combination of the two if an edge is +/// non-critical. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum LoweredBlock { + /// Block in original CLIF, with no merged edge-blocks. + Orig { + /// Original CLIF block. + block: Block, + }, + /// Block in the original CLIF, plus edge-block to one succ (which is the + /// one successor of the original block). + OrigAndEdge { + /// The original CLIF block contained in this lowered block. + block: Block, + /// The edge (jump) instruction transitioning from this block + /// to the next, i.e., corresponding to the included edge-block. This + /// will be an instruction in `block`. + edge_inst: Inst, + /// The successor CLIF block. + succ: Block, + }, + /// Block in the original CLIF, preceded by edge-block from one pred (which + /// is the one pred of the original block). + EdgeAndOrig { + /// The previous CLIF block, i.e., the edge block's predecessor. + pred: Block, + /// The edge (jump) instruction corresponding to the included + /// edge-block. This will be an instruction in `pred`. + edge_inst: Inst, + /// The original CLIF block included in this lowered block. + block: Block, + }, + /// Split critical edge between two CLIF blocks. This lowered block does not + /// correspond to any original CLIF blocks; it only serves as an insertion + /// point for work to happen on the transition from `pred` to `succ`. + Edge { + /// The predecessor CLIF block. + pred: Block, + /// The edge (jump) instruction corresponding to this edge's transition. + /// This will be an instruction in `pred`. + edge_inst: Inst, + /// The successor CLIF block. + succ: Block, + }, +} + +impl LoweredBlock { + /// The associated original (CLIF) block included in this lowered block, if + /// any. + pub fn orig_block(self) -> Option<Block> { + match self { + LoweredBlock::Orig { block, .. } + | LoweredBlock::OrigAndEdge { block, .. } + | LoweredBlock::EdgeAndOrig { block, .. } => Some(block), + LoweredBlock::Edge { .. } => None, + } + } + + /// The associated in-edge, if any. + pub fn in_edge(self) -> Option<(Block, Inst, Block)> { + match self { + LoweredBlock::EdgeAndOrig { + pred, + edge_inst, + block, + } => Some((pred, edge_inst, block)), + _ => None, + } + } + + /// the associated out-edge, if any. Also includes edge-only blocks. + pub fn out_edge(self) -> Option<(Block, Inst, Block)> { + match self { + LoweredBlock::OrigAndEdge { + block, + edge_inst, + succ, + } => Some((block, edge_inst, succ)), + LoweredBlock::Edge { + pred, + edge_inst, + succ, + } => Some((pred, edge_inst, succ)), + _ => None, + } + } +} + +impl BlockLoweringOrder { + /// Compute and return a lowered block order for `f`. + pub fn new(f: &Function) -> BlockLoweringOrder { + debug!("BlockLoweringOrder: function body {:?}", f); + + // Step 1: compute the in-edge and out-edge count of every block. + let mut block_in_count = SecondaryMap::with_default(0); + let mut block_out_count = SecondaryMap::with_default(0); + + // Cache the block successors to avoid re-examining branches below. + let mut block_succs: SmallVec<[(Inst, Block); 128]> = SmallVec::new(); + let mut block_succ_range = SecondaryMap::with_default((0, 0)); + let mut fallthrough_return_block = None; + for block in f.layout.blocks() { + let block_succ_start = block_succs.len(); + visit_block_succs(f, block, |inst, succ| { + block_out_count[block] += 1; + block_in_count[succ] += 1; + block_succs.push((inst, succ)); + }); + let block_succ_end = block_succs.len(); + block_succ_range[block] = (block_succ_start, block_succ_end); + + for inst in f.layout.block_likely_branches(block) { + if f.dfg[inst].opcode() == Opcode::Return { + // Implicit output edge for any return. + block_out_count[block] += 1; + } + if f.dfg[inst].opcode() == Opcode::FallthroughReturn { + // Fallthrough return block must come last. + debug_assert!(fallthrough_return_block == None); + fallthrough_return_block = Some(block); + } + } + } + // Implicit input edge for entry block. + if let Some(entry) = f.layout.entry_block() { + block_in_count[entry] += 1; + } + + // Here we define the implicit CLIF-plus-edges graph. There are + // conceptually two such graphs: the original, with every edge explicit, + // and the merged one, with blocks (represented by `LoweredBlock` + // values) that contain original CLIF blocks, edges, or both. This + // function returns a lowered block's successors as per the latter, with + // consideration to edge-block merging. + // + // Note that there is a property of the block-merging rules below + // that is very important to ensure we don't miss any lowered blocks: + // any block in the implicit CLIF-plus-edges graph will *only* be + // included in one block in the merged graph. + // + // This, combined with the property that every edge block is reachable + // only from one predecessor (and hence cannot be reached by a DFS + // backedge), means that it is sufficient in our DFS below to track + // visited-bits per original CLIF block only, not per edge. This greatly + // simplifies the data structures (no need to keep a sparse hash-set of + // (block, block) tuples). + let compute_lowered_succs = |ret: &mut Vec<(Inst, LoweredBlock)>, block: LoweredBlock| { + let start_idx = ret.len(); + match block { + LoweredBlock::Orig { block } | LoweredBlock::EdgeAndOrig { block, .. } => { + // At an orig block; successors are always edge blocks, + // possibly with orig blocks following. + let range = block_succ_range[block]; + for &(edge_inst, succ) in &block_succs[range.0..range.1] { + if block_in_count[succ] == 1 { + ret.push(( + edge_inst, + LoweredBlock::EdgeAndOrig { + pred: block, + edge_inst, + block: succ, + }, + )); + } else { + ret.push(( + edge_inst, + LoweredBlock::Edge { + pred: block, + edge_inst, + succ, + }, + )); + } + } + } + LoweredBlock::Edge { + succ, edge_inst, .. + } + | LoweredBlock::OrigAndEdge { + succ, edge_inst, .. + } => { + // At an edge block; successors are always orig blocks, + // possibly with edge blocks following. + if block_out_count[succ] == 1 { + let range = block_succ_range[succ]; + // check if the one succ is a real CFG edge (vs. + // implicit return succ). + if range.1 - range.0 > 0 { + debug_assert!(range.1 - range.0 == 1); + let (succ_edge_inst, succ_succ) = block_succs[range.0]; + ret.push(( + edge_inst, + LoweredBlock::OrigAndEdge { + block: succ, + edge_inst: succ_edge_inst, + succ: succ_succ, + }, + )); + } else { + ret.push((edge_inst, LoweredBlock::Orig { block: succ })); + } + } else { + ret.push((edge_inst, LoweredBlock::Orig { block: succ })); + } + } + } + let end_idx = ret.len(); + (start_idx, end_idx) + }; + + // Build the explicit LoweredBlock-to-LoweredBlock successors list. + let mut lowered_succs = vec![]; + let mut lowered_succ_indices = vec![]; + + // Step 2: Compute RPO traversal of the implicit CLIF-plus-edge-block graph. Use an + // explicit stack so we don't overflow the real stack with a deep DFS. + #[derive(Debug)] + struct StackEntry { + this: LoweredBlock, + succs: (usize, usize), // range in lowered_succs + cur_succ: usize, // index in lowered_succs + } + + let mut stack: SmallVec<[StackEntry; 16]> = SmallVec::new(); + let mut visited = FxHashSet::default(); + let mut postorder = vec![]; + if let Some(entry) = f.layout.entry_block() { + // FIXME(cfallin): we might be able to use OrigAndEdge. Find a way + // to not special-case the entry block here. + let block = LoweredBlock::Orig { block: entry }; + visited.insert(block); + let range = compute_lowered_succs(&mut lowered_succs, block); + lowered_succ_indices.resize(lowered_succs.len(), 0); + stack.push(StackEntry { + this: block, + succs: range, + cur_succ: range.1, + }); + } + + let mut deferred_last = None; + while !stack.is_empty() { + let stack_entry = stack.last_mut().unwrap(); + let range = stack_entry.succs; + if stack_entry.cur_succ == range.0 { + let orig_block = stack_entry.this.orig_block(); + if orig_block.is_some() && orig_block == fallthrough_return_block { + deferred_last = Some((stack_entry.this, range)); + } else { + postorder.push((stack_entry.this, range)); + } + stack.pop(); + } else { + // Heuristic: chase the children in reverse. This puts the first + // successor block first in RPO, all other things being equal, + // which tends to prioritize loop backedges over out-edges, + // putting the edge-block closer to the loop body and minimizing + // live-ranges in linear instruction space. + let next = lowered_succs[stack_entry.cur_succ - 1].1; + stack_entry.cur_succ -= 1; + if visited.contains(&next) { + continue; + } + visited.insert(next); + let range = compute_lowered_succs(&mut lowered_succs, next); + lowered_succ_indices.resize(lowered_succs.len(), 0); + stack.push(StackEntry { + this: next, + succs: range, + cur_succ: range.1, + }); + } + } + + postorder.reverse(); + let mut rpo = postorder; + if let Some(d) = deferred_last { + rpo.push(d); + } + + // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps. + let mut lowered_order = vec![]; + let mut lowered_succ_ranges = vec![]; + let mut lb_to_bindex = FxHashMap::default(); + for (block, succ_range) in rpo.into_iter() { + lb_to_bindex.insert(block, lowered_order.len() as BlockIndex); + lowered_order.push(block); + lowered_succ_ranges.push(succ_range); + } + + let lowered_succ_indices = lowered_succs + .iter() + .map(|&(inst, succ)| (inst, lb_to_bindex.get(&succ).cloned().unwrap())) + .collect(); + + let mut orig_map = SecondaryMap::with_default(None); + for (i, lb) in lowered_order.iter().enumerate() { + let i = i as BlockIndex; + if let Some(b) = lb.orig_block() { + orig_map[b] = Some(i); + } + } + + let result = BlockLoweringOrder { + lowered_order, + lowered_succs, + lowered_succ_indices, + lowered_succ_ranges, + orig_map, + }; + debug!("BlockLoweringOrder: {:?}", result); + result + } + + /// Get the lowered order of blocks. + pub fn lowered_order(&self) -> &[LoweredBlock] { + &self.lowered_order[..] + } + + /// Get the successors for a lowered block, by index in `lowered_order()`'s + /// returned slice. Each successsor is paired with the edge-instruction + /// (branch) corresponding to this edge. + pub fn succs(&self, block: BlockIndex) -> &[(Inst, LoweredBlock)] { + let range = self.lowered_succ_ranges[block as usize]; + &self.lowered_succs[range.0..range.1] + } + + /// Get the successor indices for a lowered block. + pub fn succ_indices(&self, block: BlockIndex) -> &[(Inst, BlockIndex)] { + let range = self.lowered_succ_ranges[block as usize]; + &self.lowered_succ_indices[range.0..range.1] + } + + /// Get the lowered block index containing a CLIF block, if any. (May not be + /// present if the original CLIF block was unreachable.) + pub fn lowered_block_for_bb(&self, bb: Block) -> Option<BlockIndex> { + self.orig_map[bb] + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::types::*; + use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature}; + use crate::isa::CallConv; + + fn build_test_func(n_blocks: usize, edges: &[(usize, usize)]) -> Function { + assert!(n_blocks > 0); + + let name = ExternalName::testcase("test0"); + let mut sig = Signature::new(CallConv::SystemV); + sig.params.push(AbiParam::new(I32)); + let mut func = Function::with_name_signature(name, sig); + let blocks = (0..n_blocks) + .map(|i| { + let bb = func.dfg.make_block(); + assert!(bb.as_u32() == i as u32); + bb + }) + .collect::<Vec<_>>(); + + let arg0 = func.dfg.append_block_param(blocks[0], I32); + + let mut pos = FuncCursor::new(&mut func); + + let mut edge = 0; + for i in 0..n_blocks { + pos.insert_block(blocks[i]); + let mut succs = vec![]; + while edge < edges.len() && edges[edge].0 == i { + succs.push(edges[edge].1); + edge += 1; + } + if succs.len() == 0 { + pos.ins().return_(&[arg0]); + } else if succs.len() == 1 { + pos.ins().jump(blocks[succs[0]], &[]); + } else if succs.len() == 2 { + pos.ins().brnz(arg0, blocks[succs[0]], &[]); + pos.ins().jump(blocks[succs[1]], &[]); + } else { + panic!("Too many successors"); + } + } + + func + } + + #[test] + fn test_blockorder_diamond() { + let func = build_test_func(4, &[(0, 1), (0, 2), (1, 3), (2, 3)]); + let order = BlockLoweringOrder::new(&func); + + assert_eq!(order.lowered_order.len(), 6); + + assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0); + assert!(order.lowered_order[0].in_edge().is_none()); + assert!(order.lowered_order[0].out_edge().is_none()); + + assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1); + assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0); + assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1); + + assert!(order.lowered_order[2].orig_block().is_none()); + assert!(order.lowered_order[2].in_edge().is_none()); + assert!(order.lowered_order[2].out_edge().unwrap().0.as_u32() == 1); + assert!(order.lowered_order[2].out_edge().unwrap().2.as_u32() == 3); + + assert!(order.lowered_order[3].orig_block().unwrap().as_u32() == 2); + assert!(order.lowered_order[3].in_edge().unwrap().0.as_u32() == 0); + assert!(order.lowered_order[3].in_edge().unwrap().2.as_u32() == 2); + assert!(order.lowered_order[3].out_edge().is_none()); + + assert!(order.lowered_order[4].orig_block().is_none()); + assert!(order.lowered_order[4].in_edge().is_none()); + assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 2); + assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 3); + + assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 3); + assert!(order.lowered_order[5].in_edge().is_none()); + assert!(order.lowered_order[5].out_edge().is_none()); + } + + #[test] + fn test_blockorder_critedge() { + // 0 + // / \ + // 1 2 + // / \ \ + // 3 4 | + // |\ _|____| + // | \/ | + // | /\ | + // 5 6 + // + // (3 -> 5, 3 -> 6, 4 -> 6 are critical edges and must be split) + // + let func = build_test_func( + 7, + &[ + (0, 1), + (0, 2), + (1, 3), + (1, 4), + (2, 5), + (3, 5), + (3, 6), + (4, 6), + ], + ); + let order = BlockLoweringOrder::new(&func); + + assert_eq!(order.lowered_order.len(), 11); + println!("ordered = {:?}", order.lowered_order); + + // block 0 + assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0); + assert!(order.lowered_order[0].in_edge().is_none()); + assert!(order.lowered_order[0].out_edge().is_none()); + + // edge 0->1 + block 1 + assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1); + assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0); + assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1); + assert!(order.lowered_order[1].out_edge().is_none()); + + // edge 1->3 + block 3 + assert!(order.lowered_order[2].orig_block().unwrap().as_u32() == 3); + assert!(order.lowered_order[2].in_edge().unwrap().0.as_u32() == 1); + assert!(order.lowered_order[2].in_edge().unwrap().2.as_u32() == 3); + assert!(order.lowered_order[2].out_edge().is_none()); + + // edge 3->5 + assert!(order.lowered_order[3].orig_block().is_none()); + assert!(order.lowered_order[3].in_edge().is_none()); + assert!(order.lowered_order[3].out_edge().unwrap().0.as_u32() == 3); + assert!(order.lowered_order[3].out_edge().unwrap().2.as_u32() == 5); + + // edge 3->6 + assert!(order.lowered_order[4].orig_block().is_none()); + assert!(order.lowered_order[4].in_edge().is_none()); + assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 3); + assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 6); + + // edge 1->4 + block 4 + assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 4); + assert!(order.lowered_order[5].in_edge().unwrap().0.as_u32() == 1); + assert!(order.lowered_order[5].in_edge().unwrap().2.as_u32() == 4); + assert!(order.lowered_order[5].out_edge().is_none()); + + // edge 4->6 + assert!(order.lowered_order[6].orig_block().is_none()); + assert!(order.lowered_order[6].in_edge().is_none()); + assert!(order.lowered_order[6].out_edge().unwrap().0.as_u32() == 4); + assert!(order.lowered_order[6].out_edge().unwrap().2.as_u32() == 6); + + // block 6 + assert!(order.lowered_order[7].orig_block().unwrap().as_u32() == 6); + assert!(order.lowered_order[7].in_edge().is_none()); + assert!(order.lowered_order[7].out_edge().is_none()); + + // edge 0->2 + block 2 + assert!(order.lowered_order[8].orig_block().unwrap().as_u32() == 2); + assert!(order.lowered_order[8].in_edge().unwrap().0.as_u32() == 0); + assert!(order.lowered_order[8].in_edge().unwrap().2.as_u32() == 2); + assert!(order.lowered_order[8].out_edge().is_none()); + + // edge 2->5 + assert!(order.lowered_order[9].orig_block().is_none()); + assert!(order.lowered_order[9].in_edge().is_none()); + assert!(order.lowered_order[9].out_edge().unwrap().0.as_u32() == 2); + assert!(order.lowered_order[9].out_edge().unwrap().2.as_u32() == 5); + + // block 5 + assert!(order.lowered_order[10].orig_block().unwrap().as_u32() == 5); + assert!(order.lowered_order[10].in_edge().is_none()); + assert!(order.lowered_order[10].out_edge().is_none()); + } +} diff --git a/third_party/rust/cranelift-codegen/src/machinst/buffer.rs b/third_party/rust/cranelift-codegen/src/machinst/buffer.rs new file mode 100644 index 0000000000..b2187a9b68 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/buffer.rs @@ -0,0 +1,1813 @@ +//! In-memory representation of compiled machine code, with labels and fixups to +//! refer to those labels. Handles constant-pool island insertion and also +//! veneer insertion for out-of-range jumps. +//! +//! This code exists to solve three problems: +//! +//! - Branch targets for forward branches are not known until later, when we +//! emit code in a single pass through the instruction structs. +//! +//! - On many architectures, address references or offsets have limited range. +//! For example, on AArch64, conditional branches can only target code +/- 1MB +//! from the branch itself. +//! +//! - The lowering of control flow from the CFG-with-edges produced by +//! [BlockLoweringOrder](super::BlockLoweringOrder), combined with many empty +//! edge blocks when the register allocator does not need to insert any +//! spills/reloads/moves in edge blocks, results in many suboptimal branch +//! patterns. The lowering also pays no attention to block order, and so +//! two-target conditional forms (cond-br followed by uncond-br) can often by +//! avoided because one of the targets is the fallthrough. There are several +//! cases here where we can simplify to use fewer branches. +//! +//! This "buffer" implements a single-pass code emission strategy (with a later +//! "fixup" pass, but only through recorded fixups, not all instructions). The +//! basic idea is: +//! +//! - Emit branches as they are, including two-target (cond/uncond) compound +//! forms, but with zero offsets and optimistically assuming the target will be +//! in range. Record the "fixup" for later. Targets are denoted instead by +//! symbolic "labels" that are then bound to certain offsets in the buffer as +//! we emit code. (Nominally, there is a label at the start of every basic +//! block.) +//! +//! - As we do this, track the offset in the buffer at which the first label +//! reference "goes out of range". We call this the "deadline". If we reach the +//! deadline and we still have not bound the label to which an unresolved branch +//! refers, we have a problem! +//! +//! - To solve this problem, we emit "islands" full of "veneers". An island is +//! simply a chunk of code inserted in the middle of the code actually produced +//! by the emitter (e.g., vcode iterating over instruction structs). The emitter +//! has some awareness of this: it either asks for an island between blocks, so +//! it is not accidentally executed, or else it emits a branch around the island +//! when all other options fail (see `Inst::EmitIsland` meta-instruction). +//! +//! - A "veneer" is an instruction (or sequence of instructions) in an "island" +//! that implements a longer-range reference to a label. The idea is that, for +//! example, a branch with a limited range can branch to a "veneer" instead, +//! which is simply a branch in a form that can use a longer-range reference. On +//! AArch64, for example, conditionals have a +/- 1 MB range, but a conditional +//! can branch to an unconditional branch which has a +/- 128 MB range. Hence, a +//! conditional branch's label reference can be fixed up with a "veneer" to +//! achieve a longer range. +//! +//! - To implement all of this, we require the backend to provide a `LabelUse` +//! type that implements a trait. This is nominally an enum that records one of +//! several kinds of references to an offset in code -- basically, a relocation +//! type -- and will usually correspond to different instruction formats. The +//! `LabelUse` implementation specifies the maximum range, how to patch in the +//! actual label location when known, and how to generate a veneer to extend the +//! range. +//! +//! That satisfies label references, but we still may have suboptimal branch +//! patterns. To clean up the branches, we do a simple "peephole"-style +//! optimization on the fly. To do so, the emitter (e.g., `Inst::emit()`) +//! informs the buffer of branches in the code and, in the case of conditionals, +//! the code that would have been emitted to invert this branch's condition. We +//! track the "latest branches": these are branches that are contiguous up to +//! the current offset. (If any code is emitted after a branch, that branch or +//! run of contiguous branches is no longer "latest".) The latest branches are +//! those that we can edit by simply truncating the buffer and doing something +//! else instead. +//! +//! To optimize branches, we implement several simple rules, and try to apply +//! them to the "latest branches" when possible: +//! +//! - A branch with a label target, when that label is bound to the ending +//! offset of the branch (the fallthrough location), can be removed altogether, +//! because the branch would have no effect). +//! +//! - An unconditional branch that starts at a label location, and branches to +//! another label, results in a "label alias": all references to the label bound +//! *to* this branch instruction are instead resolved to the *target* of the +//! branch instruction. This effectively removes empty blocks that just +//! unconditionally branch to the next block. We call this "branch threading". +//! +//! - A conditional followed by an unconditional, when the conditional branches +//! to the unconditional's fallthrough, results in (i) the truncation of the +//! unconditional, (ii) the inversion of the condition's condition, and (iii) +//! replacement of the conditional's target (using the original target of the +//! unconditional). This is a fancy way of saying "we can flip a two-target +//! conditional branch's taken/not-taken targets if it works better with our +//! fallthrough". To make this work, the emitter actually gives the buffer +//! *both* forms of every conditional branch: the true form is emitted into the +//! buffer, and the "inverted" machine-code bytes are provided as part of the +//! branch-fixup metadata. +//! +//! - An unconditional B preceded by another unconditional P, when B's label(s) have +//! been redirected to target(B), can be removed entirely. This is an extension +//! of the branch-threading optimization, and is valid because if we know there +//! will be no fallthrough into this branch instruction (the prior instruction +//! is an unconditional jump), and if we know we have successfully redirected +//! all labels, then this branch instruction is unreachable. Note that this +//! works because the redirection happens before the label is ever resolved +//! (fixups happen at island emission time, at which point latest-branches are +//! cleared, or at the end of emission), so we are sure to catch and redirect +//! all possible paths to this instruction. +//! +//! # Branch-optimization Correctness +//! +//! The branch-optimization mechanism depends on a few data structures with +//! invariants, which are always held outside the scope of top-level public +//! methods: +//! +//! - The latest-branches list. Each entry describes a span of the buffer +//! (start/end offsets), the label target, the corresponding fixup-list entry +//! index, and the bytes (must be the same length) for the inverted form, if +//! conditional. The list of labels that are bound to the start-offset of this +//! branch is *complete* (if any label has a resolved offset equal to `start` +//! and is not an alias, it must appear in this list) and *precise* (no label +//! in this list can be bound to another offset). No label in this list should +//! be an alias. No two branch ranges can overlap, and branches are in +//! ascending-offset order. +//! +//! - The labels-at-tail list. This contains all MachLabels that have been bound +//! to (whose resolved offsets are equal to) the tail offset of the buffer. +//! No label in this list should be an alias. +//! +//! - The label_offsets array, containing the bound offset of a label or +//! UNKNOWN. No label can be bound at an offset greater than the current +//! buffer tail. +//! +//! - The label_aliases array, containing another label to which a label is +//! bound or UNKNOWN. A label's resolved offset is the resolved offset +//! of the label it is aliased to, if this is set. +//! +//! We argue below, at each method, how the invariants in these data structures +//! are maintained (grep for "Post-invariant"). +//! +//! Given these invariants, we argue why each optimization preserves execution +//! semantics below (grep for "Preserves execution semantics"). + +use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc, StackMap}; +use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode}; +use crate::machinst::{BlockIndex, MachInstLabelUse, VCodeConstant, VCodeConstants, VCodeInst}; +use crate::timing; +use cranelift_entity::{entity_impl, SecondaryMap}; + +use log::trace; +use smallvec::SmallVec; +use std::mem; +use std::string::String; + +/// A buffer of output to be produced, fixed up, and then emitted to a CodeSink +/// in bulk. +/// +/// This struct uses `SmallVec`s to support small-ish function bodies without +/// any heap allocation. As such, it will be several kilobytes large. This is +/// likely fine as long as it is stack-allocated for function emission then +/// thrown away; but beware if many buffer objects are retained persistently. +pub struct MachBuffer<I: VCodeInst> { + /// The buffer contents, as raw bytes. + data: SmallVec<[u8; 1024]>, + /// Any relocations referring to this code. Note that only *external* + /// relocations are tracked here; references to labels within the buffer are + /// resolved before emission. + relocs: SmallVec<[MachReloc; 16]>, + /// Any trap records referring to this code. + traps: SmallVec<[MachTrap; 16]>, + /// Any call site records referring to this code. + call_sites: SmallVec<[MachCallSite; 16]>, + /// Any source location mappings referring to this code. + srclocs: SmallVec<[MachSrcLoc; 64]>, + /// Any stack maps referring to this code. + stack_maps: SmallVec<[MachStackMap; 8]>, + /// The current source location in progress (after `start_srcloc()` and + /// before `end_srcloc()`). This is a (start_offset, src_loc) tuple. + cur_srcloc: Option<(CodeOffset, SourceLoc)>, + /// Known label offsets; `UNKNOWN_LABEL_OFFSET` if unknown. + label_offsets: SmallVec<[CodeOffset; 16]>, + /// Label aliases: when one label points to an unconditional jump, and that + /// jump points to another label, we can redirect references to the first + /// label immediately to the second. + /// + /// Invariant: we don't have label-alias cycles. We ensure this by, + /// before setting label A to alias label B, resolving B's alias + /// target (iteratively until a non-aliased label); if B is already + /// aliased to A, then we cannot alias A back to B. + label_aliases: SmallVec<[MachLabel; 16]>, + /// Constants that must be emitted at some point. + pending_constants: SmallVec<[MachLabelConstant; 16]>, + /// Fixups that must be performed after all code is emitted. + fixup_records: SmallVec<[MachLabelFixup<I>; 16]>, + /// Current deadline at which all constants are flushed and all code labels + /// are extended by emitting long-range jumps in an island. This flush + /// should be rare (e.g., on AArch64, the shortest-range PC-rel references + /// are +/- 1MB for conditional jumps and load-literal instructions), so + /// it's acceptable to track a minimum and flush-all rather than doing more + /// detailed "current minimum" / sort-by-deadline trickery. + island_deadline: CodeOffset, + /// How many bytes are needed in the worst case for an island, given all + /// pending constants and fixups. + island_worst_case_size: CodeOffset, + /// Latest branches, to facilitate in-place editing for better fallthrough + /// behavior and empty-block removal. + latest_branches: SmallVec<[MachBranch; 4]>, + /// All labels at the current offset (emission tail). This is lazily + /// cleared: it is actually accurate as long as the current offset is + /// `labels_at_tail_off`, but if `cur_offset()` has grown larger, it should + /// be considered as empty. + /// + /// For correctness, this *must* be complete (i.e., the vector must contain + /// all labels whose offsets are resolved to the current tail), because we + /// rely on it to update labels when we truncate branches. + labels_at_tail: SmallVec<[MachLabel; 4]>, + /// The last offset at which `labels_at_tail` is valid. It is conceptually + /// always describing the tail of the buffer, but we do not clear + /// `labels_at_tail` eagerly when the tail grows, rather we lazily clear it + /// when the offset has grown past this (`labels_at_tail_off`) point. + /// Always <= `cur_offset()`. + labels_at_tail_off: CodeOffset, + /// Map used constants to their [MachLabel]. + constant_labels: SecondaryMap<VCodeConstant, MachLabel>, +} + +/// A `MachBuffer` once emission is completed: holds generated code and records, +/// without fixups. This allows the type to be independent of the backend. +pub struct MachBufferFinalized { + /// The buffer contents, as raw bytes. + pub data: SmallVec<[u8; 1024]>, + /// Any relocations referring to this code. Note that only *external* + /// relocations are tracked here; references to labels within the buffer are + /// resolved before emission. + relocs: SmallVec<[MachReloc; 16]>, + /// Any trap records referring to this code. + traps: SmallVec<[MachTrap; 16]>, + /// Any call site records referring to this code. + call_sites: SmallVec<[MachCallSite; 16]>, + /// Any source location mappings referring to this code. + srclocs: SmallVec<[MachSrcLoc; 64]>, + /// Any stack maps referring to this code. + stack_maps: SmallVec<[MachStackMap; 8]>, +} + +static UNKNOWN_LABEL_OFFSET: CodeOffset = 0xffff_ffff; +static UNKNOWN_LABEL: MachLabel = MachLabel(0xffff_ffff); + +/// A label refers to some offset in a `MachBuffer`. It may not be resolved at +/// the point at which it is used by emitted code; the buffer records "fixups" +/// for references to the label, and will come back and patch the code +/// appropriately when the label's location is eventually known. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct MachLabel(u32); +entity_impl!(MachLabel); + +impl MachLabel { + /// Get a label for a block. (The first N MachLabels are always reseved for + /// the N blocks in the vcode.) + pub fn from_block(bindex: BlockIndex) -> MachLabel { + MachLabel(bindex) + } + + /// Get the numeric label index. + pub fn get(self) -> u32 { + self.0 + } + + /// Creates a string representing this label, for convenience. + pub fn to_string(&self) -> String { + format!("label{}", self.0) + } +} + +impl Default for MachLabel { + fn default() -> Self { + UNKNOWN_LABEL + } +} + +/// A stack map extent, when creating a stack map. +pub enum StackMapExtent { + /// The stack map starts at this instruction, and ends after the number of upcoming bytes + /// (note: this is a code offset diff). + UpcomingBytes(CodeOffset), + + /// The stack map started at the given offset and ends at the current one. This helps + /// architectures where the instruction size has not a fixed length. + StartedAtOffset(CodeOffset), +} + +impl<I: VCodeInst> MachBuffer<I> { + /// Create a new section, known to start at `start_offset` and with a size limited to + /// `length_limit`. + pub fn new() -> MachBuffer<I> { + MachBuffer { + data: SmallVec::new(), + relocs: SmallVec::new(), + traps: SmallVec::new(), + call_sites: SmallVec::new(), + srclocs: SmallVec::new(), + stack_maps: SmallVec::new(), + cur_srcloc: None, + label_offsets: SmallVec::new(), + label_aliases: SmallVec::new(), + pending_constants: SmallVec::new(), + fixup_records: SmallVec::new(), + island_deadline: UNKNOWN_LABEL_OFFSET, + island_worst_case_size: 0, + latest_branches: SmallVec::new(), + labels_at_tail: SmallVec::new(), + labels_at_tail_off: 0, + constant_labels: SecondaryMap::new(), + } + } + + /// Debug-only: check invariants of labels and branch-records described + /// under "Branch-optimization Correctness" above. + #[cfg(debug)] + fn check_label_branch_invariants(&self) { + let cur_off = self.cur_offset(); + // Check that every entry in latest_branches has *correct* + // labels_at_this_branch lists. We do not check completeness because + // that would require building a reverse index, which is too slow even + // for a debug invariant check. + let mut last_end = 0; + for b in &self.latest_branches { + debug_assert!(b.start < b.end); + debug_assert!(b.end <= cur_off); + debug_assert!(b.start >= last_end); + last_end = b.end; + for &l in &b.labels_at_this_branch { + debug_assert_eq!(self.resolve_label_offset(l), b.start); + debug_assert_eq!(self.label_aliases[l.0 as usize], UNKNOWN_LABEL); + } + } + + // Check that every label is unresolved, or resolved at or before + // cur_offset. If at cur_offset, must be in `labels_at_tail`. + for (i, &off) in self.label_offsets.iter().enumerate() { + let label = MachLabel(i as u32); + debug_assert!(off == UNKNOWN_LABEL_OFFSET || off <= cur_off); + if off == cur_off { + debug_assert!( + self.labels_at_tail_off == cur_off && self.labels_at_tail.contains(&label) + ); + } + } + + // Check that every label in `labels_at_tail_off` is precise, i.e., + // resolves to the cur offset. + debug_assert!(self.labels_at_tail_off <= cur_off); + if self.labels_at_tail_off == cur_off { + for &l in &self.labels_at_tail { + debug_assert_eq!(self.resolve_label_offset(l), cur_off); + debug_assert_eq!(self.label_aliases[l.0 as usize], UNKNOWN_LABEL); + } + } + } + + #[cfg(not(debug))] + fn check_label_branch_invariants(&self) { + // Nothing. + } + + /// Current offset from start of buffer. + pub fn cur_offset(&self) -> CodeOffset { + self.data.len() as CodeOffset + } + + /// Add a byte. + pub fn put1(&mut self, value: u8) { + trace!("MachBuffer: put byte @ {}: {:x}", self.cur_offset(), value); + self.data.push(value); + + // Post-invariant: conceptual-labels_at_tail contains a complete and + // precise list of labels bound at `cur_offset()`. We have advanced + // `cur_offset()`, hence if it had been equal to `labels_at_tail_off` + // before, it is not anymore (and it cannot become equal, because + // `labels_at_tail_off` is always <= `cur_offset()`). Thus the list is + // conceptually empty (even though it is only lazily cleared). No labels + // can be bound at this new offset (by invariant on `label_offsets`). + // Hence the invariant holds. + } + + /// Add 2 bytes. + pub fn put2(&mut self, value: u16) { + trace!( + "MachBuffer: put 16-bit word @ {}: {:x}", + self.cur_offset(), + value + ); + let bytes = value.to_le_bytes(); + self.data.extend_from_slice(&bytes[..]); + + // Post-invariant: as for `put1()`. + } + + /// Add 4 bytes. + pub fn put4(&mut self, value: u32) { + trace!( + "MachBuffer: put 32-bit word @ {}: {:x}", + self.cur_offset(), + value + ); + let bytes = value.to_le_bytes(); + self.data.extend_from_slice(&bytes[..]); + + // Post-invariant: as for `put1()`. + } + + /// Add 8 bytes. + pub fn put8(&mut self, value: u64) { + trace!( + "MachBuffer: put 64-bit word @ {}: {:x}", + self.cur_offset(), + value + ); + let bytes = value.to_le_bytes(); + self.data.extend_from_slice(&bytes[..]); + + // Post-invariant: as for `put1()`. + } + + /// Add a slice of bytes. + pub fn put_data(&mut self, data: &[u8]) { + trace!( + "MachBuffer: put data @ {}: len {}", + self.cur_offset(), + data.len() + ); + self.data.extend_from_slice(data); + + // Post-invariant: as for `put1()`. + } + + /// Reserve appended space and return a mutable slice referring to it. + pub fn get_appended_space(&mut self, len: usize) -> &mut [u8] { + trace!("MachBuffer: put data @ {}: len {}", self.cur_offset(), len); + let off = self.data.len(); + let new_len = self.data.len() + len; + self.data.resize(new_len, 0); + &mut self.data[off..] + + // Post-invariant: as for `put1()`. + } + + /// Align up to the given alignment. + pub fn align_to(&mut self, align_to: CodeOffset) { + trace!("MachBuffer: align to {}", align_to); + assert!(align_to.is_power_of_two()); + while self.cur_offset() & (align_to - 1) != 0 { + self.put1(0); + } + + // Post-invariant: as for `put1()`. + } + + /// Allocate a `Label` to refer to some offset. May not be bound to a fixed + /// offset yet. + pub fn get_label(&mut self) -> MachLabel { + let l = self.label_offsets.len() as u32; + self.label_offsets.push(UNKNOWN_LABEL_OFFSET); + self.label_aliases.push(UNKNOWN_LABEL); + trace!("MachBuffer: new label -> {:?}", MachLabel(l)); + MachLabel(l) + + // Post-invariant: the only mutation is to add a new label; it has no + // bound offset yet, so it trivially satisfies all invariants. + } + + /// Reserve the first N MachLabels for blocks. + pub fn reserve_labels_for_blocks(&mut self, blocks: BlockIndex) { + trace!("MachBuffer: first {} labels are for blocks", blocks); + debug_assert!(self.label_offsets.is_empty()); + self.label_offsets + .resize(blocks as usize, UNKNOWN_LABEL_OFFSET); + self.label_aliases.resize(blocks as usize, UNKNOWN_LABEL); + + // Post-invariant: as for `get_label()`. + } + + /// Reserve the next N MachLabels for constants. + pub fn reserve_labels_for_constants(&mut self, constants: &VCodeConstants) { + trace!( + "MachBuffer: next {} labels are for constants", + constants.len() + ); + for c in constants.keys() { + self.constant_labels[c] = self.get_label(); + } + + // Post-invariant: as for `get_label()`. + } + + /// Retrieve the reserved label for a constant. + pub fn get_label_for_constant(&self, constant: VCodeConstant) -> MachLabel { + self.constant_labels[constant] + } + + /// Bind a label to the current offset. A label can only be bound once. + pub fn bind_label(&mut self, label: MachLabel) { + trace!( + "MachBuffer: bind label {:?} at offset {}", + label, + self.cur_offset() + ); + debug_assert_eq!(self.label_offsets[label.0 as usize], UNKNOWN_LABEL_OFFSET); + debug_assert_eq!(self.label_aliases[label.0 as usize], UNKNOWN_LABEL); + let offset = self.cur_offset(); + self.label_offsets[label.0 as usize] = offset; + self.lazily_clear_labels_at_tail(); + self.labels_at_tail.push(label); + + // Invariants hold: bound offset of label is <= cur_offset (in fact it + // is equal). If the `labels_at_tail` list was complete and precise + // before, it is still, because we have bound this label to the current + // offset and added it to the list (which contains all labels at the + // current offset). + + self.check_label_branch_invariants(); + self.optimize_branches(); + + // Post-invariant: by `optimize_branches()` (see argument there). + self.check_label_branch_invariants(); + } + + /// Lazily clear `labels_at_tail` if the tail offset has moved beyond the + /// offset that it applies to. + fn lazily_clear_labels_at_tail(&mut self) { + let offset = self.cur_offset(); + if offset > self.labels_at_tail_off { + self.labels_at_tail_off = offset; + self.labels_at_tail.clear(); + } + + // Post-invariant: either labels_at_tail_off was at cur_offset, and + // state is untouched, or was less than cur_offset, in which case the + // labels_at_tail list was conceptually empty, and is now actually + // empty. + } + + /// Resolve a label to an offset, if known. May return `UNKNOWN_LABEL_OFFSET`. + fn resolve_label_offset(&self, mut label: MachLabel) -> CodeOffset { + let mut iters = 0; + while self.label_aliases[label.0 as usize] != UNKNOWN_LABEL { + label = self.label_aliases[label.0 as usize]; + // To protect against an infinite loop (despite our assurances to + // ourselves that the invariants make this impossible), assert out + // after 1M iterations. The number of basic blocks is limited + // in most contexts anyway so this should be impossible to hit with + // a legitimate input. + iters += 1; + assert!(iters < 1_000_000, "Unexpected cycle in label aliases"); + } + self.label_offsets[label.0 as usize] + + // Post-invariant: no mutations. + } + + /// Emit a reference to the given label with the given reference type (i.e., + /// branch-instruction format) at the current offset. This is like a + /// relocation, but handled internally. + /// + /// This can be called before the branch is actually emitted; fixups will + /// not happen until an island is emitted or the buffer is finished. + pub fn use_label_at_offset(&mut self, offset: CodeOffset, label: MachLabel, kind: I::LabelUse) { + trace!( + "MachBuffer: use_label_at_offset: offset {} label {:?} kind {:?}", + offset, + label, + kind + ); + + // Add the fixup, and update the worst-case island size based on a + // veneer for this label use. + self.fixup_records.push(MachLabelFixup { + label, + offset, + kind, + }); + if kind.supports_veneer() { + self.island_worst_case_size += kind.veneer_size(); + self.island_worst_case_size &= !(I::LabelUse::ALIGN - 1); + } + let deadline = offset + kind.max_pos_range(); + if deadline < self.island_deadline { + self.island_deadline = deadline; + } + + // Post-invariant: no mutations to branches/labels data structures. + self.check_label_branch_invariants(); + } + + /// Inform the buffer of an unconditional branch at the given offset, + /// targetting the given label. May be used to optimize branches. + /// The last added label-use must correspond to this branch. + /// This must be called when the current offset is equal to `start`; i.e., + /// before actually emitting the branch. This implies that for a branch that + /// uses a label and is eligible for optimizations by the MachBuffer, the + /// proper sequence is: + /// + /// - Call `use_label_at_offset()` to emit the fixup record. + /// - Call `add_uncond_branch()` to make note of the branch. + /// - Emit the bytes for the branch's machine code. + /// + /// Additional requirement: no labels may be bound between `start` and `end` + /// (exclusive on both ends). + pub fn add_uncond_branch(&mut self, start: CodeOffset, end: CodeOffset, target: MachLabel) { + assert!(self.cur_offset() == start); + debug_assert!(end > start); + assert!(!self.fixup_records.is_empty()); + let fixup = self.fixup_records.len() - 1; + self.lazily_clear_labels_at_tail(); + self.latest_branches.push(MachBranch { + start, + end, + target, + fixup, + inverted: None, + labels_at_this_branch: self.labels_at_tail.clone(), + }); + + // Post-invariant: we asserted branch start is current tail; the list of + // labels at branch is cloned from list of labels at current tail. + self.check_label_branch_invariants(); + } + + /// Inform the buffer of a conditional branch at the given offset, + /// targetting the given label. May be used to optimize branches. + /// The last added label-use must correspond to this branch. + /// + /// Additional requirement: no labels may be bound between `start` and `end` + /// (exclusive on both ends). + pub fn add_cond_branch( + &mut self, + start: CodeOffset, + end: CodeOffset, + target: MachLabel, + inverted: &[u8], + ) { + assert!(self.cur_offset() == start); + debug_assert!(end > start); + assert!(!self.fixup_records.is_empty()); + debug_assert!(inverted.len() == (end - start) as usize); + let fixup = self.fixup_records.len() - 1; + let inverted = Some(SmallVec::from(inverted)); + self.lazily_clear_labels_at_tail(); + self.latest_branches.push(MachBranch { + start, + end, + target, + fixup, + inverted, + labels_at_this_branch: self.labels_at_tail.clone(), + }); + + // Post-invariant: we asserted branch start is current tail; labels at + // branch list is cloned from list of labels at current tail. + self.check_label_branch_invariants(); + } + + fn truncate_last_branch(&mut self) { + self.lazily_clear_labels_at_tail(); + // Invariants hold at this point. + + let b = self.latest_branches.pop().unwrap(); + assert!(b.end == self.cur_offset()); + + // State: + // [PRE CODE] + // Offset b.start, b.labels_at_this_branch: + // [BRANCH CODE] + // cur_off, self.labels_at_tail --> + // (end of buffer) + self.data.truncate(b.start as usize); + self.fixup_records.truncate(b.fixup); + // State: + // [PRE CODE] + // cur_off, Offset b.start, b.labels_at_this_branch: + // (end of buffer) + // + // self.labels_at_tail --> (past end of buffer) + let cur_off = self.cur_offset(); + self.labels_at_tail_off = cur_off; + // State: + // [PRE CODE] + // cur_off, Offset b.start, b.labels_at_this_branch, + // self.labels_at_tail: + // (end of buffer) + // + // resolve_label_offset(l) for l in labels_at_tail: + // (past end of buffer) + + trace!( + "truncate_last_branch: truncated {:?}; off now {}", + b, + cur_off + ); + + // Fix up resolved label offsets for labels at tail. + for &l in &self.labels_at_tail { + self.label_offsets[l.0 as usize] = cur_off; + } + // Old labels_at_this_branch are now at cur_off. + self.labels_at_tail + .extend(b.labels_at_this_branch.into_iter()); + + // Post-invariant: this operation is defined to truncate the buffer, + // which moves cur_off backward, and to move labels at the end of the + // buffer back to the start-of-branch offset. + // + // latest_branches satisfies all invariants: + // - it has no branches past the end of the buffer (branches are in + // order, we removed the last one, and we truncated the buffer to just + // before the start of that branch) + // - no labels were moved to lower offsets than the (new) cur_off, so + // the labels_at_this_branch list for any other branch need not change. + // + // labels_at_tail satisfies all invariants: + // - all labels that were at the tail after the truncated branch are + // moved backward to just before the branch, which becomes the new tail; + // thus every element in the list should remain (ensured by `.extend()` + // above). + // - all labels that refer to the new tail, which is the start-offset of + // the truncated branch, must be present. The `labels_at_this_branch` + // list in the truncated branch's record is a complete and precise list + // of exactly these labels; we append these to labels_at_tail. + // - labels_at_tail_off is at cur_off after truncation occurs, so the + // list is valid (not to be lazily cleared). + // + // The stated operation was performed: + // - For each label at the end of the buffer prior to this method, it + // now resolves to the new (truncated) end of the buffer: it must have + // been in `labels_at_tail` (this list is precise and complete, and + // the tail was at the end of the truncated branch on entry), and we + // iterate over this list and set `label_offsets` to the new tail. + // None of these labels could have been an alias (by invariant), so + // `label_offsets` is authoritative for each. + // - No other labels will be past the end of the buffer, because of the + // requirement that no labels be bound to the middle of branch ranges + // (see comments to `add_{cond,uncond}_branch()`). + // - The buffer is truncated to just before the last branch, and the + // fixup record referring to that last branch is removed. + self.check_label_branch_invariants(); + } + + fn optimize_branches(&mut self) { + self.lazily_clear_labels_at_tail(); + // Invariants valid at this point. + + trace!( + "enter optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}", + self.latest_branches, + self.labels_at_tail, + self.fixup_records + ); + + // We continue to munch on branches at the tail of the buffer until no + // more rules apply. Note that the loop only continues if a branch is + // actually truncated (or if labels are redirected away from a branch), + // so this always makes progress. + while let Some(b) = self.latest_branches.last() { + let cur_off = self.cur_offset(); + trace!("optimize_branches: last branch {:?} at off {}", b, cur_off); + // If there has been any code emission since the end of the last branch or + // label definition, then there's nothing we can edit (because we + // don't move code once placed, only back up and overwrite), so + // clear the records and finish. + if b.end < cur_off { + break; + } + + // Invariant: we are looking at a branch that ends at the tail of + // the buffer. + + // For any branch, conditional or unconditional: + // - If the target is a label at the current offset, then remove + // the conditional branch, and reset all labels that targetted + // the current offset (end of branch) to the truncated + // end-of-code. + // + // Preserves execution semantics: a branch to its own fallthrough + // address is equivalent to a no-op; in both cases, nextPC is the + // fallthrough. + if self.resolve_label_offset(b.target) == cur_off { + trace!("branch with target == cur off; truncating"); + self.truncate_last_branch(); + continue; + } + + // If latest is an unconditional branch: + // + // - If the branch's target is not its own start address, then for + // each label at the start of branch, make the label an alias of the + // branch target, and remove the label from the "labels at this + // branch" list. + // + // - Preserves execution semantics: an unconditional branch's + // only effect is to set PC to a new PC; this change simply + // collapses one step in the step-semantics. + // + // - Post-invariant: the labels that were bound to the start of + // this branch become aliases, so they must not be present in any + // labels-at-this-branch list or the labels-at-tail list. The + // labels are removed form the latest-branch record's + // labels-at-this-branch list, and are never placed in the + // labels-at-tail list. Furthermore, it is correct that they are + // not in either list, because they are now aliases, and labels + // that are aliases remain aliases forever. + // + // - If there is a prior unconditional branch that ends just before + // this one begins, and this branch has no labels bound to its + // start, then we can truncate this branch, because it is entirely + // unreachable (we have redirected all labels that make it + // reachable otherwise). Do so and continue around the loop. + // + // - Preserves execution semantics: the branch is unreachable, + // because execution can only flow into an instruction from the + // prior instruction's fallthrough or from a branch bound to that + // instruction's start offset. Unconditional branches have no + // fallthrough, so if the prior instruction is an unconditional + // branch, no fallthrough entry can happen. The + // labels-at-this-branch list is complete (by invariant), so if it + // is empty, then the instruction is entirely unreachable. Thus, + // it can be removed. + // + // - Post-invariant: ensured by truncate_last_branch(). + // + // - If there is a prior conditional branch whose target label + // resolves to the current offset (branches around the + // unconditional branch), then remove the unconditional branch, + // and make the target of the unconditional the target of the + // conditional instead. + // + // - Preserves execution semantics: previously we had: + // + // L1: + // cond_br L2 + // br L3 + // L2: + // (end of buffer) + // + // by removing the last branch, we have: + // + // L1: + // cond_br L2 + // L2: + // (end of buffer) + // + // we then fix up the records for the conditional branch to + // have: + // + // L1: + // cond_br.inverted L3 + // L2: + // + // In the original code, control flow reaches L2 when the + // conditional branch's predicate is true, and L3 otherwise. In + // the optimized code, the same is true. + // + // - Post-invariant: all edits to latest_branches and + // labels_at_tail are performed by `truncate_last_branch()`, + // which maintains the invariants at each step. + + if b.is_uncond() { + // Set any label equal to current branch's start as an alias of + // the branch's target, if the target is not the branch itself + // (i.e., an infinite loop). + // + // We cannot perform this aliasing if the target of this branch + // ultimately aliases back here; if so, we need to keep this + // branch, so break out of this loop entirely (and clear the + // latest-branches list below). + // + // Note that this check is what prevents cycles from forming in + // `self.label_aliases`. To see why, consider an arbitrary start + // state: + // + // label_aliases[L1] = L2, label_aliases[L2] = L3, ..., up to + // Ln, which is not aliased. + // + // We would create a cycle if we assigned label_aliases[Ln] + // = L1. Note that the below assignment is the only write + // to label_aliases. + // + // By our other invariants, we have that Ln (`l` below) + // resolves to the offset `b.start`, because it is in the + // set `b.labels_at_this_branch`. + // + // If L1 were already aliased, through some arbitrarily deep + // chain, to Ln, then it must also resolve to this offset + // `b.start`. + // + // By checking the resolution of `L1` against this offset, + // and aborting this branch-simplification if they are + // equal, we prevent the below assignment from ever creating + // a cycle. + if self.resolve_label_offset(b.target) != b.start { + let redirected = b.labels_at_this_branch.len(); + for &l in &b.labels_at_this_branch { + trace!( + " -> label at start of branch {:?} redirected to target {:?}", + l, + b.target + ); + self.label_aliases[l.0 as usize] = b.target; + // NOTE: we continue to ensure the invariant that labels + // pointing to tail of buffer are in `labels_at_tail` + // because we already ensured above that the last branch + // cannot have a target of `cur_off`; so we never have + // to put the label into `labels_at_tail` when moving it + // here. + } + // Maintain invariant: all branches have been redirected + // and are no longer pointing at the start of this branch. + let mut_b = self.latest_branches.last_mut().unwrap(); + mut_b.labels_at_this_branch.clear(); + + if redirected > 0 { + trace!(" -> after label redirects, restarting loop"); + continue; + } + } else { + break; + } + + let b = self.latest_branches.last().unwrap(); + + // Examine any immediately preceding branch. + if self.latest_branches.len() > 1 { + let prev_b = &self.latest_branches[self.latest_branches.len() - 2]; + trace!(" -> more than one branch; prev_b = {:?}", prev_b); + // This uncond is immediately after another uncond; we + // should have already redirected labels to this uncond away + // (but check to be sure); so we can truncate this uncond. + if prev_b.is_uncond() + && prev_b.end == b.start + && b.labels_at_this_branch.is_empty() + { + trace!(" -> uncond follows another uncond; truncating"); + self.truncate_last_branch(); + continue; + } + + // This uncond is immediately after a conditional, and the + // conditional's target is the end of this uncond, and we've + // already redirected labels to this uncond away; so we can + // truncate this uncond, flip the sense of the conditional, and + // set the conditional's target (in `latest_branches` and in + // `fixup_records`) to the uncond's target. + if prev_b.is_cond() + && prev_b.end == b.start + && self.resolve_label_offset(prev_b.target) == cur_off + { + trace!(" -> uncond follows a conditional, and conditional's target resolves to current offset"); + // Save the target of the uncond (this becomes the + // target of the cond), and truncate the uncond. + let target = b.target; + let data = prev_b.inverted.clone().unwrap(); + self.truncate_last_branch(); + + // Mutate the code and cond branch. + let off_before_edit = self.cur_offset(); + let prev_b = self.latest_branches.last_mut().unwrap(); + let not_inverted = SmallVec::from( + &self.data[(prev_b.start as usize)..(prev_b.end as usize)], + ); + + // Low-level edit: replaces bytes of branch with + // inverted form. cur_off remains the same afterward, so + // we do not need to modify label data structures. + self.data.truncate(prev_b.start as usize); + self.data.extend_from_slice(&data[..]); + + // Save the original code as the inversion of the + // inverted branch, in case we later edit this branch + // again. + prev_b.inverted = Some(not_inverted); + self.fixup_records[prev_b.fixup].label = target; + trace!(" -> reassigning target of condbr to {:?}", target); + prev_b.target = target; + debug_assert_eq!(off_before_edit, self.cur_offset()); + continue; + } + } + } + + // If we couldn't do anything with the last branch, then break. + break; + } + + self.purge_latest_branches(); + + trace!( + "leave optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}", + self.latest_branches, + self.labels_at_tail, + self.fixup_records + ); + } + + fn purge_latest_branches(&mut self) { + // All of our branch simplification rules work only if a branch ends at + // the tail of the buffer, with no following code; and branches are in + // order in latest_branches; so if the last entry ends prior to + // cur_offset, then clear all entries. + let cur_off = self.cur_offset(); + if let Some(l) = self.latest_branches.last() { + if l.end < cur_off { + trace!("purge_latest_branches: removing branch {:?}", l); + self.latest_branches.clear(); + } + } + + // Post-invariant: no invariant requires any branch to appear in + // `latest_branches`; it is always optional. The list-clear above thus + // preserves all semantics. + } + + /// Emit a constant at some point in the future, binding the given label to + /// its offset. The constant will be placed at most `max_distance` from the + /// current offset. + pub fn defer_constant( + &mut self, + label: MachLabel, + align: CodeOffset, + data: &[u8], + max_distance: CodeOffset, + ) { + trace!( + "defer_constant: eventually emit {} bytes aligned to {} at label {:?}", + data.len(), + align, + label + ); + let deadline = self.cur_offset().saturating_add(max_distance); + self.island_worst_case_size += data.len() as CodeOffset; + self.island_worst_case_size = + (self.island_worst_case_size + I::LabelUse::ALIGN - 1) & !(I::LabelUse::ALIGN - 1); + self.pending_constants.push(MachLabelConstant { + label, + align, + data: SmallVec::from(data), + }); + if deadline < self.island_deadline { + self.island_deadline = deadline; + } + } + + /// Is an island needed within the next N bytes? + pub fn island_needed(&self, distance: CodeOffset) -> bool { + let worst_case_end_of_island = self.cur_offset() + distance + self.island_worst_case_size; + worst_case_end_of_island > self.island_deadline + } + + /// Emit all pending constants and veneers. Should only be called if + /// `island_needed()` returns true, i.e., if we actually reach a deadline: + /// otherwise, unnecessary veneers may be inserted. + pub fn emit_island(&mut self) { + // We're going to purge fixups, so no latest-branch editing can happen + // anymore. + self.latest_branches.clear(); + + let pending_constants = mem::replace(&mut self.pending_constants, SmallVec::new()); + for MachLabelConstant { label, align, data } in pending_constants.into_iter() { + self.align_to(align); + self.bind_label(label); + self.put_data(&data[..]); + } + + let fixup_records = mem::replace(&mut self.fixup_records, SmallVec::new()); + let mut new_fixups = SmallVec::new(); + for MachLabelFixup { + label, + offset, + kind, + } in fixup_records.into_iter() + { + trace!( + "emit_island: fixup for label {:?} at offset {} kind {:?}", + label, + offset, + kind + ); + // We eagerly perform fixups whose label targets are known, if not out + // of range, to avoid unnecessary veneers. + let label_offset = self.resolve_label_offset(label); + let known = label_offset != UNKNOWN_LABEL_OFFSET; + let in_range = if known { + if label_offset >= offset { + (label_offset - offset) <= kind.max_pos_range() + } else { + (offset - label_offset) <= kind.max_neg_range() + } + } else { + false + }; + + trace!( + " -> label_offset = {}, known = {}, in_range = {} (pos {} neg {})", + label_offset, + known, + in_range, + kind.max_pos_range(), + kind.max_neg_range() + ); + + let start = offset as usize; + let end = (offset + kind.patch_size()) as usize; + if in_range { + debug_assert!(known); // implied by in_range. + let slice = &mut self.data[start..end]; + trace!("patching in-range!"); + kind.patch(slice, offset, label_offset); + } else if !known && !kind.supports_veneer() { + // Nothing for now. Keep it for next round. + new_fixups.push(MachLabelFixup { + label, + offset, + kind, + }); + } else if !in_range && kind.supports_veneer() { + // Allocate space for a veneer in the island. + self.align_to(I::LabelUse::ALIGN); + let veneer_offset = self.cur_offset(); + trace!("making a veneer at {}", veneer_offset); + let slice = &mut self.data[start..end]; + // Patch the original label use to refer to the veneer. + trace!( + "patching original at offset {} to veneer offset {}", + offset, + veneer_offset + ); + kind.patch(slice, offset, veneer_offset); + // Generate the veneer. + let veneer_slice = self.get_appended_space(kind.veneer_size() as usize); + let (veneer_fixup_off, veneer_label_use) = + kind.generate_veneer(veneer_slice, veneer_offset); + trace!( + "generated veneer; fixup offset {}, label_use {:?}", + veneer_fixup_off, + veneer_label_use + ); + // If the label is known (but was just out of range), do the + // veneer label-use fixup now too; otherwise, save it for later. + if known { + let start = veneer_fixup_off as usize; + let end = (veneer_fixup_off + veneer_label_use.patch_size()) as usize; + let veneer_slice = &mut self.data[start..end]; + trace!("doing veneer fixup right away too"); + veneer_label_use.patch(veneer_slice, veneer_fixup_off, label_offset); + } else { + new_fixups.push(MachLabelFixup { + label, + offset: veneer_fixup_off, + kind: veneer_label_use, + }); + } + } else { + panic!( + "Cannot support label-use {:?} (known = {}, in-range = {})", + kind, known, in_range + ); + } + } + + self.fixup_records = new_fixups; + self.island_deadline = UNKNOWN_LABEL_OFFSET; + } + + /// Finish any deferred emissions and/or fixups. + pub fn finish(mut self) -> MachBufferFinalized { + let _tt = timing::vcode_emit_finish(); + + while !self.pending_constants.is_empty() || !self.fixup_records.is_empty() { + // `emit_island()` will emit any pending veneers and constants, and + // as a side-effect, will also take care of any fixups with resolved + // labels eagerly. + self.emit_island(); + } + + // Ensure that all labels have been fixed up after the last island is emitted. This is a + // full (release-mode) assert because an unresolved label means the emitted code is + // incorrect. + assert!(self.fixup_records.is_empty()); + + MachBufferFinalized { + data: self.data, + relocs: self.relocs, + traps: self.traps, + call_sites: self.call_sites, + srclocs: self.srclocs, + stack_maps: self.stack_maps, + } + } + + /// Add an external relocation at the current offset. + pub fn add_reloc( + &mut self, + srcloc: SourceLoc, + kind: Reloc, + name: &ExternalName, + addend: Addend, + ) { + let name = name.clone(); + self.relocs.push(MachReloc { + offset: self.data.len() as CodeOffset, + srcloc, + kind, + name, + addend, + }); + } + + /// Add a trap record at the current offset. + pub fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) { + self.traps.push(MachTrap { + offset: self.data.len() as CodeOffset, + srcloc, + code, + }); + } + + /// Add a call-site record at the current offset. + pub fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) { + self.call_sites.push(MachCallSite { + ret_addr: self.data.len() as CodeOffset, + srcloc, + opcode, + }); + } + + /// Set the `SourceLoc` for code from this offset until the offset at the + /// next call to `end_srcloc()`. + pub fn start_srcloc(&mut self, loc: SourceLoc) { + self.cur_srcloc = Some((self.cur_offset(), loc)); + } + + /// Mark the end of the `SourceLoc` segment started at the last + /// `start_srcloc()` call. + pub fn end_srcloc(&mut self) { + let (start, loc) = self + .cur_srcloc + .take() + .expect("end_srcloc() called without start_srcloc()"); + let end = self.cur_offset(); + // Skip zero-length extends. + debug_assert!(end >= start); + if end > start { + self.srclocs.push(MachSrcLoc { start, end, loc }); + } + } + + /// Add stack map metadata for this program point: a set of stack offsets + /// (from SP upward) that contain live references. + /// + /// The `offset_to_fp` value is the offset from the nominal SP (at which the `stack_offsets` + /// are based) and the FP value. By subtracting `offset_to_fp` from each `stack_offsets` + /// element, one can obtain live-reference offsets from FP instead. + pub fn add_stack_map(&mut self, extent: StackMapExtent, stack_map: StackMap) { + let (start, end) = match extent { + StackMapExtent::UpcomingBytes(insn_len) => { + let start_offset = self.cur_offset(); + (start_offset, start_offset + insn_len) + } + StackMapExtent::StartedAtOffset(start_offset) => { + let end_offset = self.cur_offset(); + debug_assert!(end_offset >= start_offset); + (start_offset, end_offset) + } + }; + self.stack_maps.push(MachStackMap { + offset: start, + offset_end: end, + stack_map, + }); + } +} + +impl MachBufferFinalized { + /// Get a list of source location mapping tuples in sorted-by-start-offset order. + pub fn get_srclocs_sorted(&self) -> &[MachSrcLoc] { + &self.srclocs[..] + } + + /// Get the total required size for the code. + pub fn total_size(&self) -> CodeOffset { + self.data.len() as CodeOffset + } + + /// Emit this buffer to the given CodeSink. + pub fn emit<CS: CodeSink>(&self, sink: &mut CS) { + // N.B.: we emit every section into the .text section as far as + // the `CodeSink` is concerned; we do not bother to segregate + // the contents into the actual program text, the jumptable and the + // rodata (constant pool). This allows us to generate code assuming + // that these will not be relocated relative to each other, and avoids + // having to designate each section as belonging in one of the three + // fixed categories defined by `CodeSink`. If this becomes a problem + // later (e.g. because of memory permissions or similar), we can + // add this designation and segregate the output; take care, however, + // to add the appropriate relocations in this case. + + let mut next_reloc = 0; + let mut next_trap = 0; + let mut next_call_site = 0; + for (idx, byte) in self.data.iter().enumerate() { + if next_reloc < self.relocs.len() { + let reloc = &self.relocs[next_reloc]; + if reloc.offset == idx as CodeOffset { + sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend); + next_reloc += 1; + } + } + if next_trap < self.traps.len() { + let trap = &self.traps[next_trap]; + if trap.offset == idx as CodeOffset { + sink.trap(trap.code, trap.srcloc); + next_trap += 1; + } + } + if next_call_site < self.call_sites.len() { + let call_site = &self.call_sites[next_call_site]; + if call_site.ret_addr == idx as CodeOffset { + sink.add_call_site(call_site.opcode, call_site.srcloc); + next_call_site += 1; + } + } + sink.put1(*byte); + } + + sink.begin_jumptables(); + sink.begin_rodata(); + sink.end_codegen(); + } + + /// Get the stack map metadata for this code. + pub fn stack_maps(&self) -> &[MachStackMap] { + &self.stack_maps[..] + } +} + +/// A constant that is deferred to the next constant-pool opportunity. +struct MachLabelConstant { + /// This label will refer to the constant's offset. + label: MachLabel, + /// Required alignment. + align: CodeOffset, + /// This data will be emitted when able. + data: SmallVec<[u8; 16]>, +} + +/// A fixup to perform on the buffer once code is emitted. Fixups always refer +/// to labels and patch the code based on label offsets. Hence, they are like +/// relocations, but internal to one buffer. +#[derive(Debug)] +struct MachLabelFixup<I: VCodeInst> { + /// The label whose offset controls this fixup. + label: MachLabel, + /// The offset to fix up / patch to refer to this label. + offset: CodeOffset, + /// The kind of fixup. This is architecture-specific; each architecture may have, + /// e.g., several types of branch instructions, each with differently-sized + /// offset fields and different places within the instruction to place the + /// bits. + kind: I::LabelUse, +} + +/// A relocation resulting from a compilation. +struct MachReloc { + /// The offset at which the relocation applies, *relative to the + /// containing section*. + offset: CodeOffset, + /// The original source location. + srcloc: SourceLoc, + /// The kind of relocation. + kind: Reloc, + /// The external symbol / name to which this relocation refers. + name: ExternalName, + /// The addend to add to the symbol value. + addend: i64, +} + +/// A trap record resulting from a compilation. +struct MachTrap { + /// The offset at which the trap instruction occurs, *relative to the + /// containing section*. + offset: CodeOffset, + /// The original source location. + srcloc: SourceLoc, + /// The trap code. + code: TrapCode, +} + +/// A call site record resulting from a compilation. +struct MachCallSite { + /// The offset of the call's return address, *relative to the containing section*. + ret_addr: CodeOffset, + /// The original source location. + srcloc: SourceLoc, + /// The call's opcode. + opcode: Opcode, +} + +/// A source-location mapping resulting from a compilation. +#[derive(Clone, Debug)] +pub struct MachSrcLoc { + /// The start of the region of code corresponding to a source location. + /// This is relative to the start of the function, not to the start of the + /// section. + pub start: CodeOffset, + /// The end of the region of code corresponding to a source location. + /// This is relative to the start of the section, not to the start of the + /// section. + pub end: CodeOffset, + /// The source location. + pub loc: SourceLoc, +} + +/// Record of stack map metadata: stack offsets containing references. +#[derive(Clone, Debug)] +pub struct MachStackMap { + /// The code offset at which this stack map applies. + pub offset: CodeOffset, + /// The code offset just past the "end" of the instruction: that is, the + /// offset of the first byte of the following instruction, or equivalently, + /// the start offset plus the instruction length. + pub offset_end: CodeOffset, + /// The stack map itself. + pub stack_map: StackMap, +} + +/// Record of branch instruction in the buffer, to facilitate editing. +#[derive(Clone, Debug)] +struct MachBranch { + start: CodeOffset, + end: CodeOffset, + target: MachLabel, + fixup: usize, + inverted: Option<SmallVec<[u8; 8]>>, + /// All labels pointing to the start of this branch. For correctness, this + /// *must* be complete (i.e., must contain all labels whose resolved offsets + /// are at the start of this branch): we rely on being able to redirect all + /// labels that could jump to this branch before removing it, if it is + /// otherwise unreachable. + labels_at_this_branch: SmallVec<[MachLabel; 4]>, +} + +impl MachBranch { + fn is_cond(&self) -> bool { + self.inverted.is_some() + } + fn is_uncond(&self) -> bool { + self.inverted.is_none() + } +} + +// We use an actual instruction definition to do tests, so we depend on the `arm64` feature here. +#[cfg(all(test, feature = "arm64"))] +mod test { + use super::*; + use crate::isa::aarch64::inst::xreg; + use crate::isa::aarch64::inst::{BranchTarget, CondBrKind, EmitInfo, Inst}; + use crate::machinst::MachInstEmit; + use crate::settings; + use std::default::Default; + + fn label(n: u32) -> MachLabel { + MachLabel::from_block(n) + } + fn target(n: u32) -> BranchTarget { + BranchTarget::Label(label(n)) + } + + #[test] + fn test_elide_jump_to_next() { + let info = EmitInfo::new(settings::Flags::new(settings::builder())); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(2); + buf.bind_label(label(0)); + let inst = Inst::Jump { dest: target(1) }; + inst.emit(&mut buf, &info, &mut state); + buf.bind_label(label(1)); + let buf = buf.finish(); + assert_eq!(0, buf.total_size()); + } + + #[test] + fn test_elide_trivial_jump_blocks() { + let info = EmitInfo::new(settings::Flags::new(settings::builder())); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(4); + + buf.bind_label(label(0)); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: target(1), + not_taken: target(2), + }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(1)); + let inst = Inst::Jump { dest: target(3) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(2)); + let inst = Inst::Jump { dest: target(3) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(3)); + + let buf = buf.finish(); + assert_eq!(0, buf.total_size()); + } + + #[test] + fn test_flip_cond() { + let info = EmitInfo::new(settings::Flags::new(settings::builder())); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(4); + + buf.bind_label(label(0)); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: target(1), + not_taken: target(2), + }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(1)); + let inst = Inst::Udf { + trap_code: TrapCode::Interrupt, + }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(2)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(3)); + + let buf = buf.finish(); + + let mut buf2 = MachBuffer::new(); + let mut state = Default::default(); + let inst = Inst::TrapIf { + kind: CondBrKind::NotZero(xreg(0)), + trap_code: TrapCode::Interrupt, + }; + inst.emit(&mut buf2, &info, &mut state); + let inst = Inst::Nop4; + inst.emit(&mut buf2, &info, &mut state); + + let buf2 = buf2.finish(); + + assert_eq!(buf.data, buf2.data); + } + + #[test] + fn test_island() { + let info = EmitInfo::new(settings::Flags::new(settings::builder())); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(4); + + buf.bind_label(label(0)); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: target(2), + not_taken: target(3), + }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(1)); + while buf.cur_offset() < 2000000 { + if buf.island_needed(0) { + buf.emit_island(); + } + let inst = Inst::Nop4; + inst.emit(&mut buf, &info, &mut state); + } + + buf.bind_label(label(2)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(3)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &info, &mut state); + + let buf = buf.finish(); + + assert_eq!(2000000 + 8, buf.total_size()); + + let mut buf2 = MachBuffer::new(); + let mut state = Default::default(); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: BranchTarget::ResolvedOffset(1048576 - 4), + not_taken: BranchTarget::ResolvedOffset(2000000 + 4 - 4), + }; + inst.emit(&mut buf2, &info, &mut state); + + let buf2 = buf2.finish(); + + assert_eq!(&buf.data[0..8], &buf2.data[..]); + } + + #[test] + fn test_island_backward() { + let info = EmitInfo::new(settings::Flags::new(settings::builder())); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(4); + + buf.bind_label(label(0)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(1)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(2)); + while buf.cur_offset() < 2000000 { + let inst = Inst::Nop4; + inst.emit(&mut buf, &info, &mut state); + } + + buf.bind_label(label(3)); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: target(0), + not_taken: target(1), + }; + inst.emit(&mut buf, &info, &mut state); + + let buf = buf.finish(); + + assert_eq!(2000000 + 12, buf.total_size()); + + let mut buf2 = MachBuffer::new(); + let mut state = Default::default(); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: BranchTarget::ResolvedOffset(8), + not_taken: BranchTarget::ResolvedOffset(4 - (2000000 + 4)), + }; + inst.emit(&mut buf2, &info, &mut state); + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(-(2000000 + 8)), + }; + inst.emit(&mut buf2, &info, &mut state); + + let buf2 = buf2.finish(); + + assert_eq!(&buf.data[2000000..], &buf2.data[..]); + } + + #[test] + fn test_multiple_redirect() { + // label0: + // cbz x0, label1 + // b label2 + // label1: + // b label3 + // label2: + // nop + // nop + // b label0 + // label3: + // b label4 + // label4: + // b label5 + // label5: + // b label7 + // label6: + // nop + // label7: + // ret + // + // -- should become: + // + // label0: + // cbz x0, label7 + // label2: + // nop + // nop + // b label0 + // label6: + // nop + // label7: + // ret + + let info = EmitInfo::new(settings::Flags::new(settings::builder())); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(8); + + buf.bind_label(label(0)); + let inst = Inst::CondBr { + kind: CondBrKind::Zero(xreg(0)), + taken: target(1), + not_taken: target(2), + }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(1)); + let inst = Inst::Jump { dest: target(3) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(2)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &info, &mut state); + inst.emit(&mut buf, &info, &mut state); + let inst = Inst::Jump { dest: target(0) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(3)); + let inst = Inst::Jump { dest: target(4) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(4)); + let inst = Inst::Jump { dest: target(5) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(5)); + let inst = Inst::Jump { dest: target(7) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(6)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(7)); + let inst = Inst::Ret; + inst.emit(&mut buf, &info, &mut state); + + let buf = buf.finish(); + + let golden_data = vec![ + 0xa0, 0x00, 0x00, 0xb4, // cbz x0, 0x14 + 0x1f, 0x20, 0x03, 0xd5, // nop + 0x1f, 0x20, 0x03, 0xd5, // nop + 0xfd, 0xff, 0xff, 0x17, // b 0 + 0x1f, 0x20, 0x03, 0xd5, // nop + 0xc0, 0x03, 0x5f, 0xd6, // ret + ]; + + assert_eq!(&golden_data[..], &buf.data[..]); + } + + #[test] + fn test_handle_branch_cycle() { + // label0: + // b label1 + // label1: + // b label2 + // label2: + // b label3 + // label3: + // b label4 + // label4: + // b label1 // note: not label0 (to make it interesting). + // + // -- should become: + // + // label0, label1, ..., label4: + // b label0 + let info = EmitInfo::new(settings::Flags::new(settings::builder())); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(5); + + buf.bind_label(label(0)); + let inst = Inst::Jump { dest: target(1) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(1)); + let inst = Inst::Jump { dest: target(2) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(2)); + let inst = Inst::Jump { dest: target(3) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(3)); + let inst = Inst::Jump { dest: target(4) }; + inst.emit(&mut buf, &info, &mut state); + + buf.bind_label(label(4)); + let inst = Inst::Jump { dest: target(1) }; + inst.emit(&mut buf, &info, &mut state); + + let buf = buf.finish(); + + let golden_data = vec![ + 0x00, 0x00, 0x00, 0x14, // b 0 + ]; + + assert_eq!(&golden_data[..], &buf.data[..]); + } +} diff --git a/third_party/rust/cranelift-codegen/src/machinst/compile.rs b/third_party/rust/cranelift-codegen/src/machinst/compile.rs new file mode 100644 index 0000000000..9a00cee805 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/compile.rs @@ -0,0 +1,110 @@ +//! Compilation backend pipeline: optimized IR to VCode / binemit. + +use crate::ir::Function; +use crate::machinst::*; +use crate::settings; +use crate::timing; + +use log::debug; +use regalloc::{allocate_registers_with_opts, Algorithm, Options, PrettyPrint}; + +/// Compile the given function down to VCode with allocated registers, ready +/// for binary emission. +pub fn compile<B: LowerBackend + MachBackend>( + f: &Function, + b: &B, + abi: Box<dyn ABICallee<I = B::MInst>>, + emit_info: <B::MInst as MachInstEmit>::Info, +) -> CodegenResult<VCode<B::MInst>> +where + B::MInst: PrettyPrint, +{ + // Compute lowered block order. + let block_order = BlockLoweringOrder::new(f); + // Build the lowering context. + let lower = Lower::new(f, abi, emit_info, block_order)?; + // Lower the IR. + let (mut vcode, stack_map_request_info) = { + let _tt = timing::vcode_lower(); + lower.lower(b)? + }; + + debug!( + "vcode from lowering: \n{}", + vcode.show_rru(Some(b.reg_universe())) + ); + + // Perform register allocation. + let (run_checker, algorithm) = match vcode.flags().regalloc() { + settings::Regalloc::Backtracking => (false, Algorithm::Backtracking(Default::default())), + settings::Regalloc::BacktrackingChecked => { + (true, Algorithm::Backtracking(Default::default())) + } + settings::Regalloc::ExperimentalLinearScan => { + (false, Algorithm::LinearScan(Default::default())) + } + settings::Regalloc::ExperimentalLinearScanChecked => { + (true, Algorithm::LinearScan(Default::default())) + } + }; + + #[cfg(feature = "regalloc-snapshot")] + { + use std::fs; + use std::path::Path; + if let Some(path) = std::env::var("SERIALIZE_REGALLOC").ok() { + let snapshot = regalloc::IRSnapshot::from_function(&vcode, b.reg_universe()); + let serialized = bincode::serialize(&snapshot).expect("couldn't serialize snapshot"); + + let file_path = Path::new(&path).join(Path::new(&format!("ir{}.bin", f.name))); + fs::write(file_path, &serialized).expect("couldn't write IR snapshot file"); + } + } + + // If either there are no reference-typed values, or else there are + // but there are no safepoints at which we need to know about them, + // then we don't need stack maps. + let sri = if stack_map_request_info.reftyped_vregs.len() > 0 + && stack_map_request_info.safepoint_insns.len() > 0 + { + Some(&stack_map_request_info) + } else { + None + }; + + let result = { + let _tt = timing::regalloc(); + allocate_registers_with_opts( + &mut vcode, + b.reg_universe(), + sri, + Options { + run_checker, + algorithm, + }, + ) + .map_err(|err| { + debug!( + "Register allocation error for vcode\n{}\nError: {:?}", + vcode.show_rru(Some(b.reg_universe())), + err + ); + err + }) + .expect("register allocation") + }; + + // Reorder vcode into final order and copy out final instruction sequence + // all at once. This also inserts prologues/epilogues. + { + let _tt = timing::vcode_post_ra(); + vcode.replace_insns_from_regalloc(result); + } + + debug!( + "vcode after regalloc: final version:\n{}", + vcode.show_rru(Some(b.reg_universe())) + ); + + Ok(vcode) +} diff --git a/third_party/rust/cranelift-codegen/src/machinst/helpers.rs b/third_party/rust/cranelift-codegen/src/machinst/helpers.rs new file mode 100644 index 0000000000..0138da7670 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/helpers.rs @@ -0,0 +1,28 @@ +//! Miscellaneous helpers for machine backends. + +use super::{InsnOutput, LowerCtx, VCodeInst}; +use crate::ir::Type; +use regalloc::{Reg, Writable}; + +/// Returns the size (in bits) of a given type. +pub fn ty_bits(ty: Type) -> usize { + usize::from(ty.bits()) +} + +/// Is the type represented by an integer (not float) at the machine level? +pub(crate) fn ty_has_int_representation(ty: Type) -> bool { + ty.is_int() || ty.is_bool() || ty.is_ref() +} + +/// Is the type represented by a float or vector value at the machine level? +pub(crate) fn ty_has_float_or_vec_representation(ty: Type) -> bool { + ty.is_vector() || ty.is_float() +} + +/// Allocate a register for an instruction output and return it. +pub(crate) fn get_output_reg<I: VCodeInst, C: LowerCtx<I = I>>( + ctx: &mut C, + spec: InsnOutput, +) -> Writable<Reg> { + ctx.get_output(spec.insn, spec.output) +} diff --git a/third_party/rust/cranelift-codegen/src/machinst/inst_common.rs b/third_party/rust/cranelift-codegen/src/machinst/inst_common.rs new file mode 100644 index 0000000000..1ff6faedd4 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/inst_common.rs @@ -0,0 +1,59 @@ +//! A place to park MachInst::Inst fragments which are common across multiple architectures. + +use crate::ir::{self, Inst as IRInst}; + +//============================================================================ +// Instruction input "slots". +// +// We use these types to refer to operand numbers, and result numbers, together +// with the associated instruction, in a type-safe way. + +/// Identifier for a particular input of an instruction. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) struct InsnInput { + pub(crate) insn: IRInst, + pub(crate) input: usize, +} + +/// Identifier for a particular output of an instruction. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) struct InsnOutput { + pub(crate) insn: IRInst, + pub(crate) output: usize, +} + +//============================================================================ +// Atomic instructions. + +/// Atomic memory update operations. As of 21 Aug 2020 these are used for the aarch64 and x64 +/// targets. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(u8)] +pub enum AtomicRmwOp { + /// Add + Add, + /// Sub + Sub, + /// And + And, + /// Or + Or, + /// Exclusive Or + Xor, + /// Exchange (swap operands) + Xchg, +} + +impl AtomicRmwOp { + /// Converts an `ir::AtomicRmwOp` to the corresponding `inst_common::AtomicRmwOp`. + pub fn from(ir_op: ir::AtomicRmwOp) -> Self { + match ir_op { + ir::AtomicRmwOp::Add => AtomicRmwOp::Add, + ir::AtomicRmwOp::Sub => AtomicRmwOp::Sub, + ir::AtomicRmwOp::And => AtomicRmwOp::And, + ir::AtomicRmwOp::Or => AtomicRmwOp::Or, + ir::AtomicRmwOp::Xor => AtomicRmwOp::Xor, + ir::AtomicRmwOp::Xchg => AtomicRmwOp::Xchg, + } + } +} diff --git a/third_party/rust/cranelift-codegen/src/machinst/lower.rs b/third_party/rust/cranelift-codegen/src/machinst/lower.rs new file mode 100644 index 0000000000..531085ae4e --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/lower.rs @@ -0,0 +1,1084 @@ +//! This module implements lowering (instruction selection) from Cranelift IR +//! to machine instructions with virtual registers. This is *almost* the final +//! machine code, except for register allocation. + +use crate::entity::SecondaryMap; +use crate::fx::{FxHashMap, FxHashSet}; +use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit}; +use crate::ir::instructions::BranchInfo; +use crate::ir::types::I64; +use crate::ir::{ + ArgumentPurpose, Block, Constant, ConstantData, ExternalName, Function, GlobalValueData, Inst, + InstructionData, MemFlags, Opcode, Signature, SourceLoc, Type, Value, ValueDef, +}; +use crate::machinst::{ + ABICallee, BlockIndex, BlockLoweringOrder, LoweredBlock, MachLabel, VCode, VCodeBuilder, + VCodeConstant, VCodeConstantData, VCodeConstants, VCodeInst, +}; +use crate::CodegenResult; + +use regalloc::{Reg, RegClass, StackmapRequestInfo, VirtualReg, Writable}; + +use crate::data_value::DataValue; +use alloc::boxed::Box; +use alloc::vec::Vec; +use core::convert::TryInto; +use log::debug; +use smallvec::SmallVec; + +/// An "instruction color" partitions CLIF instructions by side-effecting ops. +/// All instructions with the same "color" are guaranteed not to be separated by +/// any side-effecting op (for this purpose, loads are also considered +/// side-effecting, to avoid subtle questions w.r.t. the memory model), and +/// furthermore, it is guaranteed that for any two instructions A and B such +/// that color(A) == color(B), either A dominates B and B postdominates A, or +/// vice-versa. (For now, in practice, only ops in the same basic block can ever +/// have the same color, trivially providing the second condition.) Intuitively, +/// this means that the ops of the same color must always execute "together", as +/// part of one atomic contiguous section of the dynamic execution trace, and +/// they can be freely permuted (modulo true dataflow dependencies) without +/// affecting program behavior. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct InstColor(u32); +impl InstColor { + fn new(n: u32) -> InstColor { + InstColor(n) + } + + /// Get an arbitrary index representing this color. The index is unique + /// *within a single function compilation*, but indices may be reused across + /// functions. + pub fn get(self) -> u32 { + self.0 + } +} + +/// A context that machine-specific lowering code can use to emit lowered +/// instructions. This is the view of the machine-independent per-function +/// lowering context that is seen by the machine backend. +pub trait LowerCtx { + /// The instruction type for which this lowering framework is instantiated. + type I: VCodeInst; + + // Function-level queries: + + /// Get the `ABICallee`. + fn abi(&mut self) -> &mut dyn ABICallee<I = Self::I>; + /// Get the (virtual) register that receives the return value. A return + /// instruction should lower into a sequence that fills this register. (Why + /// not allow the backend to specify its own result register for the return? + /// Because there may be multiple return points.) + fn retval(&self, idx: usize) -> Writable<Reg>; + /// Returns the vreg containing the VmContext parameter, if there's one. + fn get_vm_context(&self) -> Option<Reg>; + + // General instruction queries: + + /// Get the instdata for a given IR instruction. + fn data(&self, ir_inst: Inst) -> &InstructionData; + /// Get the controlling type for a polymorphic IR instruction. + fn ty(&self, ir_inst: Inst) -> Type; + /// Get the target for a call instruction, as an `ExternalName`. Returns a tuple + /// providing this name and the "relocation distance", i.e., whether the backend + /// can assume the target will be "nearby" (within some small offset) or an + /// arbitrary address. (This comes from the `colocated` bit in the CLIF.) + fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance)>; + /// Get the signature for a call or call-indirect instruction. + fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature>; + /// Get the symbol name, relocation distance estimate, and offset for a + /// symbol_value instruction. + fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance, i64)>; + /// Returns the memory flags of a given memory access. + fn memflags(&self, ir_inst: Inst) -> Option<MemFlags>; + /// Get the source location for a given instruction. + fn srcloc(&self, ir_inst: Inst) -> SourceLoc; + /// Get the side-effect color of the given instruction (specifically, at the + /// program point just prior to the instruction). The "color" changes at + /// every side-effecting op; the backend should not try to merge across + /// side-effect colors unless the op being merged is known to be pure. + fn inst_color(&self, ir_inst: Inst) -> InstColor; + + // Instruction input/output queries: + + /// Get the number of inputs to the given IR instruction. + fn num_inputs(&self, ir_inst: Inst) -> usize; + /// Get the number of outputs to the given IR instruction. + fn num_outputs(&self, ir_inst: Inst) -> usize; + /// Get the type for an instruction's input. + fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type; + /// Get the type for an instruction's output. + fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type; + /// Get the value of a constant instruction (`iconst`, etc.) as a 64-bit + /// value, if possible. + fn get_constant(&self, ir_inst: Inst) -> Option<u64>; + /// Get the input in any combination of three forms: + /// + /// - An instruction, if the same color as this instruction or if the + /// producing instruction has no side effects (thus in both cases + /// mergeable); + /// - A constant, if the value is a constant; + /// - A register. + /// + /// The instruction input may be available in some or all of these + /// forms. More than one is possible: e.g., it may be produced by an + /// instruction in the same block, but may also have been forced into a + /// register already by an earlier op. It will *always* be available + /// in a register, at least. + /// + /// If the backend uses the register, rather than one of the other + /// forms (constant or merging of the producing op), it must call + /// `use_input_reg()` to ensure the producing inst is actually lowered + /// as well. Failing to do so may result in the instruction that generates + /// this value never being generated, thus resulting in incorrect execution. + /// For correctness, backends should thus wrap `get_input()` and + /// `use_input_regs()` with helpers that return a register only after + /// ensuring it is marked as used. + fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput; + /// Get the `idx`th output register of the given IR instruction. When + /// `backend.lower_inst_to_regs(ctx, inst)` is called, it is expected that + /// the backend will write results to these output register(s). This + /// register will always be "fresh"; it is guaranteed not to overlap with + /// any of the inputs, and can be freely used as a scratch register within + /// the lowered instruction sequence, as long as its final value is the + /// result of the computation. + fn get_output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg>; + + // Codegen primitives: allocate temps, emit instructions, set result registers, + // ask for an input to be gen'd into a register. + + /// Get a new temp. + fn alloc_tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg>; + /// Emit a machine instruction. + fn emit(&mut self, mach_inst: Self::I); + /// Emit a machine instruction that is a safepoint. + fn emit_safepoint(&mut self, mach_inst: Self::I); + /// Indicate that the given input uses the register returned by + /// `get_input()`. Codegen may not happen otherwise for the producing + /// instruction if it has no side effects and no uses. + fn use_input_reg(&mut self, input: LowerInput); + /// Is the given register output needed after the given instruction? Allows + /// instructions with multiple outputs to make fine-grained decisions on + /// which outputs to actually generate. + fn is_reg_needed(&self, ir_inst: Inst, reg: Reg) -> bool; + /// Retrieve constant data given a handle. + fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData; + /// Indicate that a constant should be emitted. + fn use_constant(&mut self, constant: VCodeConstantData) -> VCodeConstant; + /// Retrieve the value immediate from an instruction. This will perform necessary lookups on the + /// `DataFlowGraph` to retrieve even large immediates. + fn get_immediate(&self, ir_inst: Inst) -> Option<DataValue>; + /// Cause the value in `reg` to be in a virtual reg, by copying it into a new virtual reg + /// if `reg` is a real reg. `ty` describes the type of the value in `reg`. + fn ensure_in_vreg(&mut self, reg: Reg, ty: Type) -> Reg; +} + +/// A representation of all of the ways in which an instruction input is +/// available: as a producing instruction (in the same color-partition), as a +/// constant, and/or in an existing register. See [LowerCtx::get_input] for more +/// details. +#[derive(Clone, Copy, Debug)] +pub struct LowerInput { + /// The value is live in a register. This option is always available. Call + /// [LowerCtx::use_input_reg()] if the register is used. + pub reg: Reg, + /// An instruction produces this value; the instruction's result index that + /// produces this value is given. + pub inst: Option<(Inst, usize)>, + /// The value is a known constant. + pub constant: Option<u64>, +} + +/// A machine backend. +pub trait LowerBackend { + /// The machine instruction type. + type MInst: VCodeInst; + + /// Lower a single instruction. + /// + /// For a branch, this function should not generate the actual branch + /// instruction. However, it must force any values it needs for the branch + /// edge (block-param actuals) into registers, because the actual branch + /// generation (`lower_branch_group()`) happens *after* any possible merged + /// out-edge. + fn lower<C: LowerCtx<I = Self::MInst>>(&self, ctx: &mut C, inst: Inst) -> CodegenResult<()>; + + /// Lower a block-terminating group of branches (which together can be seen + /// as one N-way branch), given a vcode MachLabel for each target. + fn lower_branch_group<C: LowerCtx<I = Self::MInst>>( + &self, + ctx: &mut C, + insts: &[Inst], + targets: &[MachLabel], + fallthrough: Option<MachLabel>, + ) -> CodegenResult<()>; + + /// A bit of a hack: give a fixed register that always holds the result of a + /// `get_pinned_reg` instruction, if known. This allows elision of moves + /// into the associated vreg, instead using the real reg directly. + fn maybe_pinned_reg(&self) -> Option<Reg> { + None + } +} + +/// A pending instruction to insert and auxiliary information about it: its source location and +/// whether it is a safepoint. +struct InstTuple<I: VCodeInst> { + loc: SourceLoc, + is_safepoint: bool, + inst: I, +} + +/// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence +/// from original Inst to MachInsts. +pub struct Lower<'func, I: VCodeInst> { + /// The function to lower. + f: &'func Function, + + /// Lowered machine instructions. + vcode: VCodeBuilder<I>, + + /// Mapping from `Value` (SSA value in IR) to virtual register. + value_regs: SecondaryMap<Value, Reg>, + + /// Return-value vregs. + retval_regs: Vec<Reg>, + + /// Instruction colors. + inst_colors: SecondaryMap<Inst, InstColor>, + + /// Instruction constant values, if known. + inst_constants: FxHashMap<Inst, u64>, + + /// Instruction has a side-effect and must be codegen'd. + inst_needed: SecondaryMap<Inst, bool>, + + /// Value (vreg) is needed and producer must be codegen'd. + vreg_needed: Vec<bool>, + + /// Next virtual register number to allocate. + next_vreg: u32, + + /// Insts in reverse block order, before final copy to vcode. + block_insts: Vec<InstTuple<I>>, + + /// Ranges in `block_insts` constituting BBs. + block_ranges: Vec<(usize, usize)>, + + /// Instructions collected for the BB in progress, in reverse order, with + /// source-locs attached. + bb_insts: Vec<InstTuple<I>>, + + /// Instructions collected for the CLIF inst in progress, in forward order. + ir_insts: Vec<InstTuple<I>>, + + /// The register to use for GetPinnedReg, if any, on this architecture. + pinned_reg: Option<Reg>, + + /// The vreg containing the special VmContext parameter, if it is present in the current + /// function's signature. + vm_context: Option<Reg>, +} + +/// Notion of "relocation distance". This gives an estimate of how far away a symbol will be from a +/// reference. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum RelocDistance { + /// Target of relocation is "nearby". The threshold for this is fuzzy but should be interpreted + /// as approximately "within the compiled output of one module"; e.g., within AArch64's +/- + /// 128MB offset. If unsure, use `Far` instead. + Near, + /// Target of relocation could be anywhere in the address space. + Far, +} + +fn alloc_vreg( + value_regs: &mut SecondaryMap<Value, Reg>, + regclass: RegClass, + value: Value, + next_vreg: &mut u32, +) -> VirtualReg { + if value_regs[value].is_invalid() { + // default value in map. + let v = *next_vreg; + *next_vreg += 1; + value_regs[value] = Reg::new_virtual(regclass, v); + debug!("value {} gets vreg {:?}", value, v); + } + value_regs[value].as_virtual_reg().unwrap() +} + +enum GenerateReturn { + Yes, + No, +} + +impl<'func, I: VCodeInst> Lower<'func, I> { + /// Prepare a new lowering context for the given IR function. + pub fn new( + f: &'func Function, + abi: Box<dyn ABICallee<I = I>>, + emit_info: I::Info, + block_order: BlockLoweringOrder, + ) -> CodegenResult<Lower<'func, I>> { + let constants = VCodeConstants::with_capacity(f.dfg.constants.len()); + let mut vcode = VCodeBuilder::new(abi, emit_info, block_order, constants); + + let mut next_vreg: u32 = 0; + + let mut value_regs = SecondaryMap::with_default(Reg::invalid()); + + // Assign a vreg to each block param and each inst result. + for bb in f.layout.blocks() { + for ¶m in f.dfg.block_params(bb) { + let ty = f.dfg.value_type(param); + let vreg = alloc_vreg(&mut value_regs, I::rc_for_type(ty)?, param, &mut next_vreg); + vcode.set_vreg_type(vreg, ty); + debug!("bb {} param {}: vreg {:?}", bb, param, vreg); + } + for inst in f.layout.block_insts(bb) { + for &result in f.dfg.inst_results(inst) { + let ty = f.dfg.value_type(result); + let vreg = + alloc_vreg(&mut value_regs, I::rc_for_type(ty)?, result, &mut next_vreg); + vcode.set_vreg_type(vreg, ty); + debug!( + "bb {} inst {} ({:?}): result vreg {:?}", + bb, inst, f.dfg[inst], vreg + ); + } + } + } + + let vm_context = f + .signature + .special_param_index(ArgumentPurpose::VMContext) + .map(|vm_context_index| { + let entry_block = f.layout.entry_block().unwrap(); + let param = f.dfg.block_params(entry_block)[vm_context_index]; + value_regs[param] + }); + + // Assign a vreg to each return value. + let mut retval_regs = vec![]; + for ret in &f.signature.returns { + let v = next_vreg; + next_vreg += 1; + let regclass = I::rc_for_type(ret.value_type)?; + let vreg = Reg::new_virtual(regclass, v); + retval_regs.push(vreg); + vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ret.value_type); + } + + // Compute instruction colors, find constant instructions, and find instructions with + // side-effects, in one combined pass. + let mut cur_color = 0; + let mut inst_colors = SecondaryMap::with_default(InstColor::new(0)); + let mut inst_constants = FxHashMap::default(); + let mut inst_needed = SecondaryMap::with_default(false); + for bb in f.layout.blocks() { + cur_color += 1; + for inst in f.layout.block_insts(bb) { + let side_effect = has_lowering_side_effect(f, inst); + + // Assign colors. A new color is chosen *after* any side-effecting instruction. + inst_colors[inst] = InstColor::new(cur_color); + debug!("bb {} inst {} has color {}", bb, inst, cur_color); + if side_effect { + debug!(" -> side-effecting"); + inst_needed[inst] = true; + cur_color += 1; + } + + // Determine if this is a constant; if so, add to the table. + if let Some(c) = is_constant_64bit(f, inst) { + debug!(" -> constant: {}", c); + inst_constants.insert(inst, c); + } + } + } + + let vreg_needed = std::iter::repeat(false).take(next_vreg as usize).collect(); + + Ok(Lower { + f, + vcode, + value_regs, + retval_regs, + inst_colors, + inst_constants, + inst_needed, + vreg_needed, + next_vreg, + block_insts: vec![], + block_ranges: vec![], + bb_insts: vec![], + ir_insts: vec![], + pinned_reg: None, + vm_context, + }) + } + + fn gen_arg_setup(&mut self) { + if let Some(entry_bb) = self.f.layout.entry_block() { + debug!( + "gen_arg_setup: entry BB {} args are:\n{:?}", + entry_bb, + self.f.dfg.block_params(entry_bb) + ); + for (i, param) in self.f.dfg.block_params(entry_bb).iter().enumerate() { + if !self.vcode.abi().arg_is_needed_in_body(i) { + continue; + } + let reg = Writable::from_reg(self.value_regs[*param]); + let insn = self.vcode.abi().gen_copy_arg_to_reg(i, reg); + self.emit(insn); + } + if let Some(insn) = self.vcode.abi().gen_retval_area_setup() { + self.emit(insn); + } + } + } + + fn gen_retval_setup(&mut self, gen_ret_inst: GenerateReturn) { + let retval_regs = self.retval_regs.clone(); + for (i, reg) in retval_regs.into_iter().enumerate() { + let reg = Writable::from_reg(reg); + let insns = self.vcode.abi().gen_copy_reg_to_retval(i, reg); + for insn in insns { + self.emit(insn); + } + } + let inst = match gen_ret_inst { + GenerateReturn::Yes => self.vcode.abi().gen_ret(), + GenerateReturn::No => self.vcode.abi().gen_epilogue_placeholder(), + }; + self.emit(inst); + } + + fn lower_edge(&mut self, pred: Block, inst: Inst, succ: Block) -> CodegenResult<()> { + debug!("lower_edge: pred {} succ {}", pred, succ); + + let num_args = self.f.dfg.block_params(succ).len(); + debug_assert!(num_args == self.f.dfg.inst_variable_args(inst).len()); + + // Most blocks have no params, so skip all the hoop-jumping below and make an early exit. + if num_args == 0 { + return Ok(()); + } + + // Make up two vectors of info: + // + // * one for dsts which are to be assigned constants. We'll deal with those second, so + // as to minimise live ranges. + // + // * one for dsts whose sources are non-constants. + + let mut const_bundles = SmallVec::<[(Type, Writable<Reg>, u64); 16]>::new(); + let mut var_bundles = SmallVec::<[(Type, Writable<Reg>, Reg); 16]>::new(); + + let mut i = 0; + for (dst_val, src_val) in self + .f + .dfg + .block_params(succ) + .iter() + .zip(self.f.dfg.inst_variable_args(inst).iter()) + { + let src_val = self.f.dfg.resolve_aliases(*src_val); + let ty = self.f.dfg.value_type(src_val); + + debug_assert!(ty == self.f.dfg.value_type(*dst_val)); + let dst_reg = self.value_regs[*dst_val]; + + let input = self.get_input_for_val(inst, src_val); + debug!("jump arg {} is {}, reg {:?}", i, src_val, input.reg); + i += 1; + + if let Some(c) = input.constant { + const_bundles.push((ty, Writable::from_reg(dst_reg), c)); + } else { + self.use_input_reg(input); + let src_reg = input.reg; + // Skip self-assignments. Not only are they pointless, they falsely trigger the + // overlap-check below and hence can cause a lot of unnecessary copying through + // temporaries. + if dst_reg != src_reg { + var_bundles.push((ty, Writable::from_reg(dst_reg), src_reg)); + } + } + } + + // Deal first with the moves whose sources are variables. + + // FIXME: use regalloc.rs' SparseSetU here. This would avoid all heap allocation + // for cases of up to circa 16 args. Currently not possible because regalloc.rs + // does not export it. + let mut src_reg_set = FxHashSet::<Reg>::default(); + for (_, _, src_reg) in &var_bundles { + src_reg_set.insert(*src_reg); + } + let mut overlaps = false; + for (_, dst_reg, _) in &var_bundles { + if src_reg_set.contains(&dst_reg.to_reg()) { + overlaps = true; + break; + } + } + + // If, as is mostly the case, the source and destination register sets are non + // overlapping, then we can copy directly, so as to save the register allocator work. + if !overlaps { + for (ty, dst_reg, src_reg) in &var_bundles { + self.emit(I::gen_move(*dst_reg, *src_reg, *ty)); + } + } else { + // There's some overlap, so play safe and copy via temps. + let mut tmp_regs = SmallVec::<[Writable<Reg>; 16]>::new(); + for (ty, _, _) in &var_bundles { + tmp_regs.push(self.alloc_tmp(I::rc_for_type(*ty)?, *ty)); + } + for ((ty, _, src_reg), tmp_reg) in var_bundles.iter().zip(tmp_regs.iter()) { + self.emit(I::gen_move(*tmp_reg, *src_reg, *ty)); + } + for ((ty, dst_reg, _), tmp_reg) in var_bundles.iter().zip(tmp_regs.iter()) { + self.emit(I::gen_move(*dst_reg, (*tmp_reg).to_reg(), *ty)); + } + } + + // Now, finally, deal with the moves whose sources are constants. + for (ty, dst_reg, const_u64) in &const_bundles { + for inst in I::gen_constant(*dst_reg, *const_u64, *ty, |reg_class, ty| { + self.alloc_tmp(reg_class, ty) + }) + .into_iter() + { + self.emit(inst); + } + } + + Ok(()) + } + + fn lower_clif_block<B: LowerBackend<MInst = I>>( + &mut self, + backend: &B, + block: Block, + ) -> CodegenResult<()> { + // Lowering loop: + // - For each non-branch instruction, in reverse order: + // - If side-effecting (load, store, branch/call/return, possible trap), or if + // used outside of this block, or if demanded by another inst, then lower. + // + // That's it! Lowering of side-effecting ops will force all *needed* + // (live) non-side-effecting ops to be lowered at the right places, via + // the `use_input_reg()` callback on the `LowerCtx` (that's us). That's + // because `use_input_reg()` sets the eager/demand bit for any insts + // whose result registers are used. + // + // We build up the BB in reverse instruction order in `bb_insts`. + // Because the machine backend calls `ctx.emit()` in forward order, we + // collect per-IR-inst lowered instructions in `ir_insts`, then reverse + // these and append to `bb_insts` as we go backward through the block. + // `bb_insts` are then reversed again and appended to the VCode at the + // end of the BB (in the toplevel driver `lower()`). + for inst in self.f.layout.block_insts(block).rev() { + let data = &self.f.dfg[inst]; + let value_needed = self + .f + .dfg + .inst_results(inst) + .iter() + .any(|&result| self.vreg_needed[self.value_regs[result].get_index()]); + debug!( + "lower_clif_block: block {} inst {} ({:?}) is_branch {} inst_needed {} value_needed {}", + block, + inst, + data, + data.opcode().is_branch(), + self.inst_needed[inst], + value_needed, + ); + if self.f.dfg[inst].opcode().is_branch() { + continue; + } + // Normal instruction: codegen if eager bit is set. (Other instructions may also be + // codegened if not eager when they are used by another instruction.) + if self.inst_needed[inst] || value_needed { + debug!("lowering: inst {}: {:?}", inst, self.f.dfg[inst]); + backend.lower(self, inst)?; + } + if data.opcode().is_return() { + // Return: handle specially, using ABI-appropriate sequence. + let gen_ret = if data.opcode() == Opcode::Return { + GenerateReturn::Yes + } else { + debug_assert!(data.opcode() == Opcode::FallthroughReturn); + GenerateReturn::No + }; + self.gen_retval_setup(gen_ret); + } + + let loc = self.srcloc(inst); + self.finish_ir_inst(loc); + } + Ok(()) + } + + fn finish_ir_inst(&mut self, loc: SourceLoc) { + // `bb_insts` is kept in reverse order, so emit the instructions in + // reverse order. + for mut tuple in self.ir_insts.drain(..).rev() { + tuple.loc = loc; + self.bb_insts.push(tuple); + } + } + + fn finish_bb(&mut self) { + let start = self.block_insts.len(); + for tuple in self.bb_insts.drain(..).rev() { + self.block_insts.push(tuple); + } + let end = self.block_insts.len(); + self.block_ranges.push((start, end)); + } + + fn copy_bbs_to_vcode(&mut self) { + for &(start, end) in self.block_ranges.iter().rev() { + for &InstTuple { + loc, + is_safepoint, + ref inst, + } in &self.block_insts[start..end] + { + self.vcode.set_srcloc(loc); + self.vcode.push(inst.clone(), is_safepoint); + } + self.vcode.end_bb(); + } + } + + fn lower_clif_branches<B: LowerBackend<MInst = I>>( + &mut self, + backend: &B, + block: Block, + branches: &SmallVec<[Inst; 2]>, + targets: &SmallVec<[MachLabel; 2]>, + maybe_fallthrough: Option<MachLabel>, + ) -> CodegenResult<()> { + debug!( + "lower_clif_branches: block {} branches {:?} targets {:?} maybe_fallthrough {:?}", + block, branches, targets, maybe_fallthrough + ); + backend.lower_branch_group(self, branches, targets, maybe_fallthrough)?; + let loc = self.srcloc(branches[0]); + self.finish_ir_inst(loc); + Ok(()) + } + + fn collect_branches_and_targets( + &self, + bindex: BlockIndex, + _bb: Block, + branches: &mut SmallVec<[Inst; 2]>, + targets: &mut SmallVec<[MachLabel; 2]>, + ) { + branches.clear(); + targets.clear(); + let mut last_inst = None; + for &(inst, succ) in self.vcode.block_order().succ_indices(bindex) { + // Avoid duplicates: this ensures a br_table is only inserted once. + if last_inst != Some(inst) { + branches.push(inst); + } else { + debug_assert!(self.f.dfg[inst].opcode() == Opcode::BrTable); + debug_assert!(branches.len() == 1); + } + last_inst = Some(inst); + targets.push(MachLabel::from_block(succ)); + } + } + + /// Lower the function. + pub fn lower<B: LowerBackend<MInst = I>>( + mut self, + backend: &B, + ) -> CodegenResult<(VCode<I>, StackmapRequestInfo)> { + debug!("about to lower function: {:?}", self.f); + + // Initialize the ABI object, giving it a temp if requested. + let maybe_tmp = if self.vcode.abi().temp_needed() { + Some(self.alloc_tmp(RegClass::I64, I64)) + } else { + None + }; + self.vcode.abi().init(maybe_tmp); + + // Get the pinned reg here (we only parameterize this function on `B`, + // not the whole `Lower` impl). + self.pinned_reg = backend.maybe_pinned_reg(); + + self.vcode.set_entry(0); + + // Reused vectors for branch lowering. + let mut branches: SmallVec<[Inst; 2]> = SmallVec::new(); + let mut targets: SmallVec<[MachLabel; 2]> = SmallVec::new(); + + // get a copy of the lowered order; we hold this separately because we + // need a mut ref to the vcode to mutate it below. + let lowered_order: SmallVec<[LoweredBlock; 64]> = self + .vcode + .block_order() + .lowered_order() + .iter() + .cloned() + .collect(); + + // Main lowering loop over lowered blocks. + for (bindex, lb) in lowered_order.iter().enumerate().rev() { + let bindex = bindex as BlockIndex; + + // Lower the block body in reverse order (see comment in + // `lower_clif_block()` for rationale). + + // End branches. + if let Some(bb) = lb.orig_block() { + self.collect_branches_and_targets(bindex, bb, &mut branches, &mut targets); + if branches.len() > 0 { + let maybe_fallthrough = if (bindex + 1) < (lowered_order.len() as BlockIndex) { + Some(MachLabel::from_block(bindex + 1)) + } else { + None + }; + self.lower_clif_branches(backend, bb, &branches, &targets, maybe_fallthrough)?; + self.finish_ir_inst(self.srcloc(branches[0])); + } + } else { + // If no orig block, this must be a pure edge block; get the successor and + // emit a jump. + let (_, succ) = self.vcode.block_order().succ_indices(bindex)[0]; + self.emit(I::gen_jump(MachLabel::from_block(succ))); + self.finish_ir_inst(SourceLoc::default()); + } + + // Out-edge phi moves. + if let Some((pred, inst, succ)) = lb.out_edge() { + self.lower_edge(pred, inst, succ)?; + self.finish_ir_inst(SourceLoc::default()); + } + // Original block body. + if let Some(bb) = lb.orig_block() { + self.lower_clif_block(backend, bb)?; + } + // In-edge phi moves. + if let Some((pred, inst, succ)) = lb.in_edge() { + self.lower_edge(pred, inst, succ)?; + self.finish_ir_inst(SourceLoc::default()); + } + + if bindex == 0 { + // Set up the function with arg vreg inits. + self.gen_arg_setup(); + self.finish_ir_inst(SourceLoc::default()); + } + + self.finish_bb(); + } + + self.copy_bbs_to_vcode(); + + // Now that we've emitted all instructions into the VCodeBuilder, let's build the VCode. + let (vcode, stack_map_info) = self.vcode.build(); + debug!("built vcode: {:?}", vcode); + + Ok((vcode, stack_map_info)) + } + + /// Get the actual inputs for a value. This is the implementation for + /// `get_input()` but starting from the SSA value, which is not exposed to + /// the backend. + fn get_input_for_val(&self, at_inst: Inst, val: Value) -> LowerInput { + debug!("get_input_for_val: val {} at inst {}", val, at_inst); + let mut reg = self.value_regs[val]; + debug!(" -> reg {:?}", reg); + assert!(reg.is_valid()); + let mut inst = match self.f.dfg.value_def(val) { + // OK to merge source instruction if (i) we have a source + // instruction, and either (ii-a) it has no side effects, or (ii-b) + // it has the same color as this instruction. + ValueDef::Result(src_inst, result_idx) => { + debug!(" -> src inst {}", src_inst); + debug!( + " -> has lowering side effect: {}", + has_lowering_side_effect(self.f, src_inst) + ); + debug!( + " -> our color is {:?}, src inst is {:?}", + self.inst_color(at_inst), + self.inst_color(src_inst) + ); + if !has_lowering_side_effect(self.f, src_inst) + || self.inst_color(at_inst) == self.inst_color(src_inst) + { + Some((src_inst, result_idx)) + } else { + None + } + } + _ => None, + }; + let constant = inst.and_then(|(inst, _)| self.get_constant(inst)); + + // Pinned-reg hack: if backend specifies a fixed pinned register, use it + // directly when we encounter a GetPinnedReg op, rather than lowering + // the actual op, and do not return the source inst to the caller; the + // value comes "out of the ether" and we will not force generation of + // the superfluous move. + if let Some((i, _)) = inst { + if self.f.dfg[i].opcode() == Opcode::GetPinnedReg { + if let Some(pr) = self.pinned_reg { + reg = pr; + } + inst = None; + } + } + + LowerInput { + reg, + inst, + constant, + } + } +} + +impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { + type I = I; + + fn abi(&mut self) -> &mut dyn ABICallee<I = I> { + self.vcode.abi() + } + + fn retval(&self, idx: usize) -> Writable<Reg> { + Writable::from_reg(self.retval_regs[idx]) + } + + fn get_vm_context(&self) -> Option<Reg> { + self.vm_context + } + + fn data(&self, ir_inst: Inst) -> &InstructionData { + &self.f.dfg[ir_inst] + } + + fn ty(&self, ir_inst: Inst) -> Type { + self.f.dfg.ctrl_typevar(ir_inst) + } + + fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance)> { + match &self.f.dfg[ir_inst] { + &InstructionData::Call { func_ref, .. } + | &InstructionData::FuncAddr { func_ref, .. } => { + let funcdata = &self.f.dfg.ext_funcs[func_ref]; + let dist = funcdata.reloc_distance(); + Some((&funcdata.name, dist)) + } + _ => None, + } + } + + fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature> { + match &self.f.dfg[ir_inst] { + &InstructionData::Call { func_ref, .. } => { + let funcdata = &self.f.dfg.ext_funcs[func_ref]; + Some(&self.f.dfg.signatures[funcdata.signature]) + } + &InstructionData::CallIndirect { sig_ref, .. } => Some(&self.f.dfg.signatures[sig_ref]), + _ => None, + } + } + + fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance, i64)> { + match &self.f.dfg[ir_inst] { + &InstructionData::UnaryGlobalValue { global_value, .. } => { + let gvdata = &self.f.global_values[global_value]; + match gvdata { + &GlobalValueData::Symbol { + ref name, + ref offset, + .. + } => { + let offset = offset.bits(); + let dist = gvdata.maybe_reloc_distance().unwrap(); + Some((name, dist, offset)) + } + _ => None, + } + } + _ => None, + } + } + + fn memflags(&self, ir_inst: Inst) -> Option<MemFlags> { + match &self.f.dfg[ir_inst] { + &InstructionData::AtomicCas { flags, .. } => Some(flags), + &InstructionData::AtomicRmw { flags, .. } => Some(flags), + &InstructionData::Load { flags, .. } + | &InstructionData::LoadComplex { flags, .. } + | &InstructionData::LoadNoOffset { flags, .. } + | &InstructionData::Store { flags, .. } + | &InstructionData::StoreComplex { flags, .. } => Some(flags), + &InstructionData::StoreNoOffset { flags, .. } => Some(flags), + _ => None, + } + } + + fn srcloc(&self, ir_inst: Inst) -> SourceLoc { + self.f.srclocs[ir_inst] + } + + fn inst_color(&self, ir_inst: Inst) -> InstColor { + self.inst_colors[ir_inst] + } + + fn num_inputs(&self, ir_inst: Inst) -> usize { + self.f.dfg.inst_args(ir_inst).len() + } + + fn num_outputs(&self, ir_inst: Inst) -> usize { + self.f.dfg.inst_results(ir_inst).len() + } + + fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type { + let val = self.f.dfg.inst_args(ir_inst)[idx]; + let val = self.f.dfg.resolve_aliases(val); + self.f.dfg.value_type(val) + } + + fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type { + self.f.dfg.value_type(self.f.dfg.inst_results(ir_inst)[idx]) + } + + fn get_constant(&self, ir_inst: Inst) -> Option<u64> { + self.inst_constants.get(&ir_inst).cloned() + } + + fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput { + let val = self.f.dfg.inst_args(ir_inst)[idx]; + let val = self.f.dfg.resolve_aliases(val); + self.get_input_for_val(ir_inst, val) + } + + fn get_output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg> { + let val = self.f.dfg.inst_results(ir_inst)[idx]; + Writable::from_reg(self.value_regs[val]) + } + + fn alloc_tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg> { + let v = self.next_vreg; + self.next_vreg += 1; + let vreg = Reg::new_virtual(rc, v); + self.vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ty); + Writable::from_reg(vreg) + } + + fn emit(&mut self, mach_inst: I) { + self.ir_insts.push(InstTuple { + loc: SourceLoc::default(), + is_safepoint: false, + inst: mach_inst, + }); + } + + fn emit_safepoint(&mut self, mach_inst: I) { + self.ir_insts.push(InstTuple { + loc: SourceLoc::default(), + is_safepoint: true, + inst: mach_inst, + }); + } + + fn use_input_reg(&mut self, input: LowerInput) { + debug!("use_input_reg: vreg {:?} is needed", input.reg); + // We may directly return a real (machine) register when we know that register holds the + // result of an opcode (e.g. GetPinnedReg). + if input.reg.is_virtual() { + self.vreg_needed[input.reg.get_index()] = true; + } + } + + fn is_reg_needed(&self, ir_inst: Inst, reg: Reg) -> bool { + self.inst_needed[ir_inst] || self.vreg_needed[reg.get_index()] + } + + fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData { + self.f.dfg.constants.get(constant_handle) + } + + fn use_constant(&mut self, constant: VCodeConstantData) -> VCodeConstant { + self.vcode.constants().insert(constant) + } + + fn get_immediate(&self, ir_inst: Inst) -> Option<DataValue> { + let inst_data = self.data(ir_inst); + match inst_data { + InstructionData::Shuffle { mask, .. } => { + let buffer = self.f.dfg.immediates.get(mask.clone()).unwrap().as_slice(); + let value = DataValue::V128(buffer.try_into().expect("a 16-byte data buffer")); + Some(value) + } + InstructionData::UnaryConst { + constant_handle, .. + } => { + let buffer = self.f.dfg.constants.get(constant_handle.clone()).as_slice(); + let value = DataValue::V128(buffer.try_into().expect("a 16-byte data buffer")); + Some(value) + } + _ => inst_data.imm_value(), + } + } + + fn ensure_in_vreg(&mut self, reg: Reg, ty: Type) -> Reg { + if reg.is_virtual() { + reg + } else { + let rc = reg.get_class(); + let new_reg = self.alloc_tmp(rc, ty); + self.emit(I::gen_move(new_reg, reg, ty)); + new_reg.to_reg() + } + } +} + +/// Visit all successors of a block with a given visitor closure. +pub(crate) fn visit_block_succs<F: FnMut(Inst, Block)>(f: &Function, block: Block, mut visit: F) { + for inst in f.layout.block_likely_branches(block) { + if f.dfg[inst].opcode().is_branch() { + visit_branch_targets(f, block, inst, &mut visit); + } + } +} + +fn visit_branch_targets<F: FnMut(Inst, Block)>( + f: &Function, + block: Block, + inst: Inst, + visit: &mut F, +) { + if f.dfg[inst].opcode() == Opcode::Fallthrough { + visit(inst, f.layout.next_block(block).unwrap()); + } else { + match f.dfg[inst].analyze_branch(&f.dfg.value_lists) { + BranchInfo::NotABranch => {} + BranchInfo::SingleDest(dest, _) => { + visit(inst, dest); + } + BranchInfo::Table(table, maybe_dest) => { + if let Some(dest) = maybe_dest { + visit(inst, dest); + } + for &dest in f.jump_tables[table].as_slice() { + visit(inst, dest); + } + } + } + } +} diff --git a/third_party/rust/cranelift-codegen/src/machinst/mod.rs b/third_party/rust/cranelift-codegen/src/machinst/mod.rs new file mode 100644 index 0000000000..4b12f2fd1d --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/mod.rs @@ -0,0 +1,421 @@ +//! This module exposes the machine-specific backend definition pieces. +//! +//! The MachInst infrastructure is the compiler backend, from CLIF +//! (ir::Function) to machine code. The purpose of this infrastructure is, at a +//! high level, to do instruction selection/lowering (to machine instructions), +//! register allocation, and then perform all the fixups to branches, constant +//! data references, etc., needed to actually generate machine code. +//! +//! The container for machine instructions, at various stages of construction, +//! is the `VCode` struct. We refer to a sequence of machine instructions organized +//! into basic blocks as "vcode". This is short for "virtual-register code", though +//! it's a bit of a misnomer because near the end of the pipeline, vcode has all +//! real registers. Nevertheless, the name is catchy and we like it. +//! +//! The compilation pipeline, from an `ir::Function` (already optimized as much as +//! you like by machine-independent optimization passes) onward, is as follows. +//! (N.B.: though we show the VCode separately at each stage, the passes +//! mutate the VCode in place; these are not separate copies of the code.) +//! +//! ```plain +//! +//! ir::Function (SSA IR, machine-independent opcodes) +//! | +//! | [lower] +//! | +//! VCode<arch_backend::Inst> (machine instructions: +//! | - mostly virtual registers. +//! | - cond branches in two-target form. +//! | - branch targets are block indices. +//! | - in-memory constants held by insns, +//! | with unknown offsets. +//! | - critical edges (actually all edges) +//! | are split.) +//! | [regalloc] +//! | +//! VCode<arch_backend::Inst> (machine instructions: +//! | - all real registers. +//! | - new instruction sequence returned +//! | out-of-band in RegAllocResult. +//! | - instruction sequence has spills, +//! | reloads, and moves inserted. +//! | - other invariants same as above.) +//! | +//! | [preamble/postamble] +//! | +//! VCode<arch_backend::Inst> (machine instructions: +//! | - stack-frame size known. +//! | - out-of-band instruction sequence +//! | has preamble prepended to entry +//! | block, and postamble injected before +//! | every return instruction. +//! | - all symbolic stack references to +//! | stackslots and spillslots are resolved +//! | to concrete FP-offset mem addresses.) +//! | [block/insn ordering] +//! | +//! VCode<arch_backend::Inst> (machine instructions: +//! | - vcode.final_block_order is filled in. +//! | - new insn sequence from regalloc is +//! | placed back into vcode and block +//! | boundaries are updated.) +//! | [redundant branch/block +//! | removal] +//! | +//! VCode<arch_backend::Inst> (machine instructions: +//! | - all blocks that were just an +//! | unconditional branch are removed.) +//! | +//! | [branch finalization +//! | (fallthroughs)] +//! | +//! VCode<arch_backend::Inst> (machine instructions: +//! | - all branches are in lowered one- +//! | target form, but targets are still +//! | block indices.) +//! | +//! | [branch finalization +//! | (offsets)] +//! | +//! VCode<arch_backend::Inst> (machine instructions: +//! | - all branch offsets from start of +//! | function are known, and all branches +//! | have resolved-offset targets.) +//! | +//! | [MemArg finalization] +//! | +//! VCode<arch_backend::Inst> (machine instructions: +//! | - all MemArg references to the constant +//! | pool are replaced with offsets. +//! | - all constant-pool data is collected +//! | in the VCode.) +//! | +//! | [binary emission] +//! | +//! Vec<u8> (machine code!) +//! +//! ``` + +use crate::binemit::{CodeInfo, CodeOffset, StackMap}; +use crate::ir::condcodes::IntCC; +use crate::ir::{Function, SourceLoc, Type}; +use crate::isa::unwind::input as unwind_input; +use crate::result::CodegenResult; +use crate::settings::Flags; + +use alloc::boxed::Box; +use alloc::vec::Vec; +use core::fmt::Debug; +use core::ops::Range; +use regalloc::RegUsageCollector; +use regalloc::{ + RealReg, RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable, +}; +use smallvec::SmallVec; +use std::string::String; +use target_lexicon::Triple; + +pub mod lower; +pub use lower::*; +pub mod vcode; +pub use vcode::*; +pub mod compile; +pub use compile::*; +pub mod blockorder; +pub use blockorder::*; +pub mod abi; +pub use abi::*; +pub mod abi_impl; +pub use abi_impl::*; +pub mod buffer; +pub use buffer::*; +pub mod adapter; +pub use adapter::*; +pub mod helpers; +pub use helpers::*; +pub mod inst_common; +pub use inst_common::*; + +/// A machine instruction. +pub trait MachInst: Clone + Debug { + /// Return the registers referenced by this machine instruction along with + /// the modes of reference (use, def, modify). + fn get_regs(&self, collector: &mut RegUsageCollector); + + /// Map virtual registers to physical registers using the given virt->phys + /// maps corresponding to the program points prior to, and after, this instruction. + fn map_regs<RUM: RegUsageMapper>(&mut self, maps: &RUM); + + /// If this is a simple move, return the (source, destination) tuple of registers. + fn is_move(&self) -> Option<(Writable<Reg>, Reg)>; + + /// Is this a terminator (branch or ret)? If so, return its type + /// (ret/uncond/cond) and target if applicable. + fn is_term<'a>(&'a self) -> MachTerminator<'a>; + + /// Returns true if the instruction is an epilogue placeholder. + fn is_epilogue_placeholder(&self) -> bool; + + /// Should this instruction be included in the clobber-set? + fn is_included_in_clobbers(&self) -> bool { + true + } + + /// Generate a move. + fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self; + + /// Generate a constant into a reg. + fn gen_constant<F: FnMut(RegClass, Type) -> Writable<Reg>>( + to_reg: Writable<Reg>, + value: u64, + ty: Type, + alloc_tmp: F, + ) -> SmallVec<[Self; 4]>; + + /// Generate a zero-length no-op. + fn gen_zero_len_nop() -> Self; + + /// Possibly operate on a value directly in a spill-slot rather than a + /// register. Useful if the machine has register-memory instruction forms + /// (e.g., add directly from or directly to memory), like x86. + fn maybe_direct_reload(&self, reg: VirtualReg, slot: SpillSlot) -> Option<Self>; + + /// Determine a register class to store the given Cranelift type. + /// May return an error if the type isn't supported by this backend. + fn rc_for_type(ty: Type) -> CodegenResult<RegClass>; + + /// Generate a jump to another target. Used during lowering of + /// control flow. + fn gen_jump(target: MachLabel) -> Self; + + /// Generate a NOP. The `preferred_size` parameter allows the caller to + /// request a NOP of that size, or as close to it as possible. The machine + /// backend may return a NOP whose binary encoding is smaller than the + /// preferred size, but must not return a NOP that is larger. However, + /// the instruction must have a nonzero size. + fn gen_nop(preferred_size: usize) -> Self; + + /// Get the register universe for this backend. + fn reg_universe(flags: &Flags) -> RealRegUniverse; + + /// Align a basic block offset (from start of function). By default, no + /// alignment occurs. + fn align_basic_block(offset: CodeOffset) -> CodeOffset { + offset + } + + /// What is the worst-case instruction size emitted by this instruction type? + fn worst_case_size() -> CodeOffset; + + /// What is the register class used for reference types (GC-observable pointers)? Can + /// be dependent on compilation flags. + fn ref_type_regclass(_flags: &Flags) -> RegClass; + + /// A label-use kind: a type that describes the types of label references that + /// can occur in an instruction. + type LabelUse: MachInstLabelUse; +} + +/// A descriptor of a label reference (use) in an instruction set. +pub trait MachInstLabelUse: Clone + Copy + Debug + Eq { + /// Required alignment for any veneer. Usually the required instruction + /// alignment (e.g., 4 for a RISC with 32-bit instructions, or 1 for x86). + const ALIGN: CodeOffset; + + /// What is the maximum PC-relative range (positive)? E.g., if `1024`, a + /// label-reference fixup at offset `x` is valid if the label resolves to `x + /// + 1024`. + fn max_pos_range(self) -> CodeOffset; + /// What is the maximum PC-relative range (negative)? This is the absolute + /// value; i.e., if `1024`, then a label-reference fixup at offset `x` is + /// valid if the label resolves to `x - 1024`. + fn max_neg_range(self) -> CodeOffset; + /// What is the size of code-buffer slice this label-use needs to patch in + /// the label's value? + fn patch_size(self) -> CodeOffset; + /// Perform a code-patch, given the offset into the buffer of this label use + /// and the offset into the buffer of the label's definition. + /// It is guaranteed that, given `delta = offset - label_offset`, we will + /// have `offset >= -self.max_neg_range()` and `offset <= + /// self.max_pos_range()`. + fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset); + /// Can the label-use be patched to a veneer that supports a longer range? + /// Usually valid for jumps (a short-range jump can jump to a longer-range + /// jump), but not for e.g. constant pool references, because the constant + /// load would require different code (one more level of indirection). + fn supports_veneer(self) -> bool; + /// How many bytes are needed for a veneer? + fn veneer_size(self) -> CodeOffset; + /// Generate a veneer. The given code-buffer slice is `self.veneer_size()` + /// bytes long at offset `veneer_offset` in the buffer. The original + /// label-use will be patched to refer to this veneer's offset. A new + /// (offset, LabelUse) is returned that allows the veneer to use the actual + /// label. For veneers to work properly, it is expected that the new veneer + /// has a larger range; on most platforms this probably means either a + /// "long-range jump" (e.g., on ARM, the 26-bit form), or if already at that + /// stage, a jump that supports a full 32-bit range, for example. + fn generate_veneer(self, buffer: &mut [u8], veneer_offset: CodeOffset) -> (CodeOffset, Self); +} + +/// Describes a block terminator (not call) in the vcode, when its branches +/// have not yet been finalized (so a branch may have two targets). +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum MachTerminator<'a> { + /// Not a terminator. + None, + /// A return instruction. + Ret, + /// An unconditional branch to another block. + Uncond(MachLabel), + /// A conditional branch to one of two other blocks. + Cond(MachLabel, MachLabel), + /// An indirect branch with known possible targets. + Indirect(&'a [MachLabel]), +} + +/// A trait describing the ability to encode a MachInst into binary machine code. +pub trait MachInstEmit: MachInst { + /// Persistent state carried across `emit` invocations. + type State: MachInstEmitState<Self>; + /// Constant information used in `emit` invocations. + type Info: MachInstEmitInfo; + /// Unwind info generator. + type UnwindInfo: UnwindInfoGenerator<Self>; + /// Emit the instruction. + fn emit(&self, code: &mut MachBuffer<Self>, info: &Self::Info, state: &mut Self::State); + /// Pretty-print the instruction. + fn pretty_print(&self, mb_rru: Option<&RealRegUniverse>, state: &mut Self::State) -> String; +} + +/// Constant information used to emit an instruction. +pub trait MachInstEmitInfo { + /// Return the target-independent settings used for the compilation of this + /// particular function. + fn flags(&self) -> &Flags; +} + +/// A trait describing the emission state carried between MachInsts when +/// emitting a function body. +pub trait MachInstEmitState<I: MachInst>: Default + Clone + Debug { + /// Create a new emission state given the ABI object. + fn new(abi: &dyn ABICallee<I = I>) -> Self; + /// Update the emission state before emitting an instruction that is a + /// safepoint. + fn pre_safepoint(&mut self, _stack_map: StackMap) {} + /// Update the emission state to indicate instructions are associated with a + /// particular SourceLoc. + fn pre_sourceloc(&mut self, _srcloc: SourceLoc) {} +} + +/// The result of a `MachBackend::compile_function()` call. Contains machine +/// code (as bytes) and a disassembly, if requested. +pub struct MachCompileResult { + /// Machine code. + pub buffer: MachBufferFinalized, + /// Size of stack frame, in bytes. + pub frame_size: u32, + /// Disassembly, if requested. + pub disasm: Option<String>, + /// Unwind info. + pub unwind_info: Option<unwind_input::UnwindInfo<Reg>>, +} + +impl MachCompileResult { + /// Get a `CodeInfo` describing section sizes from this compilation result. + pub fn code_info(&self) -> CodeInfo { + let code_size = self.buffer.total_size(); + CodeInfo { + code_size, + jumptables_size: 0, + rodata_size: 0, + total_size: code_size, + } + } +} + +/// Top-level machine backend trait, which wraps all monomorphized code and +/// allows a virtual call from the machine-independent `Function::compile()`. +pub trait MachBackend { + /// Compile the given function. + fn compile_function( + &self, + func: &Function, + want_disasm: bool, + ) -> CodegenResult<MachCompileResult>; + + /// Return flags for this backend. + fn flags(&self) -> &Flags; + + /// Return triple for this backend. + fn triple(&self) -> Triple; + + /// Return name for this backend. + fn name(&self) -> &'static str; + + /// Return the register universe for this backend. + fn reg_universe(&self) -> &RealRegUniverse; + + /// Machine-specific condcode info needed by TargetIsa. + /// Condition that will be true when an IaddIfcout overflows. + fn unsigned_add_overflow_condition(&self) -> IntCC; + + /// Machine-specific condcode info needed by TargetIsa. + /// Condition that will be true when an IsubIfcout overflows. + fn unsigned_sub_overflow_condition(&self) -> IntCC; + + /// Produces unwind info based on backend results. + #[cfg(feature = "unwind")] + fn emit_unwind_info( + &self, + _result: &MachCompileResult, + _kind: UnwindInfoKind, + ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> { + // By default, an backend cannot produce unwind info. + Ok(None) + } + + /// Machine-specific condcode info needed by TargetIsa. + /// Creates a new System V Common Information Entry for the ISA. + #[cfg(feature = "unwind")] + fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> { + // By default, an ISA cannot create a System V CIE + None + } +} + +/// Expected unwind info type. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum UnwindInfoKind { + /// No unwind info. + None, + /// SystemV CIE/FDE unwind info. + #[cfg(feature = "unwind")] + SystemV, + /// Windows X64 Unwind info + #[cfg(feature = "unwind")] + Windows, +} + +/// Input data for UnwindInfoGenerator. +pub struct UnwindInfoContext<'a, Inst: MachInstEmit> { + /// Function instructions. + pub insts: &'a [Inst], + /// Instruction layout: end offsets + pub insts_layout: &'a [CodeOffset], + /// Length of the function. + pub len: CodeOffset, + /// Prologue range. + pub prologue: Range<u32>, + /// Epilogue ranges. + pub epilogues: &'a [Range<u32>], +} + +/// UnwindInfo generator/helper. +pub trait UnwindInfoGenerator<I: MachInstEmit> { + /// Creates unwind info based on function signature and + /// emitted instructions. + fn create_unwind_info( + context: UnwindInfoContext<I>, + ) -> CodegenResult<Option<unwind_input::UnwindInfo<Reg>>>; +} diff --git a/third_party/rust/cranelift-codegen/src/machinst/vcode.rs b/third_party/rust/cranelift-codegen/src/machinst/vcode.rs new file mode 100644 index 0000000000..c57f018e35 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/machinst/vcode.rs @@ -0,0 +1,895 @@ +//! This implements the VCode container: a CFG of Insts that have been lowered. +//! +//! VCode is virtual-register code. An instruction in VCode is almost a machine +//! instruction; however, its register slots can refer to virtual registers in +//! addition to real machine registers. +//! +//! VCode is structured with traditional basic blocks, and +//! each block must be terminated by an unconditional branch (one target), a +//! conditional branch (two targets), or a return (no targets). Note that this +//! slightly differs from the machine code of most ISAs: in most ISAs, a +//! conditional branch has one target (and the not-taken case falls through). +//! However, we expect that machine backends will elide branches to the following +//! block (i.e., zero-offset jumps), and will be able to codegen a branch-cond / +//! branch-uncond pair if *both* targets are not fallthrough. This allows us to +//! play with layout prior to final binary emission, as well, if we want. +//! +//! See the main module comment in `mod.rs` for more details on the VCode-based +//! backend pipeline. + +use crate::ir::{self, types, Constant, ConstantData, SourceLoc}; +use crate::machinst::*; +use crate::settings; +use crate::timing; + +use regalloc::Function as RegallocFunction; +use regalloc::Set as RegallocSet; +use regalloc::{ + BlockIx, InstIx, PrettyPrint, Range, RegAllocResult, RegClass, RegUsageCollector, + RegUsageMapper, SpillSlot, StackmapRequestInfo, +}; + +use alloc::boxed::Box; +use alloc::{borrow::Cow, vec::Vec}; +use cranelift_entity::{entity_impl, Keys, PrimaryMap}; +use std::cell::RefCell; +use std::collections::HashMap; +use std::fmt; +use std::iter; +use std::string::String; + +/// Index referring to an instruction in VCode. +pub type InsnIndex = u32; +/// Index referring to a basic block in VCode. +pub type BlockIndex = u32; +/// Range of an instructions in VCode. +pub type InsnRange = core::ops::Range<InsnIndex>; + +/// VCodeInst wraps all requirements for a MachInst to be in VCode: it must be +/// a `MachInst` and it must be able to emit itself at least to a `SizeCodeSink`. +pub trait VCodeInst: MachInst + MachInstEmit {} +impl<I: MachInst + MachInstEmit> VCodeInst for I {} + +/// A function in "VCode" (virtualized-register code) form, after lowering. +/// This is essentially a standard CFG of basic blocks, where each basic block +/// consists of lowered instructions produced by the machine-specific backend. +pub struct VCode<I: VCodeInst> { + /// Function liveins. + liveins: RegallocSet<RealReg>, + + /// Function liveouts. + liveouts: RegallocSet<RealReg>, + + /// VReg IR-level types. + vreg_types: Vec<Type>, + + /// Do we have any ref values among our vregs? + have_ref_values: bool, + + /// Lowered machine instructions in order corresponding to the original IR. + insts: Vec<I>, + + /// Source locations for each instruction. (`SourceLoc` is a `u32`, so it is + /// reasonable to keep one of these per instruction.) + srclocs: Vec<SourceLoc>, + + /// Entry block. + entry: BlockIndex, + + /// Block instruction indices. + block_ranges: Vec<(InsnIndex, InsnIndex)>, + + /// Block successors: index range in the successor-list below. + block_succ_range: Vec<(usize, usize)>, + + /// Block successor lists, concatenated into one Vec. The `block_succ_range` + /// list of tuples above gives (start, end) ranges within this list that + /// correspond to each basic block's successors. + block_succs: Vec<BlockIx>, + + /// Block-order information. + block_order: BlockLoweringOrder, + + /// ABI object. + abi: Box<dyn ABICallee<I = I>>, + + /// Constant information used during code emission. This should be + /// immutable across function compilations within the same module. + emit_info: I::Info, + + /// Safepoint instruction indices. Filled in post-regalloc. (Prior to + /// regalloc, the safepoint instructions are listed in the separate + /// `StackmapRequestInfo` held separate from the `VCode`.) + safepoint_insns: Vec<InsnIndex>, + + /// For each safepoint entry in `safepoint_insns`, a list of `SpillSlot`s. + /// These are used to generate actual stack maps at emission. Filled in + /// post-regalloc. + safepoint_slots: Vec<Vec<SpillSlot>>, + + /// Ranges for prologue and epilogue instructions. + prologue_epilogue_ranges: Option<(InsnRange, Box<[InsnRange]>)>, + + /// Instruction end offsets + insts_layout: RefCell<(Vec<u32>, u32)>, + + /// Constants. + constants: VCodeConstants, +} + +/// A builder for a VCode function body. This builder is designed for the +/// lowering approach that we take: we traverse basic blocks in forward +/// (original IR) order, but within each basic block, we generate code from +/// bottom to top; and within each IR instruction that we visit in this reverse +/// order, we emit machine instructions in *forward* order again. +/// +/// Hence, to produce the final instructions in proper order, we perform two +/// swaps. First, the machine instructions (`I` instances) are produced in +/// forward order for an individual IR instruction. Then these are *reversed* +/// and concatenated to `bb_insns` at the end of the IR instruction lowering. +/// The `bb_insns` vec will thus contain all machine instructions for a basic +/// block, in reverse order. Finally, when we're done with a basic block, we +/// reverse the whole block's vec of instructions again, and concatenate onto +/// the VCode's insts. +pub struct VCodeBuilder<I: VCodeInst> { + /// In-progress VCode. + vcode: VCode<I>, + + /// In-progress stack map-request info. + stack_map_info: StackmapRequestInfo, + + /// Index of the last block-start in the vcode. + block_start: InsnIndex, + + /// Start of succs for the current block in the concatenated succs list. + succ_start: usize, + + /// Current source location. + cur_srcloc: SourceLoc, +} + +impl<I: VCodeInst> VCodeBuilder<I> { + /// Create a new VCodeBuilder. + pub fn new( + abi: Box<dyn ABICallee<I = I>>, + emit_info: I::Info, + block_order: BlockLoweringOrder, + constants: VCodeConstants, + ) -> VCodeBuilder<I> { + let reftype_class = I::ref_type_regclass(abi.flags()); + let vcode = VCode::new(abi, emit_info, block_order, constants); + let stack_map_info = StackmapRequestInfo { + reftype_class, + reftyped_vregs: vec![], + safepoint_insns: vec![], + }; + + VCodeBuilder { + vcode, + stack_map_info, + block_start: 0, + succ_start: 0, + cur_srcloc: SourceLoc::default(), + } + } + + /// Access the ABI object. + pub fn abi(&mut self) -> &mut dyn ABICallee<I = I> { + &mut *self.vcode.abi + } + + /// Access to the BlockLoweringOrder object. + pub fn block_order(&self) -> &BlockLoweringOrder { + &self.vcode.block_order + } + + /// Set the type of a VReg. + pub fn set_vreg_type(&mut self, vreg: VirtualReg, ty: Type) { + if self.vcode.vreg_types.len() <= vreg.get_index() { + self.vcode + .vreg_types + .resize(vreg.get_index() + 1, ir::types::I8); + } + self.vcode.vreg_types[vreg.get_index()] = ty; + if is_reftype(ty) { + self.stack_map_info.reftyped_vregs.push(vreg); + self.vcode.have_ref_values = true; + } + } + + /// Are there any reference-typed values at all among the vregs? + pub fn have_ref_values(&self) -> bool { + self.vcode.have_ref_values() + } + + /// Set the current block as the entry block. + pub fn set_entry(&mut self, block: BlockIndex) { + self.vcode.entry = block; + } + + /// End the current basic block. Must be called after emitting vcode insts + /// for IR insts and prior to ending the function (building the VCode). + pub fn end_bb(&mut self) { + let start_idx = self.block_start; + let end_idx = self.vcode.insts.len() as InsnIndex; + self.block_start = end_idx; + // Add the instruction index range to the list of blocks. + self.vcode.block_ranges.push((start_idx, end_idx)); + // End the successors list. + let succ_end = self.vcode.block_succs.len(); + self.vcode + .block_succ_range + .push((self.succ_start, succ_end)); + self.succ_start = succ_end; + } + + /// Push an instruction for the current BB and current IR inst within the BB. + pub fn push(&mut self, insn: I, is_safepoint: bool) { + match insn.is_term() { + MachTerminator::None | MachTerminator::Ret => {} + MachTerminator::Uncond(target) => { + self.vcode.block_succs.push(BlockIx::new(target.get())); + } + MachTerminator::Cond(true_branch, false_branch) => { + self.vcode.block_succs.push(BlockIx::new(true_branch.get())); + self.vcode + .block_succs + .push(BlockIx::new(false_branch.get())); + } + MachTerminator::Indirect(targets) => { + for target in targets { + self.vcode.block_succs.push(BlockIx::new(target.get())); + } + } + } + self.vcode.insts.push(insn); + self.vcode.srclocs.push(self.cur_srcloc); + if is_safepoint { + self.stack_map_info + .safepoint_insns + .push(InstIx::new((self.vcode.insts.len() - 1) as u32)); + } + } + + /// Get the current source location. + pub fn get_srcloc(&self) -> SourceLoc { + self.cur_srcloc + } + + /// Set the current source location. + pub fn set_srcloc(&mut self, srcloc: SourceLoc) { + self.cur_srcloc = srcloc; + } + + /// Access the constants. + pub fn constants(&mut self) -> &mut VCodeConstants { + &mut self.vcode.constants + } + + /// Build the final VCode, returning the vcode itself as well as auxiliary + /// information, such as the stack map request information. + pub fn build(self) -> (VCode<I>, StackmapRequestInfo) { + // TODO: come up with an abstraction for "vcode and auxiliary data". The + // auxiliary data needs to be separate from the vcode so that it can be + // referenced as the vcode is mutated (e.g. by the register allocator). + (self.vcode, self.stack_map_info) + } +} + +fn is_redundant_move<I: VCodeInst>(insn: &I) -> bool { + if let Some((to, from)) = insn.is_move() { + to.to_reg() == from + } else { + false + } +} + +/// Is this type a reference type? +fn is_reftype(ty: Type) -> bool { + ty == types::R64 || ty == types::R32 +} + +impl<I: VCodeInst> VCode<I> { + /// New empty VCode. + fn new( + abi: Box<dyn ABICallee<I = I>>, + emit_info: I::Info, + block_order: BlockLoweringOrder, + constants: VCodeConstants, + ) -> VCode<I> { + VCode { + liveins: abi.liveins(), + liveouts: abi.liveouts(), + vreg_types: vec![], + have_ref_values: false, + insts: vec![], + srclocs: vec![], + entry: 0, + block_ranges: vec![], + block_succ_range: vec![], + block_succs: vec![], + block_order, + abi, + emit_info, + safepoint_insns: vec![], + safepoint_slots: vec![], + prologue_epilogue_ranges: None, + insts_layout: RefCell::new((vec![], 0)), + constants, + } + } + + /// Returns the flags controlling this function's compilation. + pub fn flags(&self) -> &settings::Flags { + self.abi.flags() + } + + /// Get the IR-level type of a VReg. + pub fn vreg_type(&self, vreg: VirtualReg) -> Type { + self.vreg_types[vreg.get_index()] + } + + /// Are there any reference-typed values at all among the vregs? + pub fn have_ref_values(&self) -> bool { + self.have_ref_values + } + + /// Get the entry block. + pub fn entry(&self) -> BlockIndex { + self.entry + } + + /// Get the number of blocks. Block indices will be in the range `0 .. + /// (self.num_blocks() - 1)`. + pub fn num_blocks(&self) -> usize { + self.block_ranges.len() + } + + /// Stack frame size for the full function's body. + pub fn frame_size(&self) -> u32 { + self.abi.frame_size() + } + + /// Inbound stack-args size. + pub fn stack_args_size(&self) -> u32 { + self.abi.stack_args_size() + } + + /// Get the successors for a block. + pub fn succs(&self, block: BlockIndex) -> &[BlockIx] { + let (start, end) = self.block_succ_range[block as usize]; + &self.block_succs[start..end] + } + + /// Take the results of register allocation, with a sequence of + /// instructions including spliced fill/reload/move instructions, and replace + /// the VCode with them. + pub fn replace_insns_from_regalloc(&mut self, result: RegAllocResult<Self>) { + // Record the spillslot count and clobbered registers for the ABI/stack + // setup code. + self.abi.set_num_spillslots(result.num_spill_slots as usize); + self.abi + .set_clobbered(result.clobbered_registers.map(|r| Writable::from_reg(*r))); + + let mut final_insns = vec![]; + let mut final_block_ranges = vec![(0, 0); self.num_blocks()]; + let mut final_srclocs = vec![]; + let mut final_safepoint_insns = vec![]; + let mut safept_idx = 0; + + let mut prologue_start = None; + let mut prologue_end = None; + let mut epilogue_islands = vec![]; + + assert!(result.target_map.elems().len() == self.num_blocks()); + for block in 0..self.num_blocks() { + let start = result.target_map.elems()[block].get() as usize; + let end = if block == self.num_blocks() - 1 { + result.insns.len() + } else { + result.target_map.elems()[block + 1].get() as usize + }; + let block = block as BlockIndex; + let final_start = final_insns.len() as InsnIndex; + + if block == self.entry { + prologue_start = Some(final_insns.len() as InsnIndex); + // Start with the prologue. + let prologue = self.abi.gen_prologue(); + let len = prologue.len(); + final_insns.extend(prologue.into_iter()); + final_srclocs.extend(iter::repeat(SourceLoc::default()).take(len)); + prologue_end = Some(final_insns.len() as InsnIndex); + } + + for i in start..end { + let insn = &result.insns[i]; + + // Elide redundant moves at this point (we only know what is + // redundant once registers are allocated). + if is_redundant_move(insn) { + continue; + } + + // Is there a srcloc associated with this insn? Look it up based on original + // instruction index (if new insn corresponds to some original insn, i.e., is not + // an inserted load/spill/move). + let orig_iix = result.orig_insn_map[InstIx::new(i as u32)]; + let srcloc = if orig_iix.is_invalid() { + SourceLoc::default() + } else { + self.srclocs[orig_iix.get() as usize] + }; + + // Whenever encountering a return instruction, replace it + // with the epilogue. + let is_ret = insn.is_term() == MachTerminator::Ret; + if is_ret { + let epilogue_start = final_insns.len() as InsnIndex; + let epilogue = self.abi.gen_epilogue(); + let len = epilogue.len(); + final_insns.extend(epilogue.into_iter()); + final_srclocs.extend(iter::repeat(srcloc).take(len)); + epilogue_islands.push(epilogue_start..final_insns.len() as InsnIndex); + } else { + final_insns.push(insn.clone()); + final_srclocs.push(srcloc); + } + + // Was this instruction a safepoint instruction? Add its final + // index to the safepoint insn-index list if so. + if safept_idx < result.new_safepoint_insns.len() + && (result.new_safepoint_insns[safept_idx].get() as usize) == i + { + let idx = final_insns.len() - 1; + final_safepoint_insns.push(idx as InsnIndex); + safept_idx += 1; + } + } + + let final_end = final_insns.len() as InsnIndex; + final_block_ranges[block as usize] = (final_start, final_end); + } + + debug_assert!(final_insns.len() == final_srclocs.len()); + + self.insts = final_insns; + self.srclocs = final_srclocs; + self.block_ranges = final_block_ranges; + self.safepoint_insns = final_safepoint_insns; + + // Save safepoint slot-lists. These will be passed to the `EmitState` + // for the machine backend during emission so that it can do + // target-specific translations of slot numbers to stack offsets. + self.safepoint_slots = result.stackmaps; + + self.prologue_epilogue_ranges = Some(( + prologue_start.unwrap()..prologue_end.unwrap(), + epilogue_islands.into_boxed_slice(), + )); + } + + /// Emit the instructions to a `MachBuffer`, containing fixed-up code and external + /// reloc/trap/etc. records ready for use. + pub fn emit(&self) -> MachBuffer<I> + where + I: MachInstEmit, + { + let _tt = timing::vcode_emit(); + let mut buffer = MachBuffer::new(); + let mut state = I::State::new(&*self.abi); + + // The first M MachLabels are reserved for block indices, the next N MachLabels for + // constants. + buffer.reserve_labels_for_blocks(self.num_blocks() as BlockIndex); + buffer.reserve_labels_for_constants(&self.constants); + + let mut insts_layout = vec![0; self.insts.len()]; + + let mut safepoint_idx = 0; + let mut cur_srcloc = None; + for block in 0..self.num_blocks() { + let block = block as BlockIndex; + let new_offset = I::align_basic_block(buffer.cur_offset()); + while new_offset > buffer.cur_offset() { + // Pad with NOPs up to the aligned block offset. + let nop = I::gen_nop((new_offset - buffer.cur_offset()) as usize); + nop.emit(&mut buffer, &self.emit_info, &mut Default::default()); + } + assert_eq!(buffer.cur_offset(), new_offset); + + let (start, end) = self.block_ranges[block as usize]; + buffer.bind_label(MachLabel::from_block(block)); + for iix in start..end { + let srcloc = self.srclocs[iix as usize]; + if cur_srcloc != Some(srcloc) { + if cur_srcloc.is_some() { + buffer.end_srcloc(); + } + buffer.start_srcloc(srcloc); + cur_srcloc = Some(srcloc); + } + state.pre_sourceloc(cur_srcloc.unwrap_or(SourceLoc::default())); + + if safepoint_idx < self.safepoint_insns.len() + && self.safepoint_insns[safepoint_idx] == iix + { + if self.safepoint_slots[safepoint_idx].len() > 0 { + let stack_map = self.abi.spillslots_to_stack_map( + &self.safepoint_slots[safepoint_idx][..], + &state, + ); + state.pre_safepoint(stack_map); + } + safepoint_idx += 1; + } + + self.insts[iix as usize].emit(&mut buffer, &self.emit_info, &mut state); + + insts_layout[iix as usize] = buffer.cur_offset(); + } + + if cur_srcloc.is_some() { + buffer.end_srcloc(); + cur_srcloc = None; + } + + // Do we need an island? Get the worst-case size of the next BB and see if, having + // emitted that many bytes, we will be beyond the deadline. + if block < (self.num_blocks() - 1) as BlockIndex { + let next_block = block + 1; + let next_block_range = self.block_ranges[next_block as usize]; + let next_block_size = next_block_range.1 - next_block_range.0; + let worst_case_next_bb = I::worst_case_size() * next_block_size; + if buffer.island_needed(worst_case_next_bb) { + buffer.emit_island(); + } + } + } + + // Emit the constants used by the function. + for (constant, data) in self.constants.iter() { + let label = buffer.get_label_for_constant(constant); + buffer.defer_constant(label, data.alignment(), data.as_slice(), u32::max_value()); + } + + *self.insts_layout.borrow_mut() = (insts_layout, buffer.cur_offset()); + + buffer + } + + /// Generates unwind info. + pub fn unwind_info( + &self, + ) -> crate::result::CodegenResult<Option<crate::isa::unwind::input::UnwindInfo<Reg>>> { + let layout = &self.insts_layout.borrow(); + let (prologue, epilogues) = self.prologue_epilogue_ranges.as_ref().unwrap(); + let context = UnwindInfoContext { + insts: &self.insts, + insts_layout: &layout.0, + len: layout.1, + prologue: prologue.clone(), + epilogues, + }; + I::UnwindInfo::create_unwind_info(context) + } + + /// Get the IR block for a BlockIndex, if one exists. + pub fn bindex_to_bb(&self, block: BlockIndex) -> Option<ir::Block> { + self.block_order.lowered_order()[block as usize].orig_block() + } +} + +impl<I: VCodeInst> RegallocFunction for VCode<I> { + type Inst = I; + + fn insns(&self) -> &[I] { + &self.insts[..] + } + + fn insns_mut(&mut self) -> &mut [I] { + &mut self.insts[..] + } + + fn get_insn(&self, insn: InstIx) -> &I { + &self.insts[insn.get() as usize] + } + + fn get_insn_mut(&mut self, insn: InstIx) -> &mut I { + &mut self.insts[insn.get() as usize] + } + + fn blocks(&self) -> Range<BlockIx> { + Range::new(BlockIx::new(0), self.block_ranges.len()) + } + + fn entry_block(&self) -> BlockIx { + BlockIx::new(self.entry) + } + + fn block_insns(&self, block: BlockIx) -> Range<InstIx> { + let (start, end) = self.block_ranges[block.get() as usize]; + Range::new(InstIx::new(start), (end - start) as usize) + } + + fn block_succs(&self, block: BlockIx) -> Cow<[BlockIx]> { + let (start, end) = self.block_succ_range[block.get() as usize]; + Cow::Borrowed(&self.block_succs[start..end]) + } + + fn is_ret(&self, insn: InstIx) -> bool { + match self.insts[insn.get() as usize].is_term() { + MachTerminator::Ret => true, + _ => false, + } + } + + fn is_included_in_clobbers(&self, insn: &I) -> bool { + insn.is_included_in_clobbers() + } + + fn get_regs(insn: &I, collector: &mut RegUsageCollector) { + insn.get_regs(collector) + } + + fn map_regs<RUM: RegUsageMapper>(insn: &mut I, mapper: &RUM) { + insn.map_regs(mapper); + } + + fn is_move(&self, insn: &I) -> Option<(Writable<Reg>, Reg)> { + insn.is_move() + } + + fn get_num_vregs(&self) -> usize { + self.vreg_types.len() + } + + fn get_spillslot_size(&self, regclass: RegClass, vreg: VirtualReg) -> u32 { + let ty = self.vreg_type(vreg); + self.abi.get_spillslot_size(regclass, ty) + } + + fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, vreg: Option<VirtualReg>) -> I { + let ty = vreg.map(|v| self.vreg_type(v)); + self.abi.gen_spill(to_slot, from_reg, ty) + } + + fn gen_reload( + &self, + to_reg: Writable<RealReg>, + from_slot: SpillSlot, + vreg: Option<VirtualReg>, + ) -> I { + let ty = vreg.map(|v| self.vreg_type(v)); + self.abi.gen_reload(to_reg, from_slot, ty) + } + + fn gen_move(&self, to_reg: Writable<RealReg>, from_reg: RealReg, vreg: VirtualReg) -> I { + let ty = self.vreg_type(vreg); + I::gen_move(to_reg.map(|r| r.to_reg()), from_reg.to_reg(), ty) + } + + fn gen_zero_len_nop(&self) -> I { + I::gen_zero_len_nop() + } + + fn maybe_direct_reload(&self, insn: &I, reg: VirtualReg, slot: SpillSlot) -> Option<I> { + insn.maybe_direct_reload(reg, slot) + } + + fn func_liveins(&self) -> RegallocSet<RealReg> { + self.liveins.clone() + } + + fn func_liveouts(&self) -> RegallocSet<RealReg> { + self.liveouts.clone() + } +} + +impl<I: VCodeInst> fmt::Debug for VCode<I> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "VCode_Debug {{")?; + writeln!(f, " Entry block: {}", self.entry)?; + + for block in 0..self.num_blocks() { + writeln!(f, "Block {}:", block,)?; + for succ in self.succs(block as BlockIndex) { + writeln!(f, " (successor: Block {})", succ.get())?; + } + let (start, end) = self.block_ranges[block]; + writeln!(f, " (instruction range: {} .. {})", start, end)?; + for inst in start..end { + writeln!(f, " Inst {}: {:?}", inst, self.insts[inst as usize])?; + } + } + + writeln!(f, "}}")?; + Ok(()) + } +} + +/// Pretty-printing with `RealRegUniverse` context. +impl<I: VCodeInst> PrettyPrint for VCode<I> { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + use std::fmt::Write; + + let mut s = String::new(); + write!(&mut s, "VCode_ShowWithRRU {{{{\n").unwrap(); + write!(&mut s, " Entry block: {}\n", self.entry).unwrap(); + + let mut state = Default::default(); + let mut safepoint_idx = 0; + for i in 0..self.num_blocks() { + let block = i as BlockIndex; + + write!(&mut s, "Block {}:\n", block).unwrap(); + if let Some(bb) = self.bindex_to_bb(block) { + write!(&mut s, " (original IR block: {})\n", bb).unwrap(); + } + for succ in self.succs(block) { + write!(&mut s, " (successor: Block {})\n", succ.get()).unwrap(); + } + let (start, end) = self.block_ranges[block as usize]; + write!(&mut s, " (instruction range: {} .. {})\n", start, end).unwrap(); + for inst in start..end { + if safepoint_idx < self.safepoint_insns.len() + && self.safepoint_insns[safepoint_idx] == inst + { + write!( + &mut s, + " (safepoint: slots {:?} with EmitState {:?})\n", + self.safepoint_slots[safepoint_idx], state, + ) + .unwrap(); + safepoint_idx += 1; + } + write!( + &mut s, + " Inst {}: {}\n", + inst, + self.insts[inst as usize].pretty_print(mb_rru, &mut state) + ) + .unwrap(); + } + } + + write!(&mut s, "}}}}\n").unwrap(); + + s + } +} + +/// This structure tracks the large constants used in VCode that will be emitted separately by the +/// [MachBuffer]. +/// +/// First, during the lowering phase, constants are inserted using +/// [VCodeConstants.insert]; an intermediate handle, [VCodeConstant], tracks what constants are +/// used in this phase. Some deduplication is performed, when possible, as constant +/// values are inserted. +/// +/// Secondly, during the emission phase, the [MachBuffer] assigns [MachLabel]s for each of the +/// constants so that instructions can refer to the value's memory location. The [MachBuffer] +/// then writes the constant values to the buffer. +#[derive(Default)] +pub struct VCodeConstants { + constants: PrimaryMap<VCodeConstant, VCodeConstantData>, + pool_uses: HashMap<Constant, VCodeConstant>, + well_known_uses: HashMap<*const [u8], VCodeConstant>, +} +impl VCodeConstants { + /// Initialize the structure with the expected number of constants. + pub fn with_capacity(expected_num_constants: usize) -> Self { + Self { + constants: PrimaryMap::with_capacity(expected_num_constants), + pool_uses: HashMap::with_capacity(expected_num_constants), + well_known_uses: HashMap::new(), + } + } + + /// Insert a constant; using this method indicates that a constant value will be used and thus + /// will be emitted to the `MachBuffer`. The current implementation can deduplicate constants + /// that are [VCodeConstantData::Pool] or [VCodeConstantData::WellKnown] but not + /// [VCodeConstantData::Generated]. + pub fn insert(&mut self, data: VCodeConstantData) -> VCodeConstant { + match data { + VCodeConstantData::Generated(_) => self.constants.push(data), + VCodeConstantData::Pool(constant, _) => match self.pool_uses.get(&constant) { + None => { + let vcode_constant = self.constants.push(data); + self.pool_uses.insert(constant, vcode_constant); + vcode_constant + } + Some(&vcode_constant) => vcode_constant, + }, + VCodeConstantData::WellKnown(data_ref) => { + match self.well_known_uses.get(&(data_ref as *const [u8])) { + None => { + let vcode_constant = self.constants.push(data); + self.well_known_uses + .insert(data_ref as *const [u8], vcode_constant); + vcode_constant + } + Some(&vcode_constant) => vcode_constant, + } + } + } + } + + /// Retrieve a byte slice for the given [VCodeConstant], if available. + pub fn get(&self, constant: VCodeConstant) -> Option<&[u8]> { + self.constants.get(constant).map(|d| d.as_slice()) + } + + /// Return the number of constants inserted. + pub fn len(&self) -> usize { + self.constants.len() + } + + /// Iterate over the [VCodeConstant] keys inserted in this structure. + pub fn keys(&self) -> Keys<VCodeConstant> { + self.constants.keys() + } + + /// Iterate over the [VCodeConstant] keys and the data (as a byte slice) inserted in this + /// structure. + pub fn iter(&self) -> impl Iterator<Item = (VCodeConstant, &VCodeConstantData)> { + self.constants.iter() + } +} + +/// A use of a constant by one or more VCode instructions; see [VCodeConstants]. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct VCodeConstant(u32); +entity_impl!(VCodeConstant); + +/// Identify the different types of constant that can be inserted into [VCodeConstants]. Tracking +/// these separately instead of as raw byte buffers allows us to avoid some duplication. +pub enum VCodeConstantData { + /// A constant already present in the Cranelift IR + /// [ConstantPool](crate::ir::constant::ConstantPool). + Pool(Constant, ConstantData), + /// A reference to a well-known constant value that is statically encoded within the compiler. + WellKnown(&'static [u8]), + /// A constant value generated during lowering; the value may depend on the instruction context + /// which makes it difficult to de-duplicate--if possible, use other variants. + Generated(ConstantData), +} +impl VCodeConstantData { + /// Retrieve the constant data as a byte slice. + pub fn as_slice(&self) -> &[u8] { + match self { + VCodeConstantData::Pool(_, d) | VCodeConstantData::Generated(d) => d.as_slice(), + VCodeConstantData::WellKnown(d) => d, + } + } + + /// Calculate the alignment of the constant data. + pub fn alignment(&self) -> u32 { + if self.as_slice().len() <= 8 { + 8 + } else { + 16 + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use std::mem::size_of; + + #[test] + fn size_of_constant_structs() { + assert_eq!(size_of::<Constant>(), 4); + assert_eq!(size_of::<VCodeConstant>(), 4); + assert_eq!(size_of::<ConstantData>(), 24); + assert_eq!(size_of::<VCodeConstantData>(), 32); + assert_eq!( + size_of::<PrimaryMap<VCodeConstant, VCodeConstantData>>(), + 24 + ); + // TODO The VCodeConstants structure's memory size could be further optimized. + // With certain versions of Rust, each `HashMap` in `VCodeConstants` occupied at + // least 48 bytes, making an empty `VCodeConstants` cost 120 bytes. + } +} |