diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/cranelift-codegen/src/binemit | |
parent | Initial commit. (diff) | |
download | firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/cranelift-codegen/src/binemit')
5 files changed, 1129 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen/src/binemit/memorysink.rs b/third_party/rust/cranelift-codegen/src/binemit/memorysink.rs new file mode 100644 index 0000000000..d50d1c10eb --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/binemit/memorysink.rs @@ -0,0 +1,232 @@ +//! Code sink that writes binary machine code into contiguous memory. +//! +//! The `CodeSink` trait is the most general way of extracting binary machine code from Cranelift, +//! and it is implemented by things like the `test binemit` file test driver to generate +//! hexadecimal machine code. The `CodeSink` has some undesirable performance properties because of +//! the dual abstraction: `TargetIsa` is a trait object implemented by each supported ISA, so it +//! can't have any generic functions that could be specialized for each `CodeSink` implementation. +//! This results in many virtual function callbacks (one per `put*` call) when +//! `TargetIsa::emit_inst()` is used. +//! +//! The `MemoryCodeSink` type fixes the performance problem because it is a type known to +//! `TargetIsa` so it can specialize its machine code generation for the type. The trade-off is +//! that a `MemoryCodeSink` will always write binary machine code to raw memory. It forwards any +//! relocations to a `RelocSink` trait object. Relocations are less frequent than the +//! `CodeSink::put*` methods, so the performance impact of the virtual callbacks is less severe. +use super::{Addend, CodeInfo, CodeOffset, CodeSink, Reloc}; +use crate::binemit::stack_map::StackMap; +use crate::ir::entities::Value; +use crate::ir::{ConstantOffset, ExternalName, Function, JumpTable, Opcode, SourceLoc, TrapCode}; +use crate::isa::TargetIsa; +use core::ptr::write_unaligned; + +/// A `CodeSink` that writes binary machine code directly into memory. +/// +/// A `MemoryCodeSink` object should be used when emitting a Cranelift IR function into executable +/// memory. It writes machine code directly to a raw pointer without any bounds checking, so make +/// sure to allocate enough memory for the whole function. The number of bytes required is returned +/// by the `Context::compile()` function. +/// +/// Any relocations in the function are forwarded to the `RelocSink` trait object. +/// +/// Note that `MemoryCodeSink` writes multi-byte values in the native byte order of the host. This +/// is not the right thing to do for cross compilation. +pub struct MemoryCodeSink<'a> { + /// Pointer to start of sink's preallocated memory. + data: *mut u8, + /// Offset is isize because its major consumer needs it in that form. + offset: isize, + relocs: &'a mut dyn RelocSink, + traps: &'a mut dyn TrapSink, + stack_maps: &'a mut dyn StackMapSink, + /// Information about the generated code and read-only data. + pub info: CodeInfo, +} + +impl<'a> MemoryCodeSink<'a> { + /// Create a new memory code sink that writes a function to the memory pointed to by `data`. + /// + /// # Safety + /// + /// This function is unsafe since `MemoryCodeSink` does not perform bounds checking on the + /// memory buffer, and it can't guarantee that the `data` pointer is valid. + pub unsafe fn new( + data: *mut u8, + relocs: &'a mut dyn RelocSink, + traps: &'a mut dyn TrapSink, + stack_maps: &'a mut dyn StackMapSink, + ) -> Self { + Self { + data, + offset: 0, + info: CodeInfo { + code_size: 0, + jumptables_size: 0, + rodata_size: 0, + total_size: 0, + }, + relocs, + traps, + stack_maps, + } + } +} + +/// A trait for receiving relocations for code that is emitted directly into memory. +pub trait RelocSink { + /// Add a relocation referencing an external symbol at the current offset. + fn reloc_external( + &mut self, + _: CodeOffset, + _: SourceLoc, + _: Reloc, + _: &ExternalName, + _: Addend, + ); + + /// Add a relocation referencing a constant. + fn reloc_constant(&mut self, _: CodeOffset, _: Reloc, _: ConstantOffset); + + /// Add a relocation referencing a jump table. + fn reloc_jt(&mut self, _: CodeOffset, _: Reloc, _: JumpTable); + + /// Track a call site whose return address is the given CodeOffset, for the given opcode. Does + /// nothing in general, only useful for certain embedders (SpiderMonkey). + fn add_call_site(&mut self, _: Opcode, _: CodeOffset, _: SourceLoc) {} +} + +/// A trait for receiving trap codes and offsets. +/// +/// If you don't need information about possible traps, you can use the +/// [`NullTrapSink`](NullTrapSink) implementation. +pub trait TrapSink { + /// Add trap information for a specific offset. + fn trap(&mut self, _: CodeOffset, _: SourceLoc, _: TrapCode); +} + +impl<'a> MemoryCodeSink<'a> { + fn write<T>(&mut self, x: T) { + unsafe { + #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))] + write_unaligned(self.data.offset(self.offset) as *mut T, x); + self.offset += core::mem::size_of::<T>() as isize; + } + } +} + +impl<'a> CodeSink for MemoryCodeSink<'a> { + fn offset(&self) -> CodeOffset { + self.offset as CodeOffset + } + + fn put1(&mut self, x: u8) { + self.write(x); + } + + fn put2(&mut self, x: u16) { + self.write(x); + } + + fn put4(&mut self, x: u32) { + self.write(x); + } + + fn put8(&mut self, x: u64) { + self.write(x); + } + + fn reloc_external( + &mut self, + srcloc: SourceLoc, + rel: Reloc, + name: &ExternalName, + addend: Addend, + ) { + let ofs = self.offset(); + self.relocs.reloc_external(ofs, srcloc, rel, name, addend); + } + + fn reloc_constant(&mut self, rel: Reloc, constant_offset: ConstantOffset) { + let ofs = self.offset(); + self.relocs.reloc_constant(ofs, rel, constant_offset); + } + + fn reloc_jt(&mut self, rel: Reloc, jt: JumpTable) { + let ofs = self.offset(); + self.relocs.reloc_jt(ofs, rel, jt); + } + + fn trap(&mut self, code: TrapCode, srcloc: SourceLoc) { + let ofs = self.offset(); + self.traps.trap(ofs, srcloc, code); + } + + fn begin_jumptables(&mut self) { + self.info.code_size = self.offset(); + } + + fn begin_rodata(&mut self) { + self.info.jumptables_size = self.offset() - self.info.code_size; + } + + fn end_codegen(&mut self) { + self.info.rodata_size = self.offset() - (self.info.jumptables_size + self.info.code_size); + self.info.total_size = self.offset(); + } + + fn add_stack_map(&mut self, val_list: &[Value], func: &Function, isa: &dyn TargetIsa) { + let ofs = self.offset(); + let stack_map = StackMap::from_values(&val_list, func, isa); + self.stack_maps.add_stack_map(ofs, stack_map); + } + + fn add_call_site(&mut self, opcode: Opcode, loc: SourceLoc) { + debug_assert!( + opcode.is_call(), + "adding call site info for a non-call instruction." + ); + let ret_addr = self.offset(); + self.relocs.add_call_site(opcode, ret_addr, loc); + } +} + +/// A `RelocSink` implementation that does nothing, which is convenient when +/// compiling code that does not relocate anything. +#[derive(Default)] +pub struct NullRelocSink {} + +impl RelocSink for NullRelocSink { + fn reloc_external( + &mut self, + _: CodeOffset, + _: SourceLoc, + _: Reloc, + _: &ExternalName, + _: Addend, + ) { + } + fn reloc_constant(&mut self, _: CodeOffset, _: Reloc, _: ConstantOffset) {} + fn reloc_jt(&mut self, _: CodeOffset, _: Reloc, _: JumpTable) {} +} + +/// A `TrapSink` implementation that does nothing, which is convenient when +/// compiling code that does not rely on trapping semantics. +#[derive(Default)] +pub struct NullTrapSink {} + +impl TrapSink for NullTrapSink { + fn trap(&mut self, _offset: CodeOffset, _srcloc: SourceLoc, _code: TrapCode) {} +} + +/// A trait for emitting stack maps. +pub trait StackMapSink { + /// Output a bitmap of the stack representing the live reference variables at this code offset. + fn add_stack_map(&mut self, _: CodeOffset, _: StackMap); +} + +/// Placeholder StackMapSink that does nothing. +pub struct NullStackMapSink {} + +impl StackMapSink for NullStackMapSink { + fn add_stack_map(&mut self, _: CodeOffset, _: StackMap) {} +} diff --git a/third_party/rust/cranelift-codegen/src/binemit/mod.rs b/third_party/rust/cranelift-codegen/src/binemit/mod.rs new file mode 100644 index 0000000000..b534ec9765 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/binemit/mod.rs @@ -0,0 +1,222 @@ +//! Binary machine code emission. +//! +//! The `binemit` module contains code for translating Cranelift's intermediate representation into +//! binary machine code. + +mod memorysink; +mod relaxation; +mod shrink; +mod stack_map; + +pub use self::memorysink::{ + MemoryCodeSink, NullRelocSink, NullStackMapSink, NullTrapSink, RelocSink, StackMapSink, + TrapSink, +}; +pub use self::relaxation::relax_branches; +pub use self::shrink::shrink_instructions; +pub use self::stack_map::StackMap; +use crate::ir::entities::Value; +use crate::ir::{ + ConstantOffset, ExternalName, Function, Inst, JumpTable, Opcode, SourceLoc, TrapCode, +}; +use crate::isa::TargetIsa; +pub use crate::regalloc::RegDiversions; +use core::fmt; +#[cfg(feature = "enable-serde")] +use serde::{Deserialize, Serialize}; + +/// Offset in bytes from the beginning of the function. +/// +/// Cranelift can be used as a cross compiler, so we don't want to use a type like `usize` which +/// depends on the *host* platform, not the *target* platform. +pub type CodeOffset = u32; + +/// Addend to add to the symbol value. +pub type Addend = i64; + +/// Relocation kinds for every ISA +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub enum Reloc { + /// absolute 4-byte + Abs4, + /// absolute 8-byte + Abs8, + /// x86 PC-relative 4-byte + X86PCRel4, + /// x86 PC-relative 4-byte offset to trailing rodata + X86PCRelRodata4, + /// x86 call to PC-relative 4-byte + X86CallPCRel4, + /// x86 call to PLT-relative 4-byte + X86CallPLTRel4, + /// x86 GOT PC-relative 4-byte + X86GOTPCRel4, + /// Arm32 call target + Arm32Call, + /// Arm64 call target. Encoded as bottom 26 bits of instruction. This + /// value is sign-extended, multiplied by 4, and added to the PC of + /// the call instruction to form the destination address. + Arm64Call, + /// RISC-V call target + RiscvCall, + + /// Elf x86_64 32 bit signed PC relative offset to two GOT entries for GD symbol. + ElfX86_64TlsGd, + + /// Mach-O x86_64 32 bit signed PC relative offset to a `__thread_vars` entry. + MachOX86_64Tlv, +} + +impl fmt::Display for Reloc { + /// Display trait implementation drops the arch, since its used in contexts where the arch is + /// already unambiguous, e.g. clif syntax with isa specified. In other contexts, use Debug. + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Self::Abs4 => write!(f, "Abs4"), + Self::Abs8 => write!(f, "Abs8"), + Self::X86PCRel4 => write!(f, "PCRel4"), + Self::X86PCRelRodata4 => write!(f, "PCRelRodata4"), + Self::X86CallPCRel4 => write!(f, "CallPCRel4"), + Self::X86CallPLTRel4 => write!(f, "CallPLTRel4"), + Self::X86GOTPCRel4 => write!(f, "GOTPCRel4"), + Self::Arm32Call | Self::Arm64Call | Self::RiscvCall => write!(f, "Call"), + + Self::ElfX86_64TlsGd => write!(f, "ElfX86_64TlsGd"), + Self::MachOX86_64Tlv => write!(f, "MachOX86_64Tlv"), + } + } +} + +/// Container for information about a vector of compiled code and its supporting read-only data. +/// +/// The code starts at offset 0 and is followed optionally by relocatable jump tables and copyable +/// (raw binary) read-only data. Any padding between sections is always part of the section that +/// precedes the boundary between the sections. +#[derive(PartialEq)] +pub struct CodeInfo { + /// Number of bytes of machine code (the code starts at offset 0). + pub code_size: CodeOffset, + + /// Number of bytes of jumptables. + pub jumptables_size: CodeOffset, + + /// Number of bytes of rodata. + pub rodata_size: CodeOffset, + + /// Number of bytes in total. + pub total_size: CodeOffset, +} + +impl CodeInfo { + /// Offset of any relocatable jump tables, or equal to rodata if there are no jump tables. + pub fn jumptables(&self) -> CodeOffset { + self.code_size + } + + /// Offset of any copyable read-only data, or equal to total_size if there are no rodata. + pub fn rodata(&self) -> CodeOffset { + self.code_size + self.jumptables_size + } +} + +/// Abstract interface for adding bytes to the code segment. +/// +/// A `CodeSink` will receive all of the machine code for a function. It also accepts relocations +/// which are locations in the code section that need to be fixed up when linking. +pub trait CodeSink { + /// Get the current position. + fn offset(&self) -> CodeOffset; + + /// Add 1 byte to the code section. + fn put1(&mut self, _: u8); + + /// Add 2 bytes to the code section. + fn put2(&mut self, _: u16); + + /// Add 4 bytes to the code section. + fn put4(&mut self, _: u32); + + /// Add 8 bytes to the code section. + fn put8(&mut self, _: u64); + + /// Add a relocation referencing an external symbol plus the addend at the current offset. + fn reloc_external(&mut self, _: SourceLoc, _: Reloc, _: &ExternalName, _: Addend); + + /// Add a relocation referencing a constant. + fn reloc_constant(&mut self, _: Reloc, _: ConstantOffset); + + /// Add a relocation referencing a jump table. + fn reloc_jt(&mut self, _: Reloc, _: JumpTable); + + /// Add trap information for the current offset. + fn trap(&mut self, _: TrapCode, _: SourceLoc); + + /// Machine code output is complete, jump table data may follow. + fn begin_jumptables(&mut self); + + /// Jump table output is complete, raw read-only data may follow. + fn begin_rodata(&mut self); + + /// Read-only data output is complete, we're done. + fn end_codegen(&mut self); + + /// Add a stack map at the current code offset. + fn add_stack_map(&mut self, _: &[Value], _: &Function, _: &dyn TargetIsa); + + /// Add a call site for a call with the given opcode, returning at the current offset. + fn add_call_site(&mut self, _: Opcode, _: SourceLoc) { + // Default implementation doesn't need to do anything. + } +} + +/// Report a bad encoding error. +#[cold] +pub fn bad_encoding(func: &Function, inst: Inst) -> ! { + panic!( + "Bad encoding {} for {}", + func.encodings[inst], + func.dfg.display_inst(inst, None) + ); +} + +/// Emit a function to `sink`, given an instruction emitter function. +/// +/// This function is called from the `TargetIsa::emit_function()` implementations with the +/// appropriate instruction emitter. +pub fn emit_function<CS, EI>(func: &Function, emit_inst: EI, sink: &mut CS, isa: &dyn TargetIsa) +where + CS: CodeSink, + EI: Fn(&Function, Inst, &mut RegDiversions, &mut CS, &dyn TargetIsa), +{ + let mut divert = RegDiversions::new(); + for block in func.layout.blocks() { + divert.at_block(&func.entry_diversions, block); + debug_assert_eq!(func.offsets[block], sink.offset()); + for inst in func.layout.block_insts(block) { + emit_inst(func, inst, &mut divert, sink, isa); + } + } + + sink.begin_jumptables(); + + // Output jump tables. + for (jt, jt_data) in func.jump_tables.iter() { + let jt_offset = func.jt_offsets[jt]; + for block in jt_data.iter() { + let rel_offset: i32 = func.offsets[*block] as i32 - jt_offset as i32; + sink.put4(rel_offset as u32) + } + } + + sink.begin_rodata(); + + // Output constants. + for (_, constant_data) in func.dfg.constants.iter() { + for byte in constant_data.iter() { + sink.put1(*byte) + } + } + + sink.end_codegen(); +} diff --git a/third_party/rust/cranelift-codegen/src/binemit/relaxation.rs b/third_party/rust/cranelift-codegen/src/binemit/relaxation.rs new file mode 100644 index 0000000000..142e400985 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/binemit/relaxation.rs @@ -0,0 +1,397 @@ +//! Branch relaxation and offset computation. +//! +//! # block header offsets +//! +//! Before we can generate binary machine code for branch instructions, we need to know the final +//! offsets of all the block headers in the function. This information is encoded in the +//! `func.offsets` table. +//! +//! # Branch relaxation +//! +//! Branch relaxation is the process of ensuring that all branches in the function have enough +//! range to encode their destination. It is common to have multiple branch encodings in an ISA. +//! For example, x86 branches can have either an 8-bit or a 32-bit displacement. +//! +//! On RISC architectures, it can happen that conditional branches have a shorter range than +//! unconditional branches: +//! +//! ```clif +//! brz v1, block17 +//! ``` +//! +//! can be transformed into: +//! +//! ```clif +//! brnz v1, block23 +//! jump block17 +//! block23: +//! ``` + +use crate::binemit::{CodeInfo, CodeOffset}; +use crate::cursor::{Cursor, FuncCursor}; +use crate::dominator_tree::DominatorTree; +use crate::flowgraph::ControlFlowGraph; +use crate::ir::{Block, Function, Inst, InstructionData, Opcode, Value, ValueList}; +use crate::isa::{EncInfo, TargetIsa}; +use crate::iterators::IteratorExtras; +use crate::regalloc::RegDiversions; +use crate::timing; +use crate::CodegenResult; +use core::convert::TryFrom; +use log::debug; + +/// Relax branches and compute the final layout of block headers in `func`. +/// +/// Fill in the `func.offsets` table so the function is ready for binary emission. +pub fn relax_branches( + func: &mut Function, + _cfg: &mut ControlFlowGraph, + _domtree: &mut DominatorTree, + isa: &dyn TargetIsa, +) -> CodegenResult<CodeInfo> { + let _tt = timing::relax_branches(); + + let encinfo = isa.encoding_info(); + + // Clear all offsets so we can recognize blocks that haven't been visited yet. + func.offsets.clear(); + func.offsets.resize(func.dfg.num_blocks()); + + // Start by removing redundant jumps. + fold_redundant_jumps(func, _cfg, _domtree); + + // Convert jumps to fallthrough instructions where possible. + fallthroughs(func); + + let mut offset = 0; + let mut divert = RegDiversions::new(); + + // First, compute initial offsets for every block. + { + let mut cur = FuncCursor::new(func); + while let Some(block) = cur.next_block() { + divert.at_block(&cur.func.entry_diversions, block); + cur.func.offsets[block] = offset; + while let Some(inst) = cur.next_inst() { + divert.apply(&cur.func.dfg[inst]); + let enc = cur.func.encodings[inst]; + offset += encinfo.byte_size(enc, inst, &divert, &cur.func); + } + } + } + + // Then, run the relaxation algorithm until it converges. + let mut go_again = true; + while go_again { + go_again = false; + offset = 0; + + // Visit all instructions in layout order. + let mut cur = FuncCursor::new(func); + while let Some(block) = cur.next_block() { + divert.at_block(&cur.func.entry_diversions, block); + + // Record the offset for `block` and make sure we iterate until offsets are stable. + if cur.func.offsets[block] != offset { + cur.func.offsets[block] = offset; + go_again = true; + } + + while let Some(inst) = cur.next_inst() { + divert.apply(&cur.func.dfg[inst]); + + let enc = cur.func.encodings[inst]; + + // See if this is a branch has a range and a destination, and if the target is in + // range. + if let Some(range) = encinfo.branch_range(enc) { + if let Some(dest) = cur.func.dfg[inst].branch_destination() { + let dest_offset = cur.func.offsets[dest]; + if !range.contains(offset, dest_offset) { + offset += + relax_branch(&mut cur, &divert, offset, dest_offset, &encinfo, isa); + continue; + } + } + } + + offset += encinfo.byte_size(enc, inst, &divert, &cur.func); + } + } + } + + let code_size = offset; + let jumptables = offset; + + for (jt, jt_data) in func.jump_tables.iter() { + func.jt_offsets[jt] = offset; + // TODO: this should be computed based on the min size needed to hold the furthest branch. + offset += jt_data.len() as u32 * 4; + } + + let jumptables_size = offset - jumptables; + let rodata = offset; + + for constant in func.dfg.constants.entries_mut() { + constant.set_offset(offset); + offset += + u32::try_from(constant.len()).expect("Constants must have a length that fits in a u32") + } + + let rodata_size = offset - rodata; + + Ok(CodeInfo { + code_size, + jumptables_size, + rodata_size, + total_size: offset, + }) +} + +/// Folds an instruction if it is a redundant jump. +/// Returns whether folding was performed (which invalidates the CFG). +fn try_fold_redundant_jump( + func: &mut Function, + cfg: &mut ControlFlowGraph, + block: Block, + first_inst: Inst, +) -> bool { + let first_dest = match func.dfg[first_inst].branch_destination() { + Some(block) => block, // The instruction was a single-target branch. + None => { + return false; // The instruction was either multi-target or not a branch. + } + }; + + // For the moment, only attempt to fold a branch to a block that is parameterless. + // These blocks are mainly produced by critical edge splitting. + // + // TODO: Allow folding blocks that define SSA values and function as phi nodes. + if func.dfg.num_block_params(first_dest) != 0 { + return false; + } + + // Look at the first instruction of the first branch's destination. + // If it is an unconditional branch, maybe the second jump can be bypassed. + let second_inst = func.layout.first_inst(first_dest).expect("Instructions"); + if func.dfg[second_inst].opcode() != Opcode::Jump { + return false; + } + + // Now we need to fix up first_inst's block parameters to match second_inst's, + // without changing the branch-specific arguments. + // + // The intermediary block is allowed to reference any SSA value that dominates it, + // but that SSA value may not necessarily also dominate the instruction that's + // being patched. + + // Get the arguments and parameters passed by the first branch. + let num_fixed = func.dfg[first_inst] + .opcode() + .constraints() + .num_fixed_value_arguments(); + let (first_args, first_params) = func.dfg[first_inst] + .arguments(&func.dfg.value_lists) + .split_at(num_fixed); + + // Get the parameters passed by the second jump. + let num_fixed = func.dfg[second_inst] + .opcode() + .constraints() + .num_fixed_value_arguments(); + let (_, second_params) = func.dfg[second_inst] + .arguments(&func.dfg.value_lists) + .split_at(num_fixed); + let mut second_params = second_params.to_vec(); // Clone for rewriting below. + + // For each parameter passed by the second jump, if any of those parameters + // was a block parameter, rewrite it to refer to the value that the first jump + // passed in its parameters. Otherwise, make sure it dominates first_inst. + // + // For example: if we `block0: jump block1(v1)` to `block1(v2): jump block2(v2)`, + // we want to rewrite the original jump to `jump block2(v1)`. + let block_params: &[Value] = func.dfg.block_params(first_dest); + debug_assert!(block_params.len() == first_params.len()); + + for value in second_params.iter_mut() { + if let Some((n, _)) = block_params.iter().enumerate().find(|(_, &p)| p == *value) { + // This value was the Nth parameter passed to the second_inst's block. + // Rewrite it as the Nth parameter passed by first_inst. + *value = first_params[n]; + } + } + + // Build a value list of first_args (unchanged) followed by second_params (rewritten). + let arguments_vec: alloc::vec::Vec<_> = first_args + .iter() + .chain(second_params.iter()) + .copied() + .collect(); + let value_list = ValueList::from_slice(&arguments_vec, &mut func.dfg.value_lists); + + func.dfg[first_inst].take_value_list(); // Drop the current list. + func.dfg[first_inst].put_value_list(value_list); // Put the new list. + + // Bypass the second jump. + // This can disconnect the Block containing `second_inst`, to be cleaned up later. + let second_dest = func.dfg[second_inst].branch_destination().expect("Dest"); + func.change_branch_destination(first_inst, second_dest); + cfg.recompute_block(func, block); + + // The previously-intermediary Block may now be unreachable. Update CFG. + if cfg.pred_iter(first_dest).count() == 0 { + // Remove all instructions from that block. + while let Some(inst) = func.layout.first_inst(first_dest) { + func.layout.remove_inst(inst); + } + + // Remove the block... + cfg.recompute_block(func, first_dest); // ...from predecessor lists. + func.layout.remove_block(first_dest); // ...from the layout. + } + + true +} + +/// Redirects `jump` instructions that point to other `jump` instructions to the final destination. +/// This transformation may orphan some blocks. +fn fold_redundant_jumps( + func: &mut Function, + cfg: &mut ControlFlowGraph, + domtree: &mut DominatorTree, +) { + let mut folded = false; + + // Postorder iteration guarantees that a chain of jumps is visited from + // the end of the chain to the start of the chain. + for &block in domtree.cfg_postorder() { + // Only proceed if the first terminator instruction is a single-target branch. + let first_inst = func + .layout + .last_inst(block) + .expect("Block has no terminator"); + folded |= try_fold_redundant_jump(func, cfg, block, first_inst); + + // Also try the previous instruction. + if let Some(prev_inst) = func.layout.prev_inst(first_inst) { + folded |= try_fold_redundant_jump(func, cfg, block, prev_inst); + } + } + + // Folding jumps invalidates the dominator tree. + if folded { + domtree.compute(func, cfg); + } +} + +/// Convert `jump` instructions to `fallthrough` instructions where possible and verify that any +/// existing `fallthrough` instructions are correct. +fn fallthroughs(func: &mut Function) { + for (block, succ) in func.layout.blocks().adjacent_pairs() { + let term = func + .layout + .last_inst(block) + .expect("block has no terminator."); + if let InstructionData::Jump { + ref mut opcode, + destination, + .. + } = func.dfg[term] + { + match *opcode { + Opcode::Fallthrough => { + // Somebody used a fall-through instruction before the branch relaxation pass. + // Make sure it is correct, i.e. the destination is the layout successor. + debug_assert_eq!( + destination, succ, + "Illegal fallthrough from {} to {}, but {}'s successor is {}", + block, destination, block, succ + ) + } + Opcode::Jump => { + // If this is a jump to the successor block, change it to a fall-through. + if destination == succ { + *opcode = Opcode::Fallthrough; + func.encodings[term] = Default::default(); + } + } + _ => {} + } + } + } +} + +/// Relax the branch instruction at `cur` so it can cover the range `offset - dest_offset`. +/// +/// Return the size of the replacement instructions up to and including the location where `cur` is +/// left. +fn relax_branch( + cur: &mut FuncCursor, + divert: &RegDiversions, + offset: CodeOffset, + dest_offset: CodeOffset, + encinfo: &EncInfo, + isa: &dyn TargetIsa, +) -> CodeOffset { + let inst = cur.current_inst().unwrap(); + debug!( + "Relaxing [{}] {} for {:#x}-{:#x} range", + encinfo.display(cur.func.encodings[inst]), + cur.func.dfg.display_inst(inst, isa), + offset, + dest_offset + ); + + // Pick the smallest encoding that can handle the branch range. + let dfg = &cur.func.dfg; + let ctrl_type = dfg.ctrl_typevar(inst); + if let Some(enc) = isa + .legal_encodings(cur.func, &dfg[inst], ctrl_type) + .filter(|&enc| { + let range = encinfo.branch_range(enc).expect("Branch with no range"); + if !range.contains(offset, dest_offset) { + debug!(" trying [{}]: out of range", encinfo.display(enc)); + false + } else if encinfo.operand_constraints(enc) + != encinfo.operand_constraints(cur.func.encodings[inst]) + { + // Conservatively give up if the encoding has different constraints + // than the original, so that we don't risk picking a new encoding + // which the existing operands don't satisfy. We can't check for + // validity directly because we don't have a RegDiversions active so + // we don't know which registers are actually in use. + debug!(" trying [{}]: constraints differ", encinfo.display(enc)); + false + } else { + debug!(" trying [{}]: OK", encinfo.display(enc)); + true + } + }) + .min_by_key(|&enc| encinfo.byte_size(enc, inst, &divert, &cur.func)) + { + debug_assert!(enc != cur.func.encodings[inst]); + cur.func.encodings[inst] = enc; + return encinfo.byte_size(enc, inst, &divert, &cur.func); + } + + // Note: On some RISC ISAs, conditional branches have shorter range than unconditional + // branches, so one way of extending the range of a conditional branch is to invert its + // condition and make it branch over an unconditional jump which has the larger range. + // + // Splitting the block is problematic this late because there may be register diversions in + // effect across the conditional branch, and they can't survive the control flow edge to a new + // block. We have two options for handling that: + // + // 1. Set a flag on the new block that indicates it wants the preserve the register diversions of + // its layout predecessor, or + // 2. Use an encoding macro for the branch-over-jump pattern so we don't need to split the block. + // + // It seems that 1. would allow us to share code among RISC ISAs that need this. + // + // We can't allow register diversions to survive from the layout predecessor because the layout + // predecessor could contain kill points for some values that are live in this block, and + // diversions are not automatically cancelled when the live range of a value ends. + + // This assumes solution 2. above: + panic!("No branch in range for {:#x}-{:#x}", offset, dest_offset); +} diff --git a/third_party/rust/cranelift-codegen/src/binemit/shrink.rs b/third_party/rust/cranelift-codegen/src/binemit/shrink.rs new file mode 100644 index 0000000000..f6fa43e062 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/binemit/shrink.rs @@ -0,0 +1,73 @@ +//! Instruction shrinking. +//! +//! Sometimes there are multiple valid encodings for a given instruction. Cranelift often initially +//! chooses the largest one, because this typically provides the register allocator the most +//! flexibility. However, once register allocation is done, this is no longer important, and we +//! can switch to smaller encodings when possible. + +use crate::ir::instructions::InstructionData; +use crate::ir::Function; +use crate::isa::TargetIsa; +use crate::regalloc::RegDiversions; +use crate::timing; +use log::debug; + +/// Pick the smallest valid encodings for instructions. +pub fn shrink_instructions(func: &mut Function, isa: &dyn TargetIsa) { + let _tt = timing::shrink_instructions(); + + let encinfo = isa.encoding_info(); + let mut divert = RegDiversions::new(); + + for block in func.layout.blocks() { + // Load diversions from predecessors. + divert.at_block(&func.entry_diversions, block); + + for inst in func.layout.block_insts(block) { + let enc = func.encodings[inst]; + if enc.is_legal() { + // regmove/regfill/regspill are special instructions with register immediates + // that represented as normal operands, so the normal predicates below don't + // handle them correctly. + // + // Also, they need to be presented to the `RegDiversions` to update the + // location tracking. + // + // TODO: Eventually, we want the register allocator to avoid leaving these special + // instructions behind, but for now, just temporarily avoid trying to shrink them. + let inst_data = &func.dfg[inst]; + match inst_data { + InstructionData::RegMove { .. } + | InstructionData::RegFill { .. } + | InstructionData::RegSpill { .. } => { + divert.apply(inst_data); + continue; + } + _ => (), + } + + let ctrl_type = func.dfg.ctrl_typevar(inst); + + // Pick the last encoding with constraints that are satisfied. + let best_enc = isa + .legal_encodings(func, &func.dfg[inst], ctrl_type) + .filter(|e| encinfo.constraints[e.recipe()].satisfied(inst, &divert, &func)) + .min_by_key(|e| encinfo.byte_size(*e, inst, &divert, &func)) + .unwrap(); + + if best_enc != enc { + func.encodings[inst] = best_enc; + + debug!( + "Shrunk [{}] to [{}] in {}, reducing the size from {} to {}", + encinfo.display(enc), + encinfo.display(best_enc), + func.dfg.display_inst(inst, isa), + encinfo.byte_size(enc, inst, &divert, &func), + encinfo.byte_size(best_enc, inst, &divert, &func) + ); + } + } + } + } +} diff --git a/third_party/rust/cranelift-codegen/src/binemit/stack_map.rs b/third_party/rust/cranelift-codegen/src/binemit/stack_map.rs new file mode 100644 index 0000000000..c3055a0154 --- /dev/null +++ b/third_party/rust/cranelift-codegen/src/binemit/stack_map.rs @@ -0,0 +1,205 @@ +use crate::bitset::BitSet; +use crate::ir; +use crate::isa::TargetIsa; +use alloc::vec::Vec; + +type Num = u32; +const NUM_BITS: usize = core::mem::size_of::<Num>() * 8; + +/// Stack maps record which words in a stack frame contain live GC references at +/// a given instruction pointer. +/// +/// Logically, a set of stack maps for a function record a table of the form: +/// +/// ```text +/// +---------------------+-------------------------------------------+ +/// | Instruction Pointer | SP-Relative Offsets of Live GC References | +/// +---------------------+-------------------------------------------+ +/// | 0x12345678 | 2, 6, 12 | +/// | 0x1234abcd | 2, 6 | +/// | ... | ... | +/// +---------------------+-------------------------------------------+ +/// ``` +/// +/// Where "instruction pointer" is an instruction pointer within the function, +/// and "offsets of live GC references" contains the offsets (in units of words) +/// from the frame's stack pointer where live GC references are stored on the +/// stack. Instruction pointers within the function that do not have an entry in +/// this table are not GC safepoints. +/// +/// Because +/// +/// * offsets of live GC references are relative from the stack pointer, and +/// * stack frames grow down from higher addresses to lower addresses, +/// +/// to get a pointer to a live reference at offset `x` within a stack frame, you +/// add `x` from the frame's stack pointer. +/// +/// For example, to calculate the pointer to the live GC reference inside "frame +/// 1" below, you would do `frame_1_sp + x`: +/// +/// ```text +/// Stack +/// +-------------------+ +/// | Frame 0 | +/// | | +/// | | | +/// | +-------------------+ <--- Frame 0's SP +/// | | Frame 1 | +/// Grows | | +/// down | | +/// | | Live GC reference | --+-- +/// | | | | +/// | | | | +/// V | | x = offset of live GC reference +/// | | | +/// | | | +/// +-------------------+ --+-- <--- Frame 1's SP +/// | Frame 2 | +/// | ... | +/// ``` +/// +/// An individual `StackMap` is associated with just one instruction pointer +/// within the function, contains the size of the stack frame, and represents +/// the stack frame as a bitmap. There is one bit per word in the stack frame, +/// and if the bit is set, then the word contains a live GC reference. +/// +/// Note that a caller's `OutgoingArg` stack slots and callee's `IncomingArg` +/// stack slots overlap, so we must choose which function's stack maps record +/// live GC references in these slots. We record the `IncomingArg`s in the +/// callee's stack map. +#[derive(Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "enable-serde", derive(serde::Deserialize, serde::Serialize))] +pub struct StackMap { + bitmap: Vec<BitSet<Num>>, + mapped_words: u32, +} + +impl StackMap { + /// Create a `StackMap` based on where references are located on a + /// function's stack. + pub fn from_values( + args: &[ir::entities::Value], + func: &ir::Function, + isa: &dyn TargetIsa, + ) -> Self { + let loc = &func.locations; + let mut live_ref_in_stack_slot = crate::HashSet::new(); + // References can be in registers, and live registers values are pushed onto the stack before calls and traps. + // TODO: Implement register maps. If a register containing a reference is spilled and reused after a safepoint, + // it could contain a stale reference value if the garbage collector relocated the value. + for val in args { + if let Some(value_loc) = loc.get(*val) { + match *value_loc { + ir::ValueLoc::Stack(stack_slot) => { + live_ref_in_stack_slot.insert(stack_slot); + } + _ => {} + } + } + } + + let stack = &func.stack_slots; + let info = func.stack_slots.layout_info.unwrap(); + + // Refer to the doc comment for `StackMap` above to understand the + // bitmap representation used here. + let map_size = (info.frame_size + info.inbound_args_size) as usize; + let word_size = isa.pointer_bytes() as usize; + let num_words = map_size / word_size; + + let mut vec = alloc::vec::Vec::with_capacity(num_words); + vec.resize(num_words, false); + + for (ss, ssd) in stack.iter() { + if !live_ref_in_stack_slot.contains(&ss) + || ssd.kind == ir::stackslot::StackSlotKind::OutgoingArg + { + continue; + } + + debug_assert!(ssd.size as usize == word_size); + let bytes_from_bottom = info.frame_size as i32 + ssd.offset.unwrap(); + let words_from_bottom = (bytes_from_bottom as usize) / word_size; + vec[words_from_bottom] = true; + } + + Self::from_slice(&vec) + } + + /// Create a vec of Bitsets from a slice of bools. + pub fn from_slice(vec: &[bool]) -> Self { + let len = vec.len(); + let num_word = len / NUM_BITS + (len % NUM_BITS != 0) as usize; + let mut bitmap = Vec::with_capacity(num_word); + + for segment in vec.chunks(NUM_BITS) { + let mut curr_word = 0; + for (i, set) in segment.iter().enumerate() { + if *set { + curr_word |= 1 << i; + } + } + bitmap.push(BitSet(curr_word)); + } + Self { + mapped_words: len as u32, + bitmap, + } + } + + /// Returns a specified bit. + pub fn get_bit(&self, bit_index: usize) -> bool { + assert!(bit_index < NUM_BITS * self.bitmap.len()); + let word_index = bit_index / NUM_BITS; + let word_offset = (bit_index % NUM_BITS) as u8; + self.bitmap[word_index].contains(word_offset) + } + + /// Returns the raw bitmap that represents this stack map. + pub fn as_slice(&self) -> &[BitSet<u32>] { + &self.bitmap + } + + /// Returns the number of words represented by this stack map. + pub fn mapped_words(&self) -> u32 { + self.mapped_words + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn stack_maps() { + let vec: Vec<bool> = Vec::new(); + assert!(StackMap::from_slice(&vec).bitmap.is_empty()); + + let mut vec: [bool; NUM_BITS] = Default::default(); + let set_true_idx = [5, 7, 24, 31]; + + for &idx in &set_true_idx { + vec[idx] = true; + } + + let mut vec = vec.to_vec(); + assert_eq!( + vec![BitSet::<Num>(2164261024)], + StackMap::from_slice(&vec).bitmap + ); + + vec.push(false); + vec.push(true); + let res = StackMap::from_slice(&vec); + assert_eq!( + vec![BitSet::<Num>(2164261024), BitSet::<Num>(2)], + res.bitmap + ); + + assert!(res.get_bit(5)); + assert!(res.get_bit(31)); + assert!(res.get_bit(33)); + assert!(!res.get_bit(1)); + } +} |