summaryrefslogtreecommitdiffstats
path: root/third_party/rust/cranelift-codegen/src/redundant_reload_remover.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
commit2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
treeb80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/cranelift-codegen/src/redundant_reload_remover.rs
parentInitial commit. (diff)
downloadfirefox-upstream.tar.xz
firefox-upstream.zip
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/cranelift-codegen/src/redundant_reload_remover.rs')
-rw-r--r--third_party/rust/cranelift-codegen/src/redundant_reload_remover.rs904
1 files changed, 904 insertions, 0 deletions
diff --git a/third_party/rust/cranelift-codegen/src/redundant_reload_remover.rs b/third_party/rust/cranelift-codegen/src/redundant_reload_remover.rs
new file mode 100644
index 0000000000..501c67ab6b
--- /dev/null
+++ b/third_party/rust/cranelift-codegen/src/redundant_reload_remover.rs
@@ -0,0 +1,904 @@
+//! This module implements a late-stage redundant-reload remover, which runs after registers have
+//! been allocated and stack slots have been given specific offsets.
+
+use crate::cursor::{Cursor, CursorPosition, EncCursor, FuncCursor};
+use crate::entity::EntitySet;
+use crate::flowgraph::ControlFlowGraph;
+use crate::ir::dfg::DataFlowGraph;
+use crate::ir::instructions::BranchInfo;
+use crate::ir::stackslot::{StackSlotKind, StackSlots};
+use crate::ir::{
+ Block, Function, Inst, InstBuilder, InstructionData, Opcode, StackSlotData, Type, Value,
+ ValueLoc,
+};
+use crate::isa::{RegInfo, RegUnit, TargetIsa};
+use crate::regalloc::RegDiversions;
+use alloc::vec::Vec;
+use core::convert::TryInto;
+use cranelift_entity::{PrimaryMap, SecondaryMap};
+
+// =============================================================================================
+// A description of the redundant-fill-removal algorithm
+//
+//
+// The algorithm works forwards through each Block. It carries along and updates a table,
+// AvailEnv, with which it tracks registers that are known to have the same value as some stack
+// slot. The actions on encountering an instruction depend on the instruction, as follows:
+//
+// ss1 = spill r0: update the AvailEnv so as to note that slot `ss1` and register `r0`
+// have the same value.
+//
+// r1 = fill ss0: look in the AvailEnv. If it tells us that register `r1` and slot `ss0`
+// have the same value, then delete the instruction by converting it to a
+// `fill_nop`.
+//
+// If it tells us that some other register `r2` has the same value as
+// slot `ss0`, convert the instruction into a copy from `r2` to `r1`.
+//
+// any other insn: remove from the AvailEnv, any bindings associated with registers
+// written by this instruction, since they will be invalidated by it.
+//
+// Tracking the effects of `copy` instructions in AvailEnv for the case when both source and
+// destination are registers does not cause any more fills to be removed or converted to copies.
+// It's not clear why.
+//
+// There are various other instruction-handling cases in `visit_inst`, which are documented
+// in-line, and do not change the core algorithm, so are not described here.
+//
+// The registers tracked by AvailEnv are the post-diversion registers that are really used by the
+// code; they are not the pre-diversion names associated with each SSA `Value`. The second
+// `fill` case above opportunistically copies values from registers that may have been diversion
+// targets in some predecessor block, and so are no longer associated with any specific SSA-level
+// name at the point the copy is made. Hence those copies (from `r2` to `r1`) cannot be done
+// with an ordinary `copy` instruction. Instead they have to be done using a new `copy_to_ssa`
+// instruction, which copies from an arbitrary register to a register-resident `Value` (that is,
+// "back to" SSA-world).
+//
+// That completes the description of the core algorithm.
+//
+// In the case where a block `A` jumps to `B` and `A` is the only predecessor of `B`, the
+// AvailEnv at the end of `A` will still be valid at the entry to `B`. In such a case, we can
+// profitably transform `B` using the AvailEnv "inherited" from `A`. In order to take full
+// advantage of this, this module partitions the function's CFG into tree-shaped groups of
+// blocks, and processes each tree as described above. So the AvailEnv is only initialised to
+// empty at the start of blocks that form the root of each tree; that is, for blocks which have
+// two or more predecessors.
+
+// =============================================================================================
+// Top level algorithm structure
+//
+// The overall algorithm, for a function, starts like this:
+//
+// * (once per function): finds Blocks that have two or more predecessors, since they will be the
+// roots of Block trees. Also, the entry node for the function is considered to be a root.
+//
+// It then continues with a loop that first finds a tree of Blocks ("discovery") and then removes
+// redundant fills as described above ("processing"):
+//
+// * (discovery; once per tree): for each root, performs a depth first search to find all the Blocks
+// in the tree, guided by RedundantReloadRemover::discovery_stack.
+//
+// * (processing; once per tree): the just-discovered tree is then processed as described above,
+// guided by RedundantReloadRemover::processing_stack.
+//
+// In this way, all Blocks reachable from the function's entry point are eventually processed. Note
+// that each tree is processed as soon as it has been discovered, so the algorithm never creates a
+// list of trees for the function.
+//
+// The running state is stored in `RedundantReloadRemover`. This is allocated once and can be
+// reused for multiple functions so as to minimise heap turnover. The fields are, roughly:
+//
+// num_regunits -- constant for the whole function; used by the tree processing phase
+// num_preds_per_block -- constant for the whole function; used by the tree discovery process
+//
+// discovery_stack -- used to guide the tree discovery process
+// nodes_in_tree -- the discovered nodes are recorded here
+//
+// processing_stack -- used to guide the tree processing process
+// nodes_already_visited -- used to ensure the tree processing logic terminates in the case
+// where a tree has a branch back to its root node.
+//
+// There is further documentation in line below, as appropriate.
+
+// =============================================================================================
+// A side note on register choice heuristics
+
+// The core algorithm opportunistically replaces fill instructions when it knows of a register
+// that already holds the required value. How effective this is largely depends on how long
+// reloaded values happen to stay alive before the relevant register is overwritten. And that
+// depends on the register allocator's register choice heuristics. The worst case is, when the
+// register allocator reuses registers as soon as possible after they become free. Unfortunately
+// that was indeed the selection scheme, prior to development of this pass.
+//
+// As part of this work, the register selection scheme has been changed as follows: for registers
+// written by any instruction other than a fill, use the lowest numbered available register. But
+// for registers written by a fill instruction, use the highest numbered available register. The
+// aim is to try and keep reload- and non-reload registers disjoint to the extent possible.
+// Several other schemes were tried, but this one is simple and can be worth an extra 2% of
+// performance in some cases.
+//
+// The relevant change is more or less a one-line change in the solver.
+
+// =============================================================================================
+// Data structures used for discovery of trees
+
+// `ZeroOneOrMany` is used to record the number of predecessors a Block block has. The `Zero` case
+// is included so as to cleanly handle the case where the incoming graph has unreachable Blocks.
+
+#[derive(Clone, PartialEq)]
+enum ZeroOneOrMany {
+ Zero,
+ One,
+ Many,
+}
+
+// =============================================================================================
+// Data structures used for processing of trees
+
+// `SlotInfo` describes a spill slot in the obvious way. Note that it doesn't indicate which
+// register(s) are currently associated with the slot. That job is done by `AvailEnv` instead.
+//
+// In the CL framework, stack slots are partitioned into disjoint sets, one for each
+// `StackSlotKind`. The offset and size only give a unique identity within any particular
+// `StackSlotKind`. So, to uniquely identify a stack slot, all three fields are necessary.
+
+#[derive(Clone, Copy)]
+struct SlotInfo {
+ kind: StackSlotKind,
+ offset: i32,
+ size: u32,
+}
+
+// `AvailEnv` maps each possible register to a stack slot that holds the same value. The index
+// space of `AvailEnv::map` is exactly the set of registers available on the current target. If
+// (as is mostly the case) a register is not known to have the same value as a stack slot, then
+// its entry is `None` rather than `Some(..)`.
+//
+// Invariants for AvailEnv:
+//
+// AvailEnv may have multiple different registers bound to the same stack slot -- that is, `(kind,
+// offset, size)` triple. That's OK, and reflects the reality that those two registers contain
+// the same value. This could happen, for example, in the case
+//
+// ss1 = spill r0
+// ..
+// r2 = fill ss1
+//
+// Then both `r0` and `r2` will have the same value as `ss1`, provided that ".." doesn't write to
+// `r1`.
+//
+// To say that two different registers may be bound to the same stack slot is the same as saying
+// that it is allowed to have two different entries in AvailEnv with the same `(kind, offset,
+// size)` triple. What is *not* allowed is to have partial overlaps. That is, if two SlotInfos
+// have the same `kind` field and have `offset` and `size` fields that overlap, then their
+// `offset` and `size` fields must be identical. This is so as to make the algorithm safe against
+// situations where, for example, a 64 bit register is spilled, but then only the bottom 32 bits
+// are reloaded from the slot.
+//
+// Although in such a case it seems likely that the Cranelift IR would be ill-typed, and so this
+// case could probably not occur in practice.
+
+#[derive(Clone)]
+struct AvailEnv {
+ map: Vec<Option<SlotInfo>>,
+}
+
+// `ProcessingStackElem` combines AvailEnv with contextual information needed to "navigate" within
+// a Block.
+//
+// A ProcessingStackElem conceptually has the lifetime of exactly one Block: once the current Block is
+// completed, the ProcessingStackElem will be abandoned. In practice the top level state,
+// RedundantReloadRemover, caches them, so as to avoid heap turnover.
+//
+// Note that ProcessingStackElem must contain a CursorPosition. The CursorPosition, which
+// indicates where we are in the current Block, cannot be implicitly maintained by looping over all
+// the instructions in a Block in turn, because we may choose to suspend processing the current Block
+// at a side exit, continue by processing the subtree reached via the side exit, and only later
+// resume the current Block.
+
+struct ProcessingStackElem {
+ /// Indicates the AvailEnv at the current point in the Block.
+ avail_env: AvailEnv,
+
+ /// Shows where we currently are inside the Block.
+ cursor: CursorPosition,
+
+ /// Indicates the currently active register diversions at the current point.
+ diversions: RegDiversions,
+}
+
+// =============================================================================================
+// The top level data structure
+
+// `RedundantReloadRemover` contains data structures for the two passes: discovery of tree shaped
+// regions, and processing of them. These are allocated once and stay alive for the entire
+// function, even though they are cleared out for each new tree shaped region. It also caches
+// `num_regunits` and `num_preds_per_block`, which are computed at the start of each function and
+// then remain constant.
+
+/// The redundant reload remover's state.
+pub struct RedundantReloadRemover {
+ /// The total number of RegUnits available on this architecture. This is unknown when the
+ /// RedundantReloadRemover is created. It becomes known at the beginning of processing of a
+ /// function.
+ num_regunits: Option<u16>,
+
+ /// This stores, for each Block, a characterisation of the number of predecessors it has.
+ num_preds_per_block: PrimaryMap<Block, ZeroOneOrMany>,
+
+ /// The stack used for the first phase (discovery). There is one element on the discovery
+ /// stack for each currently unexplored Block in the tree being searched.
+ discovery_stack: Vec<Block>,
+
+ /// The nodes in the discovered tree are inserted here.
+ nodes_in_tree: EntitySet<Block>,
+
+ /// The stack used during the second phase (transformation). There is one element on the
+ /// processing stack for each currently-open node in the tree being transformed.
+ processing_stack: Vec<ProcessingStackElem>,
+
+ /// Used in the second phase to avoid visiting nodes more than once.
+ nodes_already_visited: EntitySet<Block>,
+}
+
+// =============================================================================================
+// Miscellaneous small helper functions
+
+// Is this a kind of stack slot that is safe to track in AvailEnv? This is probably overly
+// conservative, but tracking only the SpillSlot and IncomingArgument kinds catches almost all
+// available redundancy in practice.
+fn is_slot_kind_tracked(kind: StackSlotKind) -> bool {
+ match kind {
+ StackSlotKind::SpillSlot | StackSlotKind::IncomingArg => true,
+ _ => false,
+ }
+}
+
+// Find out if the range `[offset, +size)` overlaps with the range in `si`.
+fn overlaps(si: &SlotInfo, offset: i32, size: u32) -> bool {
+ let a_offset = si.offset as i64;
+ let a_size = si.size as i64;
+ let b_offset = offset as i64;
+ let b_size = size as i64;
+ let no_overlap = a_offset + a_size <= b_offset || b_offset + b_size <= a_offset;
+ !no_overlap
+}
+
+// Find, in `reginfo`, the register bank that `reg` lives in, and return the lower limit and size
+// of the bank. This is so the caller can conveniently iterate over all RegUnits in the bank that
+// `reg` lives in.
+fn find_bank_limits(reginfo: &RegInfo, reg: RegUnit) -> (RegUnit, u16) {
+ if let Some(bank) = reginfo.bank_containing_regunit(reg) {
+ return (bank.first_unit, bank.units);
+ }
+ // We should never get here, since `reg` must come from *some* RegBank.
+ panic!("find_regclass_limits: reg not found");
+}
+
+// Returns the register that `v` is allocated to. Assumes that `v` actually resides in a
+// register.
+fn reg_of_value(locations: &SecondaryMap<Value, ValueLoc>, v: Value) -> RegUnit {
+ match locations[v] {
+ ValueLoc::Reg(ru) => ru,
+ _ => panic!("reg_of_value: value isn't in a reg"),
+ }
+}
+
+// Returns the stack slot that `v` is allocated to. Assumes that `v` actually resides in a stack
+// slot.
+fn slot_of_value<'s>(
+ locations: &SecondaryMap<Value, ValueLoc>,
+ stack_slots: &'s StackSlots,
+ v: Value,
+) -> &'s StackSlotData {
+ match locations[v] {
+ ValueLoc::Stack(slot) => &stack_slots[slot],
+ _ => panic!("slot_of_value: value isn't in a stack slot"),
+ }
+}
+
+// =============================================================================================
+// Top level: discovery of tree shaped regions
+
+impl RedundantReloadRemover {
+ // A helper for `add_nodes_to_tree` below.
+ fn discovery_stack_push_successors_of(&mut self, cfg: &ControlFlowGraph, node: Block) {
+ for successor in cfg.succ_iter(node) {
+ self.discovery_stack.push(successor);
+ }
+ }
+
+ // Visit the tree of Blocks rooted at `starting_point` and add them to `self.nodes_in_tree`.
+ // `self.num_preds_per_block` guides the process, ensuring we don't leave the tree-ish region
+ // and indirectly ensuring that the process will terminate in the presence of cycles in the
+ // graph. `self.discovery_stack` holds the search state in this function.
+ fn add_nodes_to_tree(&mut self, cfg: &ControlFlowGraph, starting_point: Block) {
+ // One might well ask why this doesn't loop forever when it encounters cycles in the
+ // control flow graph. The reason is that any cycle in the graph that is reachable from
+ // anywhere outside the cycle -- in particular, that is reachable from the function's
+ // entry node -- must have at least one node that has two or more predecessors. So the
+ // logic below won't follow into it, because it regards any such node as the root of some
+ // other tree.
+ debug_assert!(self.discovery_stack.is_empty());
+ debug_assert!(self.nodes_in_tree.is_empty());
+
+ self.nodes_in_tree.insert(starting_point);
+ self.discovery_stack_push_successors_of(cfg, starting_point);
+
+ while let Some(node) = self.discovery_stack.pop() {
+ match self.num_preds_per_block[node] {
+ // We arrived at a node with multiple predecessors, so it's a new root. Ignore it.
+ ZeroOneOrMany::Many => {}
+ // This node has just one predecessor, so we should incorporate it in the tree and
+ // immediately transition into searching from it instead.
+ ZeroOneOrMany::One => {
+ self.nodes_in_tree.insert(node);
+ self.discovery_stack_push_successors_of(cfg, node);
+ }
+ // This is meaningless. We arrived at a node that doesn't point back at where we
+ // came from.
+ ZeroOneOrMany::Zero => panic!("add_nodes_to_tree: inconsistent graph"),
+ }
+ }
+ }
+}
+
+// =============================================================================================
+// Operations relating to `AvailEnv`
+
+impl AvailEnv {
+ // Create a new one.
+ fn new(size: usize) -> Self {
+ let mut env = Self {
+ map: Vec::<Option<SlotInfo>>::new(),
+ };
+ env.map.resize(size, None);
+ env
+ }
+
+ // Debug only: checks (some of) the required AvailEnv invariants.
+ #[cfg(debug_assertions)]
+ fn check_invariants(&self) -> bool {
+ // Check that any overlapping entries overlap exactly. This is super lame (quadratic),
+ // but it's only used in debug builds.
+ for i in 0..self.map.len() {
+ if let Some(si) = self.map[i] {
+ for j in i + 1..self.map.len() {
+ if let Some(sj) = self.map[j] {
+ // "si and sj overlap, but not exactly"
+ if si.kind == sj.kind
+ && overlaps(&si, sj.offset, sj.size)
+ && !(si.offset == sj.offset && si.size == sj.size)
+ {
+ return false;
+ }
+ }
+ }
+ }
+ }
+ true
+ }
+
+ // Invalidates the binding associated with `reg`. Note that by construction of AvailEnv,
+ // `reg` can only be associated with one binding at once.
+ fn invalidate_by_reg(&mut self, reg: RegUnit) {
+ self.map[reg as usize] = None;
+ }
+
+ // Invalidates any binding that has any overlap with `(kind, offset, size)`.
+ fn invalidate_by_offset(&mut self, kind: StackSlotKind, offset: i32, size: u32) {
+ debug_assert!(is_slot_kind_tracked(kind));
+ for i in 0..self.map.len() {
+ if let Some(si) = &self.map[i] {
+ if si.kind == kind && overlaps(&si, offset, size) {
+ self.map[i] = None;
+ }
+ }
+ }
+ }
+
+ // Invalidates all bindings.
+ fn invalidate_all(&mut self) {
+ for i in 0..self.map.len() {
+ self.map[i] = None;
+ }
+ }
+
+ // Updates AvailEnv to track the effect of a `regmove` instruction.
+ fn copy_reg(&mut self, src: RegUnit, dst: RegUnit) {
+ self.map[dst as usize] = self.map[src as usize];
+ }
+
+ // Does `env` have the exact binding characterised by `(reg, kind, offset, size)` ?
+ fn has_exact_binding(&self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) -> bool {
+ debug_assert!(is_slot_kind_tracked(kind));
+ if let Some(si) = &self.map[reg as usize] {
+ return si.kind == kind && si.offset == offset && si.size == size;
+ }
+ // No such binding.
+ false
+ }
+
+ // Does `env` have a binding characterised by `(kind, offset, size)` but to a register, let's
+ // call it `other_reg`, that isn't `reg`? If so, return `other_reg`. Note that `other_reg`
+ // will have the same bank as `reg`. It is a checked error to call this function with a
+ // binding matching all four of `(reg, kind, offset, size)`.
+ fn has_inexact_binding(
+ &self,
+ reginfo: &RegInfo,
+ reg: RegUnit,
+ kind: StackSlotKind,
+ offset: i32,
+ size: u32,
+ ) -> Option<RegUnit> {
+ debug_assert!(is_slot_kind_tracked(kind));
+ // Find the range of RegUnit numbers for the bank that contains `reg`, and use that as our
+ // search space. This is so as to guarantee that any match is restricted to the same bank
+ // as `reg`.
+ let (first_unit, num_units) = find_bank_limits(reginfo, reg);
+ for other_reg in first_unit..first_unit + num_units {
+ if let Some(si) = &self.map[other_reg as usize] {
+ if si.kind == kind && si.offset == offset && si.size == size {
+ if other_reg == reg {
+ panic!("has_inexact_binding: binding *is* exact!");
+ }
+ return Some(other_reg);
+ }
+ }
+ }
+ // No such binding.
+ None
+ }
+
+ // Create the binding `(reg, kind, offset, size)` in `env`, and throw away any previous
+ // binding associated with either `reg` or the `(kind, offset, size)` triple.
+ fn bind(&mut self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) {
+ debug_assert!(is_slot_kind_tracked(kind));
+ self.invalidate_by_offset(kind, offset, size);
+ self.map[reg as usize] = Some(SlotInfo { kind, offset, size });
+ }
+}
+
+// Invalidates in `avail_env`, any binding associated with a regunit that is written by `inst`.
+fn invalidate_regs_written_by_inst(
+ locations: &SecondaryMap<Value, ValueLoc>,
+ diversions: &RegDiversions,
+ dfg: &DataFlowGraph,
+ avail_env: &mut AvailEnv,
+ inst: Inst,
+) {
+ for v in dfg.inst_results(inst).iter() {
+ if let ValueLoc::Reg(ru) = locations[*v] {
+ // This must be true. It would be meaningless for an SSA value to be diverted before
+ // the point where it is defined.
+ debug_assert!(diversions.reg(*v, locations) == ru);
+ avail_env.invalidate_by_reg(ru);
+ }
+ }
+}
+
+// =============================================================================================
+// Processing of individual instructions
+
+impl RedundantReloadRemover {
+ // Process `inst`, possibly changing it into a different instruction, and possibly changing
+ // `self.avail_env` and `func.dfg`.
+ fn visit_inst(
+ &mut self,
+ func: &mut Function,
+ reginfo: &RegInfo,
+ isa: &dyn TargetIsa,
+ inst: Inst,
+ ) {
+ // Get hold of the top-of-stack work item. This is the state that we will mutate during
+ // processing of this instruction.
+ debug_assert!(!self.processing_stack.is_empty());
+ let ProcessingStackElem {
+ avail_env,
+ diversions,
+ ..
+ } = self.processing_stack.last_mut().unwrap();
+
+ #[cfg(debug_assertions)]
+ debug_assert!(
+ avail_env.check_invariants(),
+ "visit_inst: env invariants not ok"
+ );
+
+ let dfg = &mut func.dfg;
+ let locations = &func.locations;
+ let stack_slots = &func.stack_slots;
+
+ // To avoid difficulties with the borrow checker, do this in two stages. First, examine
+ // the instruction to see if it can be deleted or modified, and park the relevant
+ // information in `transform`. Update `self.avail_env` too. Later, use `transform` to
+ // actually do the transformation if necessary.
+ enum Transform {
+ NoChange,
+ ChangeToNopFill(Value), // delete this insn entirely
+ ChangeToCopyToSSA(Type, RegUnit), // change it into a copy from the specified reg
+ }
+ let mut transform = Transform::NoChange;
+
+ // In this match { .. } statement, either we must treat the instruction specially, or we
+ // must call `invalidate_regs_written_by_inst` on it.
+ match &dfg[inst] {
+ InstructionData::Unary {
+ opcode: Opcode::Spill,
+ arg: src_value,
+ } => {
+ // Extract: (src_reg, kind, offset, size)
+ // Invalidate: (kind, offset, size)
+ // Add new binding: {src_reg -> (kind, offset, size)}
+ // Don't forget that src_value might be diverted, so we have to deref it.
+ let slot = slot_of_value(locations, stack_slots, dfg.inst_results(inst)[0]);
+ let src_reg = diversions.reg(*src_value, locations);
+ let kind = slot.kind;
+ if is_slot_kind_tracked(kind) {
+ let offset = slot.offset.expect("visit_inst: spill with no offset");
+ let size = slot.size;
+ avail_env.bind(src_reg, kind, offset, size);
+ } else {
+ // We don't expect this insn to write any regs. But to be consistent with the
+ // rule above, do this anyway.
+ invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
+ }
+ }
+ InstructionData::Unary {
+ opcode: Opcode::Fill,
+ arg: src_value,
+ } => {
+ // Extract: (dst_reg, kind, offset, size)
+ // Invalidate: (kind, offset, size)
+ // Add new: {dst_reg -> (dst_value, kind, offset, size)}
+ let slot = slot_of_value(locations, stack_slots, *src_value);
+ let dst_value = dfg.inst_results(inst)[0];
+ let dst_reg = reg_of_value(locations, dst_value);
+ // This must be true. It would be meaningless for an SSA value to be diverted
+ // before it was defined.
+ debug_assert!(dst_reg == diversions.reg(dst_value, locations));
+ let kind = slot.kind;
+ if is_slot_kind_tracked(kind) {
+ let offset = slot.offset.expect("visit_inst: fill with no offset");
+ let size = slot.size;
+ if avail_env.has_exact_binding(dst_reg, kind, offset, size) {
+ // This instruction is an exact copy of a fill we saw earlier, and the
+ // loaded value is still valid. So we'll schedule this instruction for
+ // deletion (below). No need to make any changes to `avail_env`.
+ transform = Transform::ChangeToNopFill(*src_value);
+ } else if let Some(other_reg) =
+ avail_env.has_inexact_binding(reginfo, dst_reg, kind, offset, size)
+ {
+ // This fill is from the required slot, but into a different register
+ // `other_reg`. So replace it with a copy from `other_reg` to `dst_reg`
+ // and update `dst_reg`s binding to make it the same as `other_reg`'s, so
+ // as to maximise the chances of future matches after this instruction.
+ debug_assert!(other_reg != dst_reg);
+ transform =
+ Transform::ChangeToCopyToSSA(dfg.value_type(dst_value), other_reg);
+ avail_env.copy_reg(other_reg, dst_reg);
+ } else {
+ // This fill creates some new binding we don't know about. Update
+ // `avail_env` to track it.
+ avail_env.bind(dst_reg, kind, offset, size);
+ }
+ } else {
+ // Else it's "just another instruction that writes a reg", so we'd better
+ // treat it as such, just as we do below for instructions that we don't handle
+ // specially.
+ invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
+ }
+ }
+ InstructionData::RegMove { src, dst, .. } => {
+ // These happen relatively rarely, but just frequently enough that it's worth
+ // tracking the copy (at the machine level, it's really a copy) in `avail_env`.
+ avail_env.copy_reg(*src, *dst);
+ }
+ InstructionData::RegSpill { .. }
+ | InstructionData::RegFill { .. }
+ | InstructionData::Call { .. }
+ | InstructionData::CallIndirect { .. }
+ | InstructionData::StackLoad { .. }
+ | InstructionData::StackStore { .. }
+ | InstructionData::Unary {
+ opcode: Opcode::AdjustSpDown,
+ ..
+ }
+ | InstructionData::UnaryImm {
+ opcode: Opcode::AdjustSpUpImm,
+ ..
+ }
+ | InstructionData::UnaryImm {
+ opcode: Opcode::AdjustSpDownImm,
+ ..
+ } => {
+ // All of these change, or might change, the memory-register bindings tracked in
+ // `avail_env` in some way we don't know about, or at least, we might be able to
+ // track, but for which the effort-to-benefit ratio seems too low to bother. So
+ // play safe: forget everything we know.
+ //
+ // For Call/CallIndirect, we could do better when compiling for calling
+ // conventions that have callee-saved registers, since bindings for them would
+ // remain valid across the call.
+ avail_env.invalidate_all();
+ }
+ _ => {
+ // Invalidate: any `avail_env` entry associated with a reg written by `inst`.
+ invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
+ }
+ }
+
+ // Actually do the transformation.
+ match transform {
+ Transform::NoChange => {}
+ Transform::ChangeToNopFill(arg) => {
+ // Load is completely redundant. Convert it to a no-op.
+ dfg.replace(inst).fill_nop(arg);
+ let ok = func.update_encoding(inst, isa).is_ok();
+ debug_assert!(
+ ok,
+ "fill_nop encoding missing for this type: `{}`",
+ func.dfg.display_inst(inst, isa)
+ );
+ }
+ Transform::ChangeToCopyToSSA(ty, reg) => {
+ // We already have the relevant value in some other register. Convert the
+ // load into a reg-reg copy.
+ dfg.replace(inst).copy_to_ssa(ty, reg);
+ let ok = func.update_encoding(inst, isa).is_ok();
+ debug_assert!(ok, "copy_to_ssa encoding missing for type {}", ty);
+ }
+ }
+ }
+}
+
+// =============================================================================================
+// Top level: processing of tree shaped regions
+
+impl RedundantReloadRemover {
+ // Push a clone of the top-of-stack ProcessingStackElem. This will be used to process exactly
+ // one Block. The diversions are created new, rather than cloned, to reflect the fact
+ // that diversions are local to each Block.
+ fn processing_stack_push(&mut self, cursor: CursorPosition) {
+ let avail_env = if let Some(stack_top) = self.processing_stack.last() {
+ stack_top.avail_env.clone()
+ } else {
+ AvailEnv::new(
+ self.num_regunits
+ .expect("processing_stack_push: num_regunits unknown!")
+ as usize,
+ )
+ };
+ self.processing_stack.push(ProcessingStackElem {
+ avail_env,
+ cursor,
+ diversions: RegDiversions::new(),
+ });
+ }
+
+ // This pushes the node `dst` onto the processing stack, and sets up the new
+ // ProcessingStackElem accordingly. But it does all that only if `dst` is part of the current
+ // tree *and* we haven't yet visited it.
+ fn processing_stack_maybe_push(&mut self, dst: Block) {
+ if self.nodes_in_tree.contains(dst) && !self.nodes_already_visited.contains(dst) {
+ if !self.processing_stack.is_empty() {
+ // If this isn't the outermost node in the tree (that is, the root), then it must
+ // have exactly one predecessor. Nodes with no predecessors are dead and not
+ // incorporated in any tree. Nodes with two or more predecessors are the root of
+ // some other tree, and visiting them as if they were part of the current tree
+ // would be a serious error.
+ debug_assert!(self.num_preds_per_block[dst] == ZeroOneOrMany::One);
+ }
+ self.processing_stack_push(CursorPosition::Before(dst));
+ self.nodes_already_visited.insert(dst);
+ }
+ }
+
+ // Perform redundant-reload removal on the tree shaped region of graph defined by `root` and
+ // `self.nodes_in_tree`. The following state is modified: `self.processing_stack`,
+ // `self.nodes_already_visited`, and `func.dfg`.
+ fn process_tree(
+ &mut self,
+ func: &mut Function,
+ reginfo: &RegInfo,
+ isa: &dyn TargetIsa,
+ root: Block,
+ ) {
+ debug_assert!(self.nodes_in_tree.contains(root));
+ debug_assert!(self.processing_stack.is_empty());
+ debug_assert!(self.nodes_already_visited.is_empty());
+
+ // Create the initial work item
+ self.processing_stack_maybe_push(root);
+
+ while !self.processing_stack.is_empty() {
+ // It seems somewhat ridiculous to construct a whole new FuncCursor just so we can do
+ // next_inst() on it once, and then copy the resulting position back out. But use of
+ // a function-global FuncCursor, or of the EncCursor in struct Context, leads to
+ // borrow checker problems, as does including FuncCursor directly in
+ // ProcessingStackElem. In any case this is not as bad as it looks, since profiling
+ // shows that the build-insert-step-extract work is reduced to just 8 machine
+ // instructions in an optimised x86_64 build, presumably because rustc can inline and
+ // then optimise out almost all the work.
+ let tos = self.processing_stack.len() - 1;
+ let mut pos = FuncCursor::new(func).at_position(self.processing_stack[tos].cursor);
+ let maybe_inst = pos.next_inst();
+ self.processing_stack[tos].cursor = pos.position();
+
+ if let Some(inst) = maybe_inst {
+ // Deal with this insn, possibly changing it, possibly updating the top item of
+ // `self.processing_stack`.
+ self.visit_inst(func, reginfo, isa, inst);
+
+ // Update diversions after the insn.
+ self.processing_stack[tos].diversions.apply(&func.dfg[inst]);
+
+ // If the insn can branch outside this Block, push work items on the stack for all
+ // target Blocks that are part of the same tree and that we haven't yet visited.
+ // The next iteration of this instruction-processing loop will immediately start
+ // work on the most recently pushed Block, and will eventually continue in this Block
+ // when those new items have been removed from the stack.
+ match func.dfg.analyze_branch(inst) {
+ BranchInfo::NotABranch => (),
+ BranchInfo::SingleDest(dst, _) => {
+ self.processing_stack_maybe_push(dst);
+ }
+ BranchInfo::Table(jt, default) => {
+ func.jump_tables[jt]
+ .iter()
+ .for_each(|dst| self.processing_stack_maybe_push(*dst));
+ if let Some(dst) = default {
+ self.processing_stack_maybe_push(dst);
+ }
+ }
+ }
+ } else {
+ // We've come to the end of the current work-item (Block). We'll already have
+ // processed the fallthrough/continuation/whatever for it using the logic above.
+ // Pop it off the stack and resume work on its parent.
+ self.processing_stack.pop();
+ }
+ }
+ }
+}
+
+// =============================================================================================
+// Top level: perform redundant fill removal for a complete function
+
+impl RedundantReloadRemover {
+ /// Create a new remover state.
+ pub fn new() -> Self {
+ Self {
+ num_regunits: None,
+ num_preds_per_block: PrimaryMap::<Block, ZeroOneOrMany>::with_capacity(8),
+ discovery_stack: Vec::<Block>::with_capacity(16),
+ nodes_in_tree: EntitySet::<Block>::new(),
+ processing_stack: Vec::<ProcessingStackElem>::with_capacity(8),
+ nodes_already_visited: EntitySet::<Block>::new(),
+ }
+ }
+
+ /// Clear the state of the remover.
+ pub fn clear(&mut self) {
+ self.clear_for_new_function();
+ }
+
+ fn clear_for_new_function(&mut self) {
+ self.num_preds_per_block.clear();
+ self.clear_for_new_tree();
+ }
+
+ fn clear_for_new_tree(&mut self) {
+ self.discovery_stack.clear();
+ self.nodes_in_tree.clear();
+ self.processing_stack.clear();
+ self.nodes_already_visited.clear();
+ }
+
+ #[inline(never)]
+ fn do_redundant_fill_removal_on_function(
+ &mut self,
+ func: &mut Function,
+ reginfo: &RegInfo,
+ isa: &dyn TargetIsa,
+ cfg: &ControlFlowGraph,
+ ) {
+ // Fail in an obvious way if there are more than (2^32)-1 Blocks in this function.
+ let num_blocks: u32 = func.dfg.num_blocks().try_into().unwrap();
+
+ // Clear out per-tree state.
+ self.clear_for_new_function();
+
+ // Create a PrimaryMap that summarises the number of predecessors for each block, as 0, 1
+ // or "many", and that also claims the entry block as having "many" predecessors.
+ self.num_preds_per_block.clear();
+ self.num_preds_per_block.reserve(num_blocks as usize);
+
+ for i in 0..num_blocks {
+ let mut pi = cfg.pred_iter(Block::from_u32(i));
+ let mut n_pi = ZeroOneOrMany::Zero;
+ if pi.next().is_some() {
+ n_pi = ZeroOneOrMany::One;
+ if pi.next().is_some() {
+ n_pi = ZeroOneOrMany::Many;
+ // We don't care if there are more than two preds, so stop counting now.
+ }
+ }
+ self.num_preds_per_block.push(n_pi);
+ }
+ debug_assert!(self.num_preds_per_block.len() == num_blocks as usize);
+
+ // The entry block must be the root of some tree, so set up the state to reflect that.
+ let entry_block = func
+ .layout
+ .entry_block()
+ .expect("do_redundant_fill_removal_on_function: entry block unknown");
+ debug_assert!(self.num_preds_per_block[entry_block] == ZeroOneOrMany::Zero);
+ self.num_preds_per_block[entry_block] = ZeroOneOrMany::Many;
+
+ // Now build and process trees.
+ for root_ix in 0..self.num_preds_per_block.len() {
+ let root = Block::from_u32(root_ix as u32);
+
+ // Build a tree for each node that has two or more preds, and ignore all other nodes.
+ if self.num_preds_per_block[root] != ZeroOneOrMany::Many {
+ continue;
+ }
+
+ // Clear out per-tree state.
+ self.clear_for_new_tree();
+
+ // Discovery phase: build the tree, as `root` and `self.nodes_in_tree`.
+ self.add_nodes_to_tree(cfg, root);
+ debug_assert!(self.nodes_in_tree.cardinality() > 0);
+ debug_assert!(self.num_preds_per_block[root] == ZeroOneOrMany::Many);
+
+ // Processing phase: do redundant-reload-removal.
+ self.process_tree(func, reginfo, isa, root);
+ debug_assert!(
+ self.nodes_in_tree.cardinality() == self.nodes_already_visited.cardinality()
+ );
+ }
+ }
+}
+
+// =============================================================================================
+// Top level: the external interface
+
+struct Context<'a> {
+ // Current instruction as well as reference to function and ISA.
+ cur: EncCursor<'a>,
+
+ // Cached ISA information. We save it here to avoid frequent virtual function calls on the
+ // `TargetIsa` trait object.
+ reginfo: RegInfo,
+
+ // References to contextual data structures we need.
+ cfg: &'a ControlFlowGraph,
+
+ // The running state.
+ state: &'a mut RedundantReloadRemover,
+}
+
+impl RedundantReloadRemover {
+ /// Run the remover.
+ pub fn run(&mut self, isa: &dyn TargetIsa, func: &mut Function, cfg: &ControlFlowGraph) {
+ let ctx = Context {
+ cur: EncCursor::new(func, isa),
+ reginfo: isa.register_info(),
+ cfg,
+ state: self,
+ };
+ let mut total_regunits = 0;
+ for rb in isa.register_info().banks {
+ total_regunits += rb.units;
+ }
+ ctx.state.num_regunits = Some(total_regunits);
+ ctx.state.do_redundant_fill_removal_on_function(
+ ctx.cur.func,
+ &ctx.reginfo,
+ ctx.cur.isa,
+ &ctx.cfg,
+ );
+ }
+}