diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/regalloc/src | |
parent | Initial commit. (diff) | |
download | firefox-upstream.tar.xz firefox-upstream.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/regalloc/src')
23 files changed, 18511 insertions, 0 deletions
diff --git a/third_party/rust/regalloc/src/analysis_control_flow.rs b/third_party/rust/regalloc/src/analysis_control_flow.rs new file mode 100644 index 0000000000..e28f630aa0 --- /dev/null +++ b/third_party/rust/regalloc/src/analysis_control_flow.rs @@ -0,0 +1,742 @@ +//! Performs control flow analysis. + +use log::{debug, info}; +use std::cmp::Ordering; + +use crate::analysis_main::AnalysisError; +use crate::data_structures::{BlockIx, InstIx, Range, Set, TypedIxVec}; +use crate::sparse_set::{SparseSetU, SparseSetUIter}; +use crate::Function; + +use smallvec::SmallVec; + +//============================================================================= +// Debugging config. Set all these to `false` for normal operation. + +// DEBUGGING: set to true to cross-check the dominator-tree computation. +const CROSSCHECK_DOMS: bool = false; + +//===========================================================================// +// // +// CONTROL FLOW ANALYSIS // +// // +//===========================================================================// + +//============================================================================= +// Control flow analysis: create the InstIx-to-BlockIx mapping + +// This is trivial, but it's sometimes useful to have. +// Note: confusingly, the `Range` here is data_structures::Range, not +// std::ops::Range. +pub struct InstIxToBlockIxMap { + vek: TypedIxVec<BlockIx, Range<InstIx>>, +} + +impl InstIxToBlockIxMap { + #[inline(never)] + pub fn new<F: Function>(func: &F) -> Self { + let mut vek = TypedIxVec::<BlockIx, Range<InstIx>>::new(); + for bix in func.blocks() { + let r: Range<InstIx> = func.block_insns(bix); + assert!(r.start() <= r.last_plus1()); + vek.push(r); + } + + fn cmp_ranges(r1: &Range<InstIx>, r2: &Range<InstIx>) -> Ordering { + if r1.last_plus1() <= r2.first() { + return Ordering::Less; + } + if r2.last_plus1() <= r1.first() { + return Ordering::Greater; + } + if r1.first() == r2.first() && r1.last_plus1() == r2.last_plus1() { + return Ordering::Equal; + } + // If this happens, F::block_insns is telling us something that isn't right. + panic!("InstIxToBlockIxMap::cmp_ranges: overlapping InstIx ranges!"); + } + + vek.sort_unstable_by(|r1, r2| cmp_ranges(r1, r2)); + // Sanity check: ascending, non-overlapping, no gaps. We need this in + // order to ensure that binary searching in `map` works properly. + for i in 1..vek.len() { + let r_m1 = &vek[BlockIx::new(i - 1)]; + let r_m0 = &vek[BlockIx::new(i - 0)]; + assert!(r_m1.last_plus1() == r_m0.first()); + } + + Self { vek } + } + + #[inline(never)] + pub fn map(&self, iix: InstIx) -> BlockIx { + if self.vek.len() > 0 { + let mut lo = 0isize; + let mut hi = self.vek.len() as isize - 1; + loop { + if lo > hi { + break; + } + let mid = (lo + hi) / 2; + let midv = &self.vek[BlockIx::new(mid as u32)]; + if iix < midv.start() { + hi = mid - 1; + continue; + } + if iix >= midv.last_plus1() { + lo = mid + 1; + continue; + } + assert!(midv.start() <= iix && iix < midv.last_plus1()); + return BlockIx::new(mid as u32); + } + } + panic!("InstIxToBlockIxMap::map: can't map {:?}", iix); + } +} + +//============================================================================= +// Control flow analysis: calculation of block successor and predecessor maps + +// Returned TypedIxVecs contain one element per block +#[inline(never)] +fn calc_preds_and_succs<F: Function>( + func: &F, + num_blocks: u32, +) -> ( + TypedIxVec<BlockIx, SparseSetU<[BlockIx; 4]>>, + TypedIxVec<BlockIx, SparseSetU<[BlockIx; 4]>>, +) { + info!(" calc_preds_and_succs: begin"); + + assert!(func.blocks().len() == num_blocks as usize); + + // First calculate the succ map, since we can do that directly from the + // Func. + // + // Func::finish() ensures that all blocks are non-empty, and that only the + // last instruction is a control flow transfer. Hence the following won't + // miss any edges. + let mut succ_map = TypedIxVec::<BlockIx, SparseSetU<[BlockIx; 4]>>::new(); + for b in func.blocks() { + let mut bix_set = SparseSetU::<[BlockIx; 4]>::empty(); + for bix in func.block_succs(b).iter() { + bix_set.insert(*bix); + } + succ_map.push(bix_set); + } + + // Now invert the mapping + let mut pred_map = TypedIxVec::<BlockIx, SparseSetU<[BlockIx; 4]>>::new(); + pred_map.resize(num_blocks, SparseSetU::<[BlockIx; 4]>::empty()); + for (src, dst_set) in (0..).zip(succ_map.iter()) { + for dst in dst_set.iter() { + pred_map[*dst].insert(BlockIx::new(src)); + } + } + + // Stay sane .. + assert!(pred_map.len() == num_blocks); + assert!(succ_map.len() == num_blocks); + + let mut n = 0; + debug!(""); + for (preds, succs) in pred_map.iter().zip(succ_map.iter()) { + debug!( + "{:<3?} preds {:<16?} succs {:?}", + BlockIx::new(n), + preds, + succs + ); + n += 1; + } + + info!(" calc_preds_and_succs: end"); + (pred_map, succ_map) +} + +//============================================================================= +// Control flow analysis: calculation of block preorder and postorder sequences + +// Returned Vecs contain one element per block. `None` is returned if the +// sequences do not contain `num_blocks` elements, in which case the input +// contains blocks not reachable from the entry point, and is invalid. +#[inline(never)] +fn calc_preord_and_postord<F: Function>( + func: &F, + num_blocks: u32, + succ_map: &TypedIxVec<BlockIx, SparseSetU<[BlockIx; 4]>>, +) -> Option<(Vec<BlockIx>, Vec<BlockIx>)> { + info!(" calc_preord_and_postord: begin"); + + let mut pre_ord = Vec::<BlockIx>::new(); + let mut post_ord = Vec::<BlockIx>::new(); + + let mut visited = TypedIxVec::<BlockIx, bool>::new(); + visited.resize(num_blocks, false); + + // Set up initial state: entry block on the stack, marked as visited, and placed at the + // start of the pre-ord sequence. + let mut stack = SmallVec::<[(BlockIx, SparseSetUIter<[BlockIx; 4]>); 64]>::new(); + let bix_entry = func.entry_block(); + visited[bix_entry] = true; + pre_ord.push(bix_entry); + stack.push((bix_entry, succ_map[bix_entry].iter())); + + 'outer: while let Some((bix, bix_succ_iter)) = stack.last_mut() { + // Consider the block on the top of the stack. Does it have any successors we + // haven't yet visited? + while let Some(bix_next_succ) = bix_succ_iter.next() { + if !visited[*bix_next_succ] { + // Yes. Push just one of them on the stack, along with a newly initialised + // iterator for it, and continue by considering the new stack top. Because + // blocks are only ever pushed onto the stack once, we must also add the + // block to the pre-ord sequence at this point. + visited[*bix_next_succ] = true; + pre_ord.push(*bix_next_succ); + stack.push((*bix_next_succ, succ_map[*bix_next_succ].iter())); + continue 'outer; + } + } + // No. This is the last time we'll ever hear of it. So add it to the post-ord + // sequence, remove the now-defunct stack-top item, and move on. + post_ord.push(*bix); + stack.pop(); + } + + assert!(pre_ord.len() == post_ord.len()); + assert!(pre_ord.len() <= num_blocks as usize); + if pre_ord.len() < num_blocks as usize { + info!( + " calc_preord_and_postord: invalid: {} blocks, {} reachable", + num_blocks, + pre_ord.len() + ); + return None; + } + + assert!(pre_ord.len() == num_blocks as usize); + assert!(post_ord.len() == num_blocks as usize); + #[cfg(debug_assertions)] + { + let mut pre_ord_sorted: Vec<BlockIx> = pre_ord.clone(); + let mut post_ord_sorted: Vec<BlockIx> = post_ord.clone(); + pre_ord_sorted.sort_by(|bix1, bix2| bix1.get().partial_cmp(&bix2.get()).unwrap()); + post_ord_sorted.sort_by(|bix1, bix2| bix1.get().partial_cmp(&bix2.get()).unwrap()); + let expected: Vec<BlockIx> = (0..num_blocks).map(|u| BlockIx::new(u)).collect(); + debug_assert!(pre_ord_sorted == expected); + debug_assert!(post_ord_sorted == expected); + } + + info!(" calc_preord_and_postord: end. {} blocks", num_blocks); + Some((pre_ord, post_ord)) +} + +//============================================================================= +// Computation of per-block dominator sets. Note, this is slow, and will be +// removed at some point. + +// Calculate the dominance relationship, given `pred_map` and a start node +// `start`. The resulting vector maps each block to the set of blocks that +// dominate it. This algorithm is from Fig 7.14 of Muchnick 1997. The +// algorithm is described as simple but not as performant as some others. +#[inline(never)] +fn calc_dom_sets_slow( + num_blocks: u32, + pred_map: &TypedIxVec<BlockIx, SparseSetU<[BlockIx; 4]>>, + post_ord: &Vec<BlockIx>, + start: BlockIx, +) -> TypedIxVec<BlockIx, Set<BlockIx>> { + info!(" calc_dom_sets_slow: begin"); + + let mut dom_map = TypedIxVec::<BlockIx, Set<BlockIx>>::new(); + + // FIXME find better names for n/d/t sets. + { + let root: BlockIx = start; + let n_set: Set<BlockIx> = + Set::from_vec((0..num_blocks).map(|bix| BlockIx::new(bix)).collect()); + let mut d_set: Set<BlockIx>; + let mut t_set: Set<BlockIx>; + + dom_map.resize(num_blocks, Set::<BlockIx>::empty()); + dom_map[root] = Set::unit(root); + for block_i in 0..num_blocks { + let block_ix = BlockIx::new(block_i); + if block_ix != root { + dom_map[block_ix] = n_set.clone(); + } + } + + let mut num_iter = 0; + loop { + num_iter += 1; + info!(" calc_dom_sets_slow: outer loop {}", num_iter); + let mut change = false; + for i in 0..num_blocks { + // block_ix travels in "reverse preorder" + let block_ix = post_ord[(num_blocks - 1 - i) as usize]; + if block_ix == root { + continue; + } + t_set = n_set.clone(); + for pred_ix in pred_map[block_ix].iter() { + t_set.intersect(&dom_map[*pred_ix]); + } + d_set = t_set.clone(); + d_set.insert(block_ix); + if !d_set.equals(&dom_map[block_ix]) { + change = true; + dom_map[block_ix] = d_set; + } + } + if !change { + break; + } + } + } + + debug!(""); + let mut block_ix = 0; + for dom_set in dom_map.iter() { + debug!("{:<3?} dom_set {:<16?}", BlockIx::new(block_ix), dom_set); + block_ix += 1; + } + info!(" calc_dom_sets_slow: end"); + dom_map +} + +//============================================================================= +// Computation of per-block dominator sets by first computing trees. +// +// This is an implementation of the algorithm described in +// +// A Simple, Fast Dominance Algorithm +// Keith D. Cooper, Timothy J. Harvey, and Ken Kennedy +// Department of Computer Science, Rice University, Houston, Texas, USA +// TR-06-33870 +// https://www.cs.rice.edu/~keith/EMBED/dom.pdf +// +// which appears to be the de-facto standard scheme for computing dominance +// quickly nowadays. + +// Unfortunately it seems like local consts are not allowed in Rust. +const DT_INVALID_POSTORD: u32 = 0xFFFF_FFFF; +const DT_INVALID_BLOCKIX: BlockIx = BlockIx::BlockIx(0xFFFF_FFFF); + +// Helper +fn dt_merge_sets( + idom: &TypedIxVec<BlockIx, BlockIx>, + bix2rpostord: &TypedIxVec<BlockIx, u32>, + mut node1: BlockIx, + mut node2: BlockIx, +) -> BlockIx { + while node1 != node2 { + if node1 == DT_INVALID_BLOCKIX || node2 == DT_INVALID_BLOCKIX { + return DT_INVALID_BLOCKIX; + } + let rpo1 = bix2rpostord[node1]; + let rpo2 = bix2rpostord[node2]; + if rpo1 > rpo2 { + node1 = idom[node1]; + } else if rpo2 > rpo1 { + node2 = idom[node2]; + } + } + assert!(node1 == node2); + node1 +} + +#[inline(never)] +fn calc_dom_tree( + num_blocks: u32, + pred_map: &TypedIxVec<BlockIx, SparseSetU<[BlockIx; 4]>>, + post_ord: &Vec<BlockIx>, + start: BlockIx, +) -> TypedIxVec<BlockIx, BlockIx> { + info!(" calc_dom_tree: begin"); + + // We use 2^32-1 as a marker for an invalid BlockIx or postorder number. + // Hence we need this: + assert!(num_blocks < DT_INVALID_POSTORD); + + // We have post_ord, which is the postorder sequence. + + // Compute bix2rpostord, which maps a BlockIx to its reverse postorder + // number. And rpostord2bix, which maps a reverse postorder number to its + // BlockIx. + let mut bix2rpostord = TypedIxVec::<BlockIx, u32>::new(); + let mut rpostord2bix = Vec::<BlockIx>::new(); + bix2rpostord.resize(num_blocks, DT_INVALID_POSTORD); + rpostord2bix.resize(num_blocks as usize, DT_INVALID_BLOCKIX); + for n in 0..num_blocks { + // bix visits the blocks in reverse postorder + let bix = post_ord[(num_blocks - 1 - n) as usize]; + // Hence: + bix2rpostord[bix] = n; + // and + rpostord2bix[n as usize] = bix; + } + for n in 0..num_blocks { + debug_assert!(bix2rpostord[BlockIx::new(n)] < num_blocks); + } + + let mut idom = TypedIxVec::<BlockIx, BlockIx>::new(); + idom.resize(num_blocks, DT_INVALID_BLOCKIX); + + // The start node must have itself as a parent. + idom[start] = start; + + for i in 0..num_blocks { + let block_ix = BlockIx::new(i); + let preds_of_i = &pred_map[block_ix]; + // All nodes must be reachable from the root. That means that all nodes + // that aren't `start` must have at least one predecessor. However, we + // can't assert the inverse case -- that the start node has no + // predecessors -- because the start node might be a self-loop, in which + // case it will have itself as a pred. See tests/domtree_fuzz1.rat. + if block_ix != start { + assert!(!preds_of_i.is_empty()); + } + } + + let mut changed = true; + while changed { + changed = false; + for n in 0..num_blocks { + // Consider blocks in reverse postorder. + let node = rpostord2bix[n as usize]; + assert!(node != DT_INVALID_BLOCKIX); + let node_preds = &pred_map[node]; + let rponum = bix2rpostord[node]; + + let mut parent = DT_INVALID_BLOCKIX; + if node_preds.is_empty() { + // No preds, `parent` remains invalid. + } else { + for pred in node_preds.iter() { + let pred_rpo = bix2rpostord[*pred]; + if pred_rpo < rponum { + parent = *pred; + break; + } + } + } + + if parent != DT_INVALID_BLOCKIX { + for pred in node_preds.iter() { + if *pred == parent { + continue; + } + if idom[*pred] == DT_INVALID_BLOCKIX { + continue; + } + parent = dt_merge_sets(&idom, &bix2rpostord, parent, *pred); + } + } + + if parent != DT_INVALID_BLOCKIX && parent != idom[node] { + idom[node] = parent; + changed = true; + } + } + } + + // Check what we can. The start node should be its own parent. All other + // nodes should not be their own parent, since we are assured that there are + // no dead blocks in the graph, and hence that there is only one dominator + // tree, that covers the whole graph. + assert!(idom[start] == start); + for i in 0..num_blocks { + let block_ix = BlockIx::new(i); + // All "parent pointers" are valid. + assert!(idom[block_ix] != DT_INVALID_BLOCKIX); + // The only node whose parent pointer points to itself is the start node. + assert!((idom[block_ix] == block_ix) == (block_ix == start)); + } + + if CROSSCHECK_DOMS { + // Crosscheck the dom tree, by computing dom sets using the simple + // iterative algorithm. Then, for each block, construct the dominator set + // by walking up the tree to the root, and check that it's the same as + // what the simple algorithm produced. + + info!(" calc_dom_tree crosscheck: begin"); + let slow_sets = calc_dom_sets_slow(num_blocks, pred_map, post_ord, start); + assert!(slow_sets.len() == idom.len()); + + for i in 0..num_blocks { + let mut block_ix = BlockIx::new(i); + let mut set = Set::<BlockIx>::empty(); + loop { + set.insert(block_ix); + let other_block_ix = idom[block_ix]; + if other_block_ix == block_ix { + break; + } + block_ix = other_block_ix; + } + assert!(set.to_vec() == slow_sets[BlockIx::new(i)].to_vec()); + } + info!(" calc_dom_tree crosscheck: end"); + } + + info!(" calc_dom_tree: end"); + idom +} + +//============================================================================= +// Computation of per-block loop-depths + +#[inline(never)] +fn calc_loop_depths( + num_blocks: u32, + pred_map: &TypedIxVec<BlockIx, SparseSetU<[BlockIx; 4]>>, + succ_map: &TypedIxVec<BlockIx, SparseSetU<[BlockIx; 4]>>, + post_ord: &Vec<BlockIx>, + start: BlockIx, +) -> TypedIxVec<BlockIx, u32> { + info!(" calc_loop_depths: begin"); + let idom = calc_dom_tree(num_blocks, pred_map, post_ord, start); + + // Find the loops. First, find the "loop header nodes", and from those, + // derive the loops. + // + // loop_set headers: + // A "back edge" m->n is some edge m->n where n dominates m. 'n' is + // the loop header node. + // + // `back_edges` is a set rather than a vector so as to avoid complications + // that might later arise if the same loop is enumerated more than once. + // + // Iterate over all edges (m->n) + let mut back_edges = Set::<(BlockIx, BlockIx)>::empty(); + for block_m_ix in BlockIx::new(0).dotdot(BlockIx::new(num_blocks)) { + for block_n_ix in succ_map[block_m_ix].iter() { + // Figure out if N dominates M. Do this by walking the dom tree from M + // back up to the root, and seeing if we encounter N on the way. + let mut n_dominates_m = false; + let mut block_ix = block_m_ix; + loop { + if block_ix == *block_n_ix { + n_dominates_m = true; + break; + } + let other_block_ix = idom[block_ix]; + if other_block_ix == block_ix { + break; + } + block_ix = other_block_ix; + } + if n_dominates_m { + //println!("QQQQ back edge {} -> {}", + // block_m_ix.show(), block_n_ix.show()); + back_edges.insert((block_m_ix, *block_n_ix)); + } + } + } + + // Now collect the sets of Blocks for each loop. For each back edge, + // collect up all the blocks in the natural loop defined by the back edge + // M->N. This algorithm is from Fig 7.21 of Muchnick 1997 (an excellent + // book). Order in `natural_loops` has no particular meaning. + let mut natural_loops = Vec::<Set<BlockIx>>::new(); + for (block_m_ix, block_n_ix) in back_edges.iter() { + let mut loop_set: Set<BlockIx>; + let mut stack: Vec<BlockIx>; + stack = Vec::<BlockIx>::new(); + loop_set = Set::<BlockIx>::two(*block_m_ix, *block_n_ix); + if block_m_ix != block_n_ix { + // The next line is missing in the Muchnick description. Without it the + // algorithm doesn't make any sense, though. + stack.push(*block_m_ix); + while let Some(block_p_ix) = stack.pop() { + for block_q_ix in pred_map[block_p_ix].iter() { + if !loop_set.contains(*block_q_ix) { + loop_set.insert(*block_q_ix); + stack.push(*block_q_ix); + } + } + } + } + natural_loops.push(loop_set); + } + + // Here is a kludgey way to compute the depth of each loop. First, order + // `natural_loops` by increasing size, so the largest loops are at the end. + // Then, repeatedly scan forwards through the vector, in "upper triangular + // matrix" style. For each scan, remember the "current loop". Initially + // the "current loop is the start point of each scan. If, during the scan, + // we encounter a loop which is a superset of the "current loop", change the + // "current loop" to this new loop, and increment a counter associated with + // the start point of the scan. The effect is that the counter records the + // nesting depth of the loop at the start of the scan. For this to be + // completely accurate, I _think_ this requires the property that loops are + // either disjoint or nested, but are in no case intersecting. + + natural_loops.sort_by(|left_block_set, right_block_set| { + left_block_set + .card() + .partial_cmp(&right_block_set.card()) + .unwrap() + }); + + let num_loops = natural_loops.len(); + let mut loop_depths = Vec::<u32>::new(); + loop_depths.resize(num_loops, 0); + + for i in 0..num_loops { + let mut curr = i; + let mut depth = 1; + for j in i + 1..num_loops { + debug_assert!(curr < j); + if natural_loops[curr].is_subset_of(&natural_loops[j]) { + depth += 1; + curr = j; + } + } + loop_depths[i] = depth; + } + + // Now that we have a depth for each loop, we can finally compute the depth + // for each block. + let mut depth_map = TypedIxVec::<BlockIx, u32>::new(); + depth_map.resize(num_blocks, 0); + for (loop_block_indexes, depth) in natural_loops.iter().zip(loop_depths) { + for loop_block_ix in loop_block_indexes.iter() { + if depth_map[*loop_block_ix] < depth { + depth_map[*loop_block_ix] = depth; + } + } + } + + debug_assert!(depth_map.len() == num_blocks); + + let mut n = 0; + debug!(""); + for (depth, idom_by) in depth_map.iter().zip(idom.iter()) { + debug!( + "{:<3?} depth {} idom {:?}", + BlockIx::new(n), + depth, + idom_by + ); + n += 1; + } + + info!(" calc_loop_depths: end"); + depth_map +} + +//============================================================================= +// Control-flow analysis top level: For a Func: predecessors, successors, +// preord and postord sequences, and loop depths. + +// CFGInfo contains CFG-related info computed from a Func. +pub struct CFGInfo { + // All these TypedIxVecs and plain Vecs contain one element per Block in the + // Func. + + // Predecessor and successor maps. + pub pred_map: TypedIxVec<BlockIx, SparseSetU<[BlockIx; 4]>>, + pub succ_map: TypedIxVec<BlockIx, SparseSetU<[BlockIx; 4]>>, + + // Pre- and post-order sequences. Iterating forwards through these + // vectors enumerates the blocks in preorder and postorder respectively. + pub pre_ord: Vec<BlockIx>, + pub _post_ord: Vec<BlockIx>, + + // This maps from a Block to the loop depth that it is at + pub depth_map: TypedIxVec<BlockIx, u32>, +} + +impl CFGInfo { + #[inline(never)] + pub fn create<F: Function>(func: &F) -> Result<Self, AnalysisError> { + info!(" CFGInfo::create: begin"); + + // Throw out insanely large inputs. They'll probably cause failure later + // on. + let num_blocks_usize = func.blocks().len(); + if num_blocks_usize >= 1 * 1024 * 1024 { + // 1 million blocks should be enough for anyone. That will soak up 20 + // index bits, leaving a "safety margin" of 12 bits for indices for + // induced structures (RangeFragIx, InstIx, VirtualRangeIx, RealRangeIx, + // etc). + return Err(AnalysisError::ImplementationLimitsExceeded); + } + + // Similarly, limit the number of instructions to 16 million. This allows + // 16 insns per block with the worst-case number of blocks. Because each + // insn typically generates somewhat less than one new value, this check + // also has the effect of limiting the number of virtual registers to + // roughly the same amount (16 million). + if func.insns().len() >= 16 * 1024 * 1024 { + return Err(AnalysisError::ImplementationLimitsExceeded); + } + + // Now we know we're safe to narrow it to u32. + let num_blocks = num_blocks_usize as u32; + + // === BEGIN compute successor and predecessor maps === + // + let (pred_map, succ_map) = calc_preds_and_succs(func, num_blocks); + assert!(pred_map.len() == num_blocks); + assert!(succ_map.len() == num_blocks); + // + // === END compute successor and predecessor maps === + + // === BEGIN check that critical edges have been split === + // + for (src, dst_set) in (0..).zip(succ_map.iter()) { + if dst_set.card() < 2 { + continue; + } + for dst in dst_set.iter() { + if pred_map[*dst].card() >= 2 { + return Err(AnalysisError::CriticalEdge { + from: BlockIx::new(src), + to: *dst, + }); + } + } + } + // + // === END check that critical edges have been split === + + // === BEGIN compute preord/postord sequences === + // + let mb_pre_ord_and_post_ord = calc_preord_and_postord(func, num_blocks, &succ_map); + if mb_pre_ord_and_post_ord.is_none() { + return Err(AnalysisError::UnreachableBlocks); + } + + let (pre_ord, post_ord) = mb_pre_ord_and_post_ord.unwrap(); + assert!(pre_ord.len() == num_blocks as usize); + assert!(post_ord.len() == num_blocks as usize); + // + // === END compute preord/postord sequences === + + // === BEGIN compute loop depth of all Blocks + // + let depth_map = calc_loop_depths( + num_blocks, + &pred_map, + &succ_map, + &post_ord, + func.entry_block(), + ); + debug_assert!(depth_map.len() == num_blocks); + // + // === END compute loop depth of all Blocks + + info!(" CFGInfo::create: end"); + Ok(CFGInfo { + pred_map, + succ_map, + pre_ord, + _post_ord: post_ord, + depth_map, + }) + } +} diff --git a/third_party/rust/regalloc/src/analysis_data_flow.rs b/third_party/rust/regalloc/src/analysis_data_flow.rs new file mode 100644 index 0000000000..9f3c544af7 --- /dev/null +++ b/third_party/rust/regalloc/src/analysis_data_flow.rs @@ -0,0 +1,1981 @@ +//! Performs dataflow and liveness analysis, including live range construction. + +use log::{debug, info, log_enabled, Level}; +use smallvec::{smallvec, SmallVec}; +use std::cmp::min; +use std::fmt; + +use crate::analysis_control_flow::CFGInfo; +use crate::data_structures::{ + BlockIx, InstIx, InstPoint, MoveInfo, MoveInfoElem, Point, Queue, RangeFrag, RangeFragIx, + RangeFragKind, RangeFragMetrics, RealRange, RealRangeIx, RealReg, RealRegUniverse, Reg, + RegClass, RegSets, RegToRangesMaps, RegUsageCollector, RegVecBounds, RegVecs, RegVecsAndBounds, + SortedRangeFragIxs, SortedRangeFrags, SpillCost, TypedIxVec, VirtualRange, VirtualRangeIx, + VirtualReg, +}; +use crate::sparse_set::SparseSet; +use crate::union_find::{ToFromU32, UnionFind}; +use crate::Function; + +//===========================================================================// +// // +// DATA FLOW AND LIVENESS ANALYSIS // +// // +//===========================================================================// + +//============================================================================= +// Data flow analysis: extraction and sanitization of reg-use information: low +// level interface + +// === The meaning of "sanitization" === +// +// The meaning of "sanitization" is as follows. Incoming virtual-registerised +// code may mention a mixture of virtual and real registers. Those real +// registers may include some which aren't available for the allocators to +// use. Rather than scatter ad-hoc logic all over the analysis phase and the +// allocators, we simply remove all non-available real registers from the +// per-instruction use/def/mod sets. The effect is that, after this point, we +// can operate on the assumption that any register we come across is either a +// virtual register or a real register available to the allocator. +// +// A real register is available to the allocator iff its index number is less +// than `RealRegUniverse.allocable`. +// +// Furthermore, it is not allowed that any incoming instruction mentions one +// of the per-class scratch registers listed in +// `RealRegUniverse.allocable_by_class[..].suggested_scratch` in either a use +// or mod role. Sanitisation will also detect this case and return an error. +// Mentions of a scratch register in a def role are tolerated; however, since +// no instruction may use or modify a scratch register, all such writes are +// dead.. +// +// In all of the above, "mentions" of a real register really means "uses, +// defines or modifications of said register". It doesn't matter whether the +// instruction explicitly mentions the register or whether it is an implicit +// mention (eg, %cl in x86 shift-by-a-variable-amount instructions). In other +// words, a "mention" is any use, def or mod as detected by the client's +// `get_regs` routine. + +// === Filtering of register groups in `RegVec`s === +// +// Filtering on a group is done by leaving the start point unchanged, sliding +// back retained registers to fill the holes from non-retained registers, and +// reducing the group length accordingly. The effect is to effectively "leak" +// some registers in the group, but that's not a problem. +// +// Extraction of register usages for the whole function is done by +// `get_sanitized_reg_uses_for_func`. For each instruction, their used, +// defined and modified register sets are acquired by calling the client's +// `get_regs` function. Then each of those three sets are cleaned up as +// follows: +// +// (1) duplicates are removed (after which they really are sets) +// +// (2) any registers in the modified set are removed from the used and defined +// sets. This enforces the invariant that +// `intersect(modified, union(used, defined))` is the empty set. Live range +// fragment computation (get_range_frags_for_block) depends on this property. +// +// (3) real registers unavailable to the allocator are removed, per the +// abovementioned sanitization rules. + +// ==== LOCAL FN ==== +// Given a register group in `regs[start, +len)`, remove duplicates from the +// group. The new group size is written to `*len`. +#[inline(never)] +fn remove_dups_from_group(regs: &mut Vec<Reg>, start: u32, len: &mut u8) { + // First sort the group, to facilitate de-duplication. + regs[start as usize..start as usize + *len as usize].sort_unstable(); + + // Now make a compacting pass over the group. 'rd' = read point in the + // group, 'wr' = write point in the group. + let mut wr = start as usize; + for rd in start as usize..start as usize + *len as usize { + let reg = regs[rd]; + if rd == start as usize || regs[rd - 1] != reg { + // It's not a duplicate. + if wr != rd { + regs[wr] = reg; + } + wr += 1; + } + } + + let new_len_usize = wr - start as usize; + assert!(new_len_usize <= *len as usize); + // This narrowing is safe because the old `len` fitted in 8 bits. + *len = new_len_usize as u8; +} + +// ==== LOCAL FN ==== +// Remove from `group[group_start, +group_len)` any registers mentioned in +// `mods[mods_start, +mods_len)`, and update `*group_len` accordingly. +#[inline(never)] +fn remove_mods_from_group( + group: &mut Vec<Reg>, + group_start: u32, + group_len: &mut u8, + mods: &Vec<Reg>, + mods_start: u32, + mods_len: u8, +) { + let mut wr = group_start as usize; + for rd in group_start as usize..group_start as usize + *group_len as usize { + let reg = group[rd]; + // Only retain `reg` if it is not mentioned in `mods[mods_start, +mods_len)` + let mut retain = true; + for i in mods_start as usize..mods_start as usize + mods_len as usize { + if reg == mods[i] { + retain = false; + break; + } + } + if retain { + if wr != rd { + group[wr] = reg; + } + wr += 1; + } + } + let new_group_len_usize = wr - group_start as usize; + assert!(new_group_len_usize <= *group_len as usize); + // This narrowing is safe because the old `group_len` fitted in 8 bits. + *group_len = new_group_len_usize as u8; +} + +// ==== EXPORTED FN ==== +// For instruction `inst`, add the register uses to the ends of `reg_vecs`, +// and write bounds information into `bounds`. The register uses are raw +// (unsanitized) but they are guaranteed to be duplicate-free and also to have +// no `mod` mentions in the `use` or `def` groups. That is, cleanups (1) and +// (2) above have been done. +#[inline(never)] +pub fn add_raw_reg_vecs_for_insn<F: Function>( + inst: &F::Inst, + reg_vecs: &mut RegVecs, + bounds: &mut RegVecBounds, +) { + bounds.uses_start = reg_vecs.uses.len() as u32; + bounds.defs_start = reg_vecs.defs.len() as u32; + bounds.mods_start = reg_vecs.mods.len() as u32; + + let mut collector = RegUsageCollector::new(reg_vecs); + F::get_regs(inst, &mut collector); + + let uses_len = collector.reg_vecs.uses.len() as u32 - bounds.uses_start; + let defs_len = collector.reg_vecs.defs.len() as u32 - bounds.defs_start; + let mods_len = collector.reg_vecs.mods.len() as u32 - bounds.mods_start; + + // This assertion is important -- the cleanup logic also depends on it. + assert!((uses_len | defs_len | mods_len) < 256); + bounds.uses_len = uses_len as u8; + bounds.defs_len = defs_len as u8; + bounds.mods_len = mods_len as u8; + + // First, de-dup the three new groups. + if bounds.uses_len > 0 { + remove_dups_from_group( + &mut collector.reg_vecs.uses, + bounds.uses_start, + &mut bounds.uses_len, + ); + } + if bounds.defs_len > 0 { + remove_dups_from_group( + &mut collector.reg_vecs.defs, + bounds.defs_start, + &mut bounds.defs_len, + ); + } + if bounds.mods_len > 0 { + remove_dups_from_group( + &mut collector.reg_vecs.mods, + bounds.mods_start, + &mut bounds.mods_len, + ); + } + + // And finally, remove modified registers from the set of used and defined + // registers, so we don't have to make the client do so. + if bounds.mods_len > 0 { + if bounds.uses_len > 0 { + remove_mods_from_group( + &mut collector.reg_vecs.uses, + bounds.uses_start, + &mut bounds.uses_len, + &collector.reg_vecs.mods, + bounds.mods_start, + bounds.mods_len, + ); + } + if bounds.defs_len > 0 { + remove_mods_from_group( + &mut collector.reg_vecs.defs, + bounds.defs_start, + &mut bounds.defs_len, + &collector.reg_vecs.mods, + bounds.mods_start, + bounds.mods_len, + ); + } + } +} + +// ==== LOCAL FN ==== +// This is the fundamental keep-or-don't-keep? predicate for sanitization. To +// do this exactly right we also need to know whether the register is +// mentioned in a def role (as opposed to a use or mod role). Note that this +// function can fail, and the error must be propagated. +#[inline(never)] +fn sanitize_should_retain_reg( + reg_universe: &RealRegUniverse, + reg: Reg, + reg_is_defd: bool, +) -> Result<bool, RealReg> { + // Retain all virtual regs. + if reg.is_virtual() { + return Ok(true); + } + + // So it's a RealReg. + let rreg_ix = reg.get_index(); + + // Check that this RealReg is mentioned in the universe. + if rreg_ix >= reg_universe.regs.len() { + // This is a serious error which should be investigated. It means the + // client gave us an instruction which mentions a RealReg which isn't + // listed in the RealRegUniverse it gave us. That's not allowed. + return Err(reg.as_real_reg().unwrap()); + } + + // Discard all real regs that aren't available to the allocator. + if rreg_ix >= reg_universe.allocable { + return Ok(false); + } + + // It isn't allowed for the client to give us an instruction which reads or + // modifies one of the scratch registers. It is however allowed to write a + // scratch register. + for reg_info in ®_universe.allocable_by_class { + if let Some(reg_info) = reg_info { + if let Some(scratch_idx) = ®_info.suggested_scratch { + let scratch_reg = reg_universe.regs[*scratch_idx].0; + if reg.to_real_reg() == scratch_reg { + if !reg_is_defd { + // This is an error (on the part of the client). + return Err(reg.as_real_reg().unwrap()); + } + } + } + } + } + + // `reg` is mentioned in the universe, is available to the allocator, and if + // it is one of the scratch regs, it is only written, not read or modified. + Ok(true) +} +// END helper fn + +// ==== LOCAL FN ==== +// Given a register group in `regs[start, +len)`, sanitize the group. To do +// this exactly right we also need to know whether the registers in the group +// are mentioned in def roles (as opposed to use or mod roles). Sanitisation +// can fail, in which case we must propagate the error. If it is successful, +// the new group size is written to `*len`. +#[inline(never)] +fn sanitize_group( + reg_universe: &RealRegUniverse, + regs: &mut Vec<Reg>, + start: u32, + len: &mut u8, + is_def_group: bool, +) -> Result<(), RealReg> { + // Make a single compacting pass over the group. 'rd' = read point in the + // group, 'wr' = write point in the group. + let mut wr = start as usize; + for rd in start as usize..start as usize + *len as usize { + let reg = regs[rd]; + // This call can fail: + if sanitize_should_retain_reg(reg_universe, reg, is_def_group)? { + if wr != rd { + regs[wr] = reg; + } + wr += 1; + } + } + + let new_len_usize = wr - start as usize; + assert!(new_len_usize <= *len as usize); + // This narrowing is safe because the old `len` fitted in 8 bits. + *len = new_len_usize as u8; + Ok(()) +} + +// ==== LOCAL FN ==== +// For instruction `inst`, add the fully cleaned-up register uses to the ends +// of `reg_vecs`, and write bounds information into `bounds`. Cleanups (1) +// (2) and (3) mentioned above have been done. Note, this can fail, and the +// error must be propagated. +#[inline(never)] +fn add_san_reg_vecs_for_insn<F: Function>( + inst: &F::Inst, + reg_universe: &RealRegUniverse, + reg_vecs: &mut RegVecs, + bounds: &mut RegVecBounds, +) -> Result<(), RealReg> { + // Get the raw reg usages. These will be dup-free and mod-cleaned-up + // (meaning cleanups (1) and (3) have been done). + add_raw_reg_vecs_for_insn::<F>(inst, reg_vecs, bounds); + + // Finally and sanitize them. Any errors from sanitization are propagated. + if bounds.uses_len > 0 { + sanitize_group( + ®_universe, + &mut reg_vecs.uses, + bounds.uses_start, + &mut bounds.uses_len, + /*is_def_group=*/ false, + )?; + } + if bounds.defs_len > 0 { + sanitize_group( + ®_universe, + &mut reg_vecs.defs, + bounds.defs_start, + &mut bounds.defs_len, + /*is_def_group=*/ true, + )?; + } + if bounds.mods_len > 0 { + sanitize_group( + ®_universe, + &mut reg_vecs.mods, + bounds.mods_start, + &mut bounds.mods_len, + /*is_def_group=*/ false, + )?; + } + + Ok(()) +} + +// ==== MAIN FN ==== +#[inline(never)] +pub fn get_sanitized_reg_uses_for_func<F: Function>( + func: &F, + reg_universe: &RealRegUniverse, +) -> Result<RegVecsAndBounds, RealReg> { + // These are modified by the per-insn loop. + let mut reg_vecs = RegVecs::new(false); + let mut bounds_vec = TypedIxVec::<InstIx, RegVecBounds>::new(); + bounds_vec.reserve(func.insns().len()); + + // For each insn, add their register uses to the ends of the 3 vectors in + // `reg_vecs`, and create an admin entry to describe the 3 new groups. Any + // errors from sanitization are propagated. + for insn in func.insns() { + let mut bounds = RegVecBounds::new(); + add_san_reg_vecs_for_insn::<F>(insn, ®_universe, &mut reg_vecs, &mut bounds)?; + + bounds_vec.push(bounds); + } + + assert!(!reg_vecs.is_sanitized()); + reg_vecs.set_sanitized(true); + + if log_enabled!(Level::Debug) { + let show_reg = |r: Reg| { + if r.is_real() { + reg_universe.regs[r.get_index()].1.clone() + } else { + format!("{:?}", r).to_string() + } + }; + let show_regs = |r_vec: &[Reg]| { + let mut s = "".to_string(); + for r in r_vec { + s = s + &show_reg(*r) + &" ".to_string(); + } + s + }; + + for i in 0..bounds_vec.len() { + let iix = InstIx::new(i); + let s_use = show_regs( + ®_vecs.uses[bounds_vec[iix].uses_start as usize + ..bounds_vec[iix].uses_start as usize + bounds_vec[iix].uses_len as usize], + ); + let s_mod = show_regs( + ®_vecs.mods[bounds_vec[iix].mods_start as usize + ..bounds_vec[iix].mods_start as usize + bounds_vec[iix].mods_len as usize], + ); + let s_def = show_regs( + ®_vecs.defs[bounds_vec[iix].defs_start as usize + ..bounds_vec[iix].defs_start as usize + bounds_vec[iix].defs_len as usize], + ); + debug!( + "{:?} SAN_RU: use {{ {}}} mod {{ {}}} def {{ {}}}", + iix, s_use, s_mod, s_def + ); + } + } + + Ok(RegVecsAndBounds::new(reg_vecs, bounds_vec)) +} +// END main function + +//============================================================================= +// Data flow analysis: extraction and sanitization of reg-use information: +// convenience interface + +// ==== EXPORTED ==== +#[inline(always)] +pub fn does_inst_use_def_or_mod_reg( + rvb: &RegVecsAndBounds, + iix: InstIx, + reg: Reg, +) -> (/*uses*/ bool, /*defs*/ bool, /*mods*/ bool) { + let bounds = &rvb.bounds[iix]; + let vecs = &rvb.vecs; + let mut uses = false; + let mut defs = false; + let mut mods = false; + // Since each group of registers is in order and duplicate-free (as a result + // of `remove_dups_from_group`), we could in theory binary-search here. But + // it'd almost certainly be a net loss; the group sizes are very small, + // often zero. + for i in bounds.uses_start as usize..bounds.uses_start as usize + bounds.uses_len as usize { + if vecs.uses[i] == reg { + uses = true; + break; + } + } + for i in bounds.defs_start as usize..bounds.defs_start as usize + bounds.defs_len as usize { + if vecs.defs[i] == reg { + defs = true; + break; + } + } + for i in bounds.mods_start as usize..bounds.mods_start as usize + bounds.mods_len as usize { + if vecs.mods[i] == reg { + mods = true; + break; + } + } + (uses, defs, mods) +} + +// ==== EXPORTED ==== +// This is slow, really slow. Don't use it on critical paths. This applies +// `get_regs` to `inst`, performs cleanups (1) and (2), but does not sanitize +// the results. The results are wrapped up as Sets for convenience. +// JRS 2020Apr09: remove this if no further use for it appears soon. +#[allow(dead_code)] +#[inline(never)] +pub fn get_raw_reg_sets_for_insn<F: Function>(inst: &F::Inst) -> RegSets { + let mut reg_vecs = RegVecs::new(false); + let mut bounds = RegVecBounds::new(); + + add_raw_reg_vecs_for_insn::<F>(inst, &mut reg_vecs, &mut bounds); + + // Make up a fake RegVecsAndBounds for just this insn, so we can hand it to + // RegVecsAndBounds::get_reg_sets_for_iix. + let mut single_insn_bounds = TypedIxVec::<InstIx, RegVecBounds>::new(); + single_insn_bounds.push(bounds); + + assert!(!reg_vecs.is_sanitized()); + let single_insn_rvb = RegVecsAndBounds::new(reg_vecs, single_insn_bounds); + single_insn_rvb.get_reg_sets_for_iix(InstIx::new(0)) +} + +// ==== EXPORTED ==== +// This is even slower. This applies `get_regs` to `inst`, performs cleanups +// (1) (2) and (3). The results are wrapped up as Sets for convenience. Note +// this function can fail. +#[inline(never)] +pub fn get_san_reg_sets_for_insn<F: Function>( + inst: &F::Inst, + reg_universe: &RealRegUniverse, +) -> Result<RegSets, RealReg> { + let mut reg_vecs = RegVecs::new(false); + let mut bounds = RegVecBounds::new(); + + add_san_reg_vecs_for_insn::<F>(inst, ®_universe, &mut reg_vecs, &mut bounds)?; + + // Make up a fake RegVecsAndBounds for just this insn, so we can hand it to + // RegVecsAndBounds::get_reg_sets_for_iix. + let mut single_insn_bounds = TypedIxVec::<InstIx, RegVecBounds>::new(); + single_insn_bounds.push(bounds); + + assert!(!reg_vecs.is_sanitized()); + reg_vecs.set_sanitized(true); + let single_insn_rvb = RegVecsAndBounds::new(reg_vecs, single_insn_bounds); + Ok(single_insn_rvb.get_reg_sets_for_iix(InstIx::new(0))) +} + +//============================================================================= +// Data flow analysis: calculation of per-block register def and use sets + +// Returned TypedIxVecs contain one element per block +#[inline(never)] +pub fn calc_def_and_use<F: Function>( + func: &F, + rvb: &RegVecsAndBounds, + univ: &RealRegUniverse, +) -> ( + TypedIxVec<BlockIx, SparseSet<Reg>>, + TypedIxVec<BlockIx, SparseSet<Reg>>, +) { + info!(" calc_def_and_use: begin"); + assert!(rvb.is_sanitized()); + let mut def_sets = TypedIxVec::new(); + let mut use_sets = TypedIxVec::new(); + for b in func.blocks() { + let mut def = SparseSet::empty(); + let mut uce = SparseSet::empty(); + for iix in func.block_insns(b) { + let bounds_for_iix = &rvb.bounds[iix]; + // Add to `uce`, any registers for which the first event in this block + // is a read. Dealing with the "first event" constraint is a bit + // tricky. In the next two loops, `u` and `m` is used (either read or + // modified) by the instruction. Whether or not we should consider it + // live-in for the block depends on whether it was been written earlier + // in the block. We can determine that by checking whether it is + // already in the def set for the block. + // FIXME: isn't thus just: + // uce union= (regs_u minus def) followed by + // uce union= (regs_m minus def) + for i in bounds_for_iix.uses_start as usize + ..bounds_for_iix.uses_start as usize + bounds_for_iix.uses_len as usize + { + let u = rvb.vecs.uses[i]; + if !def.contains(u) { + uce.insert(u); + } + } + for i in bounds_for_iix.mods_start as usize + ..bounds_for_iix.mods_start as usize + bounds_for_iix.mods_len as usize + { + let m = rvb.vecs.mods[i]; + if !def.contains(m) { + uce.insert(m); + } + } + + // Now add to `def`, all registers written by the instruction. + // This is simpler. + // FIXME: isn't this just: def union= (regs_d union regs_m) ? + for i in bounds_for_iix.defs_start as usize + ..bounds_for_iix.defs_start as usize + bounds_for_iix.defs_len as usize + { + let d = rvb.vecs.defs[i]; + def.insert(d); + } + for i in bounds_for_iix.mods_start as usize + ..bounds_for_iix.mods_start as usize + bounds_for_iix.mods_len as usize + { + let m = rvb.vecs.mods[i]; + def.insert(m); + } + } + def_sets.push(def); + use_sets.push(uce); + } + + assert!(def_sets.len() == use_sets.len()); + + if log_enabled!(Level::Debug) { + let mut n = 0; + debug!(""); + for (def_set, use_set) in def_sets.iter().zip(use_sets.iter()) { + let mut first = true; + let mut defs_str = "".to_string(); + for def in def_set.to_vec() { + if !first { + defs_str = defs_str + &" ".to_string(); + } + first = false; + defs_str = defs_str + &def.show_with_rru(univ); + } + first = true; + let mut uses_str = "".to_string(); + for uce in use_set.to_vec() { + if !first { + uses_str = uses_str + &" ".to_string(); + } + first = false; + uses_str = uses_str + &uce.show_with_rru(univ); + } + debug!( + "{:<3?} def {{{}}} use {{{}}}", + BlockIx::new(n), + defs_str, + uses_str + ); + n += 1; + } + } + + info!(" calc_def_and_use: end"); + (def_sets, use_sets) +} + +//============================================================================= +// Data flow analysis: computation of per-block register live-in and live-out +// sets + +// Returned vectors contain one element per block +#[inline(never)] +pub fn calc_livein_and_liveout<F: Function>( + func: &F, + def_sets_per_block: &TypedIxVec<BlockIx, SparseSet<Reg>>, + use_sets_per_block: &TypedIxVec<BlockIx, SparseSet<Reg>>, + cfg_info: &CFGInfo, + univ: &RealRegUniverse, +) -> ( + TypedIxVec<BlockIx, SparseSet<Reg>>, + TypedIxVec<BlockIx, SparseSet<Reg>>, +) { + info!(" calc_livein_and_liveout: begin"); + let num_blocks = func.blocks().len() as u32; + let empty = SparseSet::<Reg>::empty(); + + let mut num_evals = 0; + let mut liveouts = TypedIxVec::<BlockIx, SparseSet<Reg>>::new(); + liveouts.resize(num_blocks, empty.clone()); + + // Initialise the work queue so as to do a reverse preorder traversal + // through the graph, after which blocks are re-evaluated on demand. + let mut work_queue = Queue::<BlockIx>::new(); + for i in 0..num_blocks { + // block_ix travels in "reverse preorder" + let block_ix = cfg_info.pre_ord[(num_blocks - 1 - i) as usize]; + work_queue.push_back(block_ix); + } + + // in_queue is an optimisation -- this routine works fine without it. in_queue is + // used to avoid inserting duplicate work items in work_queue. This avoids some + // number of duplicate re-evaluations and gets us to a fixed point faster. + // Very roughly, it reduces the number of evaluations per block from around + // 3 to around 2. + let mut in_queue = Vec::<bool>::new(); + in_queue.resize(num_blocks as usize, true); + + while let Some(block_ix) = work_queue.pop_front() { + let i = block_ix.get() as usize; + assert!(in_queue[i]); + in_queue[i] = false; + + // Compute a new value for liveouts[block_ix] + let mut set = SparseSet::<Reg>::empty(); + for block_j_ix in cfg_info.succ_map[block_ix].iter() { + let mut live_in_j = liveouts[*block_j_ix].clone(); + live_in_j.remove(&def_sets_per_block[*block_j_ix]); + live_in_j.union(&use_sets_per_block[*block_j_ix]); + set.union(&live_in_j); + } + num_evals += 1; + + if !set.equals(&liveouts[block_ix]) { + liveouts[block_ix] = set; + // Add `block_ix`'s predecessors to the work queue, since their + // liveout values might be affected. + for block_j_ix in cfg_info.pred_map[block_ix].iter() { + let j = block_j_ix.get() as usize; + if !in_queue[j] { + work_queue.push_back(*block_j_ix); + in_queue[j] = true; + } + } + } + } + + // The liveout values are done, but we need to compute the liveins + // too. + let mut liveins = TypedIxVec::<BlockIx, SparseSet<Reg>>::new(); + liveins.resize(num_blocks, empty.clone()); + for block_ix in BlockIx::new(0).dotdot(BlockIx::new(num_blocks)) { + let mut live_in = liveouts[block_ix].clone(); + live_in.remove(&def_sets_per_block[block_ix]); + live_in.union(&use_sets_per_block[block_ix]); + liveins[block_ix] = live_in; + } + + if false { + let mut sum_card_live_in = 0; + let mut sum_card_live_out = 0; + for bix in BlockIx::new(0).dotdot(BlockIx::new(num_blocks)) { + sum_card_live_in += liveins[bix].card(); + sum_card_live_out += liveouts[bix].card(); + } + println!( + "QQQQ calc_LI/LO: num_evals {}, tot LI {}, tot LO {}", + num_evals, sum_card_live_in, sum_card_live_out + ); + } + + let ratio: f32 = (num_evals as f32) / ((if num_blocks == 0 { 1 } else { num_blocks }) as f32); + info!( + " calc_livein_and_liveout: {} blocks, {} evals ({:<.2} per block)", + num_blocks, num_evals, ratio + ); + + if log_enabled!(Level::Debug) { + let mut n = 0; + debug!(""); + for (livein, liveout) in liveins.iter().zip(liveouts.iter()) { + let mut first = true; + let mut li_str = "".to_string(); + for li in livein.to_vec() { + if !first { + li_str = li_str + &" ".to_string(); + } + first = false; + li_str = li_str + &li.show_with_rru(univ); + } + first = true; + let mut lo_str = "".to_string(); + for lo in liveout.to_vec() { + if !first { + lo_str = lo_str + &" ".to_string(); + } + first = false; + lo_str = lo_str + &lo.show_with_rru(univ); + } + debug!( + "{:<3?} livein {{{}}} liveout {{{}}}", + BlockIx::new(n), + li_str, + lo_str + ); + n += 1; + } + } + + info!(" calc_livein_and_liveout: end"); + (liveins, liveouts) +} + +//============================================================================= +// Computation of RangeFrags (Live Range Fragments), aggregated per register. +// This does not produce complete live ranges. That is done later, by +// `merge_range_frags` below, using the information computed in this section +// by `get_range_frags`. + +// This is surprisingly complex, in part because of the need to correctly +// handle (1) live-in and live-out Regs, (2) dead writes, and (3) instructions +// that modify registers rather than merely reading or writing them. + +/// A ProtoRangeFrag carries information about a [write .. read] range, within a Block, which +/// we will later turn into a fully-fledged RangeFrag. It basically records the first and +/// last-known InstPoints for appearances of a Reg. +/// +/// ProtoRangeFrag also keeps count of the number of appearances of the Reg to which it +/// pertains, using `uses`. The counts get rolled into the resulting RangeFrags, and later are +/// used to calculate spill costs. +/// +/// The running state of this function is a map from Reg to ProtoRangeFrag. Only Regs that +/// actually appear in the Block (or are live-in to it) are mapped. This has the advantage of +/// economy, since most Regs will not appear in (or be live-in to) most Blocks. +#[derive(Clone)] +struct ProtoRangeFrag { + /// The InstPoint in this Block at which the associated Reg most recently became live (when + /// moving forwards though the Block). If this value is the first InstPoint for the Block + /// (the U point for the Block's lowest InstIx), that indicates the associated Reg is + /// live-in to the Block. + first: InstPoint, + + /// This is the InstPoint which is the end point (most recently observed read, in general) + /// for the current RangeFrag under construction. In general we will move `last` forwards + /// as we discover reads of the associated Reg. If this is the last InstPoint for the + /// Block (the D point for the Block's highest InstInx), that indicates that the associated + /// reg is live-out from the Block. + last: InstPoint, + + /// Number of mentions of the associated Reg in this ProtoRangeFrag. + num_mentions: u16, +} + +impl fmt::Debug for ProtoRangeFrag { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!( + fmt, + "{:?}x {:?} - {:?}", + self.num_mentions, self.first, self.last + ) + } +} + +// `fn get_range_frags` and `fn get_range_frags_for_block` below work with two vectors, +// `out_map` and `state`, that are indexed by register. This allows them to entirely avoid the +// use of hash-based `Map`s. However, it does give a problem in that real and virtual registers +// occupy separate, zero-based index spaces. To solve this, we map `Reg`s to a "unified index +// space" as follows: +// +// a `RealReg` is mapped to its `.get_index()` value +// +// a `VirtualReg` is mapped to its `.get_index()` value + the number of real registers +// +// To make this not too inconvenient, `fn reg_to_reg_ix` and `fn reg_ix_to_reg` convert `Reg`s +// to and from the unified index space. This has the annoying side effect that reconstructing a +// `Reg` from an index space value requires having available both the register universe, and a +// table specifying the class for each virtual register. +// +// Really, we ought to rework the `Reg`/`RealReg`/`VirtualReg` abstractions, so as to (1) impose +// a single index space for both register kinds, and (2) so as to separate the concepts of the +// register index from the `Reg` itself. This second point would have the additional benefit of +// making it feasible to represent sets of registers using bit sets. + +#[inline(always)] +pub(crate) fn reg_to_reg_ix(num_real_regs: u32, r: Reg) -> u32 { + if r.is_real() { + r.get_index_u32() + } else { + num_real_regs + r.get_index_u32() + } +} + +#[inline(always)] +pub(crate) fn reg_ix_to_reg( + reg_universe: &RealRegUniverse, + vreg_classes: &Vec</*vreg index,*/ RegClass>, + reg_ix: u32, +) -> Reg { + let reg_ix = reg_ix as usize; + let num_real_regs = reg_universe.regs.len(); + if reg_ix < num_real_regs { + reg_universe.regs[reg_ix].0.to_reg() + } else { + let vreg_ix = reg_ix - num_real_regs; + Reg::new_virtual(vreg_classes[vreg_ix], vreg_ix as u32) + } +} + +// HELPER FUNCTION +// Add to `out_map`, a binding from `reg` to the frags-and-metrics pair specified by `frag` and +// `frag_metrics`. As a space-saving optimisation, make some attempt to avoid creating +// duplicate entries in `out_frags` and `out_frag_metrics`. +#[inline(always)] +fn emit_range_frag( + out_map: &mut Vec</*rreg index, then vreg index, */ SmallVec<[RangeFragIx; 8]>>, + out_frags: &mut TypedIxVec<RangeFragIx, RangeFrag>, + out_frag_metrics: &mut TypedIxVec<RangeFragIx, RangeFragMetrics>, + num_real_regs: u32, + reg: Reg, + frag: &RangeFrag, + frag_metrics: &RangeFragMetrics, +) { + debug_assert!(out_frags.len() == out_frag_metrics.len()); + + // Allocate a new RangeFragIx for `frag`, except, make some minimal effort to avoid huge + // numbers of duplicates by inspecting the previous two entries, and using them if + // possible. + let mut new_fix = None; + + let num_out_frags = out_frags.len(); + if num_out_frags >= 2 { + let back_0 = RangeFragIx::new(num_out_frags - 1); + let back_1 = RangeFragIx::new(num_out_frags - 2); + if out_frags[back_0] == *frag && out_frag_metrics[back_0] == *frag_metrics { + new_fix = Some(back_0); + } else if out_frags[back_1] == *frag && out_frag_metrics[back_1] == *frag_metrics { + new_fix = Some(back_1); + } + } + + let new_fix = match new_fix { + Some(fix) => fix, + None => { + // We can't look back or there was no match; create a new one. + out_frags.push(frag.clone()); + out_frag_metrics.push(frag_metrics.clone()); + RangeFragIx::new(out_frags.len() as u32 - 1) + } + }; + + // And use the new RangeFragIx. + out_map[reg_to_reg_ix(num_real_regs, reg) as usize].push(new_fix); +} + +/// Calculate all the RangeFrags for `bix`. Add them to `out_frags` and corresponding metrics +/// data to `out_frag_metrics`. Add to `out_map`, the associated RangeFragIxs, segregated by +/// Reg. `bix`, `livein`, `liveout` and `rvb` are expected to be valid in the context of the +/// Func `f` (duh!). +#[inline(never)] +fn get_range_frags_for_block<F: Function>( + // Constants + func: &F, + rvb: &RegVecsAndBounds, + reg_universe: &RealRegUniverse, + vreg_classes: &Vec</*vreg index,*/ RegClass>, + bix: BlockIx, + livein: &SparseSet<Reg>, + liveout: &SparseSet<Reg>, + // Preallocated storage for use in this function. They do not carry any useful information + // in between calls here. + visited: &mut Vec<u32>, + state: &mut Vec</*rreg index, then vreg index, */ Option<ProtoRangeFrag>>, + // These accumulate the results of RangeFrag/RangeFragMetrics across multiple calls here. + out_map: &mut Vec</*rreg index, then vreg index, */ SmallVec<[RangeFragIx; 8]>>, + out_frags: &mut TypedIxVec<RangeFragIx, RangeFrag>, + out_frag_metrics: &mut TypedIxVec<RangeFragIx, RangeFragMetrics>, +) { + #[inline(always)] + fn plus1(n: u16) -> u16 { + if n == 0xFFFFu16 { + n + } else { + n + 1 + } + } + + // Invariants for the preallocated storage: + // + // * `visited` is always irrelevant (and cleared) at the start + // + // * `state` always has size (# real regs + # virtual regs). However, all its entries + // should be `None` in between calls here. + + // We use `visited` to keep track of which `state` entries need processing at the end of + // this function. Since `state` is indexed by unified-reg-index, it follows that `visited` + // is a vector of unified-reg-indices. We add an entry to `visited` whenever we change a + // `state` entry from `None` to `Some`. This guarantees that we can find all the `Some` + // `state` entries at the end of the function, change them back to `None`, and emit the + // corresponding fragment. + visited.clear(); + + // Some handy constants. + assert!(func.block_insns(bix).len() >= 1); + let first_iix_in_block = func.block_insns(bix).first(); + let last_iix_in_block = func.block_insns(bix).last(); + let first_pt_in_block = InstPoint::new_use(first_iix_in_block); + let last_pt_in_block = InstPoint::new_def(last_iix_in_block); + let num_real_regs = reg_universe.regs.len() as u32; + + // First, set up `state` as if all of `livein` had been written just prior to the block. + for r in livein.iter() { + let r_state_ix = reg_to_reg_ix(num_real_regs, *r) as usize; + debug_assert!(state[r_state_ix].is_none()); + state[r_state_ix] = Some(ProtoRangeFrag { + num_mentions: 0, + first: first_pt_in_block, + last: first_pt_in_block, + }); + visited.push(r_state_ix as u32); + } + + // Now visit each instruction in turn, examining first the registers it reads, then those it + // modifies, and finally those it writes. + for iix in func.block_insns(bix) { + let bounds_for_iix = &rvb.bounds[iix]; + + // Examine reads. This is pretty simple. They simply extend an existing ProtoRangeFrag + // to the U point of the reading insn. + for i in + bounds_for_iix.uses_start..bounds_for_iix.uses_start + bounds_for_iix.uses_len as u32 + { + let r = rvb.vecs.uses[i as usize]; + let r_state_ix = reg_to_reg_ix(num_real_regs, r) as usize; + match &mut state[r_state_ix] { + // First event for `r` is a read, but it's not listed in `livein`, since otherwise + // `state` would have an entry for it. + None => panic!("get_range_frags_for_block: fail #1"), + Some(ref mut pf) => { + // This the first or subsequent read after a write. Note that the "write" can + // be either a real write, or due to the fact that `r` is listed in `livein`. + // We don't care here. + pf.num_mentions = plus1(pf.num_mentions); + let new_last = InstPoint::new_use(iix); + debug_assert!(pf.last <= new_last); + pf.last = new_last; + } + } + } + + // Examine modifies. These are handled almost identically to reads, except that they + // extend an existing ProtoRangeFrag down to the D point of the modifying insn. + for i in + bounds_for_iix.mods_start..bounds_for_iix.mods_start + bounds_for_iix.mods_len as u32 + { + let r = &rvb.vecs.mods[i as usize]; + let r_state_ix = reg_to_reg_ix(num_real_regs, *r) as usize; + match &mut state[r_state_ix] { + // First event for `r` is a read (really, since this insn modifies `r`), but it's + // not listed in `livein`, since otherwise `state` would have an entry for it. + None => panic!("get_range_frags_for_block: fail #2"), + Some(ref mut pf) => { + // This the first or subsequent modify after a write. Add two to the + // mentions count, as that reflects the implied spill cost increment more + // accurately than just adding one: if we spill the live range in which this + // ends up, we'll generate both a reload and a spill instruction. + pf.num_mentions = plus1(plus1(pf.num_mentions)); + let new_last = InstPoint::new_def(iix); + debug_assert!(pf.last <= new_last); + pf.last = new_last; + } + } + } + + // Examine writes (but not writes implied by modifies). The general idea is that a + // write causes us to terminate and emit the existing ProtoRangeFrag, if any, and start + // a new frag. + for i in + bounds_for_iix.defs_start..bounds_for_iix.defs_start + bounds_for_iix.defs_len as u32 + { + let r = &rvb.vecs.defs[i as usize]; + let r_state_ix = reg_to_reg_ix(num_real_regs, *r) as usize; + match &mut state[r_state_ix] { + // First mention of a Reg we've never heard of before. Start a new + // ProtoRangeFrag for it and keep going. + None => { + let new_pt = InstPoint::new_def(iix); + let new_pf = ProtoRangeFrag { + num_mentions: 1, + first: new_pt, + last: new_pt, + }; + state[r_state_ix] = Some(new_pf); + visited.push(r_state_ix as u32); + } + + // There's already a ProtoRangeFrag for `r`. This write will start a new one, + // so emit the existing one and note this write. + Some(ProtoRangeFrag { + ref mut num_mentions, + ref mut first, + ref mut last, + }) => { + if first == last { + debug_assert!(*num_mentions == 1); + } + + let (frag, frag_metrics) = + RangeFrag::new_with_metrics(func, bix, *first, *last, *num_mentions); + emit_range_frag( + out_map, + out_frags, + out_frag_metrics, + num_real_regs, + *r, + &frag, + &frag_metrics, + ); + let new_pt = InstPoint::new_def(iix); + // Reuse the previous entry for this new definition of the same vreg. + *num_mentions = 1; + *first = new_pt; + *last = new_pt; + } + } + } + } + + // We are at the end of the block. We still have to deal with live-out Regs. We must also + // deal with ProtoRangeFrags in `state` that are for registers not listed as live-out. + + // Deal with live-out Regs. Treat each one as if it is read just after the block. + for r in liveout.iter() { + let r_state_ix = reg_to_reg_ix(num_real_regs, *r) as usize; + let state_elem_p = &mut state[r_state_ix]; + match state_elem_p { + // This can't happen. `r` is in `liveout`, but this implies that it is neither + // defined in the block nor present in `livein`. + None => panic!("get_range_frags_for_block: fail #3"), + Some(ref pf) => { + // `r` is written (or modified), either literally or by virtue of being present + // in `livein`, and may or may not subsequently be read -- we don't care, + // because it must be read "after" the block. Create a `LiveOut` or `Thru` frag + // accordingly. + let (frag, frag_metrics) = RangeFrag::new_with_metrics( + func, + bix, + pf.first, + last_pt_in_block, + pf.num_mentions, + ); + emit_range_frag( + out_map, + out_frags, + out_frag_metrics, + num_real_regs, + *r, + &frag, + &frag_metrics, + ); + // Remove the entry from `state` so that the following loop doesn't process it + // again. + *state_elem_p = None; + } + } + } + + // Finally, round up any remaining ProtoRangeFrags left in `state`. This is what `visited` + // is used for. + for r_state_ix in visited { + let state_elem_p = &mut state[*r_state_ix as usize]; + match state_elem_p { + None => {} + Some(pf) => { + if pf.first == pf.last { + debug_assert!(pf.num_mentions == 1); + } + let (frag, frag_metrics) = + RangeFrag::new_with_metrics(func, bix, pf.first, pf.last, pf.num_mentions); + let r = reg_ix_to_reg(reg_universe, vreg_classes, *r_state_ix); + emit_range_frag( + out_map, + out_frags, + out_frag_metrics, + num_real_regs, + r, + &frag, + &frag_metrics, + ); + // Maintain invariant that all `state` entries are `None` in between calls to + // this function. + *state_elem_p = None; + } + } + } +} + +#[inline(never)] +pub fn get_range_frags<F: Function>( + func: &F, + rvb: &RegVecsAndBounds, + reg_universe: &RealRegUniverse, + livein_sets_per_block: &TypedIxVec<BlockIx, SparseSet<Reg>>, + liveout_sets_per_block: &TypedIxVec<BlockIx, SparseSet<Reg>>, +) -> ( + Vec</*rreg index, then vreg index, */ SmallVec<[RangeFragIx; 8]>>, + TypedIxVec<RangeFragIx, RangeFrag>, + TypedIxVec<RangeFragIx, RangeFragMetrics>, + Vec</*vreg index,*/ RegClass>, +) { + info!(" get_range_frags: begin"); + assert!(livein_sets_per_block.len() == func.blocks().len() as u32); + assert!(liveout_sets_per_block.len() == func.blocks().len() as u32); + assert!(rvb.is_sanitized()); + + // In order that we can work with unified-reg-indices (see comments above), we need to know + // the `RegClass` for each virtual register. That info is collected here. + let mut vreg_classes = vec![RegClass::INVALID; func.get_num_vregs()]; + for r in rvb + .vecs + .uses + .iter() + .chain(rvb.vecs.defs.iter()) + .chain(rvb.vecs.mods.iter()) + { + if r.is_real() { + continue; + } + let r_ix = r.get_index(); + // rustc 1.43.0 appears to have problems avoiding duplicate bounds checks for + // `vreg_classes[r_ix]`; hence give it a helping hand here. + let vreg_classes_ptr = &mut vreg_classes[r_ix]; + if *vreg_classes_ptr == RegClass::INVALID { + *vreg_classes_ptr = r.get_class(); + } else { + assert_eq!(*vreg_classes_ptr, r.get_class()); + } + } + + let num_real_regs = reg_universe.regs.len(); + let num_virtual_regs = vreg_classes.len(); + let num_regs = num_real_regs + num_virtual_regs; + + // A state variable that's reused across calls to `get_range_frags_for_block`. When not in + // a call to `get_range_frags_for_block`, all entries should be `None`. + let mut state = Vec::</*rreg index, then vreg index, */ Option<ProtoRangeFrag>>::new(); + state.resize(num_regs, None); + + // Scratch storage needed by `get_range_frags_for_block`. Doesn't carry any useful info in + // between calls. Start it off not-quite-empty since it will always get used at least a + // bit. + let mut visited = Vec::<u32>::with_capacity(32); + + // `RangeFrag`/`RangeFragMetrics` are collected across multiple calls to + // `get_range_frag_for_blocks` in these three vectors. In other words, they collect the + // overall results for this function. + let mut result_frags = TypedIxVec::<RangeFragIx, RangeFrag>::new(); + let mut result_frag_metrics = TypedIxVec::<RangeFragIx, RangeFragMetrics>::new(); + let mut result_map = + Vec::</*rreg index, then vreg index, */ SmallVec<[RangeFragIx; 8]>>::default(); + result_map.resize(num_regs, smallvec![]); + + for bix in func.blocks() { + get_range_frags_for_block( + func, + rvb, + reg_universe, + &vreg_classes, + bix, + &livein_sets_per_block[bix], + &liveout_sets_per_block[bix], + &mut visited, + &mut state, + &mut result_map, + &mut result_frags, + &mut result_frag_metrics, + ); + } + + assert!(state.len() == num_regs); + assert!(result_map.len() == num_regs); + assert!(vreg_classes.len() == num_virtual_regs); + // This is pretty cheap (once per fn) and any failure will be catastrophic since it means we + // may have forgotten some live range fragments. Hence `assert!` and not `debug_assert!`. + for state_elem in &state { + assert!(state_elem.is_none()); + } + + if log_enabled!(Level::Debug) { + debug!(""); + let mut n = 0; + for frag in result_frags.iter() { + debug!("{:<3?} {:?}", RangeFragIx::new(n), frag); + n += 1; + } + + debug!(""); + for (reg_ix, frag_ixs) in result_map.iter().enumerate() { + if frag_ixs.len() == 0 { + continue; + } + let reg = reg_ix_to_reg(reg_universe, &vreg_classes, reg_ix as u32); + debug!( + "frags for {} {:?}", + reg.show_with_rru(reg_universe), + frag_ixs + ); + } + } + + info!(" get_range_frags: end"); + assert!(result_frags.len() == result_frag_metrics.len()); + (result_map, result_frags, result_frag_metrics, vreg_classes) +} + +//============================================================================= +// Auxiliary tasks involved in creating a single VirtualRange from its +// constituent RangeFragIxs: +// +// * The RangeFragIxs we are given here are purely within single blocks. +// Here, we "compress" them, that is, merge those pairs that flow from one +// block into the the one that immediately follows it in the instruction +// stream. This does not imply anything about control flow; it is purely a +// scheme for reducing the total number of fragments that need to be dealt +// with during interference detection (later on). +// +// * Computation of metrics for the VirtualRange. This is done by examining +// metrics of the individual fragments, and must be done before they are +// compressed. + +// HELPER FUNCTION +// Does `frag1` describe some range of instructions that is followed +// immediately by `frag2` ? Note that this assumes (and checks) that there +// are no spill or reload ranges in play at this point; there should not be. +// Note also, this is very conservative: it only merges the case where the two +// ranges are separated by a block boundary. From measurements, it appears that +// this is the only case where merging is actually a win, though. +fn frags_are_mergeable( + frag1: &RangeFrag, + frag1metrics: &RangeFragMetrics, + frag2: &RangeFrag, + frag2metrics: &RangeFragMetrics, +) -> bool { + assert!(frag1.first.pt().is_use_or_def()); + assert!(frag1.last.pt().is_use_or_def()); + assert!(frag2.first.pt().is_use_or_def()); + assert!(frag2.last.pt().is_use_or_def()); + + if frag1metrics.bix != frag2metrics.bix + && frag1.last.iix().plus(1) == frag2.first.iix() + && frag1.last.pt() == Point::Def + && frag2.first.pt() == Point::Use + { + assert!( + frag1metrics.kind == RangeFragKind::LiveOut || frag1metrics.kind == RangeFragKind::Thru + ); + assert!( + frag2metrics.kind == RangeFragKind::LiveIn || frag2metrics.kind == RangeFragKind::Thru + ); + return true; + } + + false +} + +// HELPER FUNCTION +// Create a compressed version of the fragments listed in `sorted_frag_ixs`, +// taking the opportunity to dereference them (look them up in `frag_env`) in +// the process. Assumes that `sorted_frag_ixs` is indeed ordered so that the +// dereferenced frag sequence is in instruction order. +#[inline(never)] +fn deref_and_compress_sorted_range_frag_ixs( + stats_num_vfrags_uncompressed: &mut usize, + stats_num_vfrags_compressed: &mut usize, + sorted_frag_ixs: &SortedRangeFragIxs, + frag_env: &TypedIxVec<RangeFragIx, RangeFrag>, + frag_metrics_env: &TypedIxVec<RangeFragIx, RangeFragMetrics>, +) -> SortedRangeFrags { + let mut res = SortedRangeFrags::empty(); + + let frag_ixs = &sorted_frag_ixs.frag_ixs; + let num_frags = frag_ixs.len(); + *stats_num_vfrags_uncompressed += num_frags; + + if num_frags == 1 { + // Nothing we can do. Shortcut. + res.frags.push(frag_env[frag_ixs[0]].clone()); + *stats_num_vfrags_compressed += 1; + return res; + } + + // BEGIN merge this frag sequence as much as possible + assert!(num_frags > 1); + + let mut s = 0; // start point of current group + let mut e = 0; // end point of current group + loop { + if s >= num_frags { + break; + } + while e + 1 < num_frags + && frags_are_mergeable( + &frag_env[frag_ixs[e]], + &frag_metrics_env[frag_ixs[e]], + &frag_env[frag_ixs[e + 1]], + &frag_metrics_env[frag_ixs[e + 1]], + ) + { + e += 1; + } + // s to e inclusive is a maximal group + // emit (s, e) + if s == e { + // Can't compress this one + res.frags.push(frag_env[frag_ixs[s]].clone()); + } else { + let compressed_frag = RangeFrag { + first: frag_env[frag_ixs[s]].first, + last: frag_env[frag_ixs[e]].last, + }; + res.frags.push(compressed_frag); + } + // move on + s = e + 1; + e = s; + } + // END merge this frag sequence as much as possible + + *stats_num_vfrags_compressed += res.frags.len(); + res +} + +// HELPER FUNCTION +// Computes the `size`, `total_cost` and `spill_cost` values for a +// VirtualRange, while being very careful to avoid overflow. +fn calc_virtual_range_metrics( + sorted_frag_ixs: &SortedRangeFragIxs, + frag_env: &TypedIxVec<RangeFragIx, RangeFrag>, + frag_metrics_env: &TypedIxVec<RangeFragIx, RangeFragMetrics>, + estimated_frequencies: &TypedIxVec<BlockIx, u32>, +) -> (u16, u32, SpillCost) { + assert!(frag_env.len() == frag_metrics_env.len()); + + let mut tot_size: u32 = 0; + let mut tot_cost: u32 = 0; + + for fix in &sorted_frag_ixs.frag_ixs { + let frag = &frag_env[*fix]; + let frag_metrics = &frag_metrics_env[*fix]; + + // Add on the size of this fragment, but make sure we can't + // overflow a u32 no matter how many fragments there are. + let mut frag_size: u32 = frag.last.iix().get() - frag.first.iix().get() + 1; + frag_size = min(frag_size, 0xFFFFu32); + tot_size += frag_size; + tot_size = min(tot_size, 0xFFFFu32); + + // Here, tot_size <= 0xFFFF. frag.count is u16. estFreq[] is u32. + // We must be careful not to overflow tot_cost, which is u32. + let mut new_tot_cost: u64 = frag_metrics.count as u64; // at max 16 bits + new_tot_cost *= estimated_frequencies[frag_metrics.bix] as u64; // at max 48 bits + new_tot_cost += tot_cost as u64; // at max 48 bits + epsilon + new_tot_cost = min(new_tot_cost, 0xFFFF_FFFFu64); + + // Hence this is safe. + tot_cost = new_tot_cost as u32; + } + + debug_assert!(tot_size <= 0xFFFF); + let size = tot_size as u16; + let total_cost = tot_cost; + + // Divide tot_cost by the total length, so as to increase the apparent + // spill cost of short LRs. This is so as to give the advantage to + // short LRs in competition for registers. This seems a bit of a hack + // to me, but hey .. + debug_assert!(tot_size >= 1); + let spill_cost = SpillCost::finite(tot_cost as f32 / tot_size as f32); + + (size, total_cost, spill_cost) +} + +// MAIN FUNCTION in this section +#[inline(never)] +fn create_and_add_range( + stats_num_vfrags_uncompressed: &mut usize, + stats_num_vfrags_compressed: &mut usize, + result_real: &mut TypedIxVec<RealRangeIx, RealRange>, + result_virtual: &mut TypedIxVec<VirtualRangeIx, VirtualRange>, + reg: Reg, + sorted_frag_ixs: SortedRangeFragIxs, + frag_env: &TypedIxVec<RangeFragIx, RangeFrag>, + frag_metrics_env: &TypedIxVec<RangeFragIx, RangeFragMetrics>, + estimated_frequencies: &TypedIxVec<BlockIx, u32>, +) { + if reg.is_virtual() { + // First, compute the VirtualRange metrics. This has to be done + // before fragment compression. + let (size, total_cost, spill_cost) = calc_virtual_range_metrics( + &sorted_frag_ixs, + frag_env, + frag_metrics_env, + estimated_frequencies, + ); + + // Now it's safe to compress the fragments. + let sorted_frags = deref_and_compress_sorted_range_frag_ixs( + stats_num_vfrags_uncompressed, + stats_num_vfrags_compressed, + &sorted_frag_ixs, + frag_env, + frag_metrics_env, + ); + + result_virtual.push(VirtualRange { + vreg: reg.to_virtual_reg(), + rreg: None, + sorted_frags, + is_ref: false, // analysis_reftypes.rs may later change this + size, + total_cost, + spill_cost, + }); + } else { + result_real.push(RealRange { + rreg: reg.to_real_reg(), + sorted_frags: sorted_frag_ixs, + is_ref: false, // analysis_reftypes.rs may later change this + }); + } +} + +//============================================================================= +// Merging of RangeFrags, producing the final LRs, including metrication and +// compression + +// We need this in order to construct a UnionFind<usize>. +impl ToFromU32 for usize { + // 64 bit + #[cfg(target_pointer_width = "64")] + fn to_u32(x: usize) -> u32 { + if x < 0x1_0000_0000usize { + x as u32 + } else { + panic!("impl ToFromU32 for usize: to_u32: out of range") + } + } + #[cfg(target_pointer_width = "64")] + fn from_u32(x: u32) -> usize { + x as usize + } + // 32 bit + #[cfg(target_pointer_width = "32")] + fn to_u32(x: usize) -> u32 { + x as u32 + } + #[cfg(target_pointer_width = "32")] + fn from_u32(x: u32) -> usize { + x as usize + } +} + +#[inline(never)] +pub fn merge_range_frags( + frag_ix_vec_per_reg: &Vec</*rreg index, then vreg index, */ SmallVec<[RangeFragIx; 8]>>, + frag_env: &TypedIxVec<RangeFragIx, RangeFrag>, + frag_metrics_env: &TypedIxVec<RangeFragIx, RangeFragMetrics>, + estimated_frequencies: &TypedIxVec<BlockIx, u32>, + cfg_info: &CFGInfo, + reg_universe: &RealRegUniverse, + vreg_classes: &Vec</*vreg index,*/ RegClass>, +) -> ( + TypedIxVec<RealRangeIx, RealRange>, + TypedIxVec<VirtualRangeIx, VirtualRange>, +) { + assert!(frag_env.len() == frag_metrics_env.len()); + let mut stats_num_total_incoming_frags = 0; + let mut stats_num_total_incoming_regs = 0; + for all_frag_ixs_for_reg in frag_ix_vec_per_reg { + stats_num_total_incoming_frags += all_frag_ixs_for_reg.len(); + if all_frag_ixs_for_reg.len() > 0 { + stats_num_total_incoming_regs += 1; + } + } + info!(" merge_range_frags: begin"); + info!(" in: {} in frag_env", frag_env.len()); + info!( + " in: {} regs containing in total {} frags", + stats_num_total_incoming_regs, stats_num_total_incoming_frags + ); + + let mut stats_num_single_grps = 0; + let mut stats_num_local_frags = 0; + + let mut stats_num_multi_grps_small = 0; + let mut stats_num_multi_grps_large = 0; + let mut stats_size_multi_grps_small = 0; + let mut stats_size_multi_grps_large = 0; + + let mut stats_num_vfrags_uncompressed = 0; + let mut stats_num_vfrags_compressed = 0; + + let mut result_real = TypedIxVec::<RealRangeIx, RealRange>::new(); + let mut result_virtual = TypedIxVec::<VirtualRangeIx, VirtualRange>::new(); + + // BEGIN per_reg_loop + for (reg_ix, all_frag_ixs_for_reg) in frag_ix_vec_per_reg.iter().enumerate() { + let n_frags_for_this_reg = all_frag_ixs_for_reg.len(); + + // The reg might never have been mentioned at all, especially if it's a real reg. + if n_frags_for_this_reg == 0 { + continue; + } + + let reg_ix = reg_ix as u32; + let reg = reg_ix_to_reg(reg_universe, vreg_classes, reg_ix); + + // Do some shortcutting. First off, if there's only one frag for this reg, we can directly + // give it its own live range, and have done. + if n_frags_for_this_reg == 1 { + create_and_add_range( + &mut stats_num_vfrags_uncompressed, + &mut stats_num_vfrags_compressed, + &mut result_real, + &mut result_virtual, + reg, + SortedRangeFragIxs::unit(all_frag_ixs_for_reg[0], frag_env), + frag_env, + frag_metrics_env, + estimated_frequencies, + ); + stats_num_single_grps += 1; + continue; + } + + // BEGIN merge `all_frag_ixs_for_reg` entries as much as possible. + // + // but .. if we come across independents (RangeKind::Local), pull them out immediately. + + // Try to avoid heap allocation if at all possible. Up to 100 entries are very + // common, so this is sized large to be effective. Each entry is definitely + // 16 bytes at most, so this will use 4KB stack at most, which is reasonable. + let mut triples = SmallVec::<[(RangeFragIx, RangeFragKind, BlockIx); 256]>::new(); + + // Create `triples`. We will use it to guide the merging phase, but it is immutable there. + for fix in all_frag_ixs_for_reg { + let frag_metrics = &frag_metrics_env[*fix]; + + if frag_metrics.kind == RangeFragKind::Local { + // This frag is Local (standalone). Give it its own Range and move on. This is an + // optimisation, but it's also necessary: the main fragment-merging logic below + // relies on the fact that the fragments it is presented with are all either + // LiveIn, LiveOut or Thru. + create_and_add_range( + &mut stats_num_vfrags_uncompressed, + &mut stats_num_vfrags_compressed, + &mut result_real, + &mut result_virtual, + reg, + SortedRangeFragIxs::unit(*fix, frag_env), + frag_env, + frag_metrics_env, + estimated_frequencies, + ); + stats_num_local_frags += 1; + continue; + } + + // This frag isn't Local (standalone) so we have to process it the slow way. + triples.push((*fix, frag_metrics.kind, frag_metrics.bix)); + } + + let triples_len = triples.len(); + + // This is the core of the merging algorithm. + // + // For each ix@(fix, kind, bix) in `triples` (order unimportant): + // + // (1) "Merge with blocks that are live 'downstream' from here": + // if fix is live-out or live-through: + // for b in succs[bix] + // for each ix2@(fix2, kind2, bix2) in `triples` + // if bix2 == b && kind2 is live-in or live-through: + // merge(ix, ix2) + // + // (2) "Merge with blocks that are live 'upstream' from here": + // if fix is live-in or live-through: + // for b in preds[bix] + // for each ix2@(fix2, kind2, bix2) in `triples` + // if bix2 == b && kind2 is live-out or live-through: + // merge(ix, ix2) + // + // `triples` remains unchanged. The equivalence class info is accumulated + // in `eclasses_uf` instead. `eclasses_uf` entries are indices into + // `triples`. + // + // Now, you might think it necessary to do both (1) and (2). But no, they + // are mutually redundant, since if two blocks are connected by a live + // flow from one to the other, then they are also connected in the other + // direction. Hence checking one of the directions is enough. + let mut eclasses_uf = UnionFind::<usize>::new(triples_len); + + // We have two schemes for group merging, one of which is N^2 in the + // length of triples, the other is N-log-N, but with higher constant + // factors. Some experimentation with the bz2 test on a Cortex A57 puts + // the optimal crossover point between 200 and 300; it's not critical. + // Having this protects us against bad behaviour for huge inputs whilst + // still being fast for small inputs. + if triples_len <= 250 { + // The simple way, which is N^2 in the length of `triples`. + for (ix, (_fix, kind, bix)) in triples.iter().enumerate() { + // Deal with liveness flows outbound from `fix`. Meaning, (1) above. + if *kind == RangeFragKind::LiveOut || *kind == RangeFragKind::Thru { + for b in cfg_info.succ_map[*bix].iter() { + // Visit all entries in `triples` that are for `b`. + for (ix2, (_fix2, kind2, bix2)) in triples.iter().enumerate() { + if *bix2 != *b || *kind2 == RangeFragKind::LiveOut { + continue; + } + debug_assert!( + *kind2 == RangeFragKind::LiveIn || *kind2 == RangeFragKind::Thru + ); + // Now we know that liveness for this reg "flows" from `triples[ix]` to + // `triples[ix2]`. So those two frags must be part of the same live + // range. Note this. + if ix != ix2 { + eclasses_uf.union(ix, ix2); // Order of args irrelevant + } + } + } + } + } // outermost iteration over `triples` + + stats_num_multi_grps_small += 1; + stats_size_multi_grps_small += triples_len; + } else { + // The more complex way, which is N-log-N in the length of `triples`. This is the same + // as the simple way, except that the innermost loop, which is a linear search in + // `triples` to find entries for some block `b`, is replaced by a binary search. This + // means that `triples` first needs to be sorted by block index. + triples.sort_unstable_by_key(|(_, _, bix)| *bix); + + for (ix, (_fix, kind, bix)) in triples.iter().enumerate() { + // Deal with liveness flows outbound from `fix`. Meaning, (1) above. + if *kind == RangeFragKind::LiveOut || *kind == RangeFragKind::Thru { + for b in cfg_info.succ_map[*bix].iter() { + // Visit all entries in `triples` that are for `b`. Binary search + // `triples` to find the lowest-indexed entry for `b`. + let mut ix_left = 0; + let mut ix_right = triples_len; + while ix_left < ix_right { + let m = (ix_left + ix_right) >> 1; + if triples[m].2 < *b { + ix_left = m + 1; + } else { + ix_right = m; + } + } + + // It might be that there is no block for `b` in the sequence. That's + // legit; it just means that block `bix` jumps to a successor where the + // associated register isn't live-in/thru. A failure to find `b` can be + // indicated one of two ways: + // + // * ix_left == triples_len + // * ix_left < triples_len and b < triples[ix_left].b + // + // In both cases I *think* the 'loop_over_entries_for_b below will not do + // anything. But this is all a bit hairy, so let's convert the second + // variant into the first, so as to make it obvious that the loop won't do + // anything. + + // ix_left now holds the lowest index of any `triples` entry for block `b`. + // Assert this. + if ix_left < triples_len && *b < triples[ix_left].2 { + ix_left = triples_len; + } + if ix_left < triples_len { + assert!(ix_left == 0 || triples[ix_left - 1].2 < *b); + } + + // ix2 plays the same role as in the quadratic version. ix_left and + // ix_right are not used after this point. + let mut ix2 = ix_left; + loop { + let (_fix2, kind2, bix2) = match triples.get(ix2) { + None => break, + Some(triple) => *triple, + }; + if *b < bix2 { + // We've come to the end of the sequence of `b`-blocks. + break; + } + debug_assert!(*b == bix2); + if kind2 == RangeFragKind::LiveOut { + ix2 += 1; + continue; + } + // Now we know that liveness for this reg "flows" from `triples[ix]` to + // `triples[ix2]`. So those two frags must be part of the same live + // range. Note this. + eclasses_uf.union(ix, ix2); + ix2 += 1; + } + + if ix2 + 1 < triples_len { + debug_assert!(*b < triples[ix2 + 1].2); + } + } + } + } + + stats_num_multi_grps_large += 1; + stats_size_multi_grps_large += triples_len; + } + + // Now `eclasses_uf` contains the results of the merging-search. Visit each of its + // equivalence classes in turn, and convert each into a virtual or real live range as + // appropriate. + let eclasses = eclasses_uf.get_equiv_classes(); + for leader_triple_ix in eclasses.equiv_class_leaders_iter() { + // `leader_triple_ix` is an eclass leader. Enumerate the whole eclass. + let mut frag_ixs = SmallVec::<[RangeFragIx; 4]>::new(); + for triple_ix in eclasses.equiv_class_elems_iter(leader_triple_ix) { + frag_ixs.push(triples[triple_ix].0 /*first field is frag ix*/); + } + let sorted_frags = SortedRangeFragIxs::new(frag_ixs, &frag_env); + create_and_add_range( + &mut stats_num_vfrags_uncompressed, + &mut stats_num_vfrags_compressed, + &mut result_real, + &mut result_virtual, + reg, + sorted_frags, + frag_env, + frag_metrics_env, + estimated_frequencies, + ); + } + // END merge `all_frag_ixs_for_reg` entries as much as possible + } // END per reg loop + + info!(" in: {} single groups", stats_num_single_grps); + info!( + " in: {} local frags in multi groups", + stats_num_local_frags + ); + info!( + " in: {} small multi groups, {} small multi group total size", + stats_num_multi_grps_small, stats_size_multi_grps_small + ); + info!( + " in: {} large multi groups, {} large multi group total size", + stats_num_multi_grps_large, stats_size_multi_grps_large + ); + info!( + " out: {} VLRs, {} RLRs", + result_virtual.len(), + result_real.len() + ); + info!( + " compress vfrags: in {}, out {}", + stats_num_vfrags_uncompressed, stats_num_vfrags_compressed + ); + info!(" merge_range_frags: end"); + + (result_real, result_virtual) +} + +//============================================================================= +// Auxiliary activities that mostly fall under the category "dataflow analysis", but are not +// part of the main dataflow analysis pipeline. + +// Dataflow and liveness together create vectors of VirtualRanges and RealRanges. These define +// (amongst other things) mappings from VirtualRanges to VirtualRegs and from RealRanges to +// RealRegs. However, we often need the inverse mappings: from VirtualRegs to (sets of +// VirtualRanges) and from RealRegs to (sets of) RealRanges. This function computes those +// inverse mappings. They are used by BT's coalescing analysis, and for the dataflow analysis +// that supports reftype handling. +#[inline(never)] +pub fn compute_reg_to_ranges_maps<F: Function>( + func: &F, + univ: &RealRegUniverse, + rlr_env: &TypedIxVec<RealRangeIx, RealRange>, + vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>, +) -> RegToRangesMaps { + // Arbitrary, but chosen after quite some profiling, so as to minimise both instruction + // count and number of `malloc` calls. Don't mess with this without first collecting + // comprehensive measurements. Note that if you set this above 255, the type of + // `r/vreg_approx_frag_counts` below will need to change accordingly. + const MANY_FRAGS_THRESH: u8 = 200; + + // Adds `to_add` to `*counter`, taking care not to overflow it in the process. + let add_u8_usize_saturate_to_u8 = |counter: &mut u8, mut to_add: usize| { + if to_add > 0xFF { + to_add = 0xFF; + } + let mut n = *counter as usize; + n += to_add as usize; + // n is at max 0x1FE (510) + if n > 0xFF { + n = 0xFF; + } + *counter = n as u8; + }; + + // We have in hand the virtual live ranges. Each of these carries its + // associated vreg. So in effect we have a VLR -> VReg mapping. We now + // invert that, so as to generate a mapping from VRegs to their containing + // VLRs. + // + // Note that multiple VLRs may map to the same VReg. So the inverse mapping + // will actually be from VRegs to a set of VLRs. In most cases, we expect + // the virtual-registerised-code given to this allocator to be derived from + // SSA, in which case each VReg will have only one VLR. So in this case, + // the cost of first creating the mapping, and then looking up all the VRegs + // in moves in it, will have cost linear in the size of the input function. + // + // NB re the SmallVec. That has set semantics (no dups). + + let num_vregs = func.get_num_vregs(); + let num_rregs = univ.allocable; + + let mut vreg_approx_frag_counts = vec![0u8; num_vregs]; + let mut vreg_to_vlrs_map = vec![SmallVec::<[VirtualRangeIx; 3]>::new(); num_vregs]; + for (vlr, n) in vlr_env.iter().zip(0..) { + let vlrix = VirtualRangeIx::new(n); + let vreg: VirtualReg = vlr.vreg; + // Now we know that there's a VLR `vlr` that is for VReg `vreg`. Update the inverse + // mapping accordingly. We know we are stepping sequentially through the VLR (index) + // space, so we'll never see the same VLRIx twice. Hence there's no need to check for + // dups when adding a VLR index to an existing binding for a VReg. + // + // If this array-indexing fails, it means the client's `.get_num_vregs()` function + // claims there are fewer virtual regs than we actually observe in the code it gave us. + // So it's a bug in the client. + let vreg_index = vreg.get_index(); + vreg_to_vlrs_map[vreg_index].push(vlrix); + + let vlr_num_frags = vlr.sorted_frags.frags.len(); + add_u8_usize_saturate_to_u8(&mut vreg_approx_frag_counts[vreg_index], vlr_num_frags); + } + + // Same for the real live ranges. + let mut rreg_approx_frag_counts = vec![0u8; num_rregs]; + let mut rreg_to_rlrs_map = vec![SmallVec::<[RealRangeIx; 6]>::new(); num_rregs]; + for (rlr, n) in rlr_env.iter().zip(0..) { + let rlrix = RealRangeIx::new(n); + let rreg: RealReg = rlr.rreg; + // If this array-indexing fails, it means something has gone wrong with sanitisation of + // real registers -- that should ensure that we never see a real register with an index + // greater than `univ.allocable`. So it's a bug in the allocator's analysis phases. + let rreg_index = rreg.get_index(); + rreg_to_rlrs_map[rreg_index].push(rlrix); + + let rlr_num_frags = rlr.sorted_frags.frag_ixs.len(); + add_u8_usize_saturate_to_u8(&mut rreg_approx_frag_counts[rreg_index], rlr_num_frags); + } + + // Create sets indicating which regs have "many" live ranges. Hopefully very few. + // Since the `push`ed-in values are supplied by the `zip(0..)` iterator, they are + // guaranteed duplicate-free, as required by the defn of `RegToRangesMaps`. + let mut vregs_with_many_frags = Vec::<u32 /*VirtualReg index*/>::with_capacity(16); + for (count, vreg_ix) in vreg_approx_frag_counts.iter().zip(0..) { + if *count >= MANY_FRAGS_THRESH { + vregs_with_many_frags.push(vreg_ix); + } + } + + let mut rregs_with_many_frags = Vec::<u32 /*RealReg index*/>::with_capacity(64); + for (count, rreg_ix) in rreg_approx_frag_counts.iter().zip(0..) { + if *count >= MANY_FRAGS_THRESH { + rregs_with_many_frags.push(rreg_ix); + } + } + + RegToRangesMaps { + rreg_to_rlrs_map, + vreg_to_vlrs_map, + rregs_with_many_frags, + vregs_with_many_frags, + many_frags_thresh: MANY_FRAGS_THRESH as usize, + } +} + +// Collect info about registers that are connected by moves. +#[inline(never)] +pub fn collect_move_info<F: Function>( + func: &F, + reg_vecs_and_bounds: &RegVecsAndBounds, + est_freqs: &TypedIxVec<BlockIx, u32>, +) -> MoveInfo { + let mut moves = Vec::<MoveInfoElem>::new(); + for b in func.blocks() { + let block_eef = est_freqs[b]; + for iix in func.block_insns(b) { + let insn = &func.get_insn(iix); + let im = func.is_move(insn); + match im { + None => {} + Some((wreg, reg)) => { + let iix_bounds = ®_vecs_and_bounds.bounds[iix]; + // It might seem strange to assert that `defs_len` and/or + // `uses_len` is <= 1 rather than == 1. The reason is + // that either or even both registers might be ones which + // are not available to the allocator. Hence they will + // have been removed by the sanitisation machinery before + // we get to this point. If either is missing, we + // unfortunately can't coalesce the move away, and just + // have to live with it. + // + // If any of the following five assertions fail, the + // client's `is_move` is probably lying to us. + assert!(iix_bounds.uses_len <= 1); + assert!(iix_bounds.defs_len <= 1); + assert!(iix_bounds.mods_len == 0); + if iix_bounds.uses_len == 1 && iix_bounds.defs_len == 1 { + let reg_vecs = ®_vecs_and_bounds.vecs; + assert!(reg_vecs.uses[iix_bounds.uses_start as usize] == reg); + assert!(reg_vecs.defs[iix_bounds.defs_start as usize] == wreg.to_reg()); + let dst = wreg.to_reg(); + let src = reg; + let est_freq = block_eef; + moves.push(MoveInfoElem { + dst, + src, + iix, + est_freq, + }); + } + } + } + } + } + + MoveInfo { moves } +} diff --git a/third_party/rust/regalloc/src/analysis_main.rs b/third_party/rust/regalloc/src/analysis_main.rs new file mode 100644 index 0000000000..105ab338de --- /dev/null +++ b/third_party/rust/regalloc/src/analysis_main.rs @@ -0,0 +1,317 @@ +//! Top level module for all analysis activities. + +use log::{debug, info}; + +use crate::analysis_control_flow::{CFGInfo, InstIxToBlockIxMap}; +use crate::analysis_data_flow::{ + calc_def_and_use, calc_livein_and_liveout, collect_move_info, compute_reg_to_ranges_maps, + get_range_frags, get_sanitized_reg_uses_for_func, merge_range_frags, +}; +use crate::analysis_reftypes::do_reftypes_analysis; +use crate::data_structures::{ + BlockIx, MoveInfo, RangeFrag, RangeFragIx, RangeFragMetrics, RealRange, RealRangeIx, RealReg, + RealRegUniverse, RegClass, RegToRangesMaps, RegVecsAndBounds, TypedIxVec, VirtualRange, + VirtualRangeIx, VirtualReg, +}; +use crate::sparse_set::SparseSet; +use crate::AlgorithmWithDefaults; +use crate::{Function, Reg}; + +//============================================================================= +// Overall analysis return results, for both control- and data-flow analyses. +// All of these failures refer to various problems with the code that the +// client (caller) supplied to us. + +#[derive(Clone, Debug)] +pub enum AnalysisError { + /// A critical edge from "from" to "to" has been found, and should have been + /// removed by the caller in the first place. + CriticalEdge { from: BlockIx, to: BlockIx }, + + /// Some values in the entry block are live in to the function, but are not + /// declared as such. + EntryLiveinValues(Vec<Reg>), + + /// The incoming code has an explicit or implicit mention (use, def or mod) + /// of a real register, which either (1) isn't listed in the universe at + /// all, or (2) is one of the `suggested_scratch` registers in the universe. + /// (1) isn't allowed because the client must mention *all* real registers + /// in the universe. (2) isn't allowed because the client promises to us + /// that the `suggested_scratch` registers really are completely unused in + /// the incoming code, so that the allocator can use them at literally any + /// point it wants. + IllegalRealReg(RealReg), + + /// At least one block is dead. + UnreachableBlocks, + + /// Implementation limits exceeded. The incoming function is too big. It + /// may contain at most 1 million basic blocks and 16 million instructions. + ImplementationLimitsExceeded, + + /// Currently LSRA can't generate stackmaps, but the client has requested LSRA *and* + /// stackmaps. + LSRACantDoStackmaps, +} + +impl ToString for AnalysisError { + fn to_string(&self) -> String { + match self { + AnalysisError::CriticalEdge { from, to } => { + format!("critical edge detected, from {:?} to {:?}", from, to) + } + AnalysisError::EntryLiveinValues(regs) => { + let regs_string = regs.iter().map(|reg| format!("{:?}", reg)).collect::<Vec<_>>().join(", "); + format!("entry block has love-in value not present in function liveins: {}", regs_string) + } + AnalysisError::IllegalRealReg(reg) => { + format!("instructions mention real register {:?}, which either isn't defined in the register universe, or is a 'suggested_scratch' register", reg) + } + AnalysisError::UnreachableBlocks => { + "at least one block is unreachable".to_string() + } + AnalysisError::ImplementationLimitsExceeded => { + "implementation limits exceeded (more than 1 million blocks or 16 million insns)".to_string() + } + AnalysisError::LSRACantDoStackmaps => { + "LSRA *and* stackmap creation requested; but this combination is not yet supported".to_string() + } + } + } +} + +//============================================================================= +// Top level for all analysis activities. + +pub struct AnalysisInfo { + /// The sanitized per-insn reg-use info + pub(crate) reg_vecs_and_bounds: RegVecsAndBounds, + /// The real-reg live ranges + pub(crate) real_ranges: TypedIxVec<RealRangeIx, RealRange>, + /// The virtual-reg live ranges + pub(crate) virtual_ranges: TypedIxVec<VirtualRangeIx, VirtualRange>, + /// The fragment table + pub(crate) range_frags: TypedIxVec<RangeFragIx, RangeFrag>, + /// The fragment metrics table + pub(crate) range_metrics: TypedIxVec<RangeFragIx, RangeFragMetrics>, + /// Estimated execution frequency per block + pub(crate) estimated_frequencies: TypedIxVec<BlockIx, u32>, + /// Maps InstIxs to BlockIxs + pub(crate) inst_to_block_map: InstIxToBlockIxMap, + /// Maps from RealRegs to sets of RealRanges and VirtualRegs to sets of VirtualRanges + /// (all operating on indices, not the actual objects). This is only generated in + /// situations where we need it, hence the `Option`. + pub(crate) reg_to_ranges_maps: Option<RegToRangesMaps>, + /// Information about registers connected by moves. This is only generated in situations + /// where we need it, hence the `Option`. + pub(crate) move_info: Option<MoveInfo>, +} + +#[inline(never)] +pub fn run_analysis<F: Function>( + func: &F, + reg_universe: &RealRegUniverse, + algorithm: AlgorithmWithDefaults, + client_wants_stackmaps: bool, + reftype_class: RegClass, + reftyped_vregs: &Vec<VirtualReg>, // as supplied by the client +) -> Result<AnalysisInfo, AnalysisError> { + info!("run_analysis: begin"); + info!( + " run_analysis: {} blocks, {} insns", + func.blocks().len(), + func.insns().len() + ); + + // LSRA can't do reftypes yet. That should have been checked at the top level already. + if client_wants_stackmaps { + assert!(algorithm != AlgorithmWithDefaults::LinearScan); + } + + info!(" run_analysis: begin control flow analysis"); + + // First do control flow analysis. This is (relatively) simple. Note that + // this can fail, for various reasons; we propagate the failure if so. + let cfg_info = CFGInfo::create(func)?; + + // Create the InstIx-to-BlockIx map. This isn't really control-flow + // analysis, but needs to be done at some point. + let inst_to_block_map = InstIxToBlockIxMap::new(func); + + // Annotate each Block with its estimated execution frequency + let mut estimated_frequencies = TypedIxVec::new(); + for bix in func.blocks() { + let mut estimated_frequency = 1; + let depth = u32::min(cfg_info.depth_map[bix], 3); + for _ in 0..depth { + estimated_frequency *= 10; + } + assert!(bix == BlockIx::new(estimated_frequencies.len())); + estimated_frequencies.push(estimated_frequency); + } + + info!(" run_analysis: end control flow analysis"); + + // Now perform dataflow analysis. This is somewhat more complex. + info!(" run_analysis: begin data flow analysis"); + + // See `get_sanitized_reg_uses_for_func` for the meaning of "sanitized". + let reg_vecs_and_bounds = get_sanitized_reg_uses_for_func(func, reg_universe) + .map_err(|reg| AnalysisError::IllegalRealReg(reg))?; + assert!(reg_vecs_and_bounds.is_sanitized()); + + // Calculate block-local def/use sets. + let (def_sets_per_block, use_sets_per_block) = + calc_def_and_use(func, ®_vecs_and_bounds, ®_universe); + debug_assert!(def_sets_per_block.len() == func.blocks().len() as u32); + debug_assert!(use_sets_per_block.len() == func.blocks().len() as u32); + + // Calculate live-in and live-out sets per block, using the traditional + // iterate-to-a-fixed-point scheme. + + // `liveout_sets_per_block` is amended below for return blocks, hence `mut`. + let (livein_sets_per_block, mut liveout_sets_per_block) = calc_livein_and_liveout( + func, + &def_sets_per_block, + &use_sets_per_block, + &cfg_info, + ®_universe, + ); + debug_assert!(livein_sets_per_block.len() == func.blocks().len() as u32); + debug_assert!(liveout_sets_per_block.len() == func.blocks().len() as u32); + + // Verify livein set of entry block against liveins specified by function + // (e.g., ABI params). + let func_liveins = SparseSet::from_vec( + func.func_liveins() + .to_vec() + .into_iter() + .map(|rreg| rreg.to_reg()) + .collect(), + ); + if !livein_sets_per_block[func.entry_block()].is_subset_of(&func_liveins) { + let mut regs = livein_sets_per_block[func.entry_block()].clone(); + regs.remove(&func_liveins); + return Err(AnalysisError::EntryLiveinValues(regs.to_vec())); + } + + // Add function liveouts to every block ending in a return. + let func_liveouts = SparseSet::from_vec( + func.func_liveouts() + .to_vec() + .into_iter() + .map(|rreg| rreg.to_reg()) + .collect(), + ); + for block in func.blocks() { + let last_iix = func.block_insns(block).last(); + if func.is_ret(last_iix) { + liveout_sets_per_block[block].union(&func_liveouts); + } + } + + info!(" run_analysis: end data flow analysis"); + + // Dataflow analysis is now complete. Now compute the virtual and real live + // ranges, in two steps: (1) compute RangeFrags, and (2) merge them + // together, guided by flow and liveness info, so as to create the final + // VirtualRanges and RealRanges. + info!(" run_analysis: begin liveness analysis"); + + let (frag_ixs_per_reg, frag_env, frag_metrics_env, vreg_classes) = get_range_frags( + func, + ®_vecs_and_bounds, + ®_universe, + &livein_sets_per_block, + &liveout_sets_per_block, + ); + + // These have to be mut because they may get changed below by the call to + // `to_reftypes_analysis`. + let (mut rlr_env, mut vlr_env) = merge_range_frags( + &frag_ixs_per_reg, + &frag_env, + &frag_metrics_env, + &estimated_frequencies, + &cfg_info, + ®_universe, + &vreg_classes, + ); + + debug_assert!(liveout_sets_per_block.len() == estimated_frequencies.len()); + + debug!(""); + let mut n = 0; + for rlr in rlr_env.iter() { + debug!( + "{:<4?} {}", + RealRangeIx::new(n), + rlr.show_with_rru(®_universe) + ); + n += 1; + } + + debug!(""); + n = 0; + for vlr in vlr_env.iter() { + debug!("{:<4?} {:?}", VirtualRangeIx::new(n), vlr); + n += 1; + } + + // Now a bit of auxiliary info collection, which isn't really either control- or data-flow + // analysis. + + // For BT and/or reftypes, we'll also need the reg-to-ranges maps. + let reg_to_ranges_maps = + if client_wants_stackmaps || algorithm == AlgorithmWithDefaults::Backtracking { + Some(compute_reg_to_ranges_maps( + func, + ®_universe, + &rlr_env, + &vlr_env, + )) + } else { + None + }; + + // For BT and/or reftypes, we'll also need information about moves. + let move_info = if client_wants_stackmaps || algorithm == AlgorithmWithDefaults::Backtracking { + Some(collect_move_info( + func, + ®_vecs_and_bounds, + &estimated_frequencies, + )) + } else { + None + }; + + info!(" run_analysis: end liveness analysis"); + + if client_wants_stackmaps { + info!(" run_analysis: begin reftypes analysis"); + do_reftypes_analysis( + &mut rlr_env, + &mut vlr_env, + &frag_env, + reg_to_ranges_maps.as_ref().unwrap(), /* safe because of logic just above */ + &move_info.as_ref().unwrap(), /* ditto */ + reftype_class, + reftyped_vregs, + ); + info!(" run_analysis: end reftypes analysis"); + } + + info!("run_analysis: end"); + + Ok(AnalysisInfo { + reg_vecs_and_bounds, + real_ranges: rlr_env, + virtual_ranges: vlr_env, + range_frags: frag_env, + range_metrics: frag_metrics_env, + estimated_frequencies, + inst_to_block_map, + reg_to_ranges_maps, + move_info, + }) +} diff --git a/third_party/rust/regalloc/src/analysis_reftypes.rs b/third_party/rust/regalloc/src/analysis_reftypes.rs new file mode 100644 index 0000000000..2a0aafa0d2 --- /dev/null +++ b/third_party/rust/regalloc/src/analysis_reftypes.rs @@ -0,0 +1,137 @@ +//! Performs a simple taint analysis, to find all live ranges that are reftyped. + +use crate::data_structures::{ + InstPoint, Map, MoveInfo, MoveInfoElem, RangeFrag, RangeFragIx, RangeId, RealRange, + RealRangeIx, Reg, RegClass, RegToRangesMaps, TypedIxVec, VirtualRange, VirtualRangeIx, + VirtualReg, +}; +use crate::sparse_set::{SparseSet, SparseSetU}; + +use log::debug; +use smallvec::SmallVec; + +pub fn do_reftypes_analysis( + // From dataflow/liveness analysis. Modified by setting their is_ref bit. + rlr_env: &mut TypedIxVec<RealRangeIx, RealRange>, + vlr_env: &mut TypedIxVec<VirtualRangeIx, VirtualRange>, + // From dataflow analysis + frag_env: &TypedIxVec<RangeFragIx, RangeFrag>, + reg_to_ranges_maps: &RegToRangesMaps, + move_info: &MoveInfo, + // As supplied by the client + reftype_class: RegClass, + reftyped_vregs: &Vec<VirtualReg>, +) { + // Helper: find the RangeId (RealRange or VirtualRange) for a register at an InstPoint. + let find_range_id_for_reg = |pt: InstPoint, reg: Reg| -> RangeId { + if reg.is_real() { + for &rlrix in ®_to_ranges_maps.rreg_to_rlrs_map[reg.get_index() as usize] { + if rlr_env[rlrix].sorted_frags.contains_pt(frag_env, pt) { + return RangeId::new_real(rlrix); + } + } + } else { + for &vlrix in ®_to_ranges_maps.vreg_to_vlrs_map[reg.get_index() as usize] { + if vlr_env[vlrix].sorted_frags.contains_pt(pt) { + return RangeId::new_virtual(vlrix); + } + } + } + panic!("do_reftypes_analysis::find_range_for_reg: can't find range"); + }; + + // The game here is: starting with `reftyped_vregs`, find *all* the VirtualRanges and + // RealRanges to which refness can flow, via instructions which the client's `is_move` + // function considers to be moves. + + // This is done in three stages: + // + // (1) Create a mapping from source (virtual or real) ranges to sets of destination ranges. + // We have `move_info`, which tells us which (virtual or real) regs are connected by + // moves. However, that's not directly useful -- we need to know which *ranges* are + // connected by moves. `move_info` as supplied helpfully indicates both source and + // destination regs and ranges, so we can simply use that. + // + // (2) Similarly, convert `reftyped_vregs` into a set of reftyped ranges by consulting + // `reg_to_ranges_maps`. + // + // (3) Compute the transitive closure of (1) starting from the ranges in (2). This is done + // by a depth first search of the graph implied by (1). + + // ====== Compute (1) above ====== + // Each entry in `succ` maps from `src` to a `SparseSet<dsts>`, so to speak. That is, for + // `d1`, `d2`, etc, in `dsts`, the function contains moves `d1 := src`, `d2 := src`, etc. + let mut succ = Map::<RangeId, SparseSetU<[RangeId; 4]>>::default(); + for &MoveInfoElem { dst, src, iix, .. } in &move_info.moves { + // Don't waste time processing moves which can't possibly be of reftyped values. + debug_assert!(dst.get_class() == src.get_class()); + if dst.get_class() != reftype_class { + continue; + } + let src_range = find_range_id_for_reg(InstPoint::new_use(iix), src); + let dst_range = find_range_id_for_reg(InstPoint::new_def(iix), dst); + debug!( + "move from {:?} (range {:?}) to {:?} (range {:?}) at inst {:?}", + src, src_range, dst, dst_range, iix + ); + match succ.get_mut(&src_range) { + Some(dst_ranges) => dst_ranges.insert(dst_range), + None => { + // Re `; 4`: we expect most copies copy a register to only a few destinations. + let mut dst_ranges = SparseSetU::<[RangeId; 4]>::empty(); + dst_ranges.insert(dst_range); + let r = succ.insert(src_range, dst_ranges); + assert!(r.is_none()); + } + } + } + + // ====== Compute (2) above ====== + let mut reftyped_ranges = SparseSet::<RangeId>::empty(); + for vreg in reftyped_vregs { + // If this fails, the client has been telling is that some virtual reg is reftyped, yet + // it doesn't belong to the class of regs that it claims can carry refs. So the client + // is buggy. + debug_assert!(vreg.get_class() == reftype_class); + for vlrix in ®_to_ranges_maps.vreg_to_vlrs_map[vreg.get_index()] { + debug!("range {:?} is reffy due to reffy vreg {:?}", vlrix, vreg); + reftyped_ranges.insert(RangeId::new_virtual(*vlrix)); + } + } + + // ====== Compute (3) above ====== + // Almost all chains of copies will be less than 64 long, I would guess. + let mut stack = SmallVec::<[RangeId; 64]>::new(); + let mut visited = reftyped_ranges.clone(); + for start_point_range in reftyped_ranges.iter() { + // Perform DFS from `start_point_range`. + stack.clear(); + stack.push(*start_point_range); + while let Some(src_range) = stack.pop() { + visited.insert(src_range); + if let Some(dst_ranges) = succ.get(&src_range) { + for dst_range in dst_ranges.iter() { + if !visited.contains(*dst_range) { + stack.push(*dst_range); + } + } + } + } + } + + // Finally, annotate rlr_env/vlr_env with the results of the analysis. (That was the whole + // point!) + for range in visited.iter() { + if range.is_real() { + let rrange = &mut rlr_env[range.to_real()]; + debug_assert!(!rrange.is_ref); + debug!(" -> rrange {:?} is reffy", range.to_real()); + rrange.is_ref = true; + } else { + let vrange = &mut vlr_env[range.to_virtual()]; + debug_assert!(!vrange.is_ref); + debug!(" -> rrange {:?} is reffy", range.to_virtual()); + vrange.is_ref = true; + } + } +} diff --git a/third_party/rust/regalloc/src/avl_tree.rs b/third_party/rust/regalloc/src/avl_tree.rs new file mode 100644 index 0000000000..e42208425f --- /dev/null +++ b/third_party/rust/regalloc/src/avl_tree.rs @@ -0,0 +1,1281 @@ +//! AVL trees with a private allocation pool. +//! +//! AVL tree internals are public, so that backtracking.rs can do custom +//! traversals of the tree as it wishes. + +use smallvec::SmallVec; +use std::cmp::Ordering; + +//============================================================================= +// Data structures for AVLTree + +#[derive(Clone, PartialEq)] +pub enum AVLTag { + Free, // This pool entry is not in use + None, // This pool entry is in use. Neither subtree is higher. + Left, // This pool entry is in use. The left subtree is higher. + Right, // This pool entry is in use. The right subtree is higher. +} + +#[derive(Clone)] +pub struct AVLNode<T> { + pub tag: AVLTag, + pub left: u32, + pub right: u32, + pub item: T, +} +impl<T> AVLNode<T> { + fn new(tag: AVLTag, left: u32, right: u32, item: T) -> Self { + Self { + tag, + left, + right, + item, + } + } +} + +pub const AVL_NULL: u32 = 0xFFFF_FFFF; + +pub struct AVLTree<T> { + // The storage area. There can be at most 2^32-2 entries, since AVL_NULL + // (== 2^32-1) is used to mean "the null pointer". + pub pool: Vec<AVLNode<T>>, + // A default value for the stored item. We don't care what this is; + // unfortunately Rust forces us to have one so that additions to the free + // list will be fully initialised. + default: T, + // The freelist head. This is a list of available entries. Each item on + // the freelist must have its .tag be AVLTag::Free, and will use its .left + // field as the link to the next freelist item. A freelist link value of + // AVL_NULL denotes the end of the list. If `freelist` itself is AVL_NULL + // then the list is empty. + freelist: u32, + // Last but not least, the root node. + pub root: u32, +} + +//============================================================================= +// Storage management functions for AVLTree + +impl<T: Clone> AVLTree<T> { + // Create a new tree and its associated storage pool. This requires knowing + // the default item value. + pub fn new(default: T) -> Self { + // Pre-allocate a few entries so as to save a few reallocs later, on the + // assumption that most trees will get quite large. + let pool = Vec::with_capacity(16); + let freelist = AVL_NULL; + let root = AVL_NULL; + Self { + pool, + default, + freelist, + root, + } + } + + // Private function: free a tree node and put it back on the storage pool's + // freelist. + fn free(&mut self, index: u32) { + assert!(index != AVL_NULL); + assert!(self.pool[index as usize].tag != AVLTag::Free); + self.pool[index as usize] = + AVLNode::new(AVLTag::Free, self.freelist, AVL_NULL, self.default.clone()); + self.freelist = index; + } + + // Private function: allocate a tree node from the storage pool, resizing + // the pool if necessary. This will decline to expand the tree past about + // 1.75 billion items. + fn alloc(&mut self) -> u32 { + // Check to see if the freelist is empty, and if so allocate a bunch more + // slots. + if self.freelist == AVL_NULL { + let start = self.pool.len(); + let fill_item = AVLNode::new(AVLTag::Free, AVL_NULL, AVL_NULL, self.default.clone()); + // What happens if this OOMs? At least guard against u32 overflow by + // doing this: + if start >= 0x7000_0000 { + // 1.75 billion elements in the tree should be enough for any + // reasonable use of this register allocator. + panic!("AVLTree<T>::alloc: too many items"); + } + self.pool.resize(2 * start + 2, fill_item); + let end_plus_1 = self.pool.len(); + debug_assert!(end_plus_1 >= 2); + self.pool[end_plus_1 - 1].left = self.freelist; + let mut i = end_plus_1 - 2; + while i >= start { + // The entry is already marked as free, but we must set the link. + self.pool[i].left = i as u32 + 1; + if i == 0 { + break; + } + i -= 1; + } + self.freelist = start as u32; + debug_assert!(self.freelist != AVL_NULL); + } + // And now allocate. + let new = self.freelist; + assert!(self.pool[new as usize].tag == AVLTag::Free); + // The caller is responsible for filling in the entry. But at least set + // the tag to non-Free, for sanity. + self.pool[new as usize].tag = AVLTag::None; + self.freelist = self.pool[new as usize].left; + new + } +} + +//============================================================================= +// Tree-wrangling machinery for AVLTree (private) + +// For the public interface, see below. + +// The functions 'insert' and 'delete', and all supporting functions reachable +// from them, are derived from a public domain implementation by Georg Kraml. +// Unfortunately the relevant web site is long gone, and can only be found on +// the Wayback Machine. +// +// https://web.archive.org/web/20010419134337/ +// http://www.kraml.at/georg/avltree/index.html +// +// https://web.archive.org/web/20030926063347/ +// http://www.kraml.at/georg/avltree/avlmonolithic.c +// +// https://web.archive.org/web/20030401124003/http://www.kraml.at/src/howto/ +// +// For relicensing clearance, see Mozilla bug 1620332, at +// https://bugzilla.mozilla.org/show_bug.cgi?id=1620332. + +// Did a given insertion/deletion succeed, and what do we do next? +#[derive(Clone, Copy, PartialEq)] +enum AVLRes { + Error, + OK, + Balance, +} + +impl<T: Clone + PartialOrd> AVLTree<T> { + // Private function: rotleft: perform counterclockwise rotation + // Takes the root of the tree to rotate, returns the new root + fn rotleft(&mut self, old_root: u32) -> u32 { + let new_root = self.pool[old_root as usize].right; + self.pool[old_root as usize].right = self.pool[new_root as usize].left; + self.pool[new_root as usize].left = old_root; + new_root + } + + // Private function: rotright: perform clockwise rotation + // Takes the root of the tree to rotate, returns the new root + fn rotright(&mut self, old_root: u32) -> u32 { + let new_root = self.pool[old_root as usize].left; + self.pool[old_root as usize].left = self.pool[new_root as usize].right; + self.pool[new_root as usize].right = old_root; + new_root + } + + // Private function: leftgrown: helper function for `insert` + // + // Parameters: + // + // root Root of a tree. This node's left + // subtree has just grown due to item insertion; its + // "tag" flag needs adjustment, and the local tree + // (the subtree of which this node is the root node) may + // have become unbalanced. + // + // Return values: + // + // The new root of the subtree, plus either: + // + // OK The local tree could be rebalanced or was balanced + // from the start. The parent activations of the avlinsert + // activation that called this function may assume the + // entire tree is valid. + // or + // BALANCE The local tree was balanced, but has grown in height. + // Do not assume the entire tree is valid. + // + // This function has been split into two pieces: `leftgrown`, which is small and hot, and is + // marked always-inline, and `leftgrown_left`, which handles a more complex and less + // frequent case, and is marked never-inline. The intent is to have the common case always + // inlined without having to deal with the extra register pressure from inlining the less + // frequent code. The dual function `rightgrown` is split similarly. + #[inline(never)] + fn leftgrown_left(&mut self, mut root: u32) -> (u32, AVLRes) { + if self.pool[self.pool[root as usize].left as usize].tag == AVLTag::Left { + self.pool[root as usize].tag = AVLTag::None; + let t = self.pool[root as usize].left; + self.pool[t as usize].tag = AVLTag::None; + root = self.rotright(root); + } else { + match self.pool[self.pool[self.pool[root as usize].left as usize].right as usize].tag { + AVLTag::Left => { + self.pool[root as usize].tag = AVLTag::Right; + let t = self.pool[root as usize].left; + self.pool[t as usize].tag = AVLTag::None; + } + AVLTag::Right => { + self.pool[root as usize].tag = AVLTag::None; + let t = self.pool[root as usize].left; + self.pool[t as usize].tag = AVLTag::Left; + } + AVLTag::None => { + self.pool[root as usize].tag = AVLTag::None; + let t = self.pool[root as usize].left; + self.pool[t as usize].tag = AVLTag::None; + } + AVLTag::Free => panic!("AVLTree::leftgrown_left: unallocated node in tree"), + } + let t = self.pool[self.pool[root as usize].left as usize].right; + self.pool[t as usize].tag = AVLTag::None; + self.pool[root as usize].left = self.rotleft(self.pool[root as usize].left); + root = self.rotright(root); + } + return (root, AVLRes::OK); + } + + #[inline(always)] + fn leftgrown(&mut self, root: u32) -> (u32, AVLRes) { + let root_node = &mut self.pool[root as usize]; + match root_node.tag { + AVLTag::Left => self.leftgrown_left(root), + AVLTag::Right => { + root_node.tag = AVLTag::None; + return (root, AVLRes::OK); + } + AVLTag::None => { + root_node.tag = AVLTag::Left; + return (root, AVLRes::Balance); + } + AVLTag::Free => panic!("AVLTree::leftgrown: unallocated node in tree"), + } + } + + // Private function: rightgrown: helper function for `insert` + // + // See leftgrown for details. + #[inline(never)] + fn rightgrown_right(&mut self, mut root: u32) -> (u32, AVLRes) { + if self.pool[self.pool[root as usize].right as usize].tag == AVLTag::Right { + self.pool[root as usize].tag = AVLTag::None; + let t = self.pool[root as usize].right as usize; + self.pool[t].tag = AVLTag::None; + root = self.rotleft(root); + } else { + match self.pool[self.pool[self.pool[root as usize].right as usize].left as usize].tag { + AVLTag::Right => { + self.pool[root as usize].tag = AVLTag::Left; + let t = self.pool[root as usize].right; + self.pool[t as usize].tag = AVLTag::None; + } + AVLTag::Left => { + self.pool[root as usize].tag = AVLTag::None; + let t = self.pool[root as usize].right; + self.pool[t as usize].tag = AVLTag::Right; + } + AVLTag::None => { + self.pool[root as usize].tag = AVLTag::None; + let t = self.pool[root as usize].right; + self.pool[t as usize].tag = AVLTag::None; + } + AVLTag::Free => panic!("AVLTree::rightgrown_right: unallocated node in tree"), + } + let t = self.pool[self.pool[root as usize].right as usize].left; + self.pool[t as usize].tag = AVLTag::None; + self.pool[root as usize].right = self.rotright(self.pool[root as usize].right); + root = self.rotleft(root); + } + return (root, AVLRes::OK); + } + + #[inline(always)] + fn rightgrown(&mut self, root: u32) -> (u32, AVLRes) { + match self.pool[root as usize].tag { + AVLTag::Left => { + self.pool[root as usize].tag = AVLTag::None; + return (root, AVLRes::OK); + } + AVLTag::Right => self.rightgrown_right(root), + AVLTag::None => { + self.pool[root as usize].tag = AVLTag::Right; + return (root, AVLRes::Balance); + } + AVLTag::Free => panic!("AVLTree::rightgrown: unallocated node in tree"), + } + } + + // Private function: insert_wrk: insert a node into the AVL tree + // (worker function) + // + // Parameters: + // + // root Root of the tree in whch to insert `d`. + // + // item Item to be inserted. + // + // Returns AVL_NULL if the value is already in the tree. Otherwise returns the index of the + // new root (which is obviously always non-AVL_NULL). This is infallible in the sense that, + // if allocation of a new node fails, it won't return -- `self.alloc()` will panic. + // + // This function relies on the fact that any non-AVL_NULL value will have its top bit (bit + // 31) clear, since that bit is used as a boolean in the `stack`. That property is + // guaranteed us by `fn alloc`, which ensures that the max number of nodes in the tree is + // 0x70000000. + fn insert_wrk<F>(&mut self, mut root: u32, item: T, mb_cmp: Option<&F>) -> u32 + where + F: Fn(T, T) -> Option<Ordering>, + { + #[inline(always)] + fn stack_entry_set_is_left(node: u32) -> u32 { + node | 0x8000_0000 + } + #[inline(always)] + fn stack_entry_get_is_left(ent: u32) -> u32 { + ent & 0x8000_0000 + } + #[inline(always)] + fn stack_entry_get_node(ent: u32) -> u32 { + ent & 0x7FFF_FFFF + } + + // The stack will hold a root-leaf path. Give that the max number of elements allowed + // in the tree is 0x70000000, which is (7/8) * 2^31, and that the max depth is 1.44 * + // log2(elems), a 64 entry stack should always be sufficient. Hence there should never + // be any dynamic allocation here. In fact a 48 entry stack would also suffice, but + // SmallVec doesn't provide that size. + let mut stack = SmallVec::<[u32; 64]>::new(); + + // In the first phase, walk down the tree to find the place where the new node should be + // inserted. This loop is cloned so as to allow the test on `mb_cmp` to be done just + // once. + match mb_cmp { + None => { + while root != AVL_NULL { + let cmp_loc_right = &self.pool[root as usize]; + let cmp_arg_right: T = cmp_loc_right.item.clone(); + let cmp_arg_left: T = item.clone(); + debug_assert!(stack_entry_get_is_left(root) == 0); + match cmp_arg_left.partial_cmp(&cmp_arg_right) { + None => panic!("AVLTree::insert_wrk: unordered elements(1)"), + Some(Ordering::Less) => { + stack.push(stack_entry_set_is_left(root)); + root = cmp_loc_right.left; + } + Some(Ordering::Greater) => { + stack.push(root); + root = cmp_loc_right.right; + } + Some(Ordering::Equal) => { + // Item is already in the tree. + return AVL_NULL; + } + } + } + } + Some(cmp) => { + while root != AVL_NULL { + let cmp_loc_right = &self.pool[root as usize]; + let cmp_arg_right: T = cmp_loc_right.item.clone(); + let cmp_arg_left: T = item.clone(); + debug_assert!(stack_entry_get_is_left(root) == 0); + match cmp(cmp_arg_left, cmp_arg_right) { + None => panic!("AVLTree::insert_wrk: unordered elements(2)"), + Some(Ordering::Less) => { + stack.push(stack_entry_set_is_left(root)); + root = cmp_loc_right.left; + } + Some(Ordering::Greater) => { + stack.push(root); + root = cmp_loc_right.right; + } + Some(Ordering::Equal) => { + // Item is already in the tree. + return AVL_NULL; + } + } + } + } + } + + // Now allocate the new node. + debug_assert!(root == AVL_NULL); + let new_node = self.alloc(); + self.pool[new_node as usize] = AVLNode::new(AVLTag::None, AVL_NULL, AVL_NULL, item.clone()); + + // And unwind the stack, back to the root, rebalancing as we go. Once get to a place + // where the new subtree doesn't need to be rebalanced, we can stop this upward scan, + // because no nodes above it will need to be rebalanced either. + let mut curr_node = new_node; + let mut curr_node_action = AVLRes::Balance; + + while let Some(parent_node_tagged) = stack.pop() { + let parent_node = stack_entry_get_node(parent_node_tagged); + if stack_entry_get_is_left(parent_node_tagged) != 0 { + self.pool[parent_node as usize].left = curr_node; + if curr_node_action == AVLRes::Balance { + let pair = self.leftgrown(parent_node); + curr_node = pair.0; + curr_node_action = pair.1; + } else { + curr_node = parent_node; + break; + } + } else { + self.pool[parent_node as usize].right = curr_node; + if curr_node_action == AVLRes::Balance { + let pair = self.rightgrown(parent_node); + curr_node = pair.0; + curr_node_action = pair.1; + } else { + curr_node = parent_node; + break; + } + } + } + + if !stack.is_empty() { + curr_node = stack_entry_get_node(stack[0]); + } + + debug_assert!(curr_node != AVL_NULL); + return curr_node; + } + + // Private function: leftshrunk: helper function for delete and + // findlowest + // + // Parameters: + // + // n Address of a pointer to a node. The node's left + // subtree has just shrunk due to item removal; its + // "skew" flag needs adjustment, and the local tree + // (the subtree of which this node is the root node) may + // have become unbalanced. + // + // Return values: + // + // OK The parent activation of the delete activation + // that called this function may assume the entire + // tree is valid. + // + // BALANCE Do not assume the entire tree is valid. + fn leftshrunk(&mut self, mut n: u32) -> (u32, AVLRes) { + match self.pool[n as usize].tag { + AVLTag::Left => { + self.pool[n as usize].tag = AVLTag::None; + return (n, AVLRes::Balance); + } + AVLTag::Right => { + if self.pool[self.pool[n as usize].right as usize].tag == AVLTag::Right { + self.pool[n as usize].tag = AVLTag::None; + let t = self.pool[n as usize].right; + self.pool[t as usize].tag = AVLTag::None; + n = self.rotleft(n); + return (n, AVLRes::Balance); + } else if self.pool[self.pool[n as usize].right as usize].tag == AVLTag::None { + self.pool[n as usize].tag = AVLTag::Right; + let t = self.pool[n as usize].right; + self.pool[t as usize].tag = AVLTag::Left; + n = self.rotleft(n); + return (n, AVLRes::OK); + } else { + match self.pool[self.pool[self.pool[n as usize].right as usize].left as usize] + .tag + { + AVLTag::Left => { + self.pool[n as usize].tag = AVLTag::None; + let t = self.pool[n as usize].right; + self.pool[t as usize].tag = AVLTag::Right; + } + AVLTag::Right => { + self.pool[n as usize].tag = AVLTag::Left; + let t = self.pool[n as usize].right; + self.pool[t as usize].tag = AVLTag::None; + } + AVLTag::None => { + self.pool[n as usize].tag = AVLTag::None; + let t = self.pool[n as usize].right; + self.pool[t as usize].tag = AVLTag::None; + } + AVLTag::Free => { + panic!("AVLTree::leftshrunk(1): unallocated node in tree"); + } + } + { + let t = self.pool[self.pool[n as usize].right as usize].left; + self.pool[t as usize].tag = AVLTag::None; + } + { + let t = self.rotright(self.pool[n as usize].right); + self.pool[n as usize].right = t; + } + n = self.rotleft(n); + return (n, AVLRes::Balance); + } + } + AVLTag::None => { + self.pool[n as usize].tag = AVLTag::Right; + return (n, AVLRes::OK); + } + AVLTag::Free => { + panic!("AVLTree::leftshrunk(2): unallocated node in tree"); + } + } + } + + // Private function: rightshrunk: helper function for delete and + // findhighest + // + // See leftshrunk for details. + fn rightshrunk(&mut self, mut n: u32) -> (u32, AVLRes) { + match self.pool[n as usize].tag { + AVLTag::Right => { + self.pool[n as usize].tag = AVLTag::None; + return (n, AVLRes::Balance); + } + AVLTag::Left => { + if self.pool[self.pool[n as usize].left as usize].tag == AVLTag::Left { + self.pool[n as usize].tag = AVLTag::None; + let t = self.pool[n as usize].left; + self.pool[t as usize].tag = AVLTag::None; + n = self.rotright(n); + return (n, AVLRes::Balance); + } else if self.pool[self.pool[n as usize].left as usize].tag == AVLTag::None { + self.pool[n as usize].tag = AVLTag::Left; + let t = self.pool[n as usize].left; + self.pool[t as usize].tag = AVLTag::Right; + n = self.rotright(n); + return (n, AVLRes::OK); + } else { + match self.pool[self.pool[self.pool[n as usize].left as usize].right as usize] + .tag + { + AVLTag::Left => { + self.pool[n as usize].tag = AVLTag::Right; + let t = self.pool[n as usize].left; + self.pool[t as usize].tag = AVLTag::None; + } + AVLTag::Right => { + self.pool[n as usize].tag = AVLTag::None; + let t = self.pool[n as usize].left; + self.pool[t as usize].tag = AVLTag::Left; + } + AVLTag::None => { + self.pool[n as usize].tag = AVLTag::None; + let t = self.pool[n as usize].left; + self.pool[t as usize].tag = AVLTag::None; + } + AVLTag::Free => { + panic!("AVLTree::rightshrunk(1): unallocated node in tree"); + } + } + { + let t = self.pool[self.pool[n as usize].left as usize].right; + self.pool[t as usize].tag = AVLTag::None; + } + { + let t = self.rotleft(self.pool[n as usize].left); + self.pool[n as usize].left = t; + } + n = self.rotright(n); + return (n, AVLRes::Balance); + } + } + AVLTag::None => { + self.pool[n as usize].tag = AVLTag::Left; + return (n, AVLRes::OK); + } + AVLTag::Free => { + panic!("AVLTree::rightshrunk(2): unallocated node in tree"); + } + } + } + + // Private function: findhighest: replace a node with a subtree's + // highest-ranking item. + // + // Parameters: + // + // target Pointer to node to be replaced. + // + // n Address of pointer to subtree. + // + // res Pointer to variable used to tell the caller whether + // further checks are necessary; analog to the return + // values of leftgrown and leftshrunk (see there). + // + // Return values: + // + // True A node was found; the target node has been replaced. + // + // False The target node could not be replaced because + // the subtree provided was empty. + // + fn findhighest(&mut self, target: u32, mut n: u32) -> Option<(u32, AVLRes)> { + if n == AVL_NULL { + return None; + } + let mut res = AVLRes::Balance; + if self.pool[n as usize].right != AVL_NULL { + let rec = self.findhighest(target, self.pool[n as usize].right); + if let Some((new_n_right, new_res)) = rec { + self.pool[n as usize].right = new_n_right; + res = new_res; + if res == AVLRes::Balance { + let (new_n, new_res) = self.rightshrunk(n); + n = new_n; + res = new_res; + } + return Some((n, res)); + } else { + return None; + } + } + self.pool[target as usize].item = self.pool[n as usize].item.clone(); + let tmp = n; + n = self.pool[n as usize].left; + self.free(tmp); + Some((n, res)) + } + + // Private function: findlowest: replace node with a subtree's + // lowest-ranking item. + // + // See findhighest for the details. + fn findlowest(&mut self, target: u32, mut n: u32) -> Option<(u32, AVLRes)> { + if n == AVL_NULL { + return None; + } + let mut res = AVLRes::Balance; + if self.pool[n as usize].left != AVL_NULL { + let rec = self.findlowest(target, self.pool[n as usize].left); + if let Some((new_n_left, new_res)) = rec { + self.pool[n as usize].left = new_n_left; + res = new_res; + if res == AVLRes::Balance { + let (new_n, new_res) = self.leftshrunk(n); + n = new_n; + res = new_res; + } + return Some((n, res)); + } else { + return None; + } + } + self.pool[target as usize].item = self.pool[n as usize].item.clone(); + let tmp = n; + n = self.pool[n as usize].right; + self.free(tmp); + Some((n, res)) + } + + // Private function: delete_wrk: delete an item from the tree. + // (worker function) + // + // Parameters: + // + // n Address of a pointer to a node. + // + // key AVLKEY of item to be removed. + // + // Return values: + // + // nonzero The item has been removed. The exact value of + // nonzero yields if of no concern to user code; when + // delete recursively calls itself, the number + // returned tells the parent activation if the AVL tree + // may have become unbalanced; specifically: + // + // OK None of the subtrees of the node that n points to + // has shrunk, the AVL tree is valid. + // + // BALANCE One of the subtrees of the node that n points to + // has shrunk, the node's "skew" flag needs adjustment, + // and the AVL tree may have become unbalanced. + // + // zero The tree does not contain an item yielding the + // AVLKEY value provided by the caller. + fn delete_wrk<F>(&mut self, mut root: u32, item: T, mb_cmp: Option<&F>) -> (u32, AVLRes) + where + F: Fn(T, T) -> Option<Ordering>, + { + let mut tmp = AVLRes::Balance; + if root == AVL_NULL { + return (root, AVLRes::Error); + } + + let cmp_arg_left: T = item.clone(); + let cmp_arg_right: T = self.pool[root as usize].item.clone(); + let cmp_res = match mb_cmp { + None => cmp_arg_left.partial_cmp(&cmp_arg_right), + Some(cmp) => cmp(cmp_arg_left, cmp_arg_right), + }; + match cmp_res { + None => panic!("AVLTree::delete_wrk: unordered elements"), + Some(Ordering::Less) => { + let root_left = self.pool[root as usize].left; + let (new_root_left, new_tmp) = self.delete_wrk(root_left, item, mb_cmp); + self.pool[root as usize].left = new_root_left; + tmp = new_tmp; + if tmp == AVLRes::Balance { + let (new_root, new_res) = self.leftshrunk(root); + root = new_root; + tmp = new_res; + } + return (root, tmp); + } + Some(Ordering::Greater) => { + let root_right = self.pool[root as usize].right; + let (new_root_right, new_tmp) = self.delete_wrk(root_right, item, mb_cmp); + self.pool[root as usize].right = new_root_right; + tmp = new_tmp; + if tmp == AVLRes::Balance { + let (new_root, new_res) = self.rightshrunk(root); + root = new_root; + tmp = new_res; + } + return (root, tmp); + } + Some(Ordering::Equal) => { + if self.pool[root as usize].left != AVL_NULL { + let root_left = self.pool[root as usize].left; + if let Some((new_root_left, new_tmp)) = self.findhighest(root, root_left) { + self.pool[root as usize].left = new_root_left; + tmp = new_tmp; + if new_tmp == AVLRes::Balance { + let (new_root, new_res) = self.leftshrunk(root); + root = new_root; + tmp = new_res; + } + } + return (root, tmp); + } + if self.pool[root as usize].right != AVL_NULL { + let root_right = self.pool[root as usize].right; + if let Some((new_root_right, new_tmp)) = self.findlowest(root, root_right) { + self.pool[root as usize].right = new_root_right; + tmp = new_tmp; + if new_tmp == AVLRes::Balance { + let (new_root, new_res) = self.rightshrunk(root); + root = new_root; + tmp = new_res; + } + } + return (root, tmp); + } + self.free(root); + root = AVL_NULL; + return (root, AVLRes::Balance); + } + } + } + + // Private fn: count the number of items in the tree. Warning: costs O(N) ! + #[cfg(test)] + fn count_wrk(&self, n: u32) -> usize { + if n == AVL_NULL { + return 0; + } + 1 + self.count_wrk(self.pool[n as usize].left) + self.count_wrk(self.pool[n as usize].right) + } + + // Private fn: find the max depth of the tree. Warning: costs O(N) ! + #[cfg(test)] + fn depth_wrk(&self, n: u32) -> usize { + if n == AVL_NULL { + return 0; + } + let d_left = self.depth_wrk(self.pool[n as usize].left); + let d_right = self.depth_wrk(self.pool[n as usize].right); + 1 + if d_left > d_right { d_left } else { d_right } + } +} + +// Machinery for iterating over the tree, enumerating nodes in ascending order. +// Unfortunately AVLTreeIter has to be public. +pub struct AVLTreeIter<'t, 's, T> { + tree: &'t AVLTree<T>, + stack: &'s mut Vec<u32>, +} + +impl<'t, 's, T> AVLTreeIter<'t, 's, T> { + #[allow(dead_code)] + fn new(tree: &'t AVLTree<T>, stack: &'s mut Vec<u32>) -> Self { + let mut iter = AVLTreeIter { tree, stack }; + if tree.root != AVL_NULL { + iter.stack.push(tree.root); + iter.visit_left_children(tree.root); + } + iter + } + + fn visit_left_children(&mut self, root: u32) { + let mut cur = root; + loop { + let left = self.tree.pool[cur as usize].left; + if left == AVL_NULL { + break; + } + self.stack.push(left); + cur = left; + } + } +} + +impl<'s, 't, T: Copy> Iterator for AVLTreeIter<'s, 't, T> { + type Item = T; + fn next(&mut self) -> Option<Self::Item> { + let ret = match self.stack.pop() { + Some(ret) => ret, + None => return None, + }; + let right = self.tree.pool[ret as usize].right; + if right != AVL_NULL { + self.stack.push(right); + self.visit_left_children(right); + } + Some(self.tree.pool[ret as usize].item) + } +} + +//============================================================================= +// Public interface for AVLTree + +impl<T: Clone + PartialOrd> AVLTree<T> { + // The core functions (insert, delete, contains) take a comparator argument + // + // mb_cmp: Option<&F> + // where + // F: Fn(T, T) -> Option<Ordering> + // + // which allows control over how node comparison is done. If this is None, + // then comparison is done directly using PartialOrd for the T values. + // + // If this is Some(cmp), then comparison is done by passing the two T values + // to `cmp`. In this case, the routines will complain (panic) if `cmp` + // indicates that its arguments are unordered. + + // Insert a value in the tree. Returns true if an insert happened, false if + // the item was already present. + pub fn insert<F>(&mut self, item: T, mb_cmp: Option<&F>) -> bool + where + F: Fn(T, T) -> Option<Ordering>, + { + let new_root = self.insert_wrk(self.root, item, mb_cmp); + if new_root == AVL_NULL { + return false; // already in tree + } else { + self.root = new_root; + return true; + } + } + + // Remove an item from the tree. Returns a bool which indicates whether the + // value was in there in the first place. (meaning, true == a removal + // actually occurred). + pub fn delete<F>(&mut self, item: T, mb_cmp: Option<&F>) -> bool + where + F: Fn(T, T) -> Option<Ordering>, + { + let (new_root, res) = self.delete_wrk(self.root, item, mb_cmp); + if res == AVLRes::Error { + return false; + } else { + self.root = new_root; + return true; + } + } + + // Find `item` in the tree, and replace it with `replacement`. `item` and `replacement` + // must compare equal per the comparison function `cmp`. Returns a bool indicating whether + // `item` was found (and hence, replaced). There's no comparison fast-path here + // (meaning, `cmp` is `&F` and not `Option<&F>`) only because so far there is no use case + // for it. + pub fn find_and_replace<F>(&mut self, item: T, replacement: T, cmp: &F) -> bool + where + F: Fn(T, T) -> Option<Ordering>, + { + let mut n = self.root; + loop { + if n == AVL_NULL { + return false; + } + let cmp_arg_left: T = item.clone(); + let cmp_arg_right: T = self.pool[n as usize].item.clone(); + match cmp(cmp_arg_left, cmp_arg_right) { + Some(Ordering::Less) => { + n = self.pool[n as usize].left; + } + Some(Ordering::Greater) => { + n = self.pool[n as usize].right; + } + Some(Ordering::Equal) => { + // Do what we can to ensure the caller can't mess up the total ordering in + // the tree. This is more restrictive than it needs to be, but loosening + // it requires finding the largest item below `item` and the smallest one + // above it, which is expensive. + assert!(cmp(item, replacement.clone()) == Some(Ordering::Equal)); + self.pool[n as usize].item = replacement.clone(); + return true; + } + None => { + panic!("AVLTree::find_and_replace: unordered elements in search!"); + } + } + } + } + + // Determine whether an item is in the tree. + // sewardj 2020Mar31: this is not used; I assume all users of the trees + // do their own custom traversals. Remove #[cfg(test)] if any real uses + // appear. + #[cfg(test)] + pub fn contains<F>(&self, item: T, mb_cmp: Option<&F>) -> bool + where + F: Fn(T, T) -> Option<Ordering>, + { + let mut n = self.root; + // Lookup needs to be really fast, so have two versions of the loop, one + // for direct comparison, one for indirect. + match mb_cmp { + None => { + // Do comparisons directly on the items. + loop { + if n == AVL_NULL { + return false; + } + let cmp_arg_left: T = item.clone(); + let cmp_arg_right: T = self.pool[n as usize].item.clone(); + match cmp_arg_left.partial_cmp(&cmp_arg_right) { + Some(Ordering::Less) => { + n = self.pool[n as usize].left; + } + Some(Ordering::Greater) => { + n = self.pool[n as usize].right; + } + Some(Ordering::Equal) => { + return true; + } + None => { + panic!("AVLTree::contains(1): unordered elements in search!"); + } + } + } + } + Some(cmp) => { + // Do comparisons by handing off to the supplied function. + loop { + if n == AVL_NULL { + return false; + } + let cmp_arg_left: T = item.clone(); + let cmp_arg_right: T = self.pool[n as usize].item.clone(); + match cmp(cmp_arg_left, cmp_arg_right) { + Some(Ordering::Less) => { + n = self.pool[n as usize].left; + } + Some(Ordering::Greater) => { + n = self.pool[n as usize].right; + } + Some(Ordering::Equal) => { + return true; + } + None => { + panic!("AVLTree::contains(2): unordered elements in search!"); + } + } + } + } + } + } + + // Count the number of items in the tree. Warning: costs O(N) ! + #[cfg(test)] + fn count(&self) -> usize { + self.count_wrk(self.root) + } + + // Private fn: find the max depth of the tree. Warning: costs O(N) ! + #[cfg(test)] + fn depth(&self) -> usize { + self.depth_wrk(self.root) + } + + pub fn to_vec(&self) -> Vec<T> { + // BEGIN helper fn + fn walk<U: Clone>(res: &mut Vec<U>, root: u32, pool: &Vec<AVLNode<U>>) { + let root_left = pool[root as usize].left; + if root_left != AVL_NULL { + walk(res, root_left, pool); + } + res.push(pool[root as usize].item.clone()); + let root_right = pool[root as usize].right; + if root_right != AVL_NULL { + walk(res, root_right, pool); + } + } + // END helper fn + + let mut res = Vec::<T>::new(); + if self.root != AVL_NULL { + walk(&mut res, self.root, &self.pool); + } + res + } + + #[allow(dead_code)] + pub fn iter<'t, 's>(&'t self, storage: &'s mut Vec<u32>) -> AVLTreeIter<'t, 's, T> { + storage.clear(); + AVLTreeIter::new(self, storage) + } + + // Show the tree. (For debugging only.) + //pub fn show(&self, depth: i32, node: u32) { + // if node != AVL_NULL { + // self.show(depth + 1, self.pool[node as usize].left); + // for _ in 0..depth { + // print!(" "); + // } + // println!("{}", ToFromU32::to_u32(self.pool[node as usize].item)); + // self.show(depth + 1, self.pool[node as usize].right); + // } + //} +} + +//============================================================================= +// Testing machinery for AVLTree + +#[cfg(test)] +mod avl_tree_test_utils { + use crate::data_structures::Set; + use std::cmp::Ordering; + + // Perform various checks on the tree, and assert if it is not OK. + pub fn check_tree( + tree: &super::AVLTree<u32>, + should_be_in_tree: &Set<u32>, + univ_min: u32, + univ_max: u32, + ) { + // Same cardinality + let n_in_set = should_be_in_tree.card(); + let n_in_tree = tree.count(); + assert!(n_in_set == n_in_tree); + + // Tree is not wildly out of balance. Depth should not exceed 1.44 * + // log2(size). + let tree_depth = tree.depth(); + let mut log2_size = 0; + { + let mut n: usize = n_in_tree; + while n > 0 { + n = n >> 1; + log2_size += 1; + } + } + // Actually a tighter limit than stated above. For these test cases, the + // tree is either perfectly balanced or within one level of being so + // (hence the +1). + assert!(tree_depth <= log2_size + 1); + + // Check that everything that should be in the tree is in it, and vice + // versa. + for i in univ_min..univ_max { + let should_be_in = should_be_in_tree.contains(i); + + // Look it up with a null comparator (so `contains` compares + // directly) + let is_in = tree.contains::<fn(u32, u32) -> Option<Ordering>>(i, None); + assert!(is_in == should_be_in); + + // We should get the same result with a custom comparator that does the + // same as the null comparator. + let is_in_w_cmp = tree.contains( + i, + Some(&(|x_left: u32, x_right: u32| x_left.partial_cmp(&x_right))), + ); + assert!(is_in_w_cmp == is_in); + + // And even when the comparator is actually a closure + let forty_two: u32 = 52; + let is_in_w_cmp_closure = tree.contains( + i, + Some( + &(|x_left: u32, x_right: u32| { + (x_left + forty_two).partial_cmp(&(x_right + forty_two)) + }), + ), + ); + assert!(is_in_w_cmp_closure == is_in); + } + + // We could even test that the tree items are in-order, but it hardly + // seems worth the hassle, since the previous test would surely have + // failed if that wasn't the case. + } +} + +#[test] +fn test_avl_tree1() { + use crate::data_structures::Set; + + // Perform tests on an AVL tree. Use as values, every third number between + // 5000 and 5999 inclusive. This is to ensure that there's no confusion + // between element values and internal tree indices (although I think the + // typechecker guarantees that anyway). + // + // Also carry along a Set<u32>, which keeps track of which values should be + // in the tree at the current point. + const UNIV_MIN: u32 = 5000; + const UNIV_MAX: u32 = 5999; + const UNIV_SIZE: u32 = UNIV_MAX - UNIV_MIN + 1; + + let mut tree = AVLTree::<u32>::new(0); + let mut should_be_in_tree = Set::<u32>::empty(); + + // Add numbers to the tree, checking as we go. + for i in UNIV_MIN..UNIV_MAX { + // Idiotic but simple + if i % 3 != 0 { + continue; + } + let was_added = tree.insert::<fn(u32, u32) -> Option<Ordering>>(i, None); + should_be_in_tree.insert(i); + assert!(was_added == true); + avl_tree_test_utils::check_tree(&tree, &should_be_in_tree, UNIV_MIN, UNIV_MAX); + } + + // Then remove the middle half of the tree, also checking. + for i in UNIV_MIN + UNIV_SIZE / 4..UNIV_MIN + 3 * (UNIV_SIZE / 4) { + // Note that here, we're asking to delete a bunch of numbers that aren't + // in the tree. It should remain valid throughout. + let was_removed = tree.delete::<fn(u32, u32) -> Option<Ordering>>(i, None); + let should_have_been_removed = should_be_in_tree.contains(i); + assert!(was_removed == should_have_been_removed); + should_be_in_tree.delete(i); + avl_tree_test_utils::check_tree(&tree, &should_be_in_tree, UNIV_MIN, UNIV_MAX); + } + + // Now add some numbers which are already in the tree. + for i in UNIV_MIN..UNIV_MIN + UNIV_SIZE / 4 { + if i % 3 != 0 { + continue; + } + let was_added = tree.insert::<fn(u32, u32) -> Option<Ordering>>(i, None); + let should_have_been_added = !should_be_in_tree.contains(i); + assert!(was_added == should_have_been_added); + should_be_in_tree.insert(i); + avl_tree_test_utils::check_tree(&tree, &should_be_in_tree, UNIV_MIN, UNIV_MAX); + } + + // Then remove all numbers from the tree, in reverse order. + for ir in UNIV_MIN..UNIV_MAX { + let i = UNIV_MIN + (UNIV_MAX - ir); + let was_removed = tree.delete::<fn(u32, u32) -> Option<Ordering>>(i, None); + let should_have_been_removed = should_be_in_tree.contains(i); + assert!(was_removed == should_have_been_removed); + should_be_in_tree.delete(i); + avl_tree_test_utils::check_tree(&tree, &should_be_in_tree, UNIV_MIN, UNIV_MAX); + } + + // Now the tree should be empty. + assert!(should_be_in_tree.is_empty()); + assert!(tree.count() == 0); + + // Now delete some more stuff. Tree should still be empty :-) + for i in UNIV_MIN + 10..UNIV_MIN + 100 { + assert!(should_be_in_tree.is_empty()); + assert!(tree.count() == 0); + tree.delete::<fn(u32, u32) -> Option<Ordering>>(i, None); + avl_tree_test_utils::check_tree(&tree, &should_be_in_tree, UNIV_MIN, UNIV_MAX); + } + + // The tree root should be NULL. + assert!(tree.root == AVL_NULL); + assert!(tree.freelist != AVL_NULL); + + // Check the freelist: all entries are of the expected form. + for e in &tree.pool { + assert!(e.tag == AVLTag::Free); + assert!(e.left == AVL_NULL || (e.left as usize) < tree.pool.len()); + assert!(e.right == AVL_NULL); + assert!(e.item == 0); + } + + // Check the freelist: it's non-circular, and contains the expected number + // of elements. + let mut n_in_freelist = 0; + let mut cursor: u32 = tree.freelist; + while cursor != AVL_NULL { + assert!((cursor as usize) < tree.pool.len()); + n_in_freelist += 1; + assert!(n_in_freelist < 100000 /*arbitrary*/); // else it has a cycle + cursor = tree.pool[cursor as usize].left; + } + // If we get here, the freelist at least doesn't have a cycle. + + // All elements in the pool are on the freelist. + assert!(n_in_freelist == tree.pool.len()); +} + +#[test] +fn test_avl_tree2() { + use std::cmp::Ordering; + + // Do some simple testing using a custom comparator, which inverts the order + // of items in the tree, so as to check custom comparators work right. + let mut tree = AVLTree::<u32>::new(0); + + let nums = [31, 41, 59, 27, 14, 35, 62, 25, 18, 28, 45, 90, 61]; + + fn reverse_cmp(x: u32, y: u32) -> Option<Ordering> { + y.partial_cmp(&x) + } + + // Insert + for n in &nums { + let insert_happened = tree.insert(*n, Some(&reverse_cmp)); + assert!(insert_happened == true); + } + + // Check membership + for n in 0..100 { + let is_in = tree.contains(n, Some(&reverse_cmp)); + let should_be_in = nums.iter().any(|m| n == *m); + assert!(is_in == should_be_in); + } + + // Delete + for n in 0..100 { + let remove_happened = tree.delete(n, Some(&reverse_cmp)); + let remove_should_have_happened = nums.iter().any(|m| n == *m); + assert!(remove_happened == remove_should_have_happened); + } + + // Final check + assert!(tree.root == AVL_NULL); + assert!(tree.count() == 0); +} + +#[test] +fn test_avl_tree_iter() { + let mut storage = Vec::new(); + let tree = AVLTree::<u32>::new(0); + assert!(tree.iter(&mut storage).next().is_none()); + + const FROM: u32 = 0; + const TO: u32 = 10000; + + let mut tree = AVLTree::<u32>::new(0); + for i in FROM..TO { + tree.insert(i, Some(&|a: u32, b: u32| a.partial_cmp(&b))); + } + + let as_vec = tree.to_vec(); + for (i, val) in tree.iter(&mut storage).enumerate() { + assert_eq!(as_vec[i], val, "not equal for i={}", i); + } +} diff --git a/third_party/rust/regalloc/src/bt_coalescing_analysis.rs b/third_party/rust/regalloc/src/bt_coalescing_analysis.rs new file mode 100644 index 0000000000..0b81de70dc --- /dev/null +++ b/third_party/rust/regalloc/src/bt_coalescing_analysis.rs @@ -0,0 +1,672 @@ +//! Analysis in support of copy coalescing for the backtracking allocator. +//! +//! This detects and collects information about all copy coalescing +//! opportunities in the incoming function. It does not use that information +//! at all -- that is for the main allocation loop and the spill slot allocator +//! to do. +//! +//! Coalescing analysis creates 4 pieces of information: +//! +//! * a map from `VirtualRangeIx` to a set of `Hint`s (see below) which state a +//! preference for which register that range would prefer to be allocated to. +//! +//! * equivalence class groupings for the virtual ranges. Two virtual ranges +//! will be assigned the same equivalence class if there is a move instruction +//! that transfers a value from one range to the other. The equivalence +//! classes created are the transitive closure of this pairwise relation. +//! +//! * a simple mapping from instruction index to bool, indicating those +//! instructions that are moves between virtual registers, and that have been +//! used to construct the equivalence classes above. +//! +//! * a mapping from virtual registers to virtual ranges. This is really +//! produced as a side-effect of computing the above three elements, but is +//! useful in its own right and so is also returned. + +#![allow(non_snake_case)] +#![allow(non_camel_case_types)] + +use log::{debug, info, log_enabled, Level}; +use smallvec::{smallvec, SmallVec}; + +use crate::data_structures::{ + InstIx, InstPoint, Map, MoveInfo, MoveInfoElem, RangeFrag, RangeFragIx, RealRange, RealRangeIx, + RealReg, RealRegUniverse, RegToRangesMaps, SpillCost, TypedIxVec, VirtualRange, VirtualRangeIx, + VirtualReg, +}; +use crate::union_find::{ToFromU32, UnionFind, UnionFindEquivClasses}; +use crate::Function; + +//============================================================================= +// +// *** Some important comments about the interaction between this coalescing +// *** analysis, the main allocation loop and the spill slot allocator. +// +// The main allocation loop tries to assign the same register to all the +// VirtualRanges in an equivalence class. Similarly, the spill slot allocator +// tries to allocate the same spill slot to all the VirtualRanges in an +// equivalence class. In most cases they are successful, and so the moves +// between those VirtualRanges will later disappear. However, the complete +// story is not quite so simple. +// +// It is only safe to assign the VirtualRanges in the same equivalence class +// to a single register or spill slot if those VirtualRanges are +// non-overlapping. That is, if their overall collection of RangeFrags is +// disjoint. If two such VirtualRanges overlapped, then they could be +// carrying different values, and so they would need separate registers or +// spill slots. +// +// Most of the time, these equivalence classes are indeed internally +// non-overlapping. But that's just luck -- that's how the input VCode mostly +// is. The coalescing analysis *doesn't* properly check for overlaps within an +// equivalence class, so it can be the case that the members of an equivalence +// class overlap. The users of that information -- the main allocation loop +// and the spill slot allocator -- currently check for, and handle, such +// situations. So the generated allocation is correct. +// +// It does, however, cause imprecision and unnecessary spilling, and, in the +// main allocation loop, slightly increased evictions. +// +// The "proper" fix for all this would be to fix the coalescing analysis so as +// only to build non-internally-overlapping VirtualRange equivalence classes. +// However, that sounds expensive. Instead there is a half-hearted effort +// made to avoid creating equivalence classes whose elements (VirtualRanges) +// overlap. This is done by doing an overlap check on two VirtualRanges +// connected by a move, and not merging their equivalence classes if they +// overlap. That helps, but it doesn't completely avoid the problem because +// there might be overlaps between other members (VirtualRanges) of the +// about-to-be-merged equivalence classes. + +//============================================================================= +// Coalescing analysis: Hints +// +// A coalescing hint for a virtual live range. The u32 is an arbitrary +// "weight" value which indicates a relative strength-of-preference for the +// hint. It exists because a VLR can have arbitrarily many copy +// instructions at its "boundary", and hence arbitrarily many hints. Of +// course the allocator core can honour at most one of them, so it needs a +// way to choose between them. In this implementation, the u32s are simply +// the estimated execution count of the associated copy instruction. +#[derive(Clone)] +pub enum Hint { + // I would like to have the same real register as some other virtual range. + SameAs(VirtualRangeIx, u32), + // I would like to have exactly this real register. + Exactly(RealReg, u32), +} +fn show_hint(h: &Hint, univ: &RealRegUniverse) -> String { + match h { + Hint::SameAs(vlrix, weight) => format!("(SameAs {:?}, weight={})", vlrix, weight), + Hint::Exactly(rreg, weight) => format!( + "(Exactly {}, weight={})", + rreg.to_reg().show_with_rru(&univ), + weight + ), + } +} +impl Hint { + #[inline(always)] + fn get_weight(&self) -> u32 { + match self { + Hint::SameAs(_vlrix, weight) => *weight, + Hint::Exactly(_rreg, weight) => *weight, + } + } +} + +// We need this in order to construct a UnionFind<VirtualRangeIx>. +impl ToFromU32 for VirtualRangeIx { + fn to_u32(x: VirtualRangeIx) -> u32 { + x.get() + } + fn from_u32(x: u32) -> VirtualRangeIx { + VirtualRangeIx::new(x) + } +} + +//============================================================================= +// Coalescing analysis: top level function + +// This performs coalescing analysis and returns info as a 3-tuple. Note that +// it also may change the spill costs for some of the VLRs in `vlr_env` to +// better reflect the spill cost situation in the presence of coalescing. +#[inline(never)] +pub fn do_coalescing_analysis<F: Function>( + func: &F, + univ: &RealRegUniverse, + rlr_env: &TypedIxVec<RealRangeIx, RealRange>, + vlr_env: &mut TypedIxVec<VirtualRangeIx, VirtualRange>, + frag_env: &TypedIxVec<RangeFragIx, RangeFrag>, + reg_to_ranges_maps: &RegToRangesMaps, + move_info: &MoveInfo, +) -> ( + TypedIxVec<VirtualRangeIx, SmallVec<[Hint; 8]>>, + UnionFindEquivClasses<VirtualRangeIx>, + TypedIxVec<InstIx, bool>, +) { + info!(""); + info!("do_coalescing_analysis: begin"); + + // This function contains significant additional complexity due to the requirement to handle + // pathological cases in reasonable time without unduly burdening the common cases. + // + // ======================================================================================== + // + // The core questions that the coalescing analysis asks is: + // + // For an instruction I and a reg V: + // + // * does I start a live range fragment for V? In other words, is it a "first def of V" ? + // + // * and dually: does I end a live range fragment for V? IOW, is it a "last use of V" ? + // + // V may be a real or virtual register -- we must handle both. I is invariably a move + // instruction. We could ask such questions about other kinds of insns, but we don't care + // about those. + // + // The reason we care about this is as follows. If we can find some move insn I, which is + // the last use of some reg V1 and the first def of some other reg V2, then V1 and V2 can at + // least in principle be allocated to the same real register. + // + // Note that the "last" and "first" aspect is critical for correctness. Consider this: + // + // V1 = ... + // I V2 = V1 + // * V2 = V2 - 99 + // # V3 = V1 + 47 + // + // Here, I might be a first def of V2, but it's certainly not the last use of V1, and so if + // we allocate V1 and V2 to the same real register, the insn marked * will trash the value + // of V1 while it's still needed at #, and we'll create wrong code. For the same reason, we + // can only coalesce out a move if the destination is a first def. + // + // The use of names V* in the above example is slightly misleading. As mentioned, the above + // criteria apply to both real and virtual registers. The only limitation is that, + // obviously, we can't coalesce out a move if both registers involved are real. But if only + // one is real then we have at least the possibility to do that. + // + // Now to the question of compiler performance. The simplest way to establish whether (for + // example) I is a first def of V is to visit all of V's `RangeFrag`s, to see if any of them + // start at `I.d`. That can be done by iterating over all of the live ranges that belong to + // V, and through each `RangeFrag` in each live range. Hence it's a linear search through + // V's `RangeFrag`s. + // + // For the vast majority of cases, this works well because most regs -- and especially, most + // virtual regs, in code derived from an SSA precursor -- have short live ranges, and + // usually only one, and so there are very few `RangeFrag`s to visit. However, there are + // cases where a register has many `RangeFrag`s -- we've seen inputs where that number + // exceeds 100,000 -- in which case a linear search is disastrously slow. + // + // To fix this, there is a Plan B scheme for establishing the same facts. It relies on the + // observation that the `RangeFrag`s for each register are mutually non-overlapping. Hence + // their start points are all unique, so we can park them all in a vector, sort it, and + // binary search it. And the same for the end points. This is the purpose of structs + // `ManyFragsInfoR` and `ManyFragsInfoV` below. + // + // Although this plan keeps us out of performance black holes in pathological cases, it is + // expensive in a constant-factors sense: it requires dynamic memory allocation for these + // vectors, and it requires sorting them. Hence we try to avoid it as much as possible, and + // route almost all work via the simple linear-search scheme. + // + // The linear-vs-binary-search choice is made for each register individually. Incoming + // parameter `reg_to_ranges_maps` contains fields `r/vregs_with_many_frags`, and it is only + // for those that sorted vectors are prepared. Those vectors are tracked by the maps + // `r/v_many_map` below. `reg_to_ranges_maps` also contains field `many_frags_thresh` which + // tells us what the size threshold actually was, and this is used to opportunistically + // pre-size the vectors. It's not required for correctness. + // + // All this complexity is bought together in the four closures `doesVRegHaveLastUseAt`, + // `doesVRegHaveFirstDefAt`, `doesRRegHaveLastUseAt` and `doesRRegHaveFirstDefAt`. In each + // case, they first try to resolve the query by binary search, which usually fails, in which + // case they fall back to a linear search, which will always give a correct result. In + // debug builds, if the binary search does produce an answer, it is crosschecked against the + // linear search result. + // + // The duplication in the four closures is undesirable but hard to avoid. The real- and + // virtual-reg cases have different types. Similarly, the first/last cases are slightly + // different. If there were a way to guarantee that rustc would inline closures, then it + // might be worth trying to common them up, on the basis that rustc can inline and + // specialise, leading back to what we currently have here. However, in the absence of such + // a facility, I didn't want to risk it, given that these closures are performance-critical. + // + // Finally, note that the coalescing analysis proper involves more than just the above + // described searches, and one sees the code for the rest of it following the search + // closures below. However, the rest of it isn't performance critical, and is not described + // in this comment. + // + // ======================================================================================== + + // So, first: for the registers which `reg_to_ranges_maps` tells us have "many" fragments, + // prepare the binary-search vectors. This is done first for the real regs and then below + // for virtual regs. + + struct ManyFragsInfoR { + sorted_firsts: Vec<(InstPoint, RealRangeIx)>, + sorted_lasts: Vec<(InstPoint, RealRangeIx)>, + } + let r_many_card = reg_to_ranges_maps.rregs_with_many_frags.len(); + let mut r_many_map = Map::<u32 /*RealReg index*/, ManyFragsInfoR>::default(); + r_many_map.reserve(r_many_card); + + for rreg_no in ®_to_ranges_maps.rregs_with_many_frags { + // `2 * reg_to_ranges_maps.many_frags_thresh` is clearly a heuristic hack, but we do + // know for sure that each vector will contain at least + // `reg_to_ranges_maps.many_frags_thresh` and very likely more. And that threshold is + // already quite high, so pre-sizing the vectors at this point avoids quite a number of + // resize-reallocations later. + let mut many_frags_info = ManyFragsInfoR { + sorted_firsts: Vec::with_capacity(2 * reg_to_ranges_maps.many_frags_thresh), + sorted_lasts: Vec::with_capacity(2 * reg_to_ranges_maps.many_frags_thresh), + }; + let rlrixs = ®_to_ranges_maps.rreg_to_rlrs_map[*rreg_no as usize]; + for rlrix in rlrixs { + for fix in &rlr_env[*rlrix].sorted_frags.frag_ixs { + let frag = &frag_env[*fix]; + many_frags_info.sorted_firsts.push((frag.first, *rlrix)); + many_frags_info.sorted_lasts.push((frag.last, *rlrix)); + } + } + many_frags_info + .sorted_firsts + .sort_unstable_by_key(|&(point, _)| point); + many_frags_info + .sorted_lasts + .sort_unstable_by_key(|&(point, _)| point); + debug_assert!(many_frags_info.sorted_firsts.len() == many_frags_info.sorted_lasts.len()); + // Because the RangeFrags for any reg (virtual or real) are non-overlapping, it follows + // that both the sorted first points and sorted last points contain no duplicates. (In + // fact the implied condition (no duplicates) is weaker than the premise + // (non-overlapping), but this is nevertheless correct.) + for i in 1..(many_frags_info.sorted_firsts.len()) { + debug_assert!( + many_frags_info.sorted_firsts[i - 1].0 < many_frags_info.sorted_firsts[i].0 + ); + } + for i in 1..(many_frags_info.sorted_lasts.len()) { + debug_assert!( + many_frags_info.sorted_lasts[i - 1].0 < many_frags_info.sorted_lasts[i].0 + ); + } + r_many_map.insert(*rreg_no, many_frags_info); + } + + // And the same for virtual regs. + struct ManyFragsInfoV { + sorted_firsts: Vec<(InstPoint, VirtualRangeIx)>, + sorted_lasts: Vec<(InstPoint, VirtualRangeIx)>, + } + let v_many_card = reg_to_ranges_maps.vregs_with_many_frags.len(); + let mut v_many_map = Map::<u32 /*VirtualReg index*/, ManyFragsInfoV>::default(); + v_many_map.reserve(v_many_card); + + for vreg_no in ®_to_ranges_maps.vregs_with_many_frags { + let mut many_frags_info = ManyFragsInfoV { + sorted_firsts: Vec::with_capacity(2 * reg_to_ranges_maps.many_frags_thresh), + sorted_lasts: Vec::with_capacity(2 * reg_to_ranges_maps.many_frags_thresh), + }; + let vlrixs = ®_to_ranges_maps.vreg_to_vlrs_map[*vreg_no as usize]; + for vlrix in vlrixs { + for frag in &vlr_env[*vlrix].sorted_frags.frags { + many_frags_info.sorted_firsts.push((frag.first, *vlrix)); + many_frags_info.sorted_lasts.push((frag.last, *vlrix)); + } + } + many_frags_info + .sorted_firsts + .sort_unstable_by_key(|&(point, _)| point); + many_frags_info + .sorted_lasts + .sort_unstable_by_key(|&(point, _)| point); + debug_assert!(many_frags_info.sorted_firsts.len() == many_frags_info.sorted_lasts.len()); + for i in 1..(many_frags_info.sorted_firsts.len()) { + debug_assert!( + many_frags_info.sorted_firsts[i - 1].0 < many_frags_info.sorted_firsts[i].0 + ); + } + for i in 1..(many_frags_info.sorted_lasts.len()) { + debug_assert!( + many_frags_info.sorted_lasts[i - 1].0 < many_frags_info.sorted_lasts[i].0 + ); + } + v_many_map.insert(*vreg_no, many_frags_info); + } + + // There now follows the abovementioned four (well, actually, eight) closures, which are + // used to find out whether a real or virtual reg has a last use or first def at some + // instruction. This is the central activity of the coalescing analysis -- finding move + // instructions that are the last def for the src reg and the first def for the dst reg. + + // ---------------- Range checks for VirtualRegs: last use ---------------- + // Checks whether `vreg` has a last use at `iix`.u. + + let doesVRegHaveLastUseAt_LINEAR = |vreg: VirtualReg, iix: InstIx| -> Option<VirtualRangeIx> { + let point_to_find = InstPoint::new_use(iix); + let vreg_no = vreg.get_index(); + let vlrixs = ®_to_ranges_maps.vreg_to_vlrs_map[vreg_no]; + for vlrix in vlrixs { + for frag in &vlr_env[*vlrix].sorted_frags.frags { + if frag.last == point_to_find { + return Some(*vlrix); + } + } + } + None + }; + let doesVRegHaveLastUseAt = |vreg: VirtualReg, iix: InstIx| -> Option<VirtualRangeIx> { + let point_to_find = InstPoint::new_use(iix); + let vreg_no = vreg.get_index(); + let mut binary_search_result = None; + if let Some(ref mfi) = v_many_map.get(&(vreg_no as u32)) { + match mfi + .sorted_lasts + .binary_search_by_key(&point_to_find, |(point, _)| *point) + { + Ok(found_at_ix) => binary_search_result = Some(mfi.sorted_lasts[found_at_ix].1), + Err(_) => {} + } + } + match binary_search_result { + None => doesVRegHaveLastUseAt_LINEAR(vreg, iix), + Some(_) => { + debug_assert!(binary_search_result == doesVRegHaveLastUseAt_LINEAR(vreg, iix)); + binary_search_result + } + } + }; + + // ---------------- Range checks for VirtualRegs: first def ---------------- + // Checks whether `vreg` has a first def at `iix`.d. + + let doesVRegHaveFirstDefAt_LINEAR = |vreg: VirtualReg, iix: InstIx| -> Option<VirtualRangeIx> { + let point_to_find = InstPoint::new_def(iix); + let vreg_no = vreg.get_index(); + let vlrixs = ®_to_ranges_maps.vreg_to_vlrs_map[vreg_no]; + for vlrix in vlrixs { + for frag in &vlr_env[*vlrix].sorted_frags.frags { + if frag.first == point_to_find { + return Some(*vlrix); + } + } + } + None + }; + let doesVRegHaveFirstDefAt = |vreg: VirtualReg, iix: InstIx| -> Option<VirtualRangeIx> { + let point_to_find = InstPoint::new_def(iix); + let vreg_no = vreg.get_index(); + let mut binary_search_result = None; + if let Some(ref mfi) = v_many_map.get(&(vreg_no as u32)) { + match mfi + .sorted_firsts + .binary_search_by_key(&point_to_find, |(point, _)| *point) + { + Ok(found_at_ix) => binary_search_result = Some(mfi.sorted_firsts[found_at_ix].1), + Err(_) => {} + } + } + match binary_search_result { + None => doesVRegHaveFirstDefAt_LINEAR(vreg, iix), + Some(_) => { + debug_assert!(binary_search_result == doesVRegHaveFirstDefAt_LINEAR(vreg, iix)); + binary_search_result + } + } + }; + + // ---------------- Range checks for RealRegs: last use ---------------- + // Checks whether `rreg` has a last use at `iix`.u. + + let doesRRegHaveLastUseAt_LINEAR = |rreg: RealReg, iix: InstIx| -> Option<RealRangeIx> { + let point_to_find = InstPoint::new_use(iix); + let rreg_no = rreg.get_index(); + let rlrixs = ®_to_ranges_maps.rreg_to_rlrs_map[rreg_no]; + for rlrix in rlrixs { + let frags = &rlr_env[*rlrix].sorted_frags; + for fix in &frags.frag_ixs { + let frag = &frag_env[*fix]; + if frag.last == point_to_find { + return Some(*rlrix); + } + } + } + None + }; + let doesRRegHaveLastUseAt = |rreg: RealReg, iix: InstIx| -> Option<RealRangeIx> { + let point_to_find = InstPoint::new_use(iix); + let rreg_no = rreg.get_index(); + let mut binary_search_result = None; + if let Some(ref mfi) = r_many_map.get(&(rreg_no as u32)) { + match mfi + .sorted_lasts + .binary_search_by_key(&point_to_find, |(point, _)| *point) + { + Ok(found_at_ix) => binary_search_result = Some(mfi.sorted_lasts[found_at_ix].1), + Err(_) => {} + } + } + match binary_search_result { + None => doesRRegHaveLastUseAt_LINEAR(rreg, iix), + Some(_) => { + debug_assert!(binary_search_result == doesRRegHaveLastUseAt_LINEAR(rreg, iix)); + binary_search_result + } + } + }; + + // ---------------- Range checks for RealRegs: first def ---------------- + // Checks whether `rreg` has a first def at `iix`.d. + + let doesRRegHaveFirstDefAt_LINEAR = |rreg: RealReg, iix: InstIx| -> Option<RealRangeIx> { + let point_to_find = InstPoint::new_def(iix); + let rreg_no = rreg.get_index(); + let rlrixs = ®_to_ranges_maps.rreg_to_rlrs_map[rreg_no]; + for rlrix in rlrixs { + let frags = &rlr_env[*rlrix].sorted_frags; + for fix in &frags.frag_ixs { + let frag = &frag_env[*fix]; + if frag.first == point_to_find { + return Some(*rlrix); + } + } + } + None + }; + let doesRRegHaveFirstDefAt = |rreg: RealReg, iix: InstIx| -> Option<RealRangeIx> { + let point_to_find = InstPoint::new_def(iix); + let rreg_no = rreg.get_index(); + let mut binary_search_result = None; + if let Some(ref mfi) = r_many_map.get(&(rreg_no as u32)) { + match mfi + .sorted_firsts + .binary_search_by_key(&point_to_find, |(point, _)| *point) + { + Ok(found_at_ix) => binary_search_result = Some(mfi.sorted_firsts[found_at_ix].1), + Err(_) => {} + } + } + match binary_search_result { + None => doesRRegHaveFirstDefAt_LINEAR(rreg, iix), + Some(_) => { + debug_assert!(binary_search_result == doesRRegHaveFirstDefAt_LINEAR(rreg, iix)); + binary_search_result + } + } + }; + + // Finally we come to the core logic of the coalescing analysis. It uses the complex + // hybrid-search mechanism described extensively above. The comments above however don't + // describe any of the logic after this point. + + // RETURNED TO CALLER + // Hints for each VirtualRange. Note that the SmallVecs could contain duplicates, I + // suppose, for example if there are two identical copy insns at different points on the + // "boundary" for some VLR. I don't think it matters though since we're going to rank the + // hints by strength and then choose at most one. + let mut hints = TypedIxVec::<VirtualRangeIx, SmallVec<[Hint; 8]>>::new(); + hints.resize(vlr_env.len(), smallvec![]); + + // RETURNED TO CALLER + // A vector that simply records which insns are v-to-v boundary moves, as established by the + // analysis below. This info is collected here because (1) the caller (BT) needs to have it + // and (2) this is the first point at which we can efficiently compute it. + let mut is_vv_boundary_move = TypedIxVec::<InstIx, bool>::new(); + is_vv_boundary_move.resize(func.insns().len() as u32, false); + + // RETURNED TO CALLER (after finalisation) + // The virtual-to-virtual equivalence classes we're collecting. + let mut vlrEquivClassesUF = UnionFind::<VirtualRangeIx>::new(vlr_env.len() as usize); + + // Not returned to caller; for use only in this function. + // A list of `VirtualRange`s for which the `total_cost` (hence also their + // `spill_cost`) should be adjusted downwards by the supplied `u32`. We + // can't do this directly in the loop below due to borrowing constraints, + // hence we collect the required info in this vector and do it in a second + // loop. + let mut decVLRcosts = Vec::<(VirtualRangeIx, VirtualRangeIx, u32)>::new(); + + for MoveInfoElem { + dst, + src, + iix, + est_freq, + .. + } in &move_info.moves + { + debug!( + "connected by moves: {:?} {:?} <- {:?} (est_freq {})", + iix, dst, src, est_freq + ); + match (dst.is_virtual(), src.is_virtual()) { + (true, true) => { + // Check for a V <- V hint. + let srcV = src.to_virtual_reg(); + let dstV = dst.to_virtual_reg(); + let mb_vlrixSrc = doesVRegHaveLastUseAt(srcV, *iix); + let mb_vlrixDst = doesVRegHaveFirstDefAt(dstV, *iix); + if mb_vlrixSrc.is_some() && mb_vlrixDst.is_some() { + let vlrixSrc = mb_vlrixSrc.unwrap(); + let vlrixDst = mb_vlrixDst.unwrap(); + // Per block comment at top of file, make a half-hearted + // attempt to avoid creating equivalence classes with + // internal overlaps. Note this can't be completely + // effective as presently implemented. + if !vlr_env[vlrixSrc].overlaps(&vlr_env[vlrixDst]) { + // Add hints for both VLRs, since we don't know which one will + // assign first. Indeed, a VLR may be assigned and un-assigned + // arbitrarily many times. + hints[vlrixSrc].push(Hint::SameAs(vlrixDst, *est_freq)); + hints[vlrixDst].push(Hint::SameAs(vlrixSrc, *est_freq)); + vlrEquivClassesUF.union(vlrixDst, vlrixSrc); + is_vv_boundary_move[*iix] = true; + // Reduce the total cost, and hence the spill cost, of + // both `vlrixSrc` and `vlrixDst`. This is so as to reduce to + // zero, the cost of a VLR whose only instructions are its + // v-v boundary copies. + debug!("reduce cost of {:?} and {:?}", vlrixSrc, vlrixDst); + decVLRcosts.push((vlrixSrc, vlrixDst, 1 * est_freq)); + } + } + } + (true, false) => { + // Check for a V <- R hint. + let srcR = src.to_real_reg(); + let dstV = dst.to_virtual_reg(); + let mb_rlrSrc = doesRRegHaveLastUseAt(srcR, *iix); + let mb_vlrDst = doesVRegHaveFirstDefAt(dstV, *iix); + if mb_rlrSrc.is_some() && mb_vlrDst.is_some() { + let vlrDst = mb_vlrDst.unwrap(); + hints[vlrDst].push(Hint::Exactly(srcR, *est_freq)); + } + } + (false, true) => { + // Check for a R <- V hint. + let srcV = src.to_virtual_reg(); + let dstR = dst.to_real_reg(); + let mb_vlrSrc = doesVRegHaveLastUseAt(srcV, *iix); + let mb_rlrDst = doesRRegHaveFirstDefAt(dstR, *iix); + if mb_vlrSrc.is_some() && mb_rlrDst.is_some() { + let vlrSrc = mb_vlrSrc.unwrap(); + hints[vlrSrc].push(Hint::Exactly(dstR, *est_freq)); + } + } + (false, false) => { + // This is a real-to-real move. There's nothing we can do. Ignore it. + } + } + } + + // Now decrease the `total_cost` and `spill_cost` fields of selected + // `VirtualRange`s, as detected by the previous loop. Don't decrease the + // `spill_cost` literally to zero; doing that causes various assertion + // failures and boundary problems later on, in the `CommitmentMap`s. In + // such a case, make the `spill_cost` be tiny but nonzero. + fn decrease_vlr_total_cost_by(vlr: &mut VirtualRange, decrease_total_cost_by: u32) { + // Adjust `total_cost`. + if vlr.total_cost < decrease_total_cost_by { + vlr.total_cost = 0; + } else { + vlr.total_cost -= decrease_total_cost_by; + } + // And recompute `spill_cost` accordingly. + if vlr.total_cost == 0 { + vlr.spill_cost = SpillCost::finite(1.0e-6); + } else { + assert!(vlr.size > 0); + vlr.spill_cost = SpillCost::finite(vlr.total_cost as f32 / vlr.size as f32); + } + } + + for (vlrix1, vlrix2, decrease_total_cost_by) in decVLRcosts { + decrease_vlr_total_cost_by(&mut vlr_env[vlrix1], decrease_total_cost_by); + decrease_vlr_total_cost_by(&mut vlr_env[vlrix2], decrease_total_cost_by); + } + + // For the convenience of the allocator core, sort the hints for each VLR so + // as to move the most preferred to the front. + for hints_for_one_vlr in hints.iter_mut() { + hints_for_one_vlr.sort_by(|h1, h2| h2.get_weight().partial_cmp(&h1.get_weight()).unwrap()); + } + + let vlrEquivClasses: UnionFindEquivClasses<VirtualRangeIx> = + vlrEquivClassesUF.get_equiv_classes(); + + if log_enabled!(Level::Debug) { + debug!("Revised VLRs:"); + let mut n = 0; + for vlr in vlr_env.iter() { + debug!("{:<4?} {:?}", VirtualRangeIx::new(n), vlr); + n += 1; + } + + debug!("Coalescing hints:"); + n = 0; + for hints_for_one_vlr in hints.iter() { + let mut s = "".to_string(); + for hint in hints_for_one_vlr { + s = s + &show_hint(hint, &univ) + &" ".to_string(); + } + debug!(" hintsfor {:<4?} = {}", VirtualRangeIx::new(n), s); + n += 1; + } + + for n in 0..vlr_env.len() { + let vlrix = VirtualRangeIx::new(n); + let mut tmpvec = vec![]; + for elem in vlrEquivClasses.equiv_class_elems_iter(vlrix) { + tmpvec.reverse(); + tmpvec.push(elem); + } + debug!(" eclassof {:?} = {:?}", vlrix, tmpvec); + } + + for (b, i) in is_vv_boundary_move.iter().zip(0..) { + if *b { + debug!(" vv_boundary_move at {:?}", InstIx::new(i)); + } + } + } + + info!("do_coalescing_analysis: end"); + info!(""); + + (hints, vlrEquivClasses, is_vv_boundary_move) +} diff --git a/third_party/rust/regalloc/src/bt_commitment_map.rs b/third_party/rust/regalloc/src/bt_commitment_map.rs new file mode 100644 index 0000000000..03c989321c --- /dev/null +++ b/third_party/rust/regalloc/src/bt_commitment_map.rs @@ -0,0 +1,170 @@ +#![allow(non_snake_case)] +#![allow(non_camel_case_types)] + +//! Backtracking allocator: per-real-register commitment maps + +use std::cmp::Ordering; +use std::fmt; + +use crate::avl_tree::{AVLTree, AVL_NULL}; +use crate::data_structures::{ + cmp_range_frags, InstPoint, RangeFrag, RangeFragIx, RangeId, SortedRangeFragIxs, + SortedRangeFrags, TypedIxVec, +}; + +//============================================================================= +// Per-real-register commitment maps +// + +// Something that pairs a fragment index with the identity of the virtual or real range to which +// this fragment conceptually "belongs", at least for the purposes of this commitment map. If +// the `lr_id` field denotes a real range, the associated fragment belongs to a real-reg live +// range and is therefore non-evictable. The identity of the range is necessary because: +// +// * for VirtualRanges, (1) we may need to evict the mapping, so we will need to get hold of the +// VirtualRange, so that we have all fragments of the VirtualRange to hand, and (2) if the +// client requires stackmaps, we need to look at the VirtualRange to see if it is reftyped. +// +// * for RealRanges, only (2) applies; (1) is irrelevant since RealRange assignments are +// non-evictable. +// +// (A fragment merely denotes a sequence of instruction (points), but within the context of a +// commitment map for a real register, obviously any particular fragment can't be part of two +// different virtual live ranges.) +// +// Note that we don't intend to actually use the PartialOrd methods for RangeFragAndRangeId. +// However, they need to exist since we want to construct an AVLTree<RangeFragAndRangeId>, and +// that requires PartialOrd for its element type. For working with such trees we will supply +// our own comparison function; hence PartialOrd here serves only to placate the typechecker. +// It should never actually be used. +#[derive(Clone)] +pub struct RangeFragAndRangeId { + pub frag: RangeFrag, + pub id: RangeId, +} +impl RangeFragAndRangeId { + fn new(frag: RangeFrag, id: RangeId) -> Self { + Self { frag, id } + } +} +impl PartialEq for RangeFragAndRangeId { + fn eq(&self, _other: &Self) -> bool { + // See comments above. + panic!("impl PartialEq for RangeFragAndRangeId: should never be used"); + } +} +impl PartialOrd for RangeFragAndRangeId { + fn partial_cmp(&self, _other: &Self) -> Option<Ordering> { + // See comments above. + panic!("impl PartialOrd for RangeFragAndRangeId: should never be used"); + } +} +impl fmt::Debug for RangeFragAndRangeId { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!(fmt, "(FnV {:?} {:?})", self.frag, self.id) + } +} + +//============================================================================= +// Per-real-register commitment maps +// + +// This indicates the current set of fragments to which some real register is +// currently "committed". The fragments *must* be non-overlapping. Hence +// they form a total order, and so we may validly build an AVL tree of them. + +pub struct CommitmentMap { + pub tree: AVLTree<RangeFragAndRangeId>, +} +impl fmt::Debug for CommitmentMap { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let as_vec = self.tree.to_vec(); + as_vec.fmt(fmt) + } +} + +impl CommitmentMap { + pub fn new() -> Self { + // The AVL tree constructor needs a default value for the elements. It + // will never be used. The RangeId index value will show as + // obviously bogus if we ever try to "dereference" any part of it. + let dflt = RangeFragAndRangeId::new(RangeFrag::invalid_value(), RangeId::invalid_value()); + Self { + tree: AVLTree::<RangeFragAndRangeId>::new(dflt), + } + } + + pub fn add(&mut self, to_add_frags: &SortedRangeFrags, to_add_lr_id: RangeId) { + for frag in &to_add_frags.frags { + let to_add = RangeFragAndRangeId::new(frag.clone(), to_add_lr_id); + let added = self.tree.insert( + to_add, + Some(&|pair1: RangeFragAndRangeId, pair2: RangeFragAndRangeId| { + cmp_range_frags(&pair1.frag, &pair2.frag) + }), + ); + // If this fails, it means the fragment overlaps one that has already + // been committed to. That's a serious error. + assert!(added); + } + } + + pub fn add_indirect( + &mut self, + to_add_frags: &SortedRangeFragIxs, + to_add_lr_id: RangeId, + frag_env: &TypedIxVec<RangeFragIx, RangeFrag>, + ) { + for fix in &to_add_frags.frag_ixs { + let to_add = RangeFragAndRangeId::new(frag_env[*fix].clone(), to_add_lr_id); + let added = self.tree.insert( + to_add, + Some(&|pair1: RangeFragAndRangeId, pair2: RangeFragAndRangeId| { + cmp_range_frags(&pair1.frag, &pair2.frag) + }), + ); + // If this fails, it means the fragment overlaps one that has already + // been committed to. That's a serious error. + assert!(added); + } + } + + pub fn del(&mut self, to_del_frags: &SortedRangeFrags) { + for frag in &to_del_frags.frags { + // re RangeId::invalid_value(): we don't care what the RangeId is, since we're + // deleting by RangeFrags alone. + let to_del = RangeFragAndRangeId::new(frag.clone(), RangeId::invalid_value()); + let deleted = self.tree.delete( + to_del, + Some(&|pair1: RangeFragAndRangeId, pair2: RangeFragAndRangeId| { + cmp_range_frags(&pair1.frag, &pair2.frag) + }), + ); + // If this fails, it means the fragment wasn't already committed to. + // That's also a serious error. + assert!(deleted); + } + } + + // Find the RangeId for the RangeFrag that overlaps `pt`, if one exists. + // This is conceptually equivalent to LogicalSpillSlot::get_refness_at_inst_point. + pub fn lookup_inst_point(&self, pt: InstPoint) -> Option<RangeId> { + let mut root = self.tree.root; + while root != AVL_NULL { + let root_node = &self.tree.pool[root as usize]; + let root_item = &root_node.item; + if pt < root_item.frag.first { + // `pt` is to the left of the `root`. So there's no + // overlap with `root`. Continue by inspecting the left subtree. + root = root_node.left; + } else if root_item.frag.last < pt { + // Ditto for the right subtree. + root = root_node.right; + } else { + // `pt` overlaps the `root`, so we have what we want. + return Some(root_item.id); + } + } + None + } +} diff --git a/third_party/rust/regalloc/src/bt_main.rs b/third_party/rust/regalloc/src/bt_main.rs new file mode 100644 index 0000000000..9c33348667 --- /dev/null +++ b/third_party/rust/regalloc/src/bt_main.rs @@ -0,0 +1,1844 @@ +#![allow(non_snake_case)] +#![allow(non_camel_case_types)] + +//! Core implementation of the backtracking allocator. + +use log::{debug, info, log_enabled, Level}; +use smallvec::SmallVec; +use std::default; +use std::fmt; + +use crate::analysis_data_flow::{add_raw_reg_vecs_for_insn, does_inst_use_def_or_mod_reg}; +use crate::analysis_main::{run_analysis, AnalysisInfo}; +use crate::avl_tree::{AVLTree, AVL_NULL}; +use crate::bt_coalescing_analysis::{do_coalescing_analysis, Hint}; +use crate::bt_commitment_map::{CommitmentMap, RangeFragAndRangeId}; +use crate::bt_spillslot_allocator::SpillSlotAllocator; +use crate::bt_vlr_priority_queue::VirtualRangePrioQ; +use crate::data_structures::{ + BlockIx, InstIx, InstPoint, Map, Point, RangeFrag, RangeFragIx, RangeId, RealRange, + RealRangeIx, RealReg, RealRegUniverse, Reg, RegClass, RegVecBounds, RegVecs, RegVecsAndBounds, + Set, SortedRangeFrags, SpillCost, SpillSlot, TypedIxVec, VirtualRange, VirtualRangeIx, + VirtualReg, Writable, +}; +use crate::inst_stream::{ + edit_inst_stream, ExtPoint, InstExtPoint, InstToInsert, InstToInsertAndExtPoint, +}; +use crate::sparse_set::SparseSetU; +use crate::union_find::UnionFindEquivClasses; +use crate::{AlgorithmWithDefaults, Function, RegAllocError, RegAllocResult, StackmapRequestInfo}; + +#[derive(Clone)] +pub struct BacktrackingOptions { + /// Should the register allocator generate block annotations? + pub request_block_annotations: bool, +} + +impl default::Default for BacktrackingOptions { + fn default() -> Self { + Self { + request_block_annotations: false, + } + } +} + +impl fmt::Debug for BacktrackingOptions { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!( + fmt, + "backtracking (block annotations: {})", + self.request_block_annotations + ) + } +} + +//============================================================================= +// The per-real-register state +// +// Relevant methods are expected to be parameterised by the same VirtualRange +// env as used in calls to `VirtualRangePrioQ`. + +struct PerRealReg { + // The current committed fragments for this RealReg. + committed: CommitmentMap, + + // The set of VirtualRanges which have been assigned to this RealReg. The + // union of their frags will be equal to `committed` only if this RealReg + // has no RealRanges. If this RealReg does have RealRanges the + // aforementioned union will be exactly the subset of `committed` not used + // by the RealRanges. + vlrixs_assigned: Set<VirtualRangeIx>, +} +impl PerRealReg { + fn new() -> Self { + Self { + committed: CommitmentMap::new(), + vlrixs_assigned: Set::<VirtualRangeIx>::empty(), + } + } + + #[inline(never)] + fn add_RealRange( + &mut self, + to_add_rlrix: RealRangeIx, + rlr_env: &TypedIxVec<RealRangeIx, RealRange>, + frag_env: &TypedIxVec<RangeFragIx, RangeFrag>, + ) { + // Commit this register to `to_add`, irrevocably. Don't add it to `vlrixs_assigned` + // since we will never want to later evict the assignment. (Also, from a types point of + // view that would be impossible.) + let to_add_rlr = &rlr_env[to_add_rlrix]; + self.committed.add_indirect( + &to_add_rlr.sorted_frags, + RangeId::new_real(to_add_rlrix), + frag_env, + ); + } + + #[inline(never)] + fn add_VirtualRange( + &mut self, + to_add_vlrix: VirtualRangeIx, + vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>, + ) { + let to_add_vlr = &vlr_env[to_add_vlrix]; + self.committed + .add(&to_add_vlr.sorted_frags, RangeId::new_virtual(to_add_vlrix)); + assert!(!self.vlrixs_assigned.contains(to_add_vlrix)); + self.vlrixs_assigned.insert(to_add_vlrix); + } + + #[inline(never)] + fn del_VirtualRange( + &mut self, + to_del_vlrix: VirtualRangeIx, + vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>, + ) { + // Remove it from `vlrixs_assigned` + // FIXME 2020June18: we could do this more efficiently by inspecting + // the return value from `delete`. + if self.vlrixs_assigned.contains(to_del_vlrix) { + self.vlrixs_assigned.delete(to_del_vlrix); + } else { + panic!("PerRealReg: del_VirtualRange on VR not in vlrixs_assigned"); + } + // Remove it from `committed` + let to_del_vlr = &vlr_env[to_del_vlrix]; + self.committed.del(&to_del_vlr.sorted_frags); + } +} + +// HELPER FUNCTION +// For a given `RangeFrag`, traverse the commitment tree rooted at `root`, +// adding to `running_set` the set of VLRIxs that the frag intersects, and +// adding their spill costs to `running_cost`. Return false if, for one of +// the 4 reasons documented below, the traversal has been abandoned, and true +// if the search completed successfully. +fn search_commitment_tree<IsAllowedToEvict>( + // The running state, threaded through the tree traversal. These + // accumulate ranges and costs as we traverse the tree. These are mutable + // because our caller (`find_evict_set`) will want to try and allocate + // multiple `RangeFrag`s in this tree, not just a single one, and so it + // needs a way to accumulate the total evict-cost and evict-set for all + // the `RangeFrag`s it iterates over. + running_set: &mut SparseSetU<[VirtualRangeIx; 4]>, + running_cost: &mut SpillCost, + // The tree to search. + tree: &AVLTree<RangeFragAndRangeId>, + // The RangeFrag we want to accommodate. + pair_frag: &RangeFrag, + spill_cost_budget: &SpillCost, + allowed_to_evict: &IsAllowedToEvict, + vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>, +) -> bool +where + IsAllowedToEvict: Fn(VirtualRangeIx) -> bool, +{ + let mut stack = SmallVec::<[u32; 32]>::new(); + assert!(tree.root != AVL_NULL); + stack.push(tree.root); + + while let Some(curr) = stack.pop() { + let curr_node = &tree.pool[curr as usize]; + let curr_node_item = &curr_node.item; + let curr_frag = &curr_node_item.frag; + + // Figure out whether `pair_frag` overlaps the root of the current + // subtree. + let overlaps_curr = pair_frag.last >= curr_frag.first && pair_frag.first <= curr_frag.last; + + // Let's first consider the current node. If we need it but it's not + // evictable, we might as well stop now. + if overlaps_curr { + // This frag is committed to a real range, not a virtual one, and hence is not + // evictable. + if curr_node_item.id.is_real() { + return false; + } + // Maybe this one is a spill range, in which case, it can't be + // evicted. + let vlrix_to_evict = curr_node_item.id.to_virtual(); + let vlr_to_evict = &vlr_env[vlrix_to_evict]; + if vlr_to_evict.spill_cost.is_infinite() { + return false; + } + // Check that this range alone doesn't exceed our total spill + // cost. NB: given the check XXX below, this isn't actually + // necessary; however it means that we avoid doing two + // SparseSet::contains operations before exiting. This saves + // around 0.3% instruction count for large inputs. + if !vlr_to_evict.spill_cost.is_less_than(spill_cost_budget) { + return false; + } + // Maybe our caller doesn't want us to evict this one. + if !allowed_to_evict(vlrix_to_evict) { + return false; + } + // Ok! We can evict the current node. Update the running state + // accordingly. Note that we may be presented with the same VLRIx + // to evict multiple times, so we must be careful to add the cost + // of it only once. + if !running_set.contains(vlrix_to_evict) { + let mut tmp_cost = *running_cost; + tmp_cost.add(&vlr_to_evict.spill_cost); + // See above XXX + if !tmp_cost.is_less_than(spill_cost_budget) { + return false; + } + *running_cost = tmp_cost; + running_set.insert(vlrix_to_evict); + } + } + + // Now figure out if we need to visit the subtrees, and if so push the + // relevant nodes. Whether we visit the left or right subtree first + // is unimportant, at least from a correctness perspective. + let must_check_left = pair_frag.first < curr_frag.first; + if must_check_left { + let left_of_curr = tree.pool[curr as usize].left; + if left_of_curr != AVL_NULL { + stack.push(left_of_curr); + } + } + + let must_check_right = pair_frag.last > curr_frag.last; + if must_check_right { + let right_of_curr = tree.pool[curr as usize].right; + if right_of_curr != AVL_NULL { + stack.push(right_of_curr); + } + } + } + + // If we get here, it means that `pair_frag` can be accommodated if we + // evict all the frags it overlaps in `tree`. + // + // Stay sane .. + assert!(running_cost.is_finite()); + assert!(running_cost.is_less_than(spill_cost_budget)); + true +} + +impl PerRealReg { + // Find the set of VirtualRangeIxs that would need to be evicted in order to + // allocate `would_like_to_add` to this register. Virtual ranges mentioned + // in `do_not_evict` must not be considered as candidates for eviction. + // Also returns the total associated spill cost. That spill cost cannot be + // infinite. + // + // This can fail (return None) for four different reasons: + // + // - `would_like_to_add` interferes with a real-register-live-range + // commitment, so the register would be unavailable even if we evicted + // *all* virtual ranges assigned to it. + // + // - `would_like_to_add` interferes with a virtual range which is a spill + // range (has infinite cost). We cannot evict those without risking + // non-termination of the overall allocation algorithm. + // + // - `would_like_to_add` interferes with a virtual range listed in + // `do_not_evict`. Our caller uses this mechanism when trying to do + // coalesing, to avoid the nonsensicality of evicting some part of a + // virtual live range group in order to allocate a member of the same + // group. + // + // - The total spill cost of the candidate set exceeds the spill cost of + // `would_like_to_add`. This means that spilling them would be a net loss + // per our cost model. Note that `would_like_to_add` may have an infinite + // spill cost, in which case it will "win" over all other + // non-infinite-cost eviction candidates. This is by design (so as to + // guarantee that we can always allocate spill/reload bridges). + #[inline(never)] + fn find_evict_set<IsAllowedToEvict>( + &self, + would_like_to_add: VirtualRangeIx, + allowed_to_evict: &IsAllowedToEvict, + vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>, + ) -> Option<(SparseSetU<[VirtualRangeIx; 4]>, SpillCost)> + where + IsAllowedToEvict: Fn(VirtualRangeIx) -> bool, + { + // Firstly, if the commitment tree is for this reg is empty, we can + // declare success immediately. + if self.committed.tree.root == AVL_NULL { + let evict_set = SparseSetU::<[VirtualRangeIx; 4]>::empty(); + let evict_cost = SpillCost::zero(); + return Some((evict_set, evict_cost)); + } + + // The tree isn't empty, so we will have to do this the hard way: iterate + // over all fragments in `would_like_to_add` and check them against the + // tree. + + // Useful constants for the main loop + let would_like_to_add_vlr = &vlr_env[would_like_to_add]; + let evict_cost_budget = would_like_to_add_vlr.spill_cost; + // Note that `evict_cost_budget` can be infinite because + // `would_like_to_add` might be a spill/reload range. + + // The overall evict set and cost so far. These are updated as we iterate + // over the fragments that make up `would_like_to_add`. + let mut running_set = SparseSetU::<[VirtualRangeIx; 4]>::empty(); + let mut running_cost = SpillCost::zero(); + + // "wlta" = would like to add + for wlta_frag in &would_like_to_add_vlr.sorted_frags.frags { + let wlta_frag_ok = search_commitment_tree( + &mut running_set, + &mut running_cost, + &self.committed.tree, + &wlta_frag, + &evict_cost_budget, + allowed_to_evict, + vlr_env, + ); + if !wlta_frag_ok { + // This fragment won't fit, for one of the four reasons listed + // above. So give up now. + return None; + } + // And move on to the next fragment. + } + + // If we got here, it means that `would_like_to_add` can be accommodated \o/ + assert!(running_cost.is_finite()); + assert!(running_cost.is_less_than(&evict_cost_budget)); + Some((running_set, running_cost)) + } + + #[allow(dead_code)] + #[inline(never)] + fn show1_with_envs(&self, _frag_env: &TypedIxVec<RangeFragIx, RangeFrag>) -> String { + //"in_use: ".to_string() + &self.committed.show_with_frag_env(&frag_env) + "(show1_with_envs:FIXME)".to_string() + } + #[allow(dead_code)] + #[inline(never)] + fn show2_with_envs(&self, _frag_env: &TypedIxVec<RangeFragIx, RangeFrag>) -> String { + //"assigned: ".to_string() + &format!("{:?}", &self.vlrixs_assigned) + "(show2_with_envs:FIXME)".to_string() + } +} + +//============================================================================= +// Printing the allocator's top level state + +#[inline(never)] +fn print_RA_state( + who: &str, + _universe: &RealRegUniverse, + // State components + prioQ: &VirtualRangePrioQ, + _perRealReg: &Vec<PerRealReg>, + edit_list_move: &Vec<EditListItem>, + edit_list_other: &Vec<EditListItem>, + // The context (environment) + vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>, + _frag_env: &TypedIxVec<RangeFragIx, RangeFrag>, +) { + debug!("<<<<====---- RA state at '{}' ----====", who); + //for ix in 0..perRealReg.len() { + // if !&perRealReg[ix].committed.pairs.is_empty() { + // debug!( + // "{:<5} {}", + // universe.regs[ix].1, + // &perRealReg[ix].show1_with_envs(&frag_env) + // ); + // debug!(" {}", &perRealReg[ix].show2_with_envs(&frag_env)); + // debug!(""); + // } + //} + if !prioQ.is_empty() { + for s in prioQ.show_with_envs(vlr_env) { + debug!("{}", s); + } + } + for eli in edit_list_move { + debug!("ELI MOVE: {:?}", eli); + } + for eli in edit_list_other { + debug!("ELI other: {:?}", eli); + } + debug!(">>>>"); +} + +//============================================================================= +// Reftype/stackmap support + +// This creates the artefacts for a safepoint/stackmap at some insn `iix`: the set of reftyped +// spill slots, the spills to be placed at `iix.r` (yes, you read that right) and the reloads to +// be placed at `iix.s`. +// +// This consults: +// +// * the commitment maps, to figure out which real registers are live and reftyped at `iix.u`. +// +// * the spillslot allocator, to figure out which spill slots are live and reftyped at `iix.u`. +// +// This may fail, meaning the request is in some way nonsensical; failure is propagated upwards. + +fn get_stackmap_artefacts_at( + spill_slot_allocator: &mut SpillSlotAllocator, + univ: &RealRegUniverse, + reftype_class: RegClass, + reg_vecs_and_bounds: &RegVecsAndBounds, + per_real_reg: &Vec<PerRealReg>, + rlr_env: &TypedIxVec<RealRangeIx, RealRange>, + vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>, + iix: InstIx, +) -> Result<(Vec<InstToInsert>, Vec<InstToInsert>, Vec<SpillSlot>), RegAllocError> { + // From a code generation perspective, what we need to compute is: + // + // * Sbefore: real regs that are live at `iix.u`, that are reftypes + // + // * Safter: Sbefore - real regs written by `iix` + // + // Then: + // + // * for r in Sbefore . add "spill r" at `iix.r` *after* all the reloads that are already + // there + // + // * for r in Safter . add "reload r" at `iix.s` *before* all the spills that are already + // there + // + // Once those spills have been "recorded" by the `spill_slot_allocator`, we can then ask it + // to tell us all the reftyped spill slots at `iix.u`, and that's our stackmap! This routine + // only computes the stackmap and the vectors of spills and reloads. It doesn't deal with + // interleaving them into the final code sequence. + // + // Note that this scheme isn't as runtime-inefficient as it sounds, at least in the + // SpiderMonkey use case and where `iix` is a call insn. That's because SM's calling + // convention has no callee saved registers. Hence "real regs written by `iix`" will be + // "all real regs" and so Safter will be empty. And Sbefore is in any case pretty small. + // + // (/me thinks ..) hmm, if Safter is empty, then what is the point of dumping Sbefore on the + // stack before the GC? For r in Sbefore, either r is the only reference to some object, in + // which case there's no point in presenting that ref to the GC since r is dead after call, + // or r isn't the only ref to the object, in which case some other ref to it must exist + // elsewhere in the stack, and that will keep the object alive. Maybe this needs a rethink. + // Maybe the spills before the call should be only for the set Safter? + + let pt = InstPoint::new_use(iix); + + // Compute Sbefore. + + // FIXME change this to SparseSet + let mut s_before = Set::<RealReg>::empty(); + + let rci = univ.allocable_by_class[reftype_class.rc_to_usize()]; + if rci.is_none() { + return Err(RegAllocError::Other( + "stackmap request: no regs in specified reftype class".to_string(), + )); + } + let rci = rci.unwrap(); + + debug!("computing stackmap info at {:?}", pt); + + for rreg_no in rci.first..rci.last + 1 { + // Get the RangeId, if any, assigned for `rreg_no` at `iix.u`. From that we can figure + // out if it is reftyped. + let mb_range_id = per_real_reg[rreg_no].committed.lookup_inst_point(pt); + if let Some(range_id) = mb_range_id { + // `rreg_no` is live at `iix.u`. + let is_ref = if range_id.is_real() { + debug!( + " real reg {:?} is real-range {:?}", + rreg_no, + rlr_env[range_id.to_real()] + ); + rlr_env[range_id.to_real()].is_ref + } else { + debug!( + " real reg {:?} is virtual-range {:?}", + rreg_no, + vlr_env[range_id.to_virtual()] + ); + vlr_env[range_id.to_virtual()].is_ref + }; + if is_ref { + // Finally .. we know that `rreg_no` is reftyped and live at `iix.u`. + let rreg = univ.regs[rreg_no].0; + s_before.insert(rreg); + } + } + } + + debug!("Sbefore = {:?}", s_before); + + // Compute Safter. + + let mut s_after = s_before.clone(); + let bounds = ®_vecs_and_bounds.bounds[iix]; + if bounds.mods_len != 0 { + // Only the GC is allowed to modify reftyped regs at this insn! + return Err(RegAllocError::Other( + "stackmap request: safepoint insn modifies a reftyped reg".to_string(), + )); + } + + for i in bounds.defs_start..bounds.defs_start + bounds.defs_len as u32 { + let r_defd = reg_vecs_and_bounds.vecs.defs[i as usize]; + if r_defd.is_real() && r_defd.get_class() == reftype_class { + s_after.delete(r_defd.to_real_reg()); + } + } + + debug!("Safter = {:?}", s_before); + + // Create the spill insns, as defined by Sbefore. This has the side effect of recording the + // spill in `spill_slot_allocator`, so we can later ask it to tell us all the reftyped spill + // slots. + + let frag = RangeFrag::new(InstPoint::new_reload(iix), InstPoint::new_spill(iix)); + + let mut spill_insns = Vec::<InstToInsert>::new(); + let mut where_reg_got_spilled_to = Map::<RealReg, SpillSlot>::default(); + + for from_reg in s_before.iter() { + let to_slot = spill_slot_allocator.alloc_reftyped_spillslot_for_frag(frag.clone()); + let spill = InstToInsert::Spill { + to_slot, + from_reg: *from_reg, + for_vreg: None, // spill isn't associated with any virtual reg + }; + spill_insns.push(spill); + // We also need to remember where we stashed it, so we can reload it, if it is in Safter. + if s_after.contains(*from_reg) { + where_reg_got_spilled_to.insert(*from_reg, to_slot); + } + } + + // Create the reload insns, as defined by Safter. Except, we might as well use the map we + // just made, since its domain is the same as Safter. + + let mut reload_insns = Vec::<InstToInsert>::new(); + + for (to_reg, from_slot) in where_reg_got_spilled_to.iter() { + let reload = InstToInsert::Reload { + to_reg: Writable::from_reg(*to_reg), + from_slot: *from_slot, + for_vreg: None, // reload isn't associated with any virtual reg + }; + reload_insns.push(reload); + } + + // And finally .. round up all the reftyped spill slots. That includes both "normal" spill + // slots that happen to hold reftyped values, as well as the "extras" we created here, to + // hold values of reftyped regs that are live over this instruction. + + let reftyped_spillslots = spill_slot_allocator.get_reftyped_spillslots_at_inst_point(pt); + + debug!("reftyped_spillslots = {:?}", reftyped_spillslots); + + // And we're done! + + Ok((spill_insns, reload_insns, reftyped_spillslots)) +} + +//============================================================================= +// Allocator top level + +/* (const) For each virtual live range + - its sorted RangeFrags + - its total size + - its spill cost + - (eventually) its assigned real register + New VirtualRanges are created as we go, but existing ones are unchanged, + apart from being tagged with its assigned real register. + + (mut) For each real register + - the sorted RangeFrags assigned to it (todo: rm the M) + - the virtual LR indices assigned to it. This is so we can eject + a VirtualRange from the commitment, as needed + + (mut) the set of VirtualRanges not yet allocated, prioritised by total size + + (mut) the edit list that is produced +*/ +/* +fn show_commit_tab(commit_tab: &Vec::<SortedRangeFragIxs>, + who: &str, + context: &TypedIxVec::<RangeFragIx, RangeFrag>) -> String { + let mut res = "Commit Table at '".to_string() + + &who.to_string() + &"'\n".to_string(); + let mut rregNo = 0; + for smf in commit_tab.iter() { + res += &" ".to_string(); + res += &mkRealReg(rregNo).show(); + res += &" ".to_string(); + res += &smf.show_with_fenv(&context); + res += &"\n".to_string(); + rregNo += 1; + } + res +} +*/ + +// VirtualRanges created by spilling all pertain to a single InstIx. But +// within that InstIx, there are three kinds of "bridges": +#[derive(Clone, Copy, PartialEq)] +pub(crate) enum BridgeKind { + RtoU, // A bridge for a USE. This connects the reload to the use. + RtoS, // a bridge for a MOD. This connects the reload, the use/def + // and the spill, since the value must first be reloade, then + // operated on, and finally re-spilled. + DtoS, // A bridge for a DEF. This connects the def to the spill. +} + +impl fmt::Debug for BridgeKind { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + BridgeKind::RtoU => write!(fmt, "R->U"), + BridgeKind::RtoS => write!(fmt, "R->S"), + BridgeKind::DtoS => write!(fmt, "D->S"), + } + } +} + +#[derive(Clone, Copy)] +struct EditListItem { + // This holds enough information to create a spill or reload instruction, + // or both, and also specifies where in the instruction stream it/they + // should be added. Note that if the edit list as a whole specifies + // multiple items for the same location, then it is assumed that the order + // in which they execute isn't important. + // + // Some of the relevant info can be found via the VirtualRangeIx link: + // (1) the real reg involved + // (2) the place where the insn should go, since the VirtualRange should + // only have one RangeFrag, and we can deduce the correct location + // from that. + // Despite (2) we also carry here the InstIx of the affected instruction + // (there should be only one) since computing it via (2) is expensive. + // This however gives a redundancy in representation against (2). Beware! + slot: SpillSlot, + vlrix: VirtualRangeIx, + kind: BridgeKind, + iix: InstIx, +} + +impl fmt::Debug for EditListItem { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!( + fmt, + "(ELI: at {:?} for {:?} add {:?}, slot={:?})", + self.iix, self.vlrix, self.kind, self.slot + ) + } +} + +// Allocator top level. This function returns a result struct that contains +// the final sequence of instructions, possibly with fills/spills/moves +// spliced in and redundant moves elided, and with all virtual registers +// replaced with real registers. Allocation can fail if there are insufficient +// registers to even generate spill/reload code, or if the function appears to +// have any undefined VirtualReg/RealReg uses. + +#[inline(never)] +pub fn alloc_main<F: Function>( + func: &mut F, + reg_universe: &RealRegUniverse, + stackmap_request: Option<&StackmapRequestInfo>, + use_checker: bool, + opts: &BacktrackingOptions, +) -> Result<RegAllocResult<F>, RegAllocError> { + // -------- Initial arrangements for stackmaps -------- + let empty_vec_vregs = vec![]; + let empty_vec_iixs = vec![]; + let (client_wants_stackmaps, reftype_class, reftyped_vregs, safepoint_insns) = + match stackmap_request { + Some(&StackmapRequestInfo { + reftype_class, + ref reftyped_vregs, + ref safepoint_insns, + }) => (true, reftype_class, reftyped_vregs, safepoint_insns), + None => (false, RegClass::INVALID, &empty_vec_vregs, &empty_vec_iixs), + }; + + // -------- Perform initial liveness analysis -------- + // Note that the analysis phase can fail; hence we propagate any error. + let AnalysisInfo { + reg_vecs_and_bounds, + real_ranges: rlr_env, + virtual_ranges: mut vlr_env, + range_frags: frag_env, + range_metrics: frag_metrics_env, + estimated_frequencies: est_freqs, + inst_to_block_map, + reg_to_ranges_maps: mb_reg_to_ranges_maps, + move_info: mb_move_info, + } = run_analysis( + func, + reg_universe, + AlgorithmWithDefaults::Backtracking, + client_wants_stackmaps, + reftype_class, + reftyped_vregs, + ) + .map_err(|err| RegAllocError::Analysis(err))?; + + assert!(reg_vecs_and_bounds.is_sanitized()); + assert!(frag_env.len() == frag_metrics_env.len()); + assert!(mb_reg_to_ranges_maps.is_some()); // ensured by `run_analysis` + assert!(mb_move_info.is_some()); // ensured by `run_analysis` + let reg_to_ranges_maps = mb_reg_to_ranges_maps.unwrap(); + let move_info = mb_move_info.unwrap(); + + // Also perform analysis that finds all coalescing opportunities. + let coalescing_info = do_coalescing_analysis( + func, + ®_universe, + &rlr_env, + &mut vlr_env, + &frag_env, + ®_to_ranges_maps, + &move_info, + ); + let mut hints: TypedIxVec<VirtualRangeIx, SmallVec<[Hint; 8]>> = coalescing_info.0; + let vlrEquivClasses: UnionFindEquivClasses<VirtualRangeIx> = coalescing_info.1; + let is_vv_boundary_move: TypedIxVec<InstIx, bool> = coalescing_info.2; + assert!(hints.len() == vlr_env.len()); + + // -------- Alloc main -------- + + // Create initial state + info!("alloc_main: begin"); + info!( + "alloc_main: in: {} insns in {} blocks", + func.insns().len(), + func.blocks().len() + ); + let num_vlrs_initial = vlr_env.len(); + info!( + "alloc_main: in: {} VLRs, {} RLRs", + num_vlrs_initial, + rlr_env.len() + ); + + // This is fully populated by the ::new call. + let mut prioQ = VirtualRangePrioQ::new(&vlr_env); + + // Whereas this is empty. We have to populate it "by hand", by + // effectively cloning the allocatable part (prefix) of the universe. + let mut per_real_reg = Vec::<PerRealReg>::new(); + for _ in 0..reg_universe.allocable { + // Doing this instead of simply .resize avoids needing Clone for + // PerRealReg + per_real_reg.push(PerRealReg::new()); + } + for (rlrix_no, rlr) in rlr_env.iter().enumerate() { + let rlrix = RealRangeIx::new(rlrix_no as u32); + let rregIndex = rlr.rreg.get_index(); + // Ignore RealRanges for RealRegs that are not part of the allocatable + // set. As far as the allocator is concerned, such RealRegs simply + // don't exist. + if rregIndex >= reg_universe.allocable { + continue; + } + per_real_reg[rregIndex].add_RealRange(rlrix, &rlr_env, &frag_env); + } + + let mut edit_list_move = Vec::<EditListItem>::new(); + let mut edit_list_other = Vec::<EditListItem>::new(); + if log_enabled!(Level::Debug) { + debug!(""); + print_RA_state( + "Initial", + ®_universe, + &prioQ, + &per_real_reg, + &edit_list_move, + &edit_list_other, + &vlr_env, + &frag_env, + ); + } + + // This is also part of the running state. `vlr_slot_env` tells us the + // assigned spill slot for each VirtualRange, if any. + // `spill_slot_allocator` decides on the assignments and writes them into + // `vlr_slot_env`. + let mut vlr_slot_env = TypedIxVec::<VirtualRangeIx, Option<SpillSlot>>::new(); + vlr_slot_env.resize(num_vlrs_initial, None); + let mut spill_slot_allocator = SpillSlotAllocator::new(); + + // Main allocation loop. Each time round, pull out the longest + // unallocated VirtualRange, and do one of three things: + // + // * allocate it to a RealReg, possibly by ejecting some existing + // allocation, but only one with a lower spill cost than this one, or + // + // * spill it. This causes the VirtualRange to disappear. It is replaced + // by a set of very short VirtualRanges to carry the spill and reload + // values. Or, + // + // * split it. This causes it to disappear but be replaced by two + // VirtualRanges which together constitute the original. + debug!(""); + debug!("-- MAIN ALLOCATION LOOP (DI means 'direct', CO means 'coalesced'):"); + + info!("alloc_main: main allocation loop: begin"); + + // ======== BEGIN Main allocation loop ======== + let mut num_vlrs_processed = 0; // stats only + let mut num_vlrs_spilled = 0; // stats only + let mut num_vlrs_evicted = 0; // stats only + + 'main_allocation_loop: loop { + debug!("-- still TODO {}", prioQ.len()); + if false { + if log_enabled!(Level::Debug) { + debug!(""); + print_RA_state( + "Loop Top", + ®_universe, + &prioQ, + &per_real_reg, + &edit_list_move, + &edit_list_other, + &vlr_env, + &frag_env, + ); + debug!(""); + } + } + + let mb_curr_vlrix = prioQ.get_longest_VirtualRange(); + if mb_curr_vlrix.is_none() { + break 'main_allocation_loop; + } + + num_vlrs_processed += 1; + let curr_vlrix = mb_curr_vlrix.unwrap(); + let curr_vlr = &vlr_env[curr_vlrix]; + + debug!("-- considering {:?}: {:?}", curr_vlrix, curr_vlr); + + assert!(curr_vlr.vreg.to_reg().is_virtual()); + assert!(curr_vlr.rreg.is_none()); + let curr_vlr_regclass = curr_vlr.vreg.get_class(); + let curr_vlr_rc = curr_vlr_regclass.rc_to_usize(); + + // ====== BEGIN Try to do coalescing ====== + // + // First, look through the hints for `curr_vlr`, collecting up candidate + // real regs, in decreasing order of preference, in `hinted_regs`. Note + // that we don't have to consider the weights here, because the coalescing + // analysis phase has already sorted the hints for the VLR so as to + // present the most favoured (weighty) first, so we merely need to retain + // that ordering when copying into `hinted_regs`. + assert!(hints.len() == vlr_env.len()); + let mut hinted_regs = SmallVec::<[RealReg; 8]>::new(); + + // === BEGIN collect all hints for `curr_vlr` === + // `hints` has one entry per VLR, but only for VLRs which existed + // initially (viz, not for any created by spilling/splitting/whatever). + // Similarly, `vlrEquivClasses` can only map VLRs that existed initially, + // and will panic otherwise. Hence the following check: + if curr_vlrix.get() < hints.len() { + for hint in &hints[curr_vlrix] { + // BEGIN for each hint + let mb_cand = match hint { + Hint::SameAs(other_vlrix, _weight) => { + // It wants the same reg as some other VLR, but we can only honour + // that if the other VLR actually *has* a reg at this point. Its + // `rreg` field will tell us exactly that. + vlr_env[*other_vlrix].rreg + } + Hint::Exactly(rreg, _weight) => Some(*rreg), + }; + // So now `mb_cand` might have a preferred real reg. If so, add it to + // the list of cands. De-dup as we go, since that is way cheaper than + // effectively doing the same via repeated lookups in the + // CommitmentMaps. + if let Some(rreg) = mb_cand { + if !hinted_regs.iter().any(|r| *r == rreg) { + hinted_regs.push(rreg); + } + } + // END for each hint + } + + // At this point, we have in `hinted_regs`, the hint candidates that + // arise from copies between `curr_vlr` and its immediate neighbouring + // VLRs or RLRs, in order of declining preference. And that is a good + // start. + // + // However, it may be the case that there is some other VLR which + // is in the same equivalence class as `curr_vlr`, but is not a + // direct neighbour, and which has already been assigned a + // register. We really ought to take those into account too, as + // the least-preferred candidates. Hence we need to iterate over + // the equivalence class and "round up the secondary candidates." + // + // Note that the equivalence class might contain VirtualRanges + // that are mutually overlapping. That is handled correctly, + // since we always consult the relevant CommitmentMaps (in the + // PerRealRegs) to detect interference. To more fully understand + // this, see the big block comment at the top of + // bt_coalescing_analysis.rs. + let n_primary_cands = hinted_regs.len(); + + // Work the equivalence class set for `curr_vlrix` to pick up any + // rreg hints. Equivalence class info exists only for "initial" VLRs. + if curr_vlrix.get() < num_vlrs_initial { + // `curr_vlrix` is an "initial" VLR. + for vlrix in vlrEquivClasses.equiv_class_elems_iter(curr_vlrix) { + if vlrix != curr_vlrix { + if let Some(rreg) = vlr_env[vlrix].rreg { + // Add `rreg` as a cand, if we don't already have it. + if !hinted_regs.iter().any(|r| *r == rreg) { + hinted_regs.push(rreg); + } + } + } + } + + // Sort the secondary cands, so as to try and impose more consistency + // across the group. I don't know if this is worthwhile, but it seems + // sensible. + hinted_regs[n_primary_cands..].sort_by(|rreg1, rreg2| { + rreg1.get_index().partial_cmp(&rreg2.get_index()).unwrap() + }); + } + + if log_enabled!(Level::Debug) { + if !hinted_regs.is_empty() { + let mut candStr = "pri {".to_string(); + for (rreg, n) in hinted_regs.iter().zip(0..) { + if n == n_primary_cands { + candStr = candStr + &" } sec {".to_string(); + } + candStr = + candStr + &" ".to_string() + ®_universe.regs[rreg.get_index()].1; + } + candStr = candStr + &" }"; + debug!("-- CO candidates {}", candStr); + } + } + } + // === END collect all hints for `curr_vlr` === + + // === BEGIN try to use the hints for `curr_vlr` === + // Now work through the list of preferences, to see if we can honour any + // of them. + for rreg in &hinted_regs { + let rregNo = rreg.get_index(); + + // Find the set of ranges which we'd have to evict in order to honour + // this hint. In the best case the set will be empty. In the worst + // case, we will get None either because (1) it would require evicting a + // spill range, which is disallowed so as to guarantee termination of + // the algorithm, or (2) because it would require evicting a real reg + // live range, which we can't do. + // + // We take care not to evict any range which is in the same equivalence + // class as `curr_vlr` since those are (by definition) connected to + // `curr_vlr` via V-V copies, and so evicting any of them would be + // counterproductive from the point of view of removing copies. + + let mb_evict_info: Option<(SparseSetU<[VirtualRangeIx; 4]>, SpillCost)> = + per_real_reg[rregNo].find_evict_set( + curr_vlrix, + &|vlrix_to_evict| { + // What this means is: don't evict `vlrix_to_evict` if + // it is in the same equivalence class as `curr_vlrix` + // (the VLR which we're trying to allocate) AND we + // actually know the equivalence classes for both + // (hence the `Some`). Spill/reload ("non-original") + // VLRS don't have entries in `vlrEquivClasses`. + vlrEquivClasses.in_same_equivalence_class(vlrix_to_evict, curr_vlrix) + != Some(true) + }, + &vlr_env, + ); + if let Some((vlrixs_to_evict, total_evict_cost)) = mb_evict_info { + // Stay sane #1 + assert!(curr_vlr.rreg.is_none()); + // Stay sane #2 + assert!(vlrixs_to_evict.is_empty() == total_evict_cost.is_zero()); + // Can't evict if any in the set are spill ranges + assert!(total_evict_cost.is_finite()); + // Ensure forward progress + assert!(total_evict_cost.is_less_than(&curr_vlr.spill_cost)); + // Evict all evictees in the set + for vlrix_to_evict in vlrixs_to_evict.iter() { + // Ensure we're not evicting anything in `curr_vlrix`'s eclass. + // This should be guaranteed us by find_evict_set. + assert!( + vlrEquivClasses.in_same_equivalence_class(*vlrix_to_evict, curr_vlrix) + != Some(true) + ); + // Evict .. + debug!( + "-- CO evict {:?}: {:?}", + *vlrix_to_evict, &vlr_env[*vlrix_to_evict] + ); + per_real_reg[rregNo].del_VirtualRange(*vlrix_to_evict, &vlr_env); + prioQ.add_VirtualRange(&vlr_env, *vlrix_to_evict); + // Directly modify bits of vlr_env. This means we have to abandon + // the immutable borrow for curr_vlr, but that's OK -- we won't need + // it again (in this loop iteration). + debug_assert!(vlr_env[*vlrix_to_evict].rreg.is_some()); + vlr_env[*vlrix_to_evict].rreg = None; + num_vlrs_evicted += 1; + } + // .. and reassign. + debug!("-- CO alloc to {}", reg_universe.regs[rregNo].1); + per_real_reg[rregNo].add_VirtualRange(curr_vlrix, &vlr_env); + vlr_env[curr_vlrix].rreg = Some(*rreg); + // We're done! + continue 'main_allocation_loop; + } + } /* for rreg in hinted_regs */ + // === END try to use the hints for `curr_vlr` === + + // ====== END Try to do coalescing ====== + + // We get here if we failed to find a viable assignment by the process of + // looking at the coalescing hints. + // + // So: do almost exactly as we did in the "try for coalescing" stage + // above. Except, instead of trying each coalescing candidate + // individually, iterate over all the registers in the class, to find the + // one (if any) that has the lowest total evict cost. If we find one that + // has zero cost -- that is, we can make the assignment without evicting + // anything -- then stop the search at that point, since searching further + // is pointless. + + let (first_in_rc, last_in_rc) = match ®_universe.allocable_by_class[curr_vlr_rc] { + &None => { + return Err(RegAllocError::OutOfRegisters(curr_vlr_regclass)); + } + &Some(ref info) => (info.first, info.last), + }; + + let mut best_so_far: Option<( + /*rreg index*/ usize, + SparseSetU<[VirtualRangeIx; 4]>, + SpillCost, + )> = None; + + 'search_through_cand_rregs_loop: for rregNo in first_in_rc..last_in_rc + 1 { + //debug!("-- Cand {} ...", + // reg_universe.regs[rregNo].1); + + let mb_evict_info: Option<(SparseSetU<[VirtualRangeIx; 4]>, SpillCost)> = + per_real_reg[rregNo].find_evict_set( + curr_vlrix, + // We pass a closure that ignores its arg and returns `true`. + // Meaning, "we are not specifying any particular + // can't-be-evicted VLRs in this call." + &|_vlrix_to_evict| true, + &vlr_env, + ); + // + //match mb_evict_info { + // None => debug!("-- Cand {}: Unavail", + // reg_universe.regs[rregNo].1), + // Some((ref evict_set, ref evict_cost)) => + // debug!("-- Cand {}: Avail, evict cost {:?}, set {:?}", + // reg_universe.regs[rregNo].1, evict_cost, evict_set) + //} + // + if let Some((cand_vlrixs_to_evict, cand_total_evict_cost)) = mb_evict_info { + // Stay sane .. + assert!(cand_vlrixs_to_evict.is_empty() == cand_total_evict_cost.is_zero()); + // We can't evict if any in the set are spill ranges, and + // find_evict_set should not offer us that possibility. + assert!(cand_total_evict_cost.is_finite()); + // Ensure forward progress + assert!(cand_total_evict_cost.is_less_than(&curr_vlr.spill_cost)); + // Update the "best so far". First, if the evict set is empty, then + // the candidate is by definition better than all others, and we will + // quit searching just below. + let mut cand_is_better = cand_vlrixs_to_evict.is_empty(); + if !cand_is_better { + if let Some((_, _, best_spill_cost)) = best_so_far { + // If we've already got a candidate, this one is better if it has + // a lower total spill cost. + if cand_total_evict_cost.is_less_than(&best_spill_cost) { + cand_is_better = true; + } + } else { + // We don't have any candidate so far, so what we just got is + // better (than nothing). + cand_is_better = true; + } + } + // Remember the candidate if required. + let cand_vlrixs_to_evict_is_empty = cand_vlrixs_to_evict.is_empty(); + if cand_is_better { + best_so_far = Some((rregNo, cand_vlrixs_to_evict, cand_total_evict_cost)); + } + // If we've found a no-evictions-necessary candidate, quit searching + // immediately. We won't find anything better. + if cand_vlrixs_to_evict_is_empty { + break 'search_through_cand_rregs_loop; + } + } + } // for rregNo in first_in_rc..last_in_rc + 1 { + + // Examine the results of the search. Did we find any usable candidate? + if let Some((rregNo, vlrixs_to_evict, total_spill_cost)) = best_so_far { + // We are still Totally Paranoid (tm) + // Stay sane #1 + debug_assert!(curr_vlr.rreg.is_none()); + // Can't spill a spill range + assert!(total_spill_cost.is_finite()); + // Ensure forward progress + assert!(total_spill_cost.is_less_than(&curr_vlr.spill_cost)); + // Now the same evict-reassign section as with the coalescing logic above. + // Evict all evictees in the set + for vlrix_to_evict in vlrixs_to_evict.iter() { + // Evict .. + debug!( + "-- DI evict {:?}: {:?}", + *vlrix_to_evict, &vlr_env[*vlrix_to_evict] + ); + per_real_reg[rregNo].del_VirtualRange(*vlrix_to_evict, &vlr_env); + prioQ.add_VirtualRange(&vlr_env, *vlrix_to_evict); + debug_assert!(vlr_env[*vlrix_to_evict].rreg.is_some()); + vlr_env[*vlrix_to_evict].rreg = None; + num_vlrs_evicted += 1; + } + // .. and reassign. + debug!("-- DI alloc to {}", reg_universe.regs[rregNo].1); + per_real_reg[rregNo].add_VirtualRange(curr_vlrix, &vlr_env); + let rreg = reg_universe.regs[rregNo].0; + vlr_env[curr_vlrix].rreg = Some(rreg); + // We're done! + continue 'main_allocation_loop; + } + + // Still no luck. We can't find a register to put it in, so we'll + // have to spill it, since splitting it isn't yet implemented. + debug!("-- spill"); + + // If the live range already pertains to a spill or restore, then + // it's Game Over. The constraints (availability of RealRegs vs + // requirement for them) are impossible to fulfill, and so we cannot + // generate any valid allocation for this function. + if curr_vlr.spill_cost.is_infinite() { + return Err(RegAllocError::OutOfRegisters(curr_vlr_regclass)); + } + + // Generate a new spill slot number, S + /* Spilling vreg V with virtual live range VirtualRange to slot S: + for each frag F in VirtualRange { + for each insn I in F.first.iix .. F.last.iix { + if I does not mention V + continue + if I mentions V in a Read role { + // invar: I cannot mention V in a Mod role + add new VirtualRange I.reload -> I.use, + vreg V, spillcost Inf + add eli @ I.reload V SpillSlot + } + if I mentions V in a Mod role { + // invar: I cannot mention V in a Read or Write Role + add new VirtualRange I.reload -> I.spill, + Vreg V, spillcost Inf + add eli @ I.reload V SpillSlot + add eli @ I.spill SpillSlot V + } + if I mentions V in a Write role { + // invar: I cannot mention V in a Mod role + add new VirtualRange I.def -> I.spill, + vreg V, spillcost Inf + add eli @ I.spill V SpillSlot + } + } + } + */ + + // We will be spilling vreg `curr_vlr.reg` with VirtualRange `curr_vlr` to + // a spill slot that the spill slot allocator will choose for us, just a + // bit further down this function. + + // This holds enough info to create reload or spill (or both) + // instructions around an instruction that references a VirtualReg + // that has been spilled. + struct SpillAndOrReloadInfo { + bix: BlockIx, // that `iix` is in + iix: InstIx, // this is the Inst we are spilling/reloading for + kind: BridgeKind, // says whether to create a spill or reload or both + } + + // Most spills won't require anywhere near 32 entries, so this avoids + // almost all heap allocation. + let mut sri_vec = SmallVec::<[SpillAndOrReloadInfo; 32]>::new(); + + let curr_vlr_vreg = curr_vlr.vreg; + let curr_vlr_reg = curr_vlr_vreg.to_reg(); + let curr_vlr_is_ref = curr_vlr.is_ref; + + for frag in &curr_vlr.sorted_frags.frags { + for iix in frag.first.iix().dotdot(frag.last.iix().plus(1)) { + let (iix_uses_curr_vlr_reg, iix_defs_curr_vlr_reg, iix_mods_curr_vlr_reg) = + does_inst_use_def_or_mod_reg(®_vecs_and_bounds, iix, curr_vlr_reg); + // If this insn doesn't mention the vreg we're spilling for, + // move on. + if !iix_defs_curr_vlr_reg && !iix_mods_curr_vlr_reg && !iix_uses_curr_vlr_reg { + continue; + } + // USES: Do we need to create a reload-to-use bridge + // (VirtualRange) ? + if iix_uses_curr_vlr_reg && frag.contains(&InstPoint::new_use(iix)) { + debug_assert!(!iix_mods_curr_vlr_reg); + // Stash enough info that we can create a new VirtualRange + // and a new edit list entry for the reload. + let bix = inst_to_block_map.map(iix); + let sri = SpillAndOrReloadInfo { + bix, + iix, + kind: BridgeKind::RtoU, + }; + sri_vec.push(sri); + } + // MODS: Do we need to create a reload-to-spill bridge? This + // can only happen for instructions which modify a register. + // Note this has to be a single VirtualRange, since if it were + // two (one for the reload, one for the spill) they could + // later end up being assigned to different RealRegs, which is + // obviously nonsensical. + if iix_mods_curr_vlr_reg + && frag.contains(&InstPoint::new_use(iix)) + && frag.contains(&InstPoint::new_def(iix)) + { + debug_assert!(!iix_uses_curr_vlr_reg); + debug_assert!(!iix_defs_curr_vlr_reg); + let bix = inst_to_block_map.map(iix); + let sri = SpillAndOrReloadInfo { + bix, + iix, + kind: BridgeKind::RtoS, + }; + sri_vec.push(sri); + } + // DEFS: Do we need to create a def-to-spill bridge? + if iix_defs_curr_vlr_reg && frag.contains(&InstPoint::new_def(iix)) { + debug_assert!(!iix_mods_curr_vlr_reg); + let bix = inst_to_block_map.map(iix); + let sri = SpillAndOrReloadInfo { + bix, + iix, + kind: BridgeKind::DtoS, + }; + sri_vec.push(sri); + } + } + } + + // Now that we no longer need to access `frag_env` or `vlr_env` for + // the remainder of this iteration of the main allocation loop, we can + // actually generate the required spill/reload artefacts. + + // First off, poke the spill slot allocator to get an intelligent choice + // of slot. Note that this will fail for "non-initial" VirtualRanges; but + // the only non-initial ones will have been created by spilling anyway, + // any we definitely shouldn't be trying to spill them again. Hence we + // can assert. + assert!(vlr_slot_env.len() == num_vlrs_initial); + assert!(curr_vlrix < VirtualRangeIx::new(num_vlrs_initial)); + if vlr_slot_env[curr_vlrix].is_none() { + // It hasn't been decided yet. Cause it to be so by asking for an + // allocation for the entire eclass that `curr_vlrix` belongs to. + spill_slot_allocator.alloc_spill_slots( + &mut vlr_slot_env, + func, + &vlr_env, + &vlrEquivClasses, + curr_vlrix, + ); + assert!(vlr_slot_env[curr_vlrix].is_some()); + } + let spill_slot_to_use = vlr_slot_env[curr_vlrix].unwrap(); + + // If we're spilling a reffy VLR, we'll need to tell the spillslot allocator that. The + // VLR will already have been allocated to some spill slot, and relevant RangeFrags in + // the slot should have already been reserved for it, by the above call to + // `alloc_spill_slots` (although possibly relating to a prior VLR in the same + // equivalence class, and not this one). However, those RangeFrags will have all been + // marked non-reffy, because we don't know, in general, at spillslot-allocation-time, + // whether a VLR will actually be spilled, and we don't want the resulting stack maps to + // mention stack entries which are dead at the point of the safepoint insn. Hence the + // need to update those RangeFrags pertaining to just this VLR -- now that we *know* + // it's going to be spilled. + if curr_vlr.is_ref { + spill_slot_allocator + .notify_spillage_of_reftyped_vlr(spill_slot_to_use, &curr_vlr.sorted_frags); + } + + for sri in sri_vec { + let (new_vlr_first_pt, new_vlr_last_pt) = match sri.kind { + BridgeKind::RtoU => (Point::Reload, Point::Use), + BridgeKind::RtoS => (Point::Reload, Point::Spill), + BridgeKind::DtoS => (Point::Def, Point::Spill), + }; + let new_vlr_frag = RangeFrag { + first: InstPoint::new(sri.iix, new_vlr_first_pt), + last: InstPoint::new(sri.iix, new_vlr_last_pt), + }; + debug!("-- new RangeFrag {:?}", &new_vlr_frag); + let new_vlr_sfrags = SortedRangeFrags::unit(new_vlr_frag); + let new_vlr = VirtualRange { + vreg: curr_vlr_vreg, + rreg: None, + sorted_frags: new_vlr_sfrags, + is_ref: curr_vlr_is_ref, // "inherit" refness + size: 1, + // Effectively infinite. We'll never look at this again anyway. + total_cost: 0xFFFF_FFFFu32, + spill_cost: SpillCost::infinite(), + }; + let new_vlrix = VirtualRangeIx::new(vlr_env.len() as u32); + debug!( + "-- new VirtRange {:?} := {:?}", + new_vlrix, &new_vlr + ); + vlr_env.push(new_vlr); + prioQ.add_VirtualRange(&vlr_env, new_vlrix); + + // BEGIN (optimisation only) see if we can create any coalescing hints + // for this new VLR. + let mut new_vlr_hint = SmallVec::<[Hint; 8]>::new(); + if is_vv_boundary_move[sri.iix] { + // Collect the src and dst regs for the move. It must be a + // move because `is_vv_boundary_move` claims that to be true. + let im = func.is_move(&func.get_insn(sri.iix)); + assert!(im.is_some()); + let (wdst_reg, src_reg): (Writable<Reg>, Reg) = im.unwrap(); + let dst_reg: Reg = wdst_reg.to_reg(); + assert!(src_reg.is_virtual() && dst_reg.is_virtual()); + let dst_vreg: VirtualReg = dst_reg.to_virtual_reg(); + let src_vreg: VirtualReg = src_reg.to_virtual_reg(); + let bridge_eef = est_freqs[sri.bix]; + match sri.kind { + BridgeKind::RtoU => { + // Reload-to-Use bridge. Hint that we want to be + // allocated to the same reg as the destination of the + // move. That means we have to find the VLR that owns + // the destination vreg. + for vlrix in ®_to_ranges_maps.vreg_to_vlrs_map[dst_vreg.get_index()] { + if vlr_env[*vlrix].vreg == dst_vreg { + new_vlr_hint.push(Hint::SameAs(*vlrix, bridge_eef)); + break; + } + } + } + BridgeKind::DtoS => { + // Def-to-Spill bridge. Hint that we want to be + // allocated to the same reg as the source of the + // move. + for vlrix in ®_to_ranges_maps.vreg_to_vlrs_map[src_vreg.get_index()] { + if vlr_env[*vlrix].vreg == src_vreg { + new_vlr_hint.push(Hint::SameAs(*vlrix, bridge_eef)); + break; + } + } + } + BridgeKind::RtoS => { + // A Reload-to-Spill bridge. This can't happen. It + // implies that the instruction modifies (both reads + // and writes) one of its operands, but the client's + // `is_move` function claims this instruction is a + // plain "move" (reads source, writes dest). These + // claims are mutually contradictory. + panic!("RtoS bridge for v-v boundary move"); + } + } + } + hints.push(new_vlr_hint); + // END see if we can create any coalescing hints for this new VLR. + + // Finally, create a new EditListItem. This holds enough + // information that a suitable spill or reload instruction can + // later be created. + let new_eli = EditListItem { + slot: spill_slot_to_use, + vlrix: new_vlrix, + kind: sri.kind, + iix: sri.iix, + }; + if is_vv_boundary_move[sri.iix] { + debug!("-- new ELI MOVE {:?}", &new_eli); + edit_list_move.push(new_eli); + } else { + debug!("-- new ELI other {:?}", &new_eli); + edit_list_other.push(new_eli); + } + } + + num_vlrs_spilled += 1; + // And implicitly "continue 'main_allocation_loop" + } + // ======== END Main allocation loop ======== + + info!("alloc_main: main allocation loop: end"); + + if log_enabled!(Level::Debug) { + debug!(""); + print_RA_state( + "Final", + ®_universe, + &prioQ, + &per_real_reg, + &edit_list_move, + &edit_list_other, + &vlr_env, + &frag_env, + ); + } + + // ======== BEGIN Do spill slot coalescing ======== + + debug!(""); + info!("alloc_main: create spills_n_reloads for MOVE insns"); + + // Sort `edit_list_move` by the insn with which each item is associated. + edit_list_move.sort_unstable_by(|eli1, eli2| eli1.iix.cmp(&eli2.iix)); + + // Now go through `edit_list_move` and find pairs which constitute a + // spillslot-to-the-same-spillslot move. What we have in `edit_list_move` is + // heavily constrained, as follows: + // + // * each entry should reference an InstIx which the coalescing analysis + // identified as a virtual-to-virtual copy which exists at the boundary + // between two VirtualRanges. The "boundary" aspect is important; we + // can't coalesce out moves in which the source vreg is not the "last use" + // or for which the destination vreg is not the "first def". (The same is + // true for coalescing of unspilled moves). + // + // * the each entry must reference a VirtualRange which has only a single + // RangeFrag, and that frag must exist entirely "within" the referenced + // instruction. Specifically, it may only be a R->U frag (bridge) or a + // D->S frag. + // + // * For a referenced instruction, there may be at most two entries in this + // list: one that references the R->U frag and one that references the + // D->S frag. Furthermore, the two entries must carry values of the same + // RegClass; if that isn't true, the client's `is_move` function is + // defective. + // + // For any such pair identified, if both frags mention the same spill slot, + // we skip generating both the reload and the spill instruction. We also + // note that the instruction itself is to be deleted (converted to a + // zero-len nop). In a sense we have "cancelled out" a reload/spill pair. + // Entries that can't be cancelled out are handled the same way as for + // entries in `edit_list_other`, by simply copying them there. + // + // Since `edit_list_move` is sorted by insn ix, we can scan linearly over + // it, looking for adjacent pairs. We'll have to accept them in either + // order though (first R->U then D->S, or the other way round). There's no + // fixed ordering since there is no particular ordering in the way + // VirtualRanges are allocated. + + // As a result of spill slot coalescing, we'll need to delete the move + // instructions leading to a mergable spill slot move. The insn stream + // editor can't delete instructions, so instead it'll replace them with zero + // len nops obtained from the client. `iixs_to_nop_out` records the insns + // that this has to happen to. It is in increasing order of InstIx (because + // `edit_list_sorted` is), and the indices in it refer to the original + // virtual-registerised code. + let mut iixs_to_nop_out = Vec::<InstIx>::new(); + let mut ghost_moves = vec![]; + + let n_edit_list_move = edit_list_move.len(); + let mut n_edit_list_move_processed = 0; // for assertions only + let mut i_min = 0; + loop { + if i_min >= n_edit_list_move { + break; + } + // Find the bounds of the current group. + debug!("editlist entry (MOVE): min: {:?}", &edit_list_move[i_min]); + let i_min_iix = edit_list_move[i_min].iix; + let mut i_max = i_min; + while i_max + 1 < n_edit_list_move && edit_list_move[i_max + 1].iix == i_min_iix { + i_max += 1; + debug!("editlist entry (MOVE): max: {:?}", &edit_list_move[i_max]); + } + // Current group is from i_min to i_max inclusive. At most 2 entries are + // allowed per group. + assert!(i_max - i_min <= 1); + // Check for a mergeable pair. + if i_max - i_min == 1 { + assert!(is_vv_boundary_move[i_min_iix]); + let vlrix1 = edit_list_move[i_min].vlrix; + let vlrix2 = edit_list_move[i_max].vlrix; + assert!(vlrix1 != vlrix2); + let vlr1 = &vlr_env[vlrix1]; + let vlr2 = &vlr_env[vlrix2]; + let frags1 = &vlr1.sorted_frags; + let frags2 = &vlr2.sorted_frags; + assert!(frags1.frags.len() == 1); + assert!(frags2.frags.len() == 1); + let frag1 = &frags1.frags[0]; + let frag2 = &frags2.frags[0]; + assert!(frag1.first.iix() == i_min_iix); + assert!(frag1.last.iix() == i_min_iix); + assert!(frag2.first.iix() == i_min_iix); + assert!(frag2.last.iix() == i_min_iix); + // frag1 must be R->U and frag2 must be D->S, or vice versa + match ( + frag1.first.pt(), + frag1.last.pt(), + frag2.first.pt(), + frag2.last.pt(), + ) { + (Point::Reload, Point::Use, Point::Def, Point::Spill) + | (Point::Def, Point::Spill, Point::Reload, Point::Use) => { + let slot1 = edit_list_move[i_min].slot; + let slot2 = edit_list_move[i_max].slot; + if slot1 == slot2 { + // Yay. We've found a coalescable pair. We can just ignore the + // two entries and move on. Except we have to mark the insn + // itself for deletion. + debug!("editlist entry (MOVE): delete {:?}", i_min_iix); + iixs_to_nop_out.push(i_min_iix); + i_min = i_max + 1; + n_edit_list_move_processed += 2; + if use_checker { + let (from_reg, to_reg) = if frag1.last.pt() == Point::Use { + (vlr1.vreg.to_reg(), vlr2.vreg.to_reg()) + } else { + (vlr2.vreg.to_reg(), vlr1.vreg.to_reg()) + }; + ghost_moves.push(InstToInsertAndExtPoint::new( + InstToInsert::ChangeSpillSlotOwnership { + inst_ix: i_min_iix, + slot: slot1, + from_reg, + to_reg, + }, + InstExtPoint::new(i_min_iix, ExtPoint::Reload), + )); + } + continue; + } + } + (_, _, _, _) => { + panic!("spill slot coalescing, edit_list_move: unexpected frags"); + } + } + } + // If we get here for whatever reason, this group is uninteresting. Copy + // in to `edit_list_other` so that it gets processed in the normal way. + for i in i_min..=i_max { + edit_list_other.push(edit_list_move[i]); + n_edit_list_move_processed += 1; + } + i_min = i_max + 1; + } + assert!(n_edit_list_move_processed == n_edit_list_move); + + // ======== END Do spill slot coalescing ======== + + // ======== BEGIN Create all other spills and reloads ======== + + debug!(""); + info!("alloc_main: create spills_n_reloads for other insns"); + + // Reload and spill instructions are missing. To generate them, go through + // the "edit list", which contains info on both how to generate the + // instructions, and where to insert them. + let mut spills_n_reloads = Vec::<InstToInsertAndExtPoint>::new(); + let mut num_spills = 0; // stats only + let mut num_reloads = 0; // stats only + for eli in &edit_list_other { + debug!("editlist entry (other): {:?}", eli); + let vlr = &vlr_env[eli.vlrix]; + let vlr_sfrags = &vlr.sorted_frags; + assert!(vlr_sfrags.frags.len() == 1); + let vlr_frag = &vlr_sfrags.frags[0]; + let rreg = vlr.rreg.expect("Gen of spill/reload: reg not assigned?!"); + let vreg = vlr.vreg; + match eli.kind { + BridgeKind::RtoU => { + debug_assert!(vlr_frag.first.pt().is_reload()); + debug_assert!(vlr_frag.last.pt().is_use()); + debug_assert!(vlr_frag.first.iix() == vlr_frag.last.iix()); + let insnR = InstToInsert::Reload { + to_reg: Writable::from_reg(rreg), + from_slot: eli.slot, + for_vreg: Some(vreg), + }; + let whereToR = InstExtPoint::from_inst_point(vlr_frag.first); + spills_n_reloads.push(InstToInsertAndExtPoint::new(insnR, whereToR)); + num_reloads += 1; + } + BridgeKind::RtoS => { + debug_assert!(vlr_frag.first.pt().is_reload()); + debug_assert!(vlr_frag.last.pt().is_spill()); + debug_assert!(vlr_frag.first.iix() == vlr_frag.last.iix()); + let insnR = InstToInsert::Reload { + to_reg: Writable::from_reg(rreg), + from_slot: eli.slot, + for_vreg: Some(vreg), + }; + let whereToR = InstExtPoint::from_inst_point(vlr_frag.first); + let insnS = InstToInsert::Spill { + to_slot: eli.slot, + from_reg: rreg, + for_vreg: Some(vreg), + }; + let whereToS = InstExtPoint::from_inst_point(vlr_frag.last); + spills_n_reloads.push(InstToInsertAndExtPoint::new(insnR, whereToR)); + spills_n_reloads.push(InstToInsertAndExtPoint::new(insnS, whereToS)); + num_reloads += 1; + num_spills += 1; + } + BridgeKind::DtoS => { + debug_assert!(vlr_frag.first.pt().is_def()); + debug_assert!(vlr_frag.last.pt().is_spill()); + debug_assert!(vlr_frag.first.iix() == vlr_frag.last.iix()); + let insnS = InstToInsert::Spill { + to_slot: eli.slot, + from_reg: rreg, + for_vreg: Some(vreg), + }; + let whereToS = InstExtPoint::from_inst_point(vlr_frag.last); + spills_n_reloads.push(InstToInsertAndExtPoint::new(insnS, whereToS)); + num_spills += 1; + } + } + } + + // Append all ghost moves. + if use_checker { + spills_n_reloads.extend(ghost_moves.into_iter()); + spills_n_reloads.sort_by_key(|inst_and_point| inst_and_point.iep.clone()); + } + + // ======== END Create all other spills and reloads ======== + + // ======== BEGIN Create final instruction stream ======== + + // Gather up a vector of (RangeFrag, VirtualReg, RealReg) resulting from + // the previous phase. This fundamentally is the result of the allocation + // and tells us how the instruction stream must be edited. Note it does + // not take account of spill or reload instructions. Dealing with those + // is relatively simple and happens later. + + info!("alloc_main: create frag_map"); + + let mut frag_map = Vec::<(RangeFrag, VirtualReg, RealReg)>::new(); + // For each real register under our control .. + for i in 0..reg_universe.allocable { + let rreg = reg_universe.regs[i].0; + // .. look at all the VirtualRanges assigned to it. And for each such + // VirtualRange .. + for vlrix_assigned in per_real_reg[i].vlrixs_assigned.iter() { + let VirtualRange { + vreg, sorted_frags, .. + } = &vlr_env[*vlrix_assigned]; + // All the RangeFrags in `vlr_assigned` require `vlr_assigned.reg` + // to be mapped to the real reg `i` + // .. collect up all its constituent RangeFrags. + for frag in &sorted_frags.frags { + frag_map.push((frag.clone(), *vreg, rreg)); + } + } + } + + // There is one of these for every entry in `safepoint_insns`. + let mut stackmaps = Vec::<Vec<SpillSlot>>::new(); + + if !safepoint_insns.is_empty() { + info!("alloc_main: create safepoints and stackmaps"); + for safepoint_iix in safepoint_insns { + // Create the stackmap artefacts for `safepoint_iix`. Save the stackmap (the + // reftyped spillslots); we'll have to return it to the client as part of the + // overall allocation result. The extra spill and reload instructions can simply + // be added to `spills_n_reloads` though, and `edit_inst_stream` will correctly + // merge them in. + // + // Note: this modifies `spill_slot_allocator`, since at this point we have to + // allocate spill slots to hold reftyped real regs across the safepoint insn. + // + // Because the SB (spill-before) and RA (reload-after) `ExtPoint`s are "closer" to + // the "core" of an instruction than the R (reload) and S (spill) `ExtPoint`s, any + // "normal" reload or spill ranges that are reftyped will be handled correctly. + // From `get_stackmap_artefacts_at`s point of view, such spill/reload ranges are + // just like any other real-reg live range that it will have to spill around the + // safepoint. The fact that they are for spills or reloads doesn't make any + // difference. + // + // Note also: this call can fail; failure is propagated upwards. + // + // FIXME Passing these 3 small vectors around is inefficient. Use SmallVec or + // (better) owned-by-this-function vectors instead. + let (spills_before, reloads_after, reftyped_spillslots) = get_stackmap_artefacts_at( + &mut spill_slot_allocator, + ®_universe, + reftype_class, + ®_vecs_and_bounds, + &per_real_reg, + &rlr_env, + &vlr_env, + *safepoint_iix, + )?; + stackmaps.push(reftyped_spillslots); + for spill_before in spills_before { + spills_n_reloads.push(InstToInsertAndExtPoint::new( + spill_before, + InstExtPoint::new(*safepoint_iix, ExtPoint::SpillBefore), + )); + } + for reload_after in reloads_after { + spills_n_reloads.push(InstToInsertAndExtPoint::new( + reload_after, + InstExtPoint::new(*safepoint_iix, ExtPoint::ReloadAfter), + )); + } + } + } + + info!("alloc_main: edit_inst_stream"); + + let final_insns_and_targetmap_and_new_safepoints__or_err = edit_inst_stream( + func, + &safepoint_insns, + spills_n_reloads, + &iixs_to_nop_out, + frag_map, + ®_universe, + use_checker, + &stackmaps[..], + &reftyped_vregs[..], + ); + + // ======== END Create final instruction stream ======== + + // ======== BEGIN Create the RegAllocResult ======== + + match final_insns_and_targetmap_and_new_safepoints__or_err { + Ok((ref final_insns, ..)) => { + info!( + "alloc_main: out: VLRs: {} initially, {} processed", + num_vlrs_initial, num_vlrs_processed + ); + info!( + "alloc_main: out: VLRs: {} evicted, {} spilled", + num_vlrs_evicted, num_vlrs_spilled + ); + info!( + "alloc_main: out: insns: {} total, {} spills, {} reloads, {} nopzs", + final_insns.len(), + num_spills, + num_reloads, + iixs_to_nop_out.len() + ); + info!( + "alloc_main: out: spill slots: {} used", + spill_slot_allocator.num_slots_in_use() + ); + } + Err(_) => { + info!("alloc_main: allocation failed!"); + } + } + + let (final_insns, target_map, new_to_old_insn_map, new_safepoint_insns) = + match final_insns_and_targetmap_and_new_safepoints__or_err { + Err(e) => { + info!("alloc_main: fail"); + return Err(e); + } + Ok(quad) => { + info!("alloc_main: creating RegAllocResult"); + quad + } + }; + + // Compute clobbered registers with one final, quick pass. + // + // FIXME: derive this information directly from the allocation data + // structures used above. + // + // NB at this point, the `san_reg_uses` that was computed in the analysis + // phase is no longer valid, because we've added and removed instructions to + // the function relative to the one that `san_reg_uses` was computed from, + // so we have to re-visit all insns with `add_raw_reg_vecs_for_insn`. + // That's inefficient, but we don't care .. this should only be a temporary + // fix. + + let mut clobbered_registers: Set<RealReg> = Set::empty(); + + // We'll dump all the reg uses in here. We don't care about the bounds, so just + // pass a dummy one in the loop. + let mut reg_vecs = RegVecs::new(/*sanitized=*/ false); + let mut dummy_bounds = RegVecBounds::new(); + for insn in &final_insns { + if func.is_included_in_clobbers(insn) { + add_raw_reg_vecs_for_insn::<F>(insn, &mut reg_vecs, &mut dummy_bounds); + } + } + for reg in reg_vecs.defs.iter().chain(reg_vecs.mods.iter()) { + assert!(reg.is_real()); + clobbered_registers.insert(reg.to_real_reg()); + } + + // And now remove from the set, all those not available to the allocator. + // But not removing the reserved regs, since we might have modified those. + clobbered_registers.filter_map(|®| { + if reg.get_index() >= reg_universe.allocable { + None + } else { + Some(reg) + } + }); + + assert!(est_freqs.len() as usize == func.blocks().len()); + let mut block_annotations = None; + if opts.request_block_annotations { + let mut anns = TypedIxVec::<BlockIx, Vec<String>>::new(); + for (estFreq, i) in est_freqs.iter().zip(0..) { + let bix = BlockIx::new(i); + let ef_str = format!("RA: bix {:?}, estFreq {}", bix, estFreq); + anns.push(vec![ef_str]); + } + block_annotations = Some(anns); + } + + assert!(stackmaps.len() == safepoint_insns.len()); + assert!(new_safepoint_insns.len() == safepoint_insns.len()); + let ra_res = RegAllocResult { + insns: final_insns, + target_map, + orig_insn_map: new_to_old_insn_map, + clobbered_registers, + num_spill_slots: spill_slot_allocator.num_slots_in_use() as u32, + block_annotations, + stackmaps, + new_safepoint_insns, + }; + + info!("alloc_main: end"); + + // ======== END Create the RegAllocResult ======== + + Ok(ra_res) +} diff --git a/third_party/rust/regalloc/src/bt_spillslot_allocator.rs b/third_party/rust/regalloc/src/bt_spillslot_allocator.rs new file mode 100644 index 0000000000..a85f2c0354 --- /dev/null +++ b/third_party/rust/regalloc/src/bt_spillslot_allocator.rs @@ -0,0 +1,522 @@ +#![allow(non_snake_case)] +#![allow(non_camel_case_types)] + +//! Allocation of spill slots for the backtracking allocator. + +use crate::avl_tree::{AVLTree, AVL_NULL}; +use crate::data_structures::{ + cmp_range_frags, InstPoint, RangeFrag, SortedRangeFrags, SpillSlot, TypedIxVec, VirtualRange, + VirtualRangeIx, +}; +use crate::union_find::UnionFindEquivClasses; +use crate::Function; + +//============================================================================= +// A spill slot allocator. This could be implemented more simply than it is. +// The reason for the extra complexity is to support copy-coalescing at the +// spill-slot level. That is, it tries make it possible to allocate all +// members of a VirtualRange group to the same spill slot, so that moves +// between two spilled members of the same group can be turned into no-ops. +// +// All of the `size` metrics in this bit are in terms of "logical spill slot +// units", per the interface's description for `get_spillslot_size`. + +// *** Important: to fully understand this allocator and how it interacts with +// coalescing analysis, you need to read the big block comment at the top of +// bt_coalescing_analysis.rs. + +//============================================================================= +// Logical spill slots + +// In the trees, we keep track of which frags are reftyped, so we can later create stackmaps by +// slicing all of the trees at some `InstPoint`. Unfortunately this requires storing 65 bits of +// data in each node -- 64 bits for the RangeFrag and 1 bit for the reftype. A TODO would be to +// steal one bit from the RangeFrag. For now though, we do the simple thing. + +#[derive(Clone, PartialEq, PartialOrd)] +struct RangeFragAndRefness { + frag: RangeFrag, + is_ref: bool, +} +impl RangeFragAndRefness { + fn new(frag: RangeFrag, is_ref: bool) -> Self { + Self { frag, is_ref } + } +} + +// We keep one of these for every "logical spill slot" in use. +enum LogicalSpillSlot { + // This slot is in use and can hold values of size `size` (only). Note that + // `InUse` may only appear in `SpillSlotAllocator::slots` positions that + // have indices that are 0 % `size`. Furthermore, after such an entry in + // `SpillSlotAllocator::slots`, the next `size` - 1 entries must be + // `Unavail`. This is a hard invariant, violation of which will cause + // overlapping spill slots and potential chaos. + InUse { + size: u32, + tree: AVLTree<RangeFragAndRefness>, + }, + // This slot is unavailable, as described above. It's unavailable because + // it holds some part of the values associated with the nearest lower + // numbered entry which isn't `Unavail`, and that entry must be an `InUse` + // entry. + Unavail, +} +impl LogicalSpillSlot { + fn is_Unavail(&self) -> bool { + match self { + LogicalSpillSlot::Unavail => true, + _ => false, + } + } + fn is_InUse(&self) -> bool { + !self.is_Unavail() + } + fn get_tree(&self) -> &AVLTree<RangeFragAndRefness> { + match self { + LogicalSpillSlot::InUse { ref tree, .. } => tree, + LogicalSpillSlot::Unavail => panic!("LogicalSpillSlot::get_tree"), + } + } + fn get_mut_tree(&mut self) -> &mut AVLTree<RangeFragAndRefness> { + match self { + LogicalSpillSlot::InUse { ref mut tree, .. } => tree, + LogicalSpillSlot::Unavail => panic!("LogicalSpillSlot::get_mut_tree"), + } + } + fn get_size(&self) -> u32 { + match self { + LogicalSpillSlot::InUse { size, .. } => *size, + LogicalSpillSlot::Unavail => panic!("LogicalSpillSlot::get_size"), + } + } + // If this spill slot is occupied at `pt`, return the refness of the value (VirtualRange) + // stored in it. This is conceptually equivalent to CommitmentMap::lookup_inst_point. + fn get_refness_at_inst_point(&self, pt: InstPoint) -> Option<bool> { + match self { + LogicalSpillSlot::InUse { size: 1, tree } => { + // Search the tree to see if a reffy commitment intersects `pt`. + let mut root = tree.root; + while root != AVL_NULL { + let root_node = &tree.pool[root as usize]; + let root_item = &root_node.item; + if pt < root_item.frag.first { + // `pt` is to the left of the `root`. So there's no + // overlap with `root`. Continue by inspecting the left subtree. + root = root_node.left; + } else if root_item.frag.last < pt { + // Ditto for the right subtree. + root = root_node.right; + } else { + // `pt` overlaps the `root`, so we have what we want. + return Some(root_item.is_ref); + } + } + None + } + LogicalSpillSlot::InUse { .. } | LogicalSpillSlot::Unavail => { + // Slot isn't is use, or is in use but for values of some non-ref size + None + } + } + } +} + +// HELPER FUNCTION +// Find out whether it is possible to add `frag` to `tree`. +#[inline(always)] +fn ssal_is_add_frag_possible(tree: &AVLTree<RangeFragAndRefness>, frag: &RangeFrag) -> bool { + // BEGIN check `frag` for any overlap against `tree`. + let mut root = tree.root; + while root != AVL_NULL { + let root_node = &tree.pool[root as usize]; + let root_item = &root_node.item; + if frag.last < root_item.frag.first { + // `frag` is entirely to the left of the `root`. So there's no + // overlap with root. Continue by inspecting the left subtree. + root = root_node.left; + } else if root_item.frag.last < frag.first { + // Ditto for the right subtree. + root = root_node.right; + } else { + // `frag` overlaps the `root`. Give up. + return false; + } + } + // END check `frag` for any overlap against `tree`. + // `frag` doesn't overlap. + true +} + +// HELPER FUNCTION +// Find out whether it is possible to add all of `frags` to `tree`. Returns +// true if possible, false if not. This routine relies on the fact that +// SortedFrags is non-overlapping. However, this is a bit subtle. We know +// that both `tree` and `frags` individually are non-overlapping, but there's +// no guarantee that elements of `frags` don't overlap `tree`. Hence we have +// to do a custom walk of `tree` to check for overlap; we can't just use +// `AVLTree::contains`. +fn ssal_is_add_possible(tree: &AVLTree<RangeFragAndRefness>, frags: &SortedRangeFrags) -> bool { + // Figure out whether all the frags will go in. + for frag in &frags.frags { + if !ssal_is_add_frag_possible(&tree, frag) { + return false; + } + // `frag` doesn't overlap. Move on to the next one. + } + true +} + +// HELPER FUNCTION +// Try to add all of `frags` to `tree`. Return `true` if possible, `false` if not possible. If +// `false` is returned, `tree` is unchanged (this is important). This routine relies on the +// fact that SortedFrags is non-overlapping. They are initially all marked as non-reffy. That +// may later be changed by calls to `SpillSlotAllocator::notify_spillage_of_reftyped_vlr`. +fn ssal_add_if_possible(tree: &mut AVLTree<RangeFragAndRefness>, frags: &SortedRangeFrags) -> bool { + // Check if all the frags will go in. + if !ssal_is_add_possible(tree, frags) { + return false; + } + // They will. So now insert them. + for frag in &frags.frags { + let inserted = tree.insert( + RangeFragAndRefness::new(frag.clone(), /*is_ref=*/ false), + Some(&|item1: RangeFragAndRefness, item2: RangeFragAndRefness| { + cmp_range_frags(&item1.frag, &item2.frag) + }), + ); + // This can't fail + assert!(inserted); + } + true +} + +// HELPER FUNCTION +// Let `frags` be the RangeFrags for some VirtualRange, that have already been allocated in +// `tree`. Mark each such RangeFrag as reffy. +fn ssal_mark_frags_as_reftyped(tree: &mut AVLTree<RangeFragAndRefness>, frags: &SortedRangeFrags) { + for frag in &frags.frags { + // Be paranoid. (1) `frag` must already exist in `tree`. (2) it must not be marked as + // reffy. + let del_this = RangeFragAndRefness::new(frag.clone(), /*is_ref=*/ false); + let add_this = RangeFragAndRefness::new(frag.clone(), /*is_ref=*/ true); + let replaced_ok = tree.find_and_replace( + del_this, + add_this, + &|item1: RangeFragAndRefness, item2: RangeFragAndRefness| { + cmp_range_frags(&item1.frag, &item2.frag) + }, + ); + // This assertion effectively encompasses both (1) and (2) above. + assert!(replaced_ok); + } +} + +//============================================================================= +// SpillSlotAllocator: public interface + +pub struct SpillSlotAllocator { + slots: Vec<LogicalSpillSlot>, +} +impl SpillSlotAllocator { + pub fn new() -> Self { + Self { slots: vec![] } + } + + pub fn num_slots_in_use(&self) -> usize { + self.slots.len() + } + + // This adds a new, empty slot, for items of the given size, and returns + // its index. This isn't clever, in the sense that it fails to use some + // slots that it could use, but at least it's simple. Note, this is a + // private method. + fn add_new_slot(&mut self, req_size: u32) -> u32 { + assert!(req_size == 1 || req_size == 2 || req_size == 4 || req_size == 8); + // Satisfy alignment constraints. These entries will unfortunately be + // wasted (never used). + while self.slots.len() % (req_size as usize) != 0 { + self.slots.push(LogicalSpillSlot::Unavail); + } + // And now the new slot. The `dflt` value is needed by `AVLTree` to initialise storage + // slots for tree nodes, but we will never actually see those values. So it doesn't + // matter what they are. + let dflt = RangeFragAndRefness::new(RangeFrag::invalid_value(), false); + let tree = AVLTree::<RangeFragAndRefness>::new(dflt); + let res = self.slots.len() as u32; + self.slots.push(LogicalSpillSlot::InUse { + size: req_size, + tree, + }); + // And now "block out subsequent slots that `req_size` implies. + // viz: req_size == 1 -> block out 0 more + // viz: req_size == 2 -> block out 1 more + // viz: req_size == 4 -> block out 3 more + // viz: req_size == 8 -> block out 7 more + for _ in 1..req_size { + self.slots.push(LogicalSpillSlot::Unavail); + } + assert!(self.slots.len() % (req_size as usize) == 0); + + res + } + + // THE MAIN FUNCTION + // Allocate spill slots for all the VirtualRanges in `vlrix`s eclass, + // including `vlrix` itself. Since we are allocating spill slots for + // complete eclasses at once, none of the members of the class should + // currently have any allocation. This routine will try to allocate all + // class members the same slot, but it can only guarantee to do so if the + // class members are mutually non-overlapping. Hence it can't guarantee that + // in general. + pub fn alloc_spill_slots<F: Function>( + &mut self, + vlr_slot_env: &mut TypedIxVec<VirtualRangeIx, Option<SpillSlot>>, + func: &F, + vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>, + vlrEquivClasses: &UnionFindEquivClasses<VirtualRangeIx>, + vlrix: VirtualRangeIx, + ) { + let is_ref = vlr_env[vlrix].is_ref; + for cand_vlrix in vlrEquivClasses.equiv_class_elems_iter(vlrix) { + // "None of the VLRs in this equivalence class have an allocated spill slot." + // This should be true because we allocate spill slots for all of the members of an + // eclass at once. + assert!(vlr_slot_env[cand_vlrix].is_none()); + + // "All of the VLRs in this eclass have the same ref-ness as this VLR." + // Why this is true is a bit subtle. The equivalence classes are computed by + // `do_coalescing_analysis`, fundamentally by looking at all the move instructions + // and computing the transitive closure induced by them. The ref-ness annotations + // on each VLR are computed in `do_reftypes_analysis`, and they are also computed + // as a transitive closure on the same move instructions. Hence the results should + // be identical. + // + // With all that said, note that these equivalence classes are *not* guaranteed to + // be internally non-overlapping. This is explained in the big block comment at the + // top of bt_coalescing_analysis.rs. + assert!(vlr_env[cand_vlrix].is_ref == is_ref); + } + + // Do this in two passes. It's a bit cumbersome. + // + // In the first pass, find a spill slot which can take all of the + // candidates when we try them *individually*, but don't update the tree + // yet. We will always find such a slot, because if none of the existing + // slots can do it, we can always start a new one. + // + // Now, that doesn't guarantee that all the candidates can *together* + // can be assigned to the chosen slot. That's only possible when they + // are non-overlapping. Rather than laboriously try to determine + // that, simply proceed with the second pass, the assignment pass, as + // follows. For each candidate, try to allocate it to the slot chosen + // in the first pass. If it goes in without interference, fine. If + // not, that means it overlaps with some other member of the class -- + // in which case we must find some other slot for it. It's too bad. + // + // The result is: all members will get a valid spill slot. And if they + // were all non overlapping then we are guaranteed that they all get the + // same slot. Which is as good as we can hope for. + // + // In both passes, only the highest-numbered 8 slots are checked for + // availability. This is a heuristic hack which both reduces + // allocation time and reduces the eventual resulting spilling: + // + // - It avoids lots of pointless repeated checking of low-numbered + // spill slots, that long ago became full(ish) and are unlikely to be + // able to take any new VirtualRanges + // + // - More subtly, it interacts with the question of whether or not + // each VirtualRange equivalence class is internally overlapping. + // When no overlaps are present, the spill slot allocator guarantees + // to find a slot which is free for the entire equivalence class, + // which is the ideal solution. When there are overlaps present, the + // allocator is forced to allocate at least some of the VirtualRanges + // in the class to different slots. By restricting the number of + // slots it can choose to 8 (+ extras if it needs them), we reduce the + // tendency for the VirtualRanges to be assigned a large number of + // different slots, which in turn reduces the amount of spilling in + // the end. + + // We need to know what regclass, and hence what slot size, we're looking + // for. Just look at the representative; all VirtualRanges in the eclass + // must have the same regclass. (If they don't, the client's is_move + // function has been giving us wrong information.) + let vlrix_vreg = vlr_env[vlrix].vreg; + let req_size = func.get_spillslot_size(vlrix_vreg.get_class(), vlrix_vreg); + assert!(req_size == 1 || req_size == 2 || req_size == 4 || req_size == 8); + + // Sanity check: if the VLR is reftyped, then it must need a 1-word slot + // (anything else is nonsensical.) + if is_ref { + assert!(req_size == 1); + } + + // Pass 1: find a slot which can take all VirtualRanges in `vlrix`s + // eclass when tested individually. + // + // Pass 1a: search existing slots + let search_start_slotno: u32 = { + // We will only search from `search_start_slotno` upwards. See + // block comment above for significance of the value `8`. + let window = 8; + if self.slots.len() >= window { + (self.slots.len() - window) as u32 + } else { + 0 + } + }; + let mut mb_chosen_slotno: Option<u32> = None; + // BEGIN search existing slots + for cand_slot_no in search_start_slotno..self.slots.len() as u32 { + let cand_slot = &self.slots[cand_slot_no as usize]; + if !cand_slot.is_InUse() { + continue; + } + if cand_slot.get_size() != req_size { + continue; + } + let tree = &cand_slot.get_tree(); + assert!(mb_chosen_slotno.is_none()); + + // BEGIN see if `cand_slot` can hold all eclass members individually + let mut all_cands_fit_individually = true; + for cand_vlrix in vlrEquivClasses.equiv_class_elems_iter(vlrix) { + let cand_vlr = &vlr_env[cand_vlrix]; + let this_cand_fits = ssal_is_add_possible(&tree, &cand_vlr.sorted_frags); + if !this_cand_fits { + all_cands_fit_individually = false; + break; + } + } + // END see if `cand_slot` can hold all eclass members individually + if !all_cands_fit_individually { + continue; + } + + // Ok. All eclass members will fit individually in `cand_slot_no`. + mb_chosen_slotno = Some(cand_slot_no); + break; + } + // END search existing slots + + // Pass 1b. If we didn't find a usable slot, allocate a new one. + let chosen_slotno: u32 = if mb_chosen_slotno.is_none() { + self.add_new_slot(req_size) + } else { + mb_chosen_slotno.unwrap() + }; + + // Pass 2. Try to allocate each eclass member individually to the chosen + // slot. If that fails, just allocate them anywhere. + let mut _all_in_chosen = true; + 'pass2_per_equiv_class: for cand_vlrix in vlrEquivClasses.equiv_class_elems_iter(vlrix) { + let cand_vlr = &vlr_env[cand_vlrix]; + let mut tree = self.slots[chosen_slotno as usize].get_mut_tree(); + let added = ssal_add_if_possible(&mut tree, &cand_vlr.sorted_frags); + if added { + vlr_slot_env[cand_vlrix] = Some(SpillSlot::new(chosen_slotno)); + continue 'pass2_per_equiv_class; + } + _all_in_chosen = false; + // It won't fit in `chosen_slotno`, so try somewhere (anywhere) else. + for alt_slotno in search_start_slotno..self.slots.len() as u32 { + let alt_slot = &self.slots[alt_slotno as usize]; + if !alt_slot.is_InUse() { + continue; + } + if alt_slot.get_size() != req_size { + continue; + } + if alt_slotno == chosen_slotno { + // We already know this won't work. + continue; + } + let mut tree = self.slots[alt_slotno as usize].get_mut_tree(); + let added = ssal_add_if_possible(&mut tree, &cand_vlr.sorted_frags); + if added { + vlr_slot_env[cand_vlrix] = Some(SpillSlot::new(alt_slotno)); + continue 'pass2_per_equiv_class; + } + } + // If we get here, it means it won't fit in any slot we currently have. + // So allocate a new one and use that. + let new_slotno = self.add_new_slot(req_size); + let mut tree = self.slots[new_slotno as usize].get_mut_tree(); + let added = ssal_add_if_possible(&mut tree, &cand_vlr.sorted_frags); + if added { + vlr_slot_env[cand_vlrix] = Some(SpillSlot::new(new_slotno)); + continue 'pass2_per_equiv_class; + } + // We failed to allocate it to any empty slot! This can't happen. + panic!("SpillSlotAllocator: alloc_spill_slots: failed?!?!"); + /*NOTREACHED*/ + } /* 'pass2_per_equiv_class */ + } + + // STACKMAP SUPPORT + // Mark the `frags` for `slot_no` as being reftyped. They are expected to already exist in + // the relevant tree, and not currently be marked as reftyped. + pub fn notify_spillage_of_reftyped_vlr( + &mut self, + slot_no: SpillSlot, + frags: &SortedRangeFrags, + ) { + let slot_ix = slot_no.get_usize(); + assert!(slot_ix < self.slots.len()); + let slot = &mut self.slots[slot_ix]; + match slot { + LogicalSpillSlot::InUse { size, tree } if *size == 1 => { + ssal_mark_frags_as_reftyped(tree, frags) + } + _ => panic!("SpillSlotAllocator::notify_spillage_of_reftyped_vlr: invalid slot"), + } + } + + // STACKMAP SUPPORT + // Allocate a size-1 (word!) spill slot for `frag` and return it. The slot is marked + // reftyped so that a later call to `get_reftyped_spillslots_at_inst_point` will return it. + pub fn alloc_reftyped_spillslot_for_frag(&mut self, frag: RangeFrag) -> SpillSlot { + for i in 0..self.slots.len() { + match &mut self.slots[i] { + LogicalSpillSlot::InUse { size: 1, tree } => { + if ssal_is_add_frag_possible(&tree, &frag) { + // We're in luck. + let inserted = tree.insert( + RangeFragAndRefness::new(frag, /*is_ref=*/ true), + Some(&|item1: RangeFragAndRefness, item2: RangeFragAndRefness| { + cmp_range_frags(&item1.frag, &item2.frag) + }), + ); + // This can't fail -- we just checked for it! + assert!(inserted); + return SpillSlot::new(i as u32); + } + // Otherwise move on. + } + LogicalSpillSlot::InUse { .. } | LogicalSpillSlot::Unavail => { + // Slot isn't is use, or is in use but for values of some non-ref size. + // Move on. + } + } + } + // We tried all slots, but without success. Add a new one and try again. This time we + // must succeed. Calling recursively is a bit stupid in the sense that we then search + // again to find the slot we just allocated, but hey. + self.add_new_slot(1 /*word*/); + self.alloc_reftyped_spillslot_for_frag(frag) // \o/ tailcall \o/ + } + + // STACKMAP SUPPORT + // Examine all the spill slots at `pt` and return those that are reftyped. This is + // fundamentally what creates a stack map. + pub fn get_reftyped_spillslots_at_inst_point(&self, pt: InstPoint) -> Vec<SpillSlot> { + let mut res = Vec::<SpillSlot>::new(); + for (i, slot) in self.slots.iter().enumerate() { + if slot.get_refness_at_inst_point(pt) == Some(true) { + res.push(SpillSlot::new(i as u32)); + } + } + res + } +} diff --git a/third_party/rust/regalloc/src/bt_vlr_priority_queue.rs b/third_party/rust/regalloc/src/bt_vlr_priority_queue.rs new file mode 100644 index 0000000000..1be9502e49 --- /dev/null +++ b/third_party/rust/regalloc/src/bt_vlr_priority_queue.rs @@ -0,0 +1,172 @@ +#![allow(non_snake_case)] +#![allow(non_camel_case_types)] + +//! Backtracking allocator: the as-yet-unallocated VirtualReg LR prio queue. + +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +use crate::data_structures::{TypedIxVec, VirtualRange, VirtualRangeIx}; + +//============================================================================= +// The as-yet-unallocated VirtualReg LR prio queue, `VirtualRangePrioQ`. +// +// Relevant methods are parameterised by a VirtualRange env. + +// What we seek to do with `VirtualRangePrioQ` is to implement a priority +// queue of as-yet unallocated virtual live ranges. For each iteration of the +// main allocation loop, we pull out the highest-priority unallocated +// VirtualRange, and either allocate it (somehow), or spill it. +// +// The Rust standard type BinaryHeap gives us an efficient way to implement +// the priority queue. However, it requires that the queue items supply the +// necessary cost-comparisons by implementing `Ord` on that type. Hence we +// have to wrap up the items we fundamentally want in the queue, viz, +// `VirtualRangeIx`, into a new type `VirtualRangeIxAndSize` that also carries +// the relevant cost field, `size`. Then we implement `Ord` for +// `VirtualRangeIxAndSize` so as to only look at the `size` fields. +// +// There is a small twist, however. Most virtual ranges are small and so will +// have a small `size` field (less than 20, let's say). For such cases, +// `BinaryHeap` will presumably choose between contenders with the same `size` +// field in some arbitrary way. This has two disadvantages: +// +// * it makes the exact allocation order, and hence allocation results, +// dependent on `BinaryHeap`'s arbitrary-choice scheme. This seems +// undesirable, and given recent shenanigans resulting from `HashMap` being +// nondeterministic even in a single-threaded scenario, I don't entirely +// trust `BinaryHeap` even to be deterministic. +// +// * experimentation with the "qsort" test case shows that breaking ties by +// selecting the entry that has been in the queue the longest, rather than +// choosing arbitrarily, gives slightly better allocations (slightly less +// spilling) in spill-heavy situations (where there are few registers). +// When there is not much spilling, it makes no difference. +// +// For these reasons, `VirtualRangeIxAndSize` also contains a `tiebreaker` +// field. The `VirtualRangePrioQ` logic gives a different value of this for +// each `VirtualRangeIxAndSize` it creates. These numbers start off at 2^32-1 +// and decrease towards zero. They are used as a secondary comparison key in +// the case where the `size` fields are equal. The effect is that (1) +// tiebreaking is made completely deterministic, and (2) it breaks ties in +// favour of the oldest entry (since that will have the highest `tiebreaker` +// field). +// +// The tiebreaker field will wrap around when it hits zero, but that can only +// happen after processing 2^32-1 virtual live ranges. In such cases I would +// expect that the allocator would have run out of memory long before, so it's +// academic in practice. Even if it does wrap around there is no danger to +// the correctness of the allocations. + +// Wrap up a VirtualRangeIx and its size, so that we can implement Ord for it +// on the basis of the `size` and `tiebreaker` fields. +// +// NB! Do not derive {,Partial}{Eq,Ord} for this. It has its own custom +// implementations. +struct VirtualRangeIxAndSize { + vlrix: VirtualRangeIx, + size: u16, + tiebreaker: u32, +} +impl VirtualRangeIxAndSize { + fn new(vlrix: VirtualRangeIx, size: u16, tiebreaker: u32) -> Self { + assert!(size > 0); + Self { + vlrix, + size, + tiebreaker, + } + } +} +impl PartialEq for VirtualRangeIxAndSize { + fn eq(&self, other: &Self) -> bool { + self.size == other.size && self.tiebreaker == other.tiebreaker + } +} +impl Eq for VirtualRangeIxAndSize {} +impl PartialOrd for VirtualRangeIxAndSize { + fn partial_cmp(&self, other: &Self) -> Option<Ordering> { + Some(self.cmp(other)) + } +} +impl Ord for VirtualRangeIxAndSize { + fn cmp(&self, other: &Self) -> Ordering { + match self.size.cmp(&other.size) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => self.tiebreaker.cmp(&other.tiebreaker), + } + } +} + +//============================================================================= +// VirtualRangePrioQ: public interface + +pub struct VirtualRangePrioQ { + // The set of as-yet unallocated VirtualRangeIxs. These are indexes into a + // VirtualRange env that is not stored here. The VirtualRangeIxs are tagged + // with the VirtualRange size and a tiebreaker field. + heap: BinaryHeap<VirtualRangeIxAndSize>, + tiebreaker_ctr: u32, +} +impl VirtualRangePrioQ { + pub fn new(vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>) -> Self { + let mut res = Self { + heap: BinaryHeap::new(), + tiebreaker_ctr: 0xFFFF_FFFFu32, + }; + for vlrix in VirtualRangeIx::new(0).dotdot(VirtualRangeIx::new(vlr_env.len())) { + let to_add = VirtualRangeIxAndSize::new(vlrix, vlr_env[vlrix].size, res.tiebreaker_ctr); + res.heap.push(to_add); + res.tiebreaker_ctr -= 1; + } + res + } + + #[inline(never)] + pub fn is_empty(&self) -> bool { + self.heap.is_empty() + } + + #[inline(never)] + pub fn len(&self) -> usize { + self.heap.len() + } + + // Add a VirtualRange index. + #[inline(never)] + pub fn add_VirtualRange( + &mut self, + vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>, + vlrix: VirtualRangeIx, + ) { + let to_add = VirtualRangeIxAndSize::new(vlrix, vlr_env[vlrix].size, self.tiebreaker_ctr); + self.tiebreaker_ctr -= 1; + self.heap.push(to_add); + } + + // Look in `unallocated` to locate the entry referencing the VirtualRange + // with the largest `size` value. Remove the ref from `unallocated` and + // return the VLRIx for said entry. + #[inline(never)] + pub fn get_longest_VirtualRange(&mut self) -> Option<VirtualRangeIx> { + match self.heap.pop() { + None => None, + Some(VirtualRangeIxAndSize { vlrix, .. }) => Some(vlrix), + } + } + + #[inline(never)] + pub fn show_with_envs( + &self, + vlr_env: &TypedIxVec<VirtualRangeIx, VirtualRange>, + ) -> Vec<String> { + let mut resV = vec![]; + for VirtualRangeIxAndSize { vlrix, .. } in self.heap.iter() { + let mut res = "TODO ".to_string(); + res += &format!("{:?} = {:?}", vlrix, &vlr_env[*vlrix]); + resV.push(res); + } + resV + } +} diff --git a/third_party/rust/regalloc/src/checker.rs b/third_party/rust/regalloc/src/checker.rs new file mode 100644 index 0000000000..eef2d1e6e2 --- /dev/null +++ b/third_party/rust/regalloc/src/checker.rs @@ -0,0 +1,717 @@ +//! Checker: verifies that spills/reloads/moves retain equivalent dataflow to original, vreg-based +//! code. +//! +//! The basic idea is that we track symbolic values as they flow through spills and reloads. +//! The symbolic values represent particular virtual or real registers in the original +//! function body presented to the register allocator. Any instruction in the original +//! function body (i.e., not added by the allocator) conceptually generates a symbolic +//! value "Rn" or "Vn" when storing to (or modifying) a real or virtual register. This +//! includes moves (from e.g. phi-node lowering): they also generate a new value. +//! +//! In other words, the dataflow analysis state at each program point is: +//! +//! - map `R` of: real reg -> lattice value (top > Rn/Vn symbols (unordered) > bottom) +//! - map `S` of: spill slot -> lattice value (same) +//! +//! And the transfer functions for each statement type are: +//! +//! - spill (inserted by RA): [ store spill_i, R_j ] +//! +//! S[spill_i] := R[R_j] +//! +//! - reload (inserted by RA): [ load R_i, spill_j ] +//! +//! R[R_i] := S[spill_j] +//! +//! - move (inserted by RA): [ R_i := R_j ] +//! +//! R[R_i] := R[R_j] +//! +//! - statement in pre-regalloc function [ V_i := op V_j, V_k, ... ] +//! with allocated form [ R_i := op R_j, R_k, ... ] +//! +//! R[R_i] := `V_i` +//! +//! In other words, a statement, even after allocation, generates a symbol +//! that corresponds to its original virtual-register def. +//! +//! (N.B.: moves in pre-regalloc function fall into this last case -- they +//! are "just another operation" and generate a new symbol) +//! +//! (Slight extension for multi-def ops, and ops with "modify" args: the op +//! generates symbol `V_i` into reg `R_i` allocated for that particular def/mod). +//! +//! The initial state is: for each real reg R_livein where R_livein is in the livein set, we set +//! R[R_livein] to `R_livein`. +//! +//! At control-flow join points, the symbols meet using a very simple lattice meet-function: +//! two different symbols in the same real-reg or spill-slot meet to "conflicted"; otherwise, +//! the symbol meets with itself to produce itself (reflexivity). +//! +//! To check correctness, we first find the dataflow fixpoint with the above lattice and +//! transfer/meet functions. Then, at each op, we examine the dataflow solution at the preceding +//! program point, and check that the real reg for each op arg (input/use) contains the symbol +//! corresponding to the original (usually virtual) register specified for this arg. + +#![allow(dead_code)] + +use crate::analysis_data_flow::get_san_reg_sets_for_insn; +use crate::data_structures::{ + BlockIx, InstIx, Map, RealReg, RealRegUniverse, Reg, RegSets, SpillSlot, VirtualReg, Writable, +}; +use crate::inst_stream::{ExtPoint, InstExtPoint, InstToInsertAndExtPoint}; +use crate::{Function, RegUsageMapper}; + +use rustc_hash::FxHashSet; +use std::collections::VecDeque; +use std::default::Default; +use std::hash::Hash; +use std::result::Result; + +use log::debug; + +/// A set of errors detected by the regalloc checker. +#[derive(Clone, Debug)] +pub struct CheckerErrors { + errors: Vec<CheckerError>, +} + +/// A single error detected by the regalloc checker. +#[derive(Clone, Debug)] +pub enum CheckerError { + MissingAllocationForReg { + reg: VirtualReg, + inst: InstIx, + }, + UnknownValueInReg { + real_reg: RealReg, + inst: InstIx, + }, + IncorrectValueInReg { + actual: Reg, + expected: Reg, + real_reg: RealReg, + inst: InstIx, + }, + UnknownValueInSlot { + slot: SpillSlot, + expected: Reg, + inst: InstIx, + }, + IncorrectValueInSlot { + slot: SpillSlot, + expected: Reg, + actual: Reg, + inst: InstIx, + }, + StackMapSpecifiesNonRefSlot { + inst: InstIx, + slot: SpillSlot, + }, + StackMapSpecifiesUndefinedSlot { + inst: InstIx, + slot: SpillSlot, + }, +} + +/// Abstract state for a storage slot (real register or spill slot). +/// +/// Forms a lattice with \top (`Unknown`), \bot (`Conflicted`), and a number of mutually unordered +/// value-points in between, one per real or virtual register. Any two different registers +/// meet to \bot. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum CheckerValue { + /// "top" value: this storage slot has no known value. + Unknown, + /// "bottom" value: this storage slot has a conflicted value. + Conflicted, + /// Reg: this storage slot has a value that originated as a def into + /// the given register, either implicitly (RealRegs at beginning of + /// function) or explicitly (as an instruction's def). + /// + /// The boolean flag indicates whether the value is reference-typed. + Reg(Reg, bool), +} + +impl Default for CheckerValue { + fn default() -> CheckerValue { + CheckerValue::Unknown + } +} + +impl CheckerValue { + /// Meet function of the abstract-interpretation value lattice. + fn meet(&self, other: &CheckerValue) -> CheckerValue { + match (self, other) { + (&CheckerValue::Unknown, _) => *other, + (_, &CheckerValue::Unknown) => *self, + (&CheckerValue::Conflicted, _) => *self, + (_, &CheckerValue::Conflicted) => *other, + (&CheckerValue::Reg(r1, ref1), &CheckerValue::Reg(r2, ref2)) if r1 == r2 => { + CheckerValue::Reg(r1, ref1 || ref2) + } + _ => CheckerValue::Conflicted, + } + } +} + +/// State that steps through program points as we scan over the instruction stream. +#[derive(Clone, Debug, PartialEq, Eq)] +struct CheckerState { + /// For each RealReg, abstract state. + reg_values: Map<RealReg, CheckerValue>, + /// For each spill slot, abstract state. + spill_slots: Map<SpillSlot, CheckerValue>, +} + +impl Default for CheckerState { + fn default() -> CheckerState { + CheckerState { + reg_values: Map::default(), + spill_slots: Map::default(), + } + } +} + +fn merge_map<K: Copy + Clone + PartialEq + Eq + Hash>( + into: &mut Map<K, CheckerValue>, + from: &Map<K, CheckerValue>, +) { + for (k, v) in from { + let into_v = into.entry(*k).or_insert(Default::default()); + let merged = into_v.meet(v); + *into_v = merged; + } +} + +impl CheckerState { + /// Create a new checker state. + fn new() -> CheckerState { + Default::default() + } + + /// Produce an entry checker state with all real regs holding themselves, symbolically. + fn entry_state(ru: &RealRegUniverse) -> CheckerState { + let mut state = CheckerState::new(); + for &(rreg, _) in &ru.regs { + state + .reg_values + .insert(rreg, CheckerValue::Reg(rreg.to_reg(), false)); + } + state + } + + /// Merge this checker state with another at a CFG join-point. + fn meet_with(&mut self, other: &CheckerState) { + merge_map(&mut self.reg_values, &other.reg_values); + merge_map(&mut self.spill_slots, &other.spill_slots); + } + + /// Check an instruction against this state. + fn check(&self, inst: &Inst) -> Result<(), CheckerError> { + match inst { + &Inst::Op { + inst_ix, + ref uses_orig, + ref uses, + .. + } => { + // For each use, check the mapped RealReg's symbolic value; it must + // be the original reg. + assert!(uses_orig.len() == uses.len()); + for (orig, mapped) in uses_orig.iter().cloned().zip(uses.iter().cloned()) { + let val = self + .reg_values + .get(&mapped) + .cloned() + .unwrap_or(Default::default()); + debug!( + "checker: inst {:?}: orig {:?}, mapped {:?}, checker state {:?}", + inst, orig, mapped, val + ); + match val { + CheckerValue::Unknown | CheckerValue::Conflicted => { + return Err(CheckerError::UnknownValueInReg { + real_reg: mapped, + inst: inst_ix, + }); + } + CheckerValue::Reg(r, _) if r != orig => { + return Err(CheckerError::IncorrectValueInReg { + actual: r, + expected: orig, + real_reg: mapped, + inst: inst_ix, + }); + } + _ => {} + } + } + } + &Inst::ChangeSpillSlotOwnership { + inst_ix, + slot, + from_reg, + .. + } => { + let val = self + .spill_slots + .get(&slot) + .cloned() + .unwrap_or(Default::default()); + debug!("checker: inst {:?}: slot value {:?}", inst, val); + match val { + CheckerValue::Unknown | CheckerValue::Conflicted => { + return Err(CheckerError::UnknownValueInSlot { + slot, + expected: from_reg, + inst: inst_ix, + }); + } + CheckerValue::Reg(r, _) if r != from_reg => { + return Err(CheckerError::IncorrectValueInSlot { + slot, + expected: from_reg, + actual: r, + inst: inst_ix, + }); + } + _ => {} + } + } + &Inst::Safepoint { inst_ix, ref slots } => { + self.check_stackmap(inst_ix, slots)?; + } + _ => {} + } + Ok(()) + } + + fn check_stackmap(&self, inst: InstIx, slots: &Vec<SpillSlot>) -> Result<(), CheckerError> { + // N.B.: it's OK for the stackmap to omit a slot that has a ref value in + // it; it might be dead. We simply update such a slot's value to + // 'undefined' in the transfer function. + for &slot in slots { + match self.spill_slots.get(&slot) { + Some(CheckerValue::Reg(_, false)) => { + return Err(CheckerError::StackMapSpecifiesNonRefSlot { inst, slot }); + } + Some(CheckerValue::Reg(_, true)) => { + // OK. + } + _ => { + return Err(CheckerError::StackMapSpecifiesUndefinedSlot { inst, slot }); + } + } + } + Ok(()) + } + + fn update_stackmap(&mut self, slots: &Vec<SpillSlot>) { + for (&slot, val) in &mut self.spill_slots { + if let &mut CheckerValue::Reg(_, true) = val { + let in_stackmap = slots.binary_search(&slot).is_ok(); + if !in_stackmap { + *val = CheckerValue::Unknown; + } + } + } + } + + /// Update according to instruction. + pub(crate) fn update(&mut self, inst: &Inst) { + match inst { + &Inst::Op { + ref defs_orig, + ref defs, + ref defs_reftyped, + .. + } => { + // For each def, set the symbolic value of the mapped RealReg to a + // symbol corresponding to the original def. + assert!(defs_orig.len() == defs.len()); + for i in 0..defs.len() { + let orig = defs_orig[i]; + let mapped = defs[i]; + let reftyped = defs_reftyped[i]; + self.reg_values + .insert(mapped, CheckerValue::Reg(orig, reftyped)); + } + } + &Inst::Move { into, from } => { + let val = self + .reg_values + .get(&from) + .cloned() + .unwrap_or(Default::default()); + self.reg_values.insert(into.to_reg(), val); + } + &Inst::ChangeSpillSlotOwnership { slot, to_reg, .. } => { + let reftyped = if let Some(val) = self.spill_slots.get(&slot) { + match val { + &CheckerValue::Reg(_, reftyped) => reftyped, + _ => false, + } + } else { + false + }; + self.spill_slots + .insert(slot, CheckerValue::Reg(to_reg, reftyped)); + } + &Inst::Spill { into, from } => { + let val = self + .reg_values + .get(&from) + .cloned() + .unwrap_or(Default::default()); + self.spill_slots.insert(into, val); + } + &Inst::Reload { into, from } => { + let val = self + .spill_slots + .get(&from) + .cloned() + .unwrap_or(Default::default()); + self.reg_values.insert(into.to_reg(), val); + } + &Inst::Safepoint { ref slots, .. } => { + self.update_stackmap(slots); + } + } + } +} + +/// An instruction representation in the checker's BB summary. +#[derive(Clone, Debug)] +pub(crate) enum Inst { + /// A register spill into memory. + Spill { into: SpillSlot, from: RealReg }, + /// A register reload from memory. + Reload { + into: Writable<RealReg>, + from: SpillSlot, + }, + /// A regalloc-inserted move (not a move in the original program!) + Move { + into: Writable<RealReg>, + from: RealReg, + }, + /// A spillslot ghost move (between vregs) resulting from an user-program + /// move whose source and destination regs are both vregs that are currently + /// spilled. + ChangeSpillSlotOwnership { + inst_ix: InstIx, + slot: SpillSlot, + from_reg: Reg, + to_reg: Reg, + }, + /// A regular instruction with fixed use and def slots. Contains both + /// the original registers (as given to the regalloc) and the allocated ones. + Op { + inst_ix: InstIx, + defs_orig: Vec<Reg>, + uses_orig: Vec<Reg>, + defs: Vec<RealReg>, + uses: Vec<RealReg>, + defs_reftyped: Vec<bool>, + }, + /// A safepoint, with a list of expected slots. + Safepoint { + inst_ix: InstIx, + slots: Vec<SpillSlot>, + }, +} + +#[derive(Debug)] +pub(crate) struct Checker { + bb_entry: BlockIx, + bb_in: Map<BlockIx, CheckerState>, + bb_succs: Map<BlockIx, Vec<BlockIx>>, + bb_insts: Map<BlockIx, Vec<Inst>>, + reftyped_vregs: FxHashSet<VirtualReg>, +} + +fn map_regs<F: Fn(VirtualReg) -> Option<RealReg>>( + inst: InstIx, + regs: &[Reg], + f: &F, +) -> Result<Vec<RealReg>, CheckerErrors> { + let mut errors = Vec::new(); + let real_regs = regs + .iter() + .map(|r| { + if r.is_virtual() { + f(r.to_virtual_reg()).unwrap_or_else(|| { + errors.push(CheckerError::MissingAllocationForReg { + reg: r.to_virtual_reg(), + inst, + }); + // Provide a dummy value for the register, it'll never be read. + Reg::new_real(r.get_class(), 0x0, 0).to_real_reg() + }) + } else { + r.to_real_reg() + } + }) + .collect(); + if errors.is_empty() { + Ok(real_regs) + } else { + Err(CheckerErrors { errors }) + } +} + +impl Checker { + /// Create a new checker for the given function, initializing CFG info immediately. + /// The client should call the `add_*()` methods to add abstract instructions to each + /// BB before invoking `run()` to check for errors. + pub(crate) fn new<F: Function>( + f: &F, + ru: &RealRegUniverse, + reftyped_vregs: &[VirtualReg], + ) -> Checker { + let mut bb_in = Map::default(); + let mut bb_succs = Map::default(); + let mut bb_insts = Map::default(); + + for block in f.blocks() { + bb_in.insert(block, Default::default()); + bb_succs.insert(block, f.block_succs(block).to_vec()); + bb_insts.insert(block, vec![]); + } + + bb_in.insert(f.entry_block(), CheckerState::entry_state(ru)); + + let reftyped_vregs = reftyped_vregs.iter().cloned().collect::<FxHashSet<_>>(); + Checker { + bb_entry: f.entry_block(), + bb_in, + bb_succs, + bb_insts, + reftyped_vregs, + } + } + + /// Add an abstract instruction (spill, reload, or move) to a BB. + /// + /// Can also accept an `Inst::Op`, but `add_op()` is better-suited + /// for this. + pub(crate) fn add_inst(&mut self, block: BlockIx, inst: Inst) { + let insts = self.bb_insts.get_mut(&block).unwrap(); + insts.push(inst); + } + + /// Add a "normal" instruction that uses, modifies, and/or defines certain + /// registers. The `SanitizedInstRegUses` must be the pre-allocation state; + /// the `mapper` must be provided to give the virtual -> real mappings at + /// the program points immediately before and after this instruction. + pub(crate) fn add_op<RUM: RegUsageMapper>( + &mut self, + block: BlockIx, + inst_ix: InstIx, + regsets: &RegSets, + mapper: &RUM, + ) -> Result<(), CheckerErrors> { + debug!( + "add_op: block {} inst {} regsets {:?}", + block.get(), + inst_ix.get(), + regsets + ); + assert!(regsets.is_sanitized()); + let mut uses_set = regsets.uses.clone(); + let mut defs_set = regsets.defs.clone(); + uses_set.union(®sets.mods); + defs_set.union(®sets.mods); + if uses_set.is_empty() && defs_set.is_empty() { + return Ok(()); + } + + let uses_orig = uses_set.to_vec(); + let defs_orig = defs_set.to_vec(); + let uses = map_regs(inst_ix, &uses_orig[..], &|vreg| mapper.get_use(vreg))?; + let defs = map_regs(inst_ix, &defs_orig[..], &|vreg| mapper.get_def(vreg))?; + let defs_reftyped = defs_orig + .iter() + .map(|reg| reg.is_virtual() && self.reftyped_vregs.contains(®.to_virtual_reg())) + .collect(); + let insts = self.bb_insts.get_mut(&block).unwrap(); + let op = Inst::Op { + inst_ix, + uses_orig, + defs_orig, + uses, + defs, + defs_reftyped, + }; + debug!("add_op: adding {:?}", op); + insts.push(op); + Ok(()) + } + + /// Perform the dataflow analysis to compute checker state at each BB entry. + fn analyze(&mut self) { + let mut queue = VecDeque::new(); + queue.push_back(self.bb_entry); + + while !queue.is_empty() { + let block = queue.pop_front().unwrap(); + let mut state = self.bb_in.get(&block).cloned().unwrap(); + debug!("analyze: block {} has state {:?}", block.get(), state); + for inst in self.bb_insts.get(&block).unwrap() { + state.update(inst); + debug!("analyze: inst {:?} -> state {:?}", inst, state); + } + + for succ in self.bb_succs.get(&block).unwrap() { + let cur_succ_in = self.bb_in.get(succ).unwrap(); + let mut new_state = state.clone(); + new_state.meet_with(cur_succ_in); + let changed = &new_state != cur_succ_in; + if changed { + debug!( + "analyze: block {} state changed from {:?} to {:?}; pushing onto queue", + succ.get(), + cur_succ_in, + new_state + ); + self.bb_in.insert(*succ, new_state); + queue.push_back(*succ); + } + } + } + } + + /// Using BB-start state computed by `analyze()`, step the checker state + /// through each BB and check each instruction's register allocations + /// for errors. + fn find_errors(&self) -> Result<(), CheckerErrors> { + let mut errors = vec![]; + for (block, input) in &self.bb_in { + let mut state = input.clone(); + for inst in self.bb_insts.get(block).unwrap() { + if let Err(e) = state.check(inst) { + debug!("Checker error: {:?}", e); + errors.push(e); + } + state.update(inst); + } + } + + if errors.is_empty() { + Ok(()) + } else { + Err(CheckerErrors { errors }) + } + } + + /// Find any errors, returning `Err(CheckerErrors)` with all errors found + /// or `Ok(())` otherwise. + pub(crate) fn run(mut self) -> Result<(), CheckerErrors> { + debug!("Checker: full body is:\n{:?}", self.bb_insts); + self.analyze(); + self.find_errors() + } +} + +/// A wrapper around `Checker` that assists its use with `InstToInsertAndExtPoint`s and +/// `Function` together. +pub(crate) struct CheckerContext { + checker: Checker, + checker_inst_map: Map<InstExtPoint, Vec<Inst>>, +} + +impl CheckerContext { + /// Create a new checker context for the given function, which is about to be edited with the + /// given instruction insertions. + pub(crate) fn new<F: Function>( + f: &F, + ru: &RealRegUniverse, + insts_to_add: &Vec<InstToInsertAndExtPoint>, + safepoint_insns: &[InstIx], + stackmaps: &[Vec<SpillSlot>], + reftyped_vregs: &[VirtualReg], + ) -> CheckerContext { + assert!(safepoint_insns.len() == stackmaps.len()); + let mut checker_inst_map: Map<InstExtPoint, Vec<Inst>> = Map::default(); + for &InstToInsertAndExtPoint { ref inst, ref iep } in insts_to_add { + let checker_insts = checker_inst_map + .entry(iep.clone()) + .or_insert_with(|| vec![]); + checker_insts.push(inst.to_checker_inst()); + } + for (iix, slots) in safepoint_insns.iter().zip(stackmaps.iter()) { + let iep = InstExtPoint::new(*iix, ExtPoint::Use); + let mut slots = slots.clone(); + slots.sort(); + checker_inst_map + .entry(iep) + .or_insert_with(|| vec![]) + .push(Inst::Safepoint { + inst_ix: *iix, + slots, + }); + } + let checker = Checker::new(f, ru, reftyped_vregs); + CheckerContext { + checker, + checker_inst_map, + } + } + + /// Update the checker with the given instruction and the given pre- and post-maps. Instructions + /// within a block must be visited in program order. + pub(crate) fn handle_insn<F: Function, RUM: RegUsageMapper>( + &mut self, + ru: &RealRegUniverse, + func: &F, + bix: BlockIx, + iix: InstIx, + mapper: &RUM, + ) -> Result<(), CheckerErrors> { + let empty = vec![]; + let mut skip_inst = false; + + debug!("CheckerContext::handle_insn: inst {:?}", iix,); + + for &pre_point in &[ExtPoint::Reload, ExtPoint::SpillBefore, ExtPoint::Use] { + let pre_point = InstExtPoint::new(iix, pre_point); + for checker_inst in self.checker_inst_map.get(&pre_point).unwrap_or(&empty) { + debug!("at inst {:?}: pre checker_inst: {:?}", iix, checker_inst); + self.checker.add_inst(bix, checker_inst.clone()); + if let Inst::ChangeSpillSlotOwnership { .. } = checker_inst { + // Unlike spills/reloads/moves inserted by the regalloc, ChangeSpillSlotOwnership + // pseudo-insts replace the instruction itself. + skip_inst = true; + } + } + } + + if !skip_inst { + let regsets = get_san_reg_sets_for_insn::<F>(func.get_insn(iix), ru) + .expect("only existing real registers at this point"); + assert!(regsets.is_sanitized()); + + debug!( + "at inst {:?}: regsets {:?} mapper {:?}", + iix, regsets, mapper + ); + self.checker.add_op(bix, iix, ®sets, mapper)?; + } + + for &post_point in &[ExtPoint::ReloadAfter, ExtPoint::Spill] { + let post_point = InstExtPoint::new(iix, post_point); + for checker_inst in self.checker_inst_map.get(&post_point).unwrap_or(&empty) { + debug!("at inst {:?}: post checker_inst: {:?}", iix, checker_inst); + self.checker.add_inst(bix, checker_inst.clone()); + } + } + + Ok(()) + } + + /// Run the underlying checker, once all instructions have been added. + pub(crate) fn run(self) -> Result<(), CheckerErrors> { + self.checker.run() + } +} diff --git a/third_party/rust/regalloc/src/data_structures.rs b/third_party/rust/regalloc/src/data_structures.rs new file mode 100644 index 0000000000..e90672e95c --- /dev/null +++ b/third_party/rust/regalloc/src/data_structures.rs @@ -0,0 +1,2505 @@ +//! Data structures for the whole crate. + +use rustc_hash::FxHashMap; +use rustc_hash::FxHashSet; +use smallvec::SmallVec; + +use std::cmp::Ordering; +use std::collections::VecDeque; +use std::fmt; +use std::hash::Hash; +use std::marker::PhantomData; +use std::ops::Index; +use std::ops::IndexMut; +use std::slice::{Iter, IterMut}; + +use crate::{Function, RegUsageMapper}; + +#[cfg(feature = "enable-serde")] +use serde::{Deserialize, Serialize}; + +//============================================================================= +// Queues + +pub type Queue<T> = VecDeque<T>; + +//============================================================================= +// Maps + +// NOTE: plain HashMap is nondeterministic, even in a single-threaded +// scenario, which can make debugging code that uses it really confusing. So +// we use FxHashMap instead, as it *is* deterministic, and, allegedly, faster +// too. +pub type Map<K, V> = FxHashMap<K, V>; + +//============================================================================= +// Sets of things + +// Same comment as above for FxHashMap. +#[derive(Clone)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct Set<T: Eq + Hash> { + set: FxHashSet<T>, +} + +impl<T: Eq + Ord + Hash + Copy + fmt::Debug> Set<T> { + #[inline(never)] + pub fn empty() -> Self { + Self { + set: FxHashSet::<T>::default(), + } + } + + #[inline(never)] + pub fn unit(item: T) -> Self { + let mut s = Self::empty(); + s.insert(item); + s + } + + #[inline(never)] + pub fn two(item1: T, item2: T) -> Self { + let mut s = Self::empty(); + s.insert(item1); + s.insert(item2); + s + } + + #[inline(never)] + pub fn card(&self) -> usize { + self.set.len() + } + + #[inline(never)] + pub fn insert(&mut self, item: T) { + self.set.insert(item); + } + + #[inline(never)] + pub fn delete(&mut self, item: T) { + self.set.remove(&item); + } + + #[inline(never)] + pub fn is_empty(&self) -> bool { + self.set.is_empty() + } + + #[inline(never)] + pub fn contains(&self, item: T) -> bool { + self.set.contains(&item) + } + + #[inline(never)] + pub fn intersect(&mut self, other: &Self) { + let mut res = FxHashSet::<T>::default(); + for item in self.set.iter() { + if other.set.contains(item) { + res.insert(*item); + } + } + self.set = res; + } + + #[inline(never)] + pub fn union(&mut self, other: &Self) { + for item in other.set.iter() { + self.set.insert(*item); + } + } + + #[inline(never)] + pub fn remove(&mut self, other: &Self) { + for item in other.set.iter() { + self.set.remove(item); + } + } + + #[inline(never)] + pub fn intersects(&self, other: &Self) -> bool { + !self.set.is_disjoint(&other.set) + } + + #[inline(never)] + pub fn is_subset_of(&self, other: &Self) -> bool { + self.set.is_subset(&other.set) + } + + #[inline(never)] + pub fn to_vec(&self) -> Vec<T> { + let mut res = Vec::<T>::new(); + for item in self.set.iter() { + res.push(*item) + } + // Don't delete this. It is important. + res.sort_unstable(); + res + } + + #[inline(never)] + pub fn from_vec(vec: Vec<T>) -> Self { + let mut res = Set::<T>::empty(); + for x in vec { + res.insert(x); + } + res + } + + #[inline(never)] + pub fn equals(&self, other: &Self) -> bool { + self.set == other.set + } + + #[inline(never)] + pub fn retain<F>(&mut self, f: F) + where + F: FnMut(&T) -> bool, + { + self.set.retain(f) + } + + #[inline(never)] + pub fn map<F, U>(&self, f: F) -> Set<U> + where + F: Fn(&T) -> U, + U: Eq + Ord + Hash + Copy + fmt::Debug, + { + Set { + set: self.set.iter().map(f).collect(), + } + } + + #[inline(never)] + pub fn filter_map<F, U>(&self, f: F) -> Set<U> + where + F: Fn(&T) -> Option<U>, + U: Eq + Ord + Hash + Copy + fmt::Debug, + { + Set { + set: self.set.iter().filter_map(f).collect(), + } + } + + pub fn clear(&mut self) { + self.set.clear(); + } +} + +impl<T: Eq + Ord + Hash + Copy + fmt::Debug> fmt::Debug for Set<T> { + #[inline(never)] + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + // Print the elements in some way which depends only on what is + // present in the set, and not on any other factor. In particular, + // <Debug for FxHashSet> has been observed to to print the elements + // of a two element set in both orders on different occasions. + let sorted_vec = self.to_vec(); + let mut s = "{".to_string(); + for i in 0..sorted_vec.len() { + if i > 0 { + s = s + &", ".to_string(); + } + s = s + &format!("{:?}", &sorted_vec[i]); + } + s = s + &"}".to_string(); + write!(fmt, "{}", s) + } +} + +pub struct SetIter<'a, T> { + set_iter: std::collections::hash_set::Iter<'a, T>, +} +impl<T: Eq + Hash> Set<T> { + pub fn iter(&self) -> SetIter<T> { + SetIter { + set_iter: self.set.iter(), + } + } +} +impl<'a, T> Iterator for SetIter<'a, T> { + type Item = &'a T; + fn next(&mut self) -> Option<Self::Item> { + self.set_iter.next() + } +} + +//============================================================================= +// Iteration boilerplate for entities. The only purpose of this is to support +// constructions of the form +// +// for ent in startEnt .dotdot( endPlus1Ent ) { +// } +// +// until such time as `trait Step` is available in stable Rust. At that point +// `fn dotdot` and all of the following can be removed, and the loops +// rewritten using the standard syntax: +// +// for ent in startEnt .. endPlus1Ent { +// } + +pub trait Zero { + fn zero() -> Self; +} + +pub trait PlusN { + fn plus_n(&self, n: usize) -> Self; +} + +#[derive(Clone, Copy)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct Range<T> { + first: T, + len: usize, +} + +impl<T: Copy + PartialOrd + PlusN> IntoIterator for Range<T> { + type Item = T; + type IntoIter = MyIterator<T>; + fn into_iter(self) -> Self::IntoIter { + MyIterator { + range: self, + next: self.first, + } + } +} + +impl<T: Copy + Eq + Ord + PlusN> Range<T> { + /// Create a new range object. + pub fn new(from: T, len: usize) -> Range<T> { + Range { first: from, len } + } + + pub fn start(&self) -> T { + self.first + } + + pub fn first(&self) -> T { + assert!(self.len() > 0); + self.start() + } + + pub fn last(&self) -> T { + assert!(self.len() > 0); + self.start().plus_n(self.len() - 1) + } + + pub fn last_plus1(&self) -> T { + self.start().plus_n(self.len()) + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn contains(&self, t: T) -> bool { + t >= self.first && t < self.first.plus_n(self.len) + } +} + +pub struct MyIterator<T> { + range: Range<T>, + next: T, +} +impl<T: Copy + PartialOrd + PlusN> Iterator for MyIterator<T> { + type Item = T; + fn next(&mut self) -> Option<Self::Item> { + if self.next >= self.range.first.plus_n(self.range.len) { + None + } else { + let res = Some(self.next); + self.next = self.next.plus_n(1); + res + } + } +} + +//============================================================================= +// Vectors where both the index and element types can be specified (and at +// most 2^32-1 elems can be stored. What if this overflows?) + +pub struct TypedIxVec<TyIx, Ty> { + vek: Vec<Ty>, + ty_ix: PhantomData<TyIx>, +} + +impl<TyIx, Ty> TypedIxVec<TyIx, Ty> +where + Ty: Clone, + TyIx: Copy + Eq + Ord + Zero + PlusN + Into<u32>, +{ + pub fn new() -> Self { + Self { + vek: Vec::new(), + ty_ix: PhantomData::<TyIx>, + } + } + pub fn from_vec(vek: Vec<Ty>) -> Self { + Self { + vek, + ty_ix: PhantomData::<TyIx>, + } + } + pub fn append(&mut self, other: &mut TypedIxVec<TyIx, Ty>) { + // FIXME what if this overflows? + self.vek.append(&mut other.vek); + } + pub fn iter(&self) -> Iter<Ty> { + self.vek.iter() + } + pub fn iter_mut(&mut self) -> IterMut<Ty> { + self.vek.iter_mut() + } + pub fn len(&self) -> u32 { + // FIXME what if this overflows? + self.vek.len() as u32 + } + pub fn push(&mut self, item: Ty) { + // FIXME what if this overflows? + self.vek.push(item); + } + pub fn resize(&mut self, new_len: u32, value: Ty) { + self.vek.resize(new_len as usize, value); + } + pub fn reserve(&mut self, additional: usize) { + self.vek.reserve(additional); + } + pub fn elems(&self) -> &[Ty] { + &self.vek[..] + } + pub fn elems_mut(&mut self) -> &mut [Ty] { + &mut self.vek[..] + } + pub fn range(&self) -> Range<TyIx> { + Range::new(TyIx::zero(), self.len() as usize) + } + pub fn remove(&mut self, idx: TyIx) -> Ty { + self.vek.remove(idx.into() as usize) + } + pub fn sort_by<F: FnMut(&Ty, &Ty) -> Ordering>(&mut self, compare: F) { + self.vek.sort_by(compare) + } + pub fn sort_unstable_by<F: FnMut(&Ty, &Ty) -> Ordering>(&mut self, compare: F) { + self.vek.sort_unstable_by(compare) + } + pub fn clear(&mut self) { + self.vek.clear(); + } +} + +impl<TyIx, Ty> Index<TyIx> for TypedIxVec<TyIx, Ty> +where + TyIx: Into<u32>, +{ + type Output = Ty; + fn index(&self, ix: TyIx) -> &Ty { + &self.vek[ix.into() as usize] + } +} + +impl<TyIx, Ty> IndexMut<TyIx> for TypedIxVec<TyIx, Ty> +where + TyIx: Into<u32>, +{ + fn index_mut(&mut self, ix: TyIx) -> &mut Ty { + &mut self.vek[ix.into() as usize] + } +} + +impl<TyIx, Ty> Clone for TypedIxVec<TyIx, Ty> +where + Ty: Clone, +{ + // This is only needed for debug printing. + fn clone(&self) -> Self { + Self { + vek: self.vek.clone(), + ty_ix: PhantomData::<TyIx>, + } + } +} + +//============================================================================= + +macro_rules! generate_boilerplate { + ($TypeIx:ident, $Type:ident, $PrintingPrefix:expr) => { + #[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] + #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] + // Firstly, the indexing type (TypeIx) + pub enum $TypeIx { + $TypeIx(u32), + } + impl $TypeIx { + #[allow(dead_code)] + #[inline(always)] + pub fn new(n: u32) -> Self { + debug_assert!(n != u32::max_value()); + Self::$TypeIx(n) + } + #[allow(dead_code)] + #[inline(always)] + pub const fn max_value() -> Self { + Self::$TypeIx(u32::max_value() - 1) + } + #[allow(dead_code)] + #[inline(always)] + pub const fn min_value() -> Self { + Self::$TypeIx(u32::min_value()) + } + #[allow(dead_code)] + #[inline(always)] + pub const fn invalid_value() -> Self { + Self::$TypeIx(u32::max_value()) + } + #[allow(dead_code)] + #[inline(always)] + pub fn is_valid(self) -> bool { + self != Self::invalid_value() + } + #[allow(dead_code)] + #[inline(always)] + pub fn is_invalid(self) -> bool { + self == Self::invalid_value() + } + #[allow(dead_code)] + #[inline(always)] + pub fn get(self) -> u32 { + debug_assert!(self.is_valid()); + match self { + $TypeIx::$TypeIx(n) => n, + } + } + #[allow(dead_code)] + #[inline(always)] + pub fn plus(self, delta: u32) -> $TypeIx { + debug_assert!(self.is_valid()); + $TypeIx::$TypeIx(self.get() + delta) + } + #[allow(dead_code)] + #[inline(always)] + pub fn minus(self, delta: u32) -> $TypeIx { + debug_assert!(self.is_valid()); + $TypeIx::$TypeIx(self.get() - delta) + } + #[allow(dead_code)] + pub fn dotdot(&self, last_plus1: $TypeIx) -> Range<$TypeIx> { + debug_assert!(self.is_valid()); + let len = (last_plus1.get() - self.get()) as usize; + Range::new(*self, len) + } + } + impl fmt::Debug for $TypeIx { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + if self.is_invalid() { + write!(fmt, "{}<NONE>", $PrintingPrefix) + } else { + write!(fmt, "{}{}", $PrintingPrefix, &self.get()) + } + } + } + impl PlusN for $TypeIx { + #[inline(always)] + fn plus_n(&self, n: usize) -> Self { + debug_assert!(self.is_valid()); + self.plus(n as u32) + } + } + impl Into<u32> for $TypeIx { + #[inline(always)] + fn into(self) -> u32 { + debug_assert!(self.is_valid()); + self.get() + } + } + impl Zero for $TypeIx { + #[inline(always)] + fn zero() -> Self { + $TypeIx::new(0) + } + } + }; +} + +generate_boilerplate!(InstIx, Inst, "i"); + +generate_boilerplate!(BlockIx, Block, "b"); + +generate_boilerplate!(RangeFragIx, RangeFrag, "f"); + +generate_boilerplate!(VirtualRangeIx, VirtualRange, "vr"); + +generate_boilerplate!(RealRangeIx, RealRange, "rr"); + +impl<TyIx, Ty: fmt::Debug> fmt::Debug for TypedIxVec<TyIx, Ty> { + // This is something of a hack in the sense that it doesn't show the + // indices, but oh well .. + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!(fmt, "{:?}", self.vek) + } +} + +//============================================================================= +// Definitions of register classes, registers and stack slots, and printing +// thereof. Note that this register class definition is meant to be +// architecture-independent: it simply captures common integer/float/vector +// types that machines are likely to use. TODO: investigate whether we need a +// more flexible register-class definition mechanism. + +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub enum RegClass { + I32 = 0, + F32 = 1, + I64 = 2, + F64 = 3, + V128 = 4, + INVALID = 5, +} + +/// The number of register classes that exist. +/// N.B.: must be <= 7 (fit into 3 bits) for 32-bit VReg/RReg packed format! +pub const NUM_REG_CLASSES: usize = 5; + +impl RegClass { + /// Convert a register class to a u32 index. + #[inline(always)] + pub fn rc_to_u32(self) -> u32 { + self as u32 + } + /// Convert a register class to a usize index. + #[inline(always)] + pub fn rc_to_usize(self) -> usize { + self as usize + } + /// Construct a register class from a u32. + #[inline(always)] + pub fn rc_from_u32(rc: u32) -> RegClass { + match rc { + 0 => RegClass::I32, + 1 => RegClass::F32, + 2 => RegClass::I64, + 3 => RegClass::F64, + 4 => RegClass::V128, + _ => panic!("RegClass::rc_from_u32"), + } + } + + pub fn short_name(self) -> &'static str { + match self { + RegClass::I32 => "I", + RegClass::I64 => "J", + RegClass::F32 => "F", + RegClass::F64 => "D", + RegClass::V128 => "V", + RegClass::INVALID => panic!("RegClass::short_name"), + } + } + + pub fn long_name(self) -> &'static str { + match self { + RegClass::I32 => "I32", + RegClass::I64 => "I32", + RegClass::F32 => "F32", + RegClass::F64 => "F32", + RegClass::V128 => "V128", + RegClass::INVALID => panic!("RegClass::long_name"), + } + } +} + +// Reg represents both real and virtual registers. For compactness and speed, +// these fields are packed into a single u32. The format is: +// +// Virtual Reg: 1 rc:3 index:28 +// Real Reg: 0 rc:3 uu:12 enc:8 index:8 +// +// `rc` is the register class. `uu` means "unused". `enc` is the hardware +// encoding for the reg. `index` is a zero based index which has the +// following meanings: +// +// * for a Virtual Reg, `index` is just the virtual register number. +// * for a Real Reg, `index` is the entry number in the associated +// `RealRegUniverse`. +// +// This scheme gives us: +// +// * a compact (32-bit) representation for registers +// * fast equality tests for registers +// * ability to handle up to 2^28 (268.4 million) virtual regs per function +// * ability to handle up to 8 register classes +// * ability to handle targets with up to 256 real registers +// * ability to emit instructions containing real regs without having to +// look up encodings in any side tables, since a real reg carries its +// encoding +// * efficient bitsets and arrays of virtual registers, since each has a +// zero-based index baked in +// * efficient bitsets and arrays of real registers, for the same reason +// +// This scheme makes it impossible to represent overlapping register classes, +// but that doesn't seem important. AFAIK only ARM32 VFP/Neon has that. + +#[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct Reg { + bits: u32, +} + +static INVALID_REG: u32 = 0xffffffff; + +impl Reg { + #[inline(always)] + pub fn is_virtual(self) -> bool { + self.is_valid() && (self.bits & 0x8000_0000) != 0 + } + #[inline(always)] + pub fn is_real(self) -> bool { + self.is_valid() && (self.bits & 0x8000_0000) == 0 + } + pub fn new_real(rc: RegClass, enc: u8, index: u8) -> Self { + let n = (0 << 31) | (rc.rc_to_u32() << 28) | ((enc as u32) << 8) | ((index as u32) << 0); + Reg { bits: n } + } + pub fn new_virtual(rc: RegClass, index: u32) -> Self { + if index >= (1 << 28) { + panic!("new_virtual(): index too large"); + } + let n = (1 << 31) | (rc.rc_to_u32() << 28) | (index << 0); + Reg { bits: n } + } + pub fn invalid() -> Reg { + Reg { bits: INVALID_REG } + } + #[inline(always)] + pub fn is_invalid(self) -> bool { + self.bits == INVALID_REG + } + #[inline(always)] + pub fn is_valid(self) -> bool { + !self.is_invalid() + } + pub fn is_virtual_or_invalid(self) -> bool { + self.is_virtual() || self.is_invalid() + } + pub fn is_real_or_invalid(self) -> bool { + self.is_real() || self.is_invalid() + } + #[inline(always)] + pub fn get_class(self) -> RegClass { + debug_assert!(self.is_valid()); + RegClass::rc_from_u32((self.bits >> 28) & 0x7) + } + #[inline(always)] + pub fn get_index(self) -> usize { + debug_assert!(self.is_valid()); + // Return type is usize because typically we will want to use the + // result for indexing into a Vec + if self.is_virtual() { + (self.bits & ((1 << 28) - 1)) as usize + } else { + (self.bits & ((1 << 8) - 1)) as usize + } + } + #[inline(always)] + pub fn get_index_u32(self) -> u32 { + debug_assert!(self.is_valid()); + if self.is_virtual() { + self.bits & ((1 << 28) - 1) + } else { + self.bits & ((1 << 8) - 1) + } + } + pub fn get_hw_encoding(self) -> u8 { + debug_assert!(self.is_valid()); + if self.is_virtual() { + panic!("Virtual register does not have a hardware encoding") + } else { + ((self.bits >> 8) & ((1 << 8) - 1)) as u8 + } + } + pub fn as_virtual_reg(self) -> Option<VirtualReg> { + // Allow invalid virtual regs as well. + if self.is_virtual_or_invalid() { + Some(VirtualReg { reg: self }) + } else { + None + } + } + pub fn as_real_reg(self) -> Option<RealReg> { + // Allow invalid real regs as well. + if self.is_real_or_invalid() { + Some(RealReg { reg: self }) + } else { + None + } + } + pub fn show_with_rru(self, univ: &RealRegUniverse) -> String { + if self.is_real() && self.get_index() < univ.regs.len() { + univ.regs[self.get_index()].1.clone() + } else if self.is_valid() { + format!("{:?}", self) + } else { + "rINVALID".to_string() + } + } +} + +impl fmt::Debug for Reg { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + if self.is_valid() { + write!( + fmt, + "{}{}{}", + if self.is_virtual() { "v" } else { "r" }, + self.get_index(), + self.get_class().short_name(), + ) + } else { + write!(fmt, "rINVALID") + } + } +} + +// RealReg and VirtualReg are merely wrappers around Reg, which try to +// dynamically ensure that they are really wrapping the correct flavour of +// register. + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct RealReg { + reg: Reg, +} +impl Reg /* !!not RealReg!! */ { + pub fn to_real_reg(self) -> RealReg { + if self.is_virtual() { + panic!("Reg::to_real_reg: this is a virtual register") + } else { + RealReg { reg: self } + } + } +} +impl RealReg { + pub fn get_class(self) -> RegClass { + self.reg.get_class() + } + #[inline(always)] + pub fn get_index(self) -> usize { + self.reg.get_index() + } + pub fn get_hw_encoding(self) -> usize { + self.reg.get_hw_encoding() as usize + } + #[inline(always)] + pub fn to_reg(self) -> Reg { + self.reg + } + pub fn invalid() -> RealReg { + RealReg { + reg: Reg::invalid(), + } + } + pub fn is_valid(self) -> bool { + self.reg.is_valid() + } + pub fn is_invalid(self) -> bool { + self.reg.is_invalid() + } + pub fn maybe_valid(self) -> Option<RealReg> { + if self == RealReg::invalid() { + None + } else { + Some(self) + } + } +} +impl fmt::Debug for RealReg { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!(fmt, "{:?}", self.reg) + } +} + +#[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct VirtualReg { + reg: Reg, +} +impl Reg /* !!not VirtualReg!! */ { + #[inline(always)] + pub fn to_virtual_reg(self) -> VirtualReg { + if self.is_virtual() { + VirtualReg { reg: self } + } else { + panic!("Reg::to_virtual_reg: this is a real register") + } + } +} +impl VirtualReg { + pub fn get_class(self) -> RegClass { + self.reg.get_class() + } + #[inline(always)] + pub fn get_index(self) -> usize { + self.reg.get_index() + } + #[inline(always)] + pub fn to_reg(self) -> Reg { + self.reg + } + pub fn invalid() -> VirtualReg { + VirtualReg { + reg: Reg::invalid(), + } + } + pub fn is_valid(self) -> bool { + self.reg.is_valid() + } + pub fn is_invalid(self) -> bool { + self.reg.is_invalid() + } + pub fn maybe_valid(self) -> Option<VirtualReg> { + if self == VirtualReg::invalid() { + None + } else { + Some(self) + } + } +} +impl fmt::Debug for VirtualReg { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!(fmt, "{:?}", self.reg) + } +} + +impl Reg { + /// Apply a vreg-rreg mapping to a Reg. This is used for registers used in + /// a read-role. + pub fn apply_uses<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + self.apply(|vreg| mapper.get_use(vreg)); + } + + /// Apply a vreg-rreg mapping to a Reg. This is used for registers used in + /// a write-role. + pub fn apply_defs<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + self.apply(|vreg| mapper.get_def(vreg)); + } + + /// Apply a vreg-rreg mapping to a Reg. This is used for registers used in + /// a modify-role. + pub fn apply_mods<RUM: RegUsageMapper>(&mut self, mapper: &RUM) { + self.apply(|vreg| mapper.get_mod(vreg)); + } + + fn apply<F: Fn(VirtualReg) -> Option<RealReg>>(&mut self, f: F) { + if let Some(vreg) = self.as_virtual_reg() { + if let Some(rreg) = f(vreg) { + debug_assert!(rreg.get_class() == vreg.get_class()); + *self = rreg.to_reg(); + } else { + panic!("Reg::apply: no mapping for {:?}", self); + } + } + } +} + +/// A "writable register". This is a zero-cost wrapper that can be used to +/// create a distinction, at the Rust type level, between a plain "register" +/// and a "writable register". +/// +/// Only structs that implement the `WritableBase` trait can be wrapped with +/// `Writable`. These are the Reg, RealReg and VirtualReg data structures only, +/// since `WritableBase` is not exposed to end users. +/// +/// Writable<..> can be used by the client to ensure that, internally, it only +/// generates instructions that write to registers that should be written. The +/// `InstRegUses` below, which must be implemented for every instruction, +/// requires a `Writable<Reg>` (not just `Reg`) in its `defined` and +/// `modified` sets. While we cannot hide the constructor for `Writable<..>` +/// from certain parts of the client while exposing it to others, the client +/// *can* adopt conventions to e.g. only ever call the Writable<..> +/// constructor from its central vreg-management logic, and decide that any +/// invocation of this constructor in a machine backend (for example) is an +/// error. +#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct Writable<R: WritableBase> { + reg: R, +} + +/// Set of requirements for types that can be wrapped in Writable. +pub trait WritableBase: + Copy + Clone + PartialEq + Eq + Hash + PartialOrd + Ord + fmt::Debug +{ +} + +impl WritableBase for Reg {} +impl WritableBase for RealReg {} +impl WritableBase for VirtualReg {} + +impl<R: WritableBase> Writable<R> { + /// Create a Writable<R> from an R. The client should carefully audit where + /// it calls this constructor to ensure correctness (see `Writable<..>` + /// struct documentation). + #[inline(always)] + pub fn from_reg(reg: R) -> Writable<R> { + Writable { reg } + } + + /// Get the inner Reg. + pub fn to_reg(&self) -> R { + self.reg + } + + pub fn map<F, U>(&self, f: F) -> Writable<U> + where + F: Fn(R) -> U, + U: WritableBase, + { + Writable { reg: f(self.reg) } + } +} + +#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct SpillSlot(u32); + +impl SpillSlot { + #[inline(always)] + pub fn new(n: u32) -> Self { + Self(n) + } + #[inline(always)] + pub fn get(self) -> u32 { + self.0 + } + #[inline(always)] + pub fn get_usize(self) -> usize { + self.get() as usize + } + pub fn round_up(self, num_slots: u32) -> SpillSlot { + assert!(num_slots > 0); + SpillSlot::new((self.get() + num_slots - 1) / num_slots * num_slots) + } + pub fn inc(self, num_slots: u32) -> SpillSlot { + SpillSlot::new(self.get() + num_slots) + } +} + +impl fmt::Debug for SpillSlot { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!(fmt, "S{}", self.get()) + } +} + +//============================================================================= +// Register uses: low level interface + +// This minimal struct is visible from outside the regalloc.rs interface. It +// is intended to be a safe wrapper around `RegVecs`, which isn't externally +// visible. It is used to collect unsanitized reg use info from client +// instructions. +pub struct RegUsageCollector<'a> { + pub reg_vecs: &'a mut RegVecs, +} + +impl<'a> RegUsageCollector<'a> { + pub fn new(reg_vecs: &'a mut RegVecs) -> Self { + Self { reg_vecs } + } + pub fn add_use(&mut self, r: Reg) { + self.reg_vecs.uses.push(r); + } + pub fn add_uses(&mut self, regs: &[Reg]) { + self.reg_vecs.uses.extend(regs.iter()); + } + pub fn add_def(&mut self, r: Writable<Reg>) { + self.reg_vecs.defs.push(r.to_reg()); + } + pub fn add_defs(&mut self, regs: &[Writable<Reg>]) { + self.reg_vecs.defs.reserve(regs.len()); + for r in regs { + self.reg_vecs.defs.push(r.to_reg()); + } + } + pub fn add_mod(&mut self, r: Writable<Reg>) { + self.reg_vecs.mods.push(r.to_reg()); + } + pub fn add_mods(&mut self, regs: &[Writable<Reg>]) { + self.reg_vecs.mods.reserve(regs.len()); + for r in regs { + self.reg_vecs.mods.push(r.to_reg()); + } + } + + // The presence of the following two is a hack, needed to support fuzzing + // in the test framework. Real clients should not call them. + pub fn get_use_def_mod_vecs_test_framework_only(&self) -> (Vec<Reg>, Vec<Reg>, Vec<Reg>) { + ( + self.reg_vecs.uses.clone(), + self.reg_vecs.defs.clone(), + self.reg_vecs.mods.clone(), + ) + } + + pub fn get_empty_reg_vecs_test_framework_only(sanitized: bool) -> RegVecs { + RegVecs::new(sanitized) + } +} + +// Everything else is not visible outside the regalloc.rs interface. + +// There is one of these per function. Note that `defs` and `mods` lose the +// `Writable` constraint at this point. This is for convenience of having all +// three vectors be the same type, but comes at the cost of the loss of being +// able to differentiate readonly vs read/write registers in the Rust type +// system. +#[derive(Debug)] +pub struct RegVecs { + pub uses: Vec<Reg>, + pub defs: Vec<Reg>, + pub mods: Vec<Reg>, + sanitized: bool, +} + +impl RegVecs { + pub fn new(sanitized: bool) -> Self { + Self { + uses: vec![], + defs: vec![], + mods: vec![], + sanitized, + } + } + pub fn is_sanitized(&self) -> bool { + self.sanitized + } + pub fn set_sanitized(&mut self, sanitized: bool) { + self.sanitized = sanitized; + } + pub fn clear(&mut self) { + self.uses.clear(); + self.defs.clear(); + self.mods.clear(); + } +} + +// There is one of these per insn, so try and keep it as compact as possible. +// I think this should fit in 16 bytes. +#[derive(Clone, Debug)] +pub struct RegVecBounds { + // These are the group start indices in RegVecs.{uses, defs, mods}. + pub uses_start: u32, + pub defs_start: u32, + pub mods_start: u32, + // And these are the group lengths. This does limit each instruction to + // mentioning only 256 registers in any group, but that does not seem like a + // problem. + pub uses_len: u8, + pub defs_len: u8, + pub mods_len: u8, +} + +impl RegVecBounds { + pub fn new() -> Self { + Self { + uses_start: 0, + defs_start: 0, + mods_start: 0, + uses_len: 0, + defs_len: 0, + mods_len: 0, + } + } +} + +// This is the primary structure. We compute just one of these for an entire +// function. +pub struct RegVecsAndBounds { + // The three vectors of registers. These can be arbitrarily long. + pub vecs: RegVecs, + // Admin info which tells us the location, for each insn, of its register + // groups in `vecs`. + pub bounds: TypedIxVec<InstIx, RegVecBounds>, +} + +impl RegVecsAndBounds { + pub fn new(vecs: RegVecs, bounds: TypedIxVec<InstIx, RegVecBounds>) -> Self { + Self { vecs, bounds } + } + pub fn is_sanitized(&self) -> bool { + self.vecs.sanitized + } + #[allow(dead_code)] // XXX for some reason, Rustc 1.43.1 thinks this is currently unused. + pub fn num_insns(&self) -> u32 { + self.bounds.len() + } +} + +//============================================================================= +// Register uses: convenience interface + +// Some call sites want to get reg use information as three Sets. This is a +// "convenience facility" which is easier to use but much slower than working +// with a whole-function `RegVecsAndBounds`. It shouldn't be used on critical +// paths. +#[derive(Debug)] +pub struct RegSets { + pub uses: Set<Reg>, // registers that are read. + pub defs: Set<Reg>, // registers that are written. + pub mods: Set<Reg>, // registers that are modified. + sanitized: bool, +} + +impl RegSets { + pub fn new(sanitized: bool) -> Self { + Self { + uses: Set::<Reg>::empty(), + defs: Set::<Reg>::empty(), + mods: Set::<Reg>::empty(), + sanitized, + } + } + + pub fn is_sanitized(&self) -> bool { + self.sanitized + } +} + +impl RegVecsAndBounds { + /* !!not RegSets!! */ + #[inline(never)] + // Convenience function. Try to avoid using this. + pub fn get_reg_sets_for_iix(&self, iix: InstIx) -> RegSets { + let bounds = &self.bounds[iix]; + let mut regsets = RegSets::new(self.vecs.sanitized); + for i in bounds.uses_start as usize..bounds.uses_start as usize + bounds.uses_len as usize { + regsets.uses.insert(self.vecs.uses[i]); + } + for i in bounds.defs_start as usize..bounds.defs_start as usize + bounds.defs_len as usize { + regsets.defs.insert(self.vecs.defs[i]); + } + for i in bounds.mods_start as usize..bounds.mods_start as usize + bounds.mods_len as usize { + regsets.mods.insert(self.vecs.mods[i]); + } + regsets + } +} + +//============================================================================= +// Definitions of the "real register universe". + +// A "Real Register Universe" is a read-only structure that contains all +// information about real registers on a given host. It serves several +// purposes: +// +// * defines the mapping from real register indices to the registers +// themselves +// +// * defines the size of the initial section of that mapping that is available +// to the register allocator for use, so that it can treat the registers +// under its control as a zero based, contiguous array. This is important +// for its efficiency. +// +// * gives meaning to Set<RealReg>, which otherwise would merely be a bunch of +// bits. + +#[derive(Clone, Debug)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct RealRegUniverse { + // The registers themselves. All must be real registers, and all must + // have their index number (.get_index()) equal to the array index here, + // since this is the only place where we map index numbers to actual + // registers. + pub regs: Vec<(RealReg, String)>, + + // This is the size of the initial section of `regs` that is available to + // the allocator. It must be <= `regs`.len(). + pub allocable: usize, + + // Information about groups of allocable registers. Used to quickly address + // only a group of allocable registers belonging to the same register class. + // Indexes into `allocable_by_class` are RegClass values, such as + // RegClass::F32. If the resulting entry is `None` then there are no + // registers in that class. Otherwise the value is a `RegClassInfo`, which + // provides a register range and possibly information about fixed uses. + pub allocable_by_class: [Option<RegClassInfo>; NUM_REG_CLASSES], +} + +/// Information about a single register class in the `RealRegUniverse`. +#[derive(Clone, Copy, Debug)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct RegClassInfo { + // Range of allocatable registers in this register class, in terms of + // register indices. + // + // A range (first, last) specifies the range of entries in + // `RealRegUniverse.regs` corresponding to that class. The range includes + // both `first` and `last`. + // + // In all cases, `last` must be < `RealRegUniverse.allocable`. In other + // words, all ranges together in `allocable_by_class` must describe only the + // allocable prefix of `regs`. + // + // For example, let's say + // allocable_by_class[RegClass::F32] == + // Some(RegClassInfo { first: 10, last: 14, .. }) + // Then regs[10], regs[11], regs[12], regs[13], and regs[14] give all + // registers of register class RegClass::F32. + // + // The effect of the above is that registers in `regs` must form + // contiguous groups. This is checked by RealRegUniverse::check_is_sane(). + pub first: usize, + pub last: usize, + + // A register, if any, that is *guaranteed* not to be used as a fixed use + // in any code, and so that the register allocator can statically reserve + // for its own use as a temporary. Some register allocators may need such + // a register for various maneuvers, for example a spillslot-to-spillslot + // move when no (other) registers are free. + pub suggested_scratch: Option<usize>, +} + +impl RealRegUniverse { + /// Show it in a pretty way. + pub fn show(&self) -> Vec<String> { + let mut res = vec![]; + // Show the allocables + for class_num in 0..NUM_REG_CLASSES { + let class_info = match &self.allocable_by_class[class_num] { + None => continue, + Some(info) => info, + }; + let class = RegClass::rc_from_u32(class_num as u32); + let mut class_str = "class ".to_string() + + &class.long_name().to_string() + + &"(".to_string() + + &class.short_name().to_string() + + &") at ".to_string(); + class_str = class_str + &format!("[{} .. {}]: ", class_info.first, class_info.last); + for ix in class_info.first..=class_info.last { + class_str = class_str + &self.regs[ix].1; + if let Some(suggested_ix) = class_info.suggested_scratch { + if ix == suggested_ix { + class_str = class_str + "*"; + } + } + class_str = class_str + " "; + } + res.push(class_str); + } + // And the non-allocables + if self.allocable < self.regs.len() { + let mut stragglers = format!( + "not allocable at [{} .. {}]: ", + self.allocable, + self.regs.len() - 1 + ); + for ix in self.allocable..self.regs.len() { + stragglers = stragglers + &self.regs[ix].1 + &" ".to_string(); + } + res.push(stragglers); + } + res + } + + /// Check that the given universe satisfies various invariants, and panic + /// if not. All the invariants are important. + pub fn check_is_sane(&self) { + let regs_len = self.regs.len(); + let regs_allocable = self.allocable; + // The universe must contain at most 256 registers. That's because + // `Reg` only has an 8-bit index value field, so if the universe + // contained more than 256 registers, we'd never be able to index into + // entries 256 and above. This is no limitation in practice since all + // targets we're interested in contain (many) fewer than 256 regs in + // total. + let mut ok = regs_len <= 256; + // The number of allocable registers must not exceed the number of + // `regs` presented. In general it will be less, since the universe + // will list some registers (stack pointer, etc) which are not + // available for allocation. + if ok { + ok = regs_allocable <= regs_len; + } + // All registers must have an index value which points back at the + // `regs` slot they are in. Also they really must be real regs. + if ok { + for i in 0..regs_len { + let (reg, _name) = &self.regs[i]; + if ok && (reg.to_reg().is_virtual() || reg.get_index() != i) { + ok = false; + } + } + } + // The allocatable regclass groupings defined by `allocable_first` and + // `allocable_last` must be contiguous. + if ok { + let mut regclass_used = [false; NUM_REG_CLASSES]; + for rc in 0..NUM_REG_CLASSES { + regclass_used[rc] = false; + } + for i in 0..regs_allocable { + let (reg, _name) = &self.regs[i]; + let rc = reg.get_class().rc_to_u32() as usize; + regclass_used[rc] = true; + } + // Scan forward through each grouping, checking that the listed + // registers really are of the claimed class. Also count the + // total number visited. This seems a fairly reliable way to + // ensure that the groupings cover all allocated registers exactly + // once, and that all classes are contiguous groups. + let mut regs_visited = 0; + for rc in 0..NUM_REG_CLASSES { + match &self.allocable_by_class[rc] { + &None => { + if regclass_used[rc] { + ok = false; + } + } + &Some(RegClassInfo { + first, + last, + suggested_scratch, + }) => { + if !regclass_used[rc] { + ok = false; + } + if ok { + for i in first..last + 1 { + let (reg, _name) = &self.regs[i]; + if ok && RegClass::rc_from_u32(rc as u32) != reg.get_class() { + ok = false; + } + regs_visited += 1; + } + } + if ok { + if let Some(s) = suggested_scratch { + if s < first || s > last { + ok = false; + } + } + } + } + } + } + if ok && regs_visited != regs_allocable { + ok = false; + } + } + // So finally .. + if !ok { + panic!("RealRegUniverse::check_is_sane: invalid RealRegUniverse"); + } + } +} + +//============================================================================= +// Representing and printing of live range fragments. + +#[derive(Copy, Clone, Hash, PartialEq, Eq, Ord)] +// There are four "points" within an instruction that are of interest, and +// these have a total ordering: R < U < D < S. They are: +// +// * R(eload): this is where any reload insns for the insn itself are +// considered to live. +// +// * U(se): this is where the insn is considered to use values from those of +// its register operands that appear in a Read or Modify role. +// +// * D(ef): this is where the insn is considered to define new values for +// those of its register operands that appear in a Write or Modify role. +// +// * S(pill): this is where any spill insns for the insn itself are considered +// to live. +// +// Instructions in the incoming Func may only exist at the U and D points, +// and so their associated live range fragments will only mention the U and D +// points. However, when adding spill code, we need a way to represent live +// ranges involving the added spill and reload insns, in which case R and S +// come into play: +// +// * A reload for instruction i is considered to be live from i.R to i.U. +// +// * A spill for instruction i is considered to be live from i.D to i.S. + +pub enum Point { + // The values here are important. Don't change them. + Reload = 0, + Use = 1, + Def = 2, + Spill = 3, +} + +impl Point { + #[inline(always)] + pub fn is_reload(self) -> bool { + match self { + Point::Reload => true, + _ => false, + } + } + #[inline(always)] + pub fn is_use(self) -> bool { + match self { + Point::Use => true, + _ => false, + } + } + #[inline(always)] + pub fn is_def(self) -> bool { + match self { + Point::Def => true, + _ => false, + } + } + #[inline(always)] + pub fn is_spill(self) -> bool { + match self { + Point::Spill => true, + _ => false, + } + } + #[inline(always)] + pub fn is_use_or_def(self) -> bool { + self.is_use() || self.is_def() + } +} + +impl PartialOrd for Point { + // In short .. R < U < D < S. This is probably what would be #derive'd + // anyway, but we need to be sure. + fn partial_cmp(&self, other: &Self) -> Option<Ordering> { + (*self as u32).partial_cmp(&(*other as u32)) + } +} + +// See comments below on `RangeFrag` for the meaning of `InstPoint`. +#[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct InstPoint { + /// This is conceptually: + /// pub iix: InstIx, + /// pub pt: Point, + /// + /// but packed into a single 32 bit word, so as + /// (1) to ensure it is only 32 bits (and hence to guarantee that `RangeFrag` + /// is 64 bits), and + /// (2) to make it possible to implement `PartialOrd` using `PartialOrd` + /// directly on 32 bit words (and hence we let it be derived). + /// + /// This has the format: + /// InstIx as bits 31:2, Point as bits 1:0. + /// + /// It does give the slight limitation that all InstIxs must be < 2^30, but + /// that's hardly a big deal: the analysis module rejects any input with 2^24 + /// or more Insns. + /// + /// Do not access this directly: + bits: u32, +} + +impl InstPoint { + #[inline(always)] + pub fn new(iix: InstIx, pt: Point) -> Self { + let iix_n = iix.get(); + assert!(iix_n < 0x4000_0000u32); + let pt_n = pt as u32; + InstPoint { + bits: (iix_n << 2) | pt_n, + } + } + #[inline(always)] + pub fn iix(self) -> InstIx { + InstIx::new(self.bits >> 2) + } + #[inline(always)] + pub fn pt(self) -> Point { + match self.bits & 3 { + 0 => Point::Reload, + 1 => Point::Use, + 2 => Point::Def, + 3 => Point::Spill, + // This can never happen, but rustc doesn't seem to know that. + _ => panic!("InstPt::pt: unreachable case"), + } + } + #[inline(always)] + pub fn set_iix(&mut self, iix: InstIx) { + let iix_n = iix.get(); + assert!(iix_n < 0x4000_0000u32); + self.bits = (iix_n << 2) | (self.bits & 3); + } + #[inline(always)] + pub fn set_pt(&mut self, pt: Point) { + self.bits = (self.bits & 0xFFFF_FFFCu32) | pt as u32; + } + #[inline(always)] + pub fn new_reload(iix: InstIx) -> Self { + InstPoint::new(iix, Point::Reload) + } + #[inline(always)] + pub fn new_use(iix: InstIx) -> Self { + InstPoint::new(iix, Point::Use) + } + #[inline(always)] + pub fn new_def(iix: InstIx) -> Self { + InstPoint::new(iix, Point::Def) + } + #[inline(always)] + pub fn new_spill(iix: InstIx) -> Self { + InstPoint::new(iix, Point::Spill) + } + #[inline(always)] + pub fn invalid_value() -> Self { + Self { + bits: 0xFFFF_FFFFu32, + } + } + #[inline(always)] + pub fn max_value() -> Self { + Self { + bits: 0xFFFF_FFFFu32, + } + } + #[inline(always)] + pub fn min_value() -> Self { + Self { bits: 0u32 } + } +} + +impl fmt::Debug for InstPoint { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!( + fmt, + "{:?}{}", + self.iix(), + match self.pt() { + Point::Reload => ".r", + Point::Use => ".u", + Point::Def => ".d", + Point::Spill => ".s", + } + ) + } +} + +//============================================================================= +// Live Range Fragments, and their metrics + +// A Live Range Fragment (RangeFrag) describes a consecutive sequence of one or +// more instructions, in which a Reg is "live". The sequence must exist +// entirely inside only one basic block. +// +// However, merely indicating the start and end instruction numbers isn't +// enough: we must also include a "Use or Def" indication. These indicate two +// different "points" within each instruction: the Use position, where +// incoming registers are read, and the Def position, where outgoing registers +// are written. The Use position is considered to come before the Def +// position, as described for `Point` above. +// +// When we come to generate spill/restore live ranges, Point::S and Point::R +// also come into play. Live ranges (and hence, RangeFrags) that do not perform +// spills or restores should not use either of Point::S or Point::R. +// +// The set of positions denoted by +// +// {0 .. #insns-1} x {Reload point, Use point, Def point, Spill point} +// +// is exactly the set of positions that we need to keep track of when mapping +// live ranges to registers. This the reason for the type InstPoint. Note +// that InstPoint values have a total ordering, at least within a single basic +// block: the insn number is used as the primary key, and the Point part is +// the secondary key, with Reload < Use < Def < Spill. +#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct RangeFrag { + pub first: InstPoint, + pub last: InstPoint, +} + +impl fmt::Debug for RangeFrag { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!(fmt, "(RF: {:?}-{:?})", self.first, self.last) + } +} + +impl RangeFrag { + #[allow(dead_code)] // XXX for some reason, Rustc 1.43.1 thinks this is unused. + pub fn new(first: InstPoint, last: InstPoint) -> Self { + debug_assert!(first <= last); + RangeFrag { first, last } + } + + pub fn invalid_value() -> Self { + Self { + first: InstPoint::invalid_value(), + last: InstPoint::invalid_value(), + } + } + + pub fn new_with_metrics<F: Function>( + f: &F, + bix: BlockIx, + first: InstPoint, + last: InstPoint, + count: u16, + ) -> (Self, RangeFragMetrics) { + debug_assert!(f.block_insns(bix).len() >= 1); + debug_assert!(f.block_insns(bix).contains(first.iix())); + debug_assert!(f.block_insns(bix).contains(last.iix())); + debug_assert!(first <= last); + if first == last { + debug_assert!(count == 1); + } + let first_iix_in_block = f.block_insns(bix).first(); + let last_iix_in_block = f.block_insns(bix).last(); + let first_pt_in_block = InstPoint::new_use(first_iix_in_block); + let last_pt_in_block = InstPoint::new_def(last_iix_in_block); + let kind = match (first == first_pt_in_block, last == last_pt_in_block) { + (false, false) => RangeFragKind::Local, + (false, true) => RangeFragKind::LiveOut, + (true, false) => RangeFragKind::LiveIn, + (true, true) => RangeFragKind::Thru, + }; + ( + RangeFrag { first, last }, + RangeFragMetrics { bix, kind, count }, + ) + } +} + +// Comparison of RangeFrags. They form a partial order. + +pub fn cmp_range_frags(f1: &RangeFrag, f2: &RangeFrag) -> Option<Ordering> { + if f1.last < f2.first { + return Some(Ordering::Less); + } + if f1.first > f2.last { + return Some(Ordering::Greater); + } + if f1.first == f2.first && f1.last == f2.last { + return Some(Ordering::Equal); + } + None +} + +impl RangeFrag { + pub fn contains(&self, ipt: &InstPoint) -> bool { + self.first <= *ipt && *ipt <= self.last + } +} + +// A handy summary hint for a RangeFrag. Note that none of these are correct +// if the RangeFrag has been extended so as to cover multiple basic blocks. +// But that ("RangeFrag compression") is something done locally within each +// algorithm (BT and LSRA). The analysis-phase output will not include any +// such compressed RangeFrags. +#[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub enum RangeFragKind { + Local, // Fragment exists entirely inside one block + LiveIn, // Fragment is live in to a block, but ends inside it + LiveOut, // Fragment is live out of a block, but starts inside it + Thru, // Fragment is live through the block (starts and ends outside it) +} + +impl fmt::Debug for RangeFragKind { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + RangeFragKind::Local => write!(fmt, "Local"), + RangeFragKind::LiveIn => write!(fmt, "LiveIn"), + RangeFragKind::LiveOut => write!(fmt, "LiveOut"), + RangeFragKind::Thru => write!(fmt, "Thru"), + } + } +} + +// `RangeFrags` resulting from the initial analysis phase (analysis_data_flow.rs) +// exist only within single basic blocks, and therefore have some associated +// metrics, held by `RangeFragMetrics`: +// +// * a `count` field, which is a u16 indicating how often the associated storage +// unit (Reg) is mentioned inside the RangeFrag. It is assumed that the RangeFrag +// is associated with some Reg. If not, the `count` field is meaningless. This +// field has no effect on the correctness of the resulting allocation. It is used +// however in the estimation of `VirtualRange` spill costs, which are important +// for prioritising which `VirtualRange`s get assigned a register vs which have +// to be spilled. +// +// * `bix` field, which indicates which `Block` the fragment exists in. This +// field is actually redundant, since the containing `Block` can be inferred, +// laboriously, from the associated `RangeFrag`s `first` and `last` fields, +// providing you have an `InstIxToBlockIx` mapping table to hand. It is included +// here for convenience. +// +// * `kind` is another convenience field, indicating how the range is included +// within its owning block. +// +// The analysis phase (fn `deref_and_compress_sorted_range_frag_ixs`) +// compresses ranges and as a result breaks the invariant that a `RangeFrag` +// exists only within a single `Block`. For a `RangeFrag` spanning multiple +// `Block`s, all three `RangeFragMetric` fields are meaningless. This is the +// reason for separating `RangeFrag` and `RangeFragMetrics` -- so that it is +// possible to merge `RangeFrag`s without being forced to create fake values +// for the metrics fields. +#[derive(Clone, PartialEq)] +pub struct RangeFragMetrics { + pub bix: BlockIx, + pub kind: RangeFragKind, + pub count: u16, +} + +impl fmt::Debug for RangeFragMetrics { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!( + fmt, + "(RFM: {:?}, count={}, {:?})", + self.kind, self.count, self.bix + ) + } +} + +//============================================================================= +// Vectors of RangeFragIxs, sorted so that the associated RangeFrags are in +// ascending order, per their InstPoint fields. The associated RangeFrags may +// not overlap. +// +// The "fragment environment" (usually called "frag_env"), to which the +// RangeFragIxs refer, is not stored here. + +#[derive(Clone)] +pub struct SortedRangeFragIxs { + pub frag_ixs: SmallVec<[RangeFragIx; 4]>, +} + +impl fmt::Debug for SortedRangeFragIxs { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + self.frag_ixs.fmt(fmt) + } +} + +impl SortedRangeFragIxs { + pub(crate) fn check(&self, fenv: &TypedIxVec<RangeFragIx, RangeFrag>) { + for i in 1..self.frag_ixs.len() { + let prev_frag = &fenv[self.frag_ixs[i - 1]]; + let this_frag = &fenv[self.frag_ixs[i]]; + if cmp_range_frags(prev_frag, this_frag) != Some(Ordering::Less) { + panic!("SortedRangeFragIxs::check: vector not ok"); + } + } + } + + pub fn sort(&mut self, fenv: &TypedIxVec<RangeFragIx, RangeFrag>) { + self.frag_ixs.sort_unstable_by(|fix_a, fix_b| { + match cmp_range_frags(&fenv[*fix_a], &fenv[*fix_b]) { + Some(Ordering::Less) => Ordering::Less, + Some(Ordering::Greater) => Ordering::Greater, + Some(Ordering::Equal) | None => { + panic!("SortedRangeFragIxs::sort: overlapping Frags!") + } + } + }); + } + + pub fn new( + frag_ixs: SmallVec<[RangeFragIx; 4]>, + fenv: &TypedIxVec<RangeFragIx, RangeFrag>, + ) -> Self { + let mut res = SortedRangeFragIxs { frag_ixs }; + // check the source is ordered, and clone (or sort it) + res.sort(fenv); + res.check(fenv); + res + } + + pub fn unit(fix: RangeFragIx, fenv: &TypedIxVec<RangeFragIx, RangeFrag>) -> Self { + let mut res = SortedRangeFragIxs { + frag_ixs: SmallVec::<[RangeFragIx; 4]>::new(), + }; + res.frag_ixs.push(fix); + res.check(fenv); + res + } + + /// Does this sorted list of range fragments contain the given instruction point? + pub fn contains_pt(&self, fenv: &TypedIxVec<RangeFragIx, RangeFrag>, pt: InstPoint) -> bool { + self.frag_ixs + .binary_search_by(|&ix| { + let frag = &fenv[ix]; + if pt < frag.first { + Ordering::Greater + } else if pt >= frag.first && pt <= frag.last { + Ordering::Equal + } else { + Ordering::Less + } + }) + .is_ok() + } +} + +//============================================================================= +// Vectors of RangeFrags, sorted so that they are in ascending order, per +// their InstPoint fields. The RangeFrags may not overlap. + +#[derive(Clone)] +pub struct SortedRangeFrags { + pub frags: SmallVec<[RangeFrag; 4]>, +} + +impl fmt::Debug for SortedRangeFrags { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + self.frags.fmt(fmt) + } +} + +impl SortedRangeFrags { + pub fn unit(frag: RangeFrag) -> Self { + let mut res = SortedRangeFrags { + frags: SmallVec::<[RangeFrag; 4]>::new(), + }; + res.frags.push(frag); + res + } + + pub fn empty() -> Self { + Self { + frags: SmallVec::<[RangeFrag; 4]>::new(), + } + } + + pub fn overlaps(&self, other: &Self) -> bool { + // Since both vectors are sorted and individually non-overlapping, we + // can establish that they are mutually non-overlapping by walking + // them simultaneously and checking, at each step, that there is a + // unique "next lowest" frag available. + let frags1 = &self.frags; + let frags2 = &other.frags; + let n1 = frags1.len(); + let n2 = frags2.len(); + let mut c1 = 0; + let mut c2 = 0; + loop { + if c1 >= n1 || c2 >= n2 { + // We made it to the end of one (or both) vectors without + // finding any conflicts. + return false; // "no overlaps" + } + let f1 = &frags1[c1]; + let f2 = &frags2[c2]; + match cmp_range_frags(f1, f2) { + Some(Ordering::Less) => c1 += 1, + Some(Ordering::Greater) => c2 += 1, + _ => { + // There's no unique "next frag" -- either they are + // identical, or they overlap. So we're done. + return true; // "there's an overlap" + } + } + } + } + + /// Does this sorted list of range fragments contain the given instruction point? + pub fn contains_pt(&self, pt: InstPoint) -> bool { + self.frags + .binary_search_by(|frag| { + if pt < frag.first { + Ordering::Greater + } else if pt >= frag.first && pt <= frag.last { + Ordering::Equal + } else { + Ordering::Less + } + }) + .is_ok() + } +} + +//============================================================================= +// Representing spill costs. A spill cost can either be infinite, in which +// case the associated VirtualRange may not be spilled, because it's already a +// spill/reload range. Or it can be finite, in which case it must be a 32-bit +// floating point number, which is (in the IEEE754 meaning of the terms) +// non-infinite, non-NaN and it must be non negative. In fact it's +// meaningless for a VLR to have a zero spill cost (how could that really be +// the case?) but we allow it here for convenience. + +#[derive(Copy, Clone)] +pub enum SpillCost { + Infinite, // Infinite, positive + Finite(f32), // Finite, non-negative +} + +impl fmt::Debug for SpillCost { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + SpillCost::Infinite => write!(fmt, "INFINITY"), + SpillCost::Finite(c) => write!(fmt, "{:<.3}", c), + } + } +} + +impl SpillCost { + #[inline(always)] + pub fn zero() -> Self { + SpillCost::Finite(0.0) + } + #[inline(always)] + pub fn infinite() -> Self { + SpillCost::Infinite + } + #[inline(always)] + pub fn finite(cost: f32) -> Self { + // "`is_normal` returns true if the number is neither zero, infinite, + // subnormal, or NaN." + assert!(cost.is_normal() || cost == 0.0); + // And also it can't be negative. + assert!(cost >= 0.0); + // Somewhat arbitrarily .. + assert!(cost < 1e18); + SpillCost::Finite(cost) + } + #[inline(always)] + pub fn is_zero(&self) -> bool { + match self { + SpillCost::Infinite => false, + SpillCost::Finite(c) => *c == 0.0, + } + } + #[inline(always)] + pub fn is_infinite(&self) -> bool { + match self { + SpillCost::Infinite => true, + SpillCost::Finite(_) => false, + } + } + #[inline(always)] + pub fn is_finite(&self) -> bool { + !self.is_infinite() + } + #[inline(always)] + pub fn is_less_than(&self, other: &Self) -> bool { + match (self, other) { + // Dubious .. both are infinity + (SpillCost::Infinite, SpillCost::Infinite) => false, + // finite < inf + (SpillCost::Finite(_), SpillCost::Infinite) => true, + // inf is not < finite + (SpillCost::Infinite, SpillCost::Finite(_)) => false, + // straightforward + (SpillCost::Finite(c1), SpillCost::Finite(c2)) => c1 < c2, + } + } + #[inline(always)] + pub fn add(&mut self, other: &Self) { + match (*self, other) { + (SpillCost::Finite(c1), SpillCost::Finite(c2)) => { + // The 10^18 limit above gives us a lot of headroom here, since max + // f32 is around 10^37. + *self = SpillCost::Finite(c1 + c2); + } + (_, _) => { + // All other cases produce an infinity. + *self = SpillCost::Infinite; + } + } + } +} + +//============================================================================= +// Representing and printing live ranges. These are represented by two +// different but closely related types, RealRange and VirtualRange. + +// RealRanges are live ranges for real regs (RealRegs). VirtualRanges are +// live ranges for virtual regs (VirtualRegs). VirtualRanges are the +// fundamental unit of allocation. +// +// A RealRange pairs a RealReg with a vector of RangeFragIxs in which it is +// live. The RangeFragIxs are indices into some vector of RangeFrags (a +// "fragment environment", 'fenv'), which is not specified here. They are +// sorted so as to give ascending order to the RangeFrags which they refer to. +// +// A VirtualRange pairs a VirtualReg with a vector of RangeFrags in which it +// is live. Same scheme as for a RealRange, except it avoids the overhead of +// having to indirect into the fragment environment. +// +// VirtualRanges also contain metrics: +// +// * `size` is the number of instructions in total spanned by the LR. It must +// not be zero. +// +// * `total cost` is an abstractified measure of the cost of the LR. Each +// basic block in which the range exists gives a contribution to the `total +// cost`, which is the number of times the register is mentioned in this +// block, multiplied by the estimated execution frequency for the block. +// +// * `spill_cost` is an abstractified measure of the cost of spilling the LR, +// and is the `total cost` divided by the `size`. The only constraint +// (w.r.t. correctness) is that normal LRs have a `Some` value, whilst +// `None` is reserved for live ranges created for spills and reloads and +// interpreted to mean "infinity". This is needed to guarantee that +// allocation can always succeed in the worst case, in which all of the +// original live ranges of the program are spilled. +// +// RealRanges don't carry any metrics info since we are not trying to allocate +// them. We merely need to work around them. +// +// I find it helpful to think of a live range, both RealRange and +// VirtualRange, as a "renaming equivalence class". That is, if you rename +// `reg` at some point inside `sorted_frags`, then you must rename *all* +// occurrences of `reg` inside `sorted_frags`, since otherwise the program will +// no longer work. + +#[derive(Clone)] +pub struct RealRange { + pub rreg: RealReg, + pub sorted_frags: SortedRangeFragIxs, + pub is_ref: bool, +} + +impl fmt::Debug for RealRange { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!( + fmt, + "(RR: {:?}{}, {:?})", + self.rreg, + if self.is_ref { " REF" } else { "" }, + self.sorted_frags + ) + } +} + +impl RealRange { + pub fn show_with_rru(&self, univ: &RealRegUniverse) -> String { + format!( + "(RR: {}{}, {:?})", + self.rreg.to_reg().show_with_rru(univ), + if self.is_ref { " REF" } else { "" }, + self.sorted_frags + ) + } +} + +// VirtualRanges are live ranges for virtual regs (VirtualRegs). This does +// carry metrics info and also the identity of the RealReg to which it +// eventually got allocated. (Or in the backtracking allocator, the identity +// of the RealReg to which it is *currently* assigned; that may be undone at +// some later point.) + +#[derive(Clone)] +pub struct VirtualRange { + pub vreg: VirtualReg, + pub rreg: Option<RealReg>, + pub sorted_frags: SortedRangeFrags, + pub is_ref: bool, + pub size: u16, + pub total_cost: u32, + pub spill_cost: SpillCost, // == total_cost / size +} + +impl VirtualRange { + pub fn overlaps(&self, other: &Self) -> bool { + self.sorted_frags.overlaps(&other.sorted_frags) + } +} + +impl fmt::Debug for VirtualRange { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!( + fmt, + "(VR: {:?}{},", + self.vreg, + if self.is_ref { " REF" } else { "" } + )?; + if self.rreg.is_some() { + write!(fmt, " -> {:?}", self.rreg.unwrap())?; + } + write!( + fmt, + " sz={}, tc={}, sc={:?}, {:?})", + self.size, self.total_cost, self.spill_cost, self.sorted_frags + ) + } +} + +//============================================================================= +// Some auxiliary/miscellaneous data structures that are useful: RegToRangesMaps + +// Mappings from RealRegs and VirtualRegs to the sets of RealRanges and VirtualRanges that +// belong to them. These are needed for BT's coalescing analysis and for the dataflow analysis +// that supports reftype handling. + +pub struct RegToRangesMaps { + // This maps RealReg indices to the set of RealRangeIxs for that RealReg. Valid indices are + // real register indices for all non-sanitised real regs; that is, + // 0 .. RealRegUniverse::allocable, for ".." having the Rust meaning. The Vecs of + // RealRangeIxs are duplicate-free. The SmallVec capacity of 6 was chosen after quite + // some profiling, of CL/x64/newBE compiling ZenGarden.wasm -- a huge input, with many + // relatively small functions. Profiling was performed in August 2020, using Valgrind/DHAT. + pub rreg_to_rlrs_map: Vec</*real reg ix, */ SmallVec<[RealRangeIx; 6]>>, + + // This maps VirtualReg indices to the set of VirtualRangeIxs for that VirtualReg. Valid + // indices are 0 .. Function::get_num_vregs(). For functions mostly translated from SSA, + // most VirtualRegs will have just one VirtualRange, and there are a lot of VirtualRegs in + // general. So SmallVec is a definite benefit here. + pub vreg_to_vlrs_map: Vec</*virtual reg ix, */ SmallVec<[VirtualRangeIx; 3]>>, + + // As an optimisation heuristic for BT's coalescing analysis, these indicate which + // real/virtual registers have "many" `RangeFrag`s in their live ranges. For some + // definition of "many", perhaps "200 or more". This is not important for overall + // allocation result or correctness: it merely allows the coalescing analysis to switch + // between two search strategies, one of which is fast for regs with few `RangeFrag`s (the + // vast majority) and the other of which has better asymptotic behaviour for regs with many + // `RangeFrag`s (in order to keep out of trouble on some pathological inputs). These + // vectors are duplicate-free but the elements may be in an arbitrary order. + pub rregs_with_many_frags: Vec<u32 /*RealReg index*/>, + pub vregs_with_many_frags: Vec<u32 /*VirtualReg index*/>, + + // And this indicates what the thresh is actually set to. A frag will be in + // `r/vregs_with_many_frags` if it has `many_frags_thresh` or more RangeFrags. + pub many_frags_thresh: usize, +} + +//============================================================================= +// Some auxiliary/miscellaneous data structures that are useful: MoveInfo + +// `MoveInfoElem` holds info about the two registers connected a move: the source and destination +// of the move, the insn performing the move, and the estimated execution frequency of the +// containing block. In `MoveInfo`, the moves are not presented in any particular order, but +// they are duplicate-free in that each such instruction will be listed only once. + +pub struct MoveInfoElem { + pub dst: Reg, + pub src: Reg, + pub iix: InstIx, + pub est_freq: u32, +} + +pub struct MoveInfo { + pub moves: Vec<MoveInfoElem>, +} + +// Something that can be either a VirtualRangeIx or a RealRangeIx, whilst still being 32 bits +// (by stealing one bit from those spaces). Note that the resulting thing no longer denotes a +// contiguous index space, and so it has a name that indicates it is an identifier rather than +// an index. + +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)] +pub struct RangeId { + // 1 X--(31)--X is a RealRangeIx with value X--(31)--X + // 0 X--(31)--X is a VirtualRangeIx with value X--(31)--X + bits: u32, +} + +impl RangeId { + #[inline(always)] + pub fn new_real(rlrix: RealRangeIx) -> Self { + let n = rlrix.get(); + assert!(n <= 0x7FFF_FFFF); + Self { + bits: n | 0x8000_0000, + } + } + #[inline(always)] + pub fn new_virtual(vlrix: VirtualRangeIx) -> Self { + let n = vlrix.get(); + assert!(n <= 0x7FFF_FFFF); + Self { bits: n } + } + #[inline(always)] + pub fn is_real(self) -> bool { + self.bits & 0x8000_0000 != 0 + } + #[allow(dead_code)] + #[inline(always)] + pub fn is_virtual(self) -> bool { + self.bits & 0x8000_0000 == 0 + } + #[inline(always)] + pub fn to_real(self) -> RealRangeIx { + assert!(self.bits & 0x8000_0000 != 0); + RealRangeIx::new(self.bits & 0x7FFF_FFFF) + } + #[inline(always)] + pub fn to_virtual(self) -> VirtualRangeIx { + assert!(self.bits & 0x8000_0000 == 0); + VirtualRangeIx::new(self.bits) + } + #[inline(always)] + pub fn invalid_value() -> Self { + // Real, and inplausibly huge + Self { bits: 0xFFFF_FFFF } + } +} + +impl fmt::Debug for RangeId { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + if self.is_real() { + self.to_real().fmt(fmt) + } else { + self.to_virtual().fmt(fmt) + } + } +} + +//============================================================================= +// Test cases + +// sewardj 2020Mar04: these are commented out for now, as they no longer +// compile. They may be useful later though, once BT acquires an interval +// tree implementation for its CommitmentMap. + +/* +#[test] +fn test_sorted_frag_ranges() { + // Create a RangeFrag and RangeFragIx from two InstPoints. + fn gen_fix( + fenv: &mut TypedIxVec<RangeFragIx, RangeFrag>, first: InstPoint, + last: InstPoint, + ) -> RangeFragIx { + assert!(first <= last); + let res = RangeFragIx::new(fenv.len() as u32); + let frag = RangeFrag { + bix: BlockIx::new(123), + kind: RangeFragKind::Local, + first, + last, + count: 0, + }; + fenv.push(frag); + res + } + + fn get_range_frag( + fenv: &TypedIxVec<RangeFragIx, RangeFrag>, fix: RangeFragIx, + ) -> &RangeFrag { + &fenv[fix] + } + + // Structural equality, at least. Not equality in the sense of + // deferencing the contained RangeFragIxes. + fn sorted_range_eq( + fixs1: &SortedRangeFragIxs, fixs2: &SortedRangeFragIxs, + ) -> bool { + if fixs1.frag_ixs.len() != fixs2.frag_ixs.len() { + return false; + } + for (mf1, mf2) in fixs1.frag_ixs.iter().zip(&fixs2.frag_ixs) { + if mf1 != mf2 { + return false; + } + } + true + } + + let iix3 = InstIx::new(3); + let iix4 = InstIx::new(4); + let iix5 = InstIx::new(5); + let iix6 = InstIx::new(6); + let iix7 = InstIx::new(7); + let iix10 = InstIx::new(10); + let iix12 = InstIx::new(12); + + let fp_3u = InstPoint::new_use(iix3); + let fp_3d = InstPoint::new_def(iix3); + + let fp_4u = InstPoint::new_use(iix4); + + let fp_5u = InstPoint::new_use(iix5); + let fp_5d = InstPoint::new_def(iix5); + + let fp_6u = InstPoint::new_use(iix6); + let fp_6d = InstPoint::new_def(iix6); + + let fp_7u = InstPoint::new_use(iix7); + let fp_7d = InstPoint::new_def(iix7); + + let fp_10u = InstPoint::new_use(iix10); + let fp_12u = InstPoint::new_use(iix12); + + let mut fenv = TypedIxVec::<RangeFragIx, RangeFrag>::new(); + + let fix_3u = gen_fix(&mut fenv, fp_3u, fp_3u); + let fix_3d = gen_fix(&mut fenv, fp_3d, fp_3d); + let fix_4u = gen_fix(&mut fenv, fp_4u, fp_4u); + let fix_3u_5u = gen_fix(&mut fenv, fp_3u, fp_5u); + let fix_3d_5d = gen_fix(&mut fenv, fp_3d, fp_5d); + let fix_3d_5u = gen_fix(&mut fenv, fp_3d, fp_5u); + let fix_3u_5d = gen_fix(&mut fenv, fp_3u, fp_5d); + let fix_6u_6d = gen_fix(&mut fenv, fp_6u, fp_6d); + let fix_7u_7d = gen_fix(&mut fenv, fp_7u, fp_7d); + let fix_10u = gen_fix(&mut fenv, fp_10u, fp_10u); + let fix_12u = gen_fix(&mut fenv, fp_12u, fp_12u); + + // Boundary checks for point ranges, 3u vs 3d + assert!( + cmp_range_frags( + get_range_frag(&fenv, fix_3u), + get_range_frag(&fenv, fix_3u) + ) == Some(Ordering::Equal) + ); + assert!( + cmp_range_frags( + get_range_frag(&fenv, fix_3u), + get_range_frag(&fenv, fix_3d) + ) == Some(Ordering::Less) + ); + assert!( + cmp_range_frags( + get_range_frag(&fenv, fix_3d), + get_range_frag(&fenv, fix_3u) + ) == Some(Ordering::Greater) + ); + + // Boundary checks for point ranges, 3d vs 4u + assert!( + cmp_range_frags( + get_range_frag(&fenv, fix_3d), + get_range_frag(&fenv, fix_3d) + ) == Some(Ordering::Equal) + ); + assert!( + cmp_range_frags( + get_range_frag(&fenv, fix_3d), + get_range_frag(&fenv, fix_4u) + ) == Some(Ordering::Less) + ); + assert!( + cmp_range_frags( + get_range_frag(&fenv, fix_4u), + get_range_frag(&fenv, fix_3d) + ) == Some(Ordering::Greater) + ); + + // Partially overlapping + assert!( + cmp_range_frags( + get_range_frag(&fenv, fix_3d_5d), + get_range_frag(&fenv, fix_3u_5u) + ) == None + ); + assert!( + cmp_range_frags( + get_range_frag(&fenv, fix_3u_5u), + get_range_frag(&fenv, fix_3d_5d) + ) == None + ); + + // Completely overlapping: one contained within the other + assert!( + cmp_range_frags( + get_range_frag(&fenv, fix_3d_5u), + get_range_frag(&fenv, fix_3u_5d) + ) == None + ); + assert!( + cmp_range_frags( + get_range_frag(&fenv, fix_3u_5d), + get_range_frag(&fenv, fix_3d_5u) + ) == None + ); + + // Create a SortedRangeFragIxs from a bunch of RangeFrag indices + fn new_sorted_frag_ranges( + fenv: &TypedIxVec<RangeFragIx, RangeFrag>, frags: &Vec<RangeFragIx>, + ) -> SortedRangeFragIxs { + SortedRangeFragIxs::new(&frags, fenv) + } + + // Construction tests + // These fail due to overlap + //let _ = new_sorted_frag_ranges(&fenv, &vec![fix_3u_3u, fix_3u_3u]); + //let _ = new_sorted_frag_ranges(&fenv, &vec![fix_3u_5u, fix_3d_5d]); + + // These fail due to not being in order + //let _ = new_sorted_frag_ranges(&fenv, &vec![fix_4u_4u, fix_3u_3u]); + + // Simple non-overlap tests for add() + + let smf_empty = new_sorted_frag_ranges(&fenv, &vec![]); + let smf_6_7_10 = + new_sorted_frag_ranges(&fenv, &vec![fix_6u_6d, fix_7u_7d, fix_10u]); + let smf_3_12 = new_sorted_frag_ranges(&fenv, &vec![fix_3u, fix_12u]); + let smf_3_6_7_10_12 = new_sorted_frag_ranges( + &fenv, + &vec![fix_3u, fix_6u_6d, fix_7u_7d, fix_10u, fix_12u], + ); + let mut tmp; + + tmp = smf_empty.clone(); + tmp.add(&smf_empty, &fenv); + assert!(sorted_range_eq(&tmp, &smf_empty)); + + tmp = smf_3_12.clone(); + tmp.add(&smf_empty, &fenv); + assert!(sorted_range_eq(&tmp, &smf_3_12)); + + tmp = smf_empty.clone(); + tmp.add(&smf_3_12, &fenv); + assert!(sorted_range_eq(&tmp, &smf_3_12)); + + tmp = smf_6_7_10.clone(); + tmp.add(&smf_3_12, &fenv); + assert!(sorted_range_eq(&tmp, &smf_3_6_7_10_12)); + + tmp = smf_3_12.clone(); + tmp.add(&smf_6_7_10, &fenv); + assert!(sorted_range_eq(&tmp, &smf_3_6_7_10_12)); + + // Tests for can_add() + assert!(true == smf_empty.can_add(&smf_empty, &fenv)); + assert!(true == smf_empty.can_add(&smf_3_12, &fenv)); + assert!(true == smf_3_12.can_add(&smf_empty, &fenv)); + assert!(false == smf_3_12.can_add(&smf_3_12, &fenv)); + + assert!(true == smf_6_7_10.can_add(&smf_3_12, &fenv)); + + assert!(true == smf_3_12.can_add(&smf_6_7_10, &fenv)); + + // Tests for del() + let smf_6_7 = new_sorted_frag_ranges(&fenv, &vec![fix_6u_6d, fix_7u_7d]); + let smf_6_10 = new_sorted_frag_ranges(&fenv, &vec![fix_6u_6d, fix_10u]); + let smf_7 = new_sorted_frag_ranges(&fenv, &vec![fix_7u_7d]); + let smf_10 = new_sorted_frag_ranges(&fenv, &vec![fix_10u]); + + tmp = smf_empty.clone(); + tmp.del(&smf_empty, &fenv); + assert!(sorted_range_eq(&tmp, &smf_empty)); + + tmp = smf_3_12.clone(); + tmp.del(&smf_empty, &fenv); + assert!(sorted_range_eq(&tmp, &smf_3_12)); + + tmp = smf_empty.clone(); + tmp.del(&smf_3_12, &fenv); + assert!(sorted_range_eq(&tmp, &smf_empty)); + + tmp = smf_6_7_10.clone(); + tmp.del(&smf_3_12, &fenv); + assert!(sorted_range_eq(&tmp, &smf_6_7_10)); + + tmp = smf_3_12.clone(); + tmp.del(&smf_6_7_10, &fenv); + assert!(sorted_range_eq(&tmp, &smf_3_12)); + + tmp = smf_6_7_10.clone(); + tmp.del(&smf_6_7, &fenv); + assert!(sorted_range_eq(&tmp, &smf_10)); + + tmp = smf_6_7_10.clone(); + tmp.del(&smf_10, &fenv); + assert!(sorted_range_eq(&tmp, &smf_6_7)); + + tmp = smf_6_7_10.clone(); + tmp.del(&smf_7, &fenv); + assert!(sorted_range_eq(&tmp, &smf_6_10)); + + // Tests for can_add_if_we_first_del() + let smf_10_12 = new_sorted_frag_ranges(&fenv, &vec![fix_10u, fix_12u]); + + assert!( + true + == smf_6_7_10 + .can_add_if_we_first_del(/*d=*/ &smf_10_12, /*a=*/ &smf_3_12, &fenv) + ); + + assert!( + false + == smf_6_7_10 + .can_add_if_we_first_del(/*d=*/ &smf_10_12, /*a=*/ &smf_7, &fenv) + ); +} +*/ diff --git a/third_party/rust/regalloc/src/inst_stream.rs b/third_party/rust/regalloc/src/inst_stream.rs new file mode 100644 index 0000000000..620b9f8cf8 --- /dev/null +++ b/third_party/rust/regalloc/src/inst_stream.rs @@ -0,0 +1,664 @@ +use crate::checker::Inst as CheckerInst; +use crate::checker::{CheckerContext, CheckerErrors}; +use crate::data_structures::{ + BlockIx, InstIx, InstPoint, Point, RangeFrag, RealReg, RealRegUniverse, Reg, SpillSlot, + TypedIxVec, VirtualReg, Writable, +}; +use crate::{reg_maps::VrangeRegUsageMapper, Function, RegAllocError}; +use log::trace; + +use std::result::Result; + +//============================================================================= +// InstToInsert and InstToInsertAndPoint + +#[derive(Clone, Debug)] +pub(crate) enum InstToInsert { + Spill { + to_slot: SpillSlot, + from_reg: RealReg, + for_vreg: Option<VirtualReg>, + }, + Reload { + to_reg: Writable<RealReg>, + from_slot: SpillSlot, + for_vreg: Option<VirtualReg>, + }, + Move { + to_reg: Writable<RealReg>, + from_reg: RealReg, + for_vreg: VirtualReg, + }, + /// A spillslot reassignment (to another vreg). In the edited instruction + /// stream, this is a nop, but this is needed for the checker to properly + /// track the symbolic values in slots. Always originates from a move + /// in the original user program whose source and dest vregs are both + /// spilled. + ChangeSpillSlotOwnership { + inst_ix: InstIx, + slot: SpillSlot, + from_reg: Reg, + to_reg: Reg, + }, +} + +impl InstToInsert { + pub(crate) fn construct<F: Function>(&self, f: &F) -> Option<F::Inst> { + match self { + &InstToInsert::Spill { + to_slot, + from_reg, + for_vreg, + } => Some(f.gen_spill(to_slot, from_reg, for_vreg)), + &InstToInsert::Reload { + to_reg, + from_slot, + for_vreg, + } => Some(f.gen_reload(to_reg, from_slot, for_vreg)), + &InstToInsert::Move { + to_reg, + from_reg, + for_vreg, + } => Some(f.gen_move(to_reg, from_reg, for_vreg)), + &InstToInsert::ChangeSpillSlotOwnership { .. } => None, + } + } + + pub(crate) fn to_checker_inst(&self) -> CheckerInst { + match self { + &InstToInsert::Spill { + to_slot, from_reg, .. + } => CheckerInst::Spill { + into: to_slot, + from: from_reg, + }, + &InstToInsert::Reload { + to_reg, from_slot, .. + } => CheckerInst::Reload { + into: to_reg, + from: from_slot, + }, + &InstToInsert::Move { + to_reg, from_reg, .. + } => CheckerInst::Move { + into: to_reg, + from: from_reg, + }, + &InstToInsert::ChangeSpillSlotOwnership { + inst_ix, + slot, + from_reg, + to_reg, + } => CheckerInst::ChangeSpillSlotOwnership { + inst_ix, + slot, + from_reg, + to_reg, + }, + } + } +} + +// ExtPoint is an extended version of Point. It plays no role in dataflow analysis or in the +// specification of live ranges. It exists only to describe where to place the "extra" +// spill/reload instructions required to make stackmap/reftype support work. If there was no +// need to support stackmaps/reftypes, ExtPoint would not be needed, and Point would be +// adequate. +// +// Recall that Point can denote 4 places within an instruction, with R < U < D < S: +// +// * R(eload): this is where any reload insns for the insn itself are +// considered to live. +// +// * U(se): this is where the insn is considered to use values from those of +// its register operands that appear in a Read or Modify role. +// +// * D(ef): this is where the insn is considered to define new values for +// those of its register operands that appear in a Write or Modify role. +// +// * S(pill): this is where any spill insns for the insn itself are considered +// to live. +// +// ExtPoint extends that to six places, by adding a new point in between Reload and Use, and one +// between Def and Spill, giving: R < SB < U < D < RA < S: +// +// * (R)eload: unchanged +// +// * SB (Spill before): at this point, reftyped regs will be spilled, if this insn is a safepoint +// +// * (U)se: unchanged +// +// * (D)ef: unchanged +// +// * RA (Reload after): at this point, reftyped regs spilled at SB will be reloaded, if needed, +// and if this insn is a safepoint +// +// * (S)pill: unchanged +// +// From this it can be seen that the SB and RA points are closest to the instruction "core" -- +// the U and D points. SB and RA describe places where reftyped regs must be spilled/reloaded +// around the core. Because the SB-RA range falls inside the R-S range, it means the the +// safepoint spill/reload instructions can be added after "normal" spill/reload instructions +// have been created, and it doesn't interact with the logic to create those "normal" +// spill/reload instructions. +// +// In the worst case scenario, a value could be reloaded at R, immediately spilled at SB, then +// possibly modified in memory at the safepoint proper, reloaded at RA, and spilled at S. That +// is considered to be an unlikely scenario, though. + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum ExtPoint { + Reload = 0, + SpillBefore = 1, + Use = 2, + Def = 3, + ReloadAfter = 4, + Spill = 5, +} + +impl ExtPoint { + // Promote a Point to an ExtPoint + #[inline(always)] + pub fn from_point(pt: Point) -> Self { + match pt { + Point::Reload => ExtPoint::Reload, + Point::Use => ExtPoint::Use, + Point::Def => ExtPoint::Def, + Point::Spill => ExtPoint::Spill, + } + } +} + +// As the direct analogy to InstPoint, a InstExtPoint pairs an InstIx with an ExtPoint. In +// contrast to InstPoint, these aren't so performance critical, so there's no fancy bit-packed +// representation as there is for InstPoint. + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct InstExtPoint { + pub iix: InstIx, + pub extpt: ExtPoint, +} + +impl InstExtPoint { + #[inline(always)] + pub fn new(iix: InstIx, extpt: ExtPoint) -> Self { + Self { iix, extpt } + } + // Promote an InstPoint to an InstExtPoint + #[inline(always)] + pub fn from_inst_point(inst_pt: InstPoint) -> Self { + InstExtPoint { + iix: inst_pt.iix(), + extpt: ExtPoint::from_point(inst_pt.pt()), + } + } +} + +// So, finally, we can specify what we want: an instruction to insert, and a place to insert it. +#[derive(Debug)] +pub(crate) struct InstToInsertAndExtPoint { + pub(crate) inst: InstToInsert, + pub(crate) iep: InstExtPoint, +} + +impl InstToInsertAndExtPoint { + #[inline(always)] + pub(crate) fn new(inst: InstToInsert, iep: InstExtPoint) -> Self { + Self { inst, iep } + } +} + +//============================================================================= +// Apply all vreg->rreg mappings for the function's instructions, and run +// the checker if required. This also removes instructions that the core +// algorithm wants removed, by nop-ing them out. + +#[inline(never)] +fn map_vregs_to_rregs<F: Function>( + func: &mut F, + frag_map: Vec<(RangeFrag, VirtualReg, RealReg)>, + insts_to_add: &Vec<InstToInsertAndExtPoint>, + iixs_to_nop_out: &Vec<InstIx>, + reg_universe: &RealRegUniverse, + use_checker: bool, + safepoint_insns: &[InstIx], + stackmaps: &[Vec<SpillSlot>], + reftyped_vregs: &[VirtualReg], +) -> Result<(), CheckerErrors> { + // Set up checker state, if indicated by our configuration. + let mut checker: Option<CheckerContext> = None; + let mut insn_blocks: Vec<BlockIx> = vec![]; + if use_checker { + checker = Some(CheckerContext::new( + func, + reg_universe, + insts_to_add, + safepoint_insns, + stackmaps, + reftyped_vregs, + )); + insn_blocks.resize(func.insns().len(), BlockIx::new(0)); + for block_ix in func.blocks() { + for insn_ix in func.block_insns(block_ix) { + insn_blocks[insn_ix.get() as usize] = block_ix; + } + } + } + + // Sort the insn nop-out index list, so we can advance through it + // during the main loop. + let mut iixs_to_nop_out = iixs_to_nop_out.clone(); + iixs_to_nop_out.sort(); + + // Make two copies of the fragment mapping, one sorted by the fragment start + // points (just the InstIx numbers, ignoring the Point), and one sorted by + // fragment end points. + let mut frag_maps_by_start = frag_map.clone(); + let mut frag_maps_by_end = frag_map; + + // -------- Edit the instruction stream -------- + frag_maps_by_start.sort_unstable_by(|(frag, _, _), (other_frag, _, _)| { + frag.first + .iix() + .partial_cmp(&other_frag.first.iix()) + .unwrap() + }); + + frag_maps_by_end.sort_unstable_by(|(frag, _, _), (other_frag, _, _)| { + frag.last.iix().partial_cmp(&other_frag.last.iix()).unwrap() + }); + + let mut cursor_starts = 0; + let mut cursor_ends = 0; + let mut cursor_nop = 0; + + // Allocate the "mapper" data structure that we update incrementally and + // pass to instruction reg-mapping routines to query. + let mut mapper = VrangeRegUsageMapper::new(func.get_num_vregs()); + + fn is_sane(frag: &RangeFrag) -> bool { + // "Normal" frag (unrelated to spilling). No normal frag may start or + // end at a .s or a .r point. + if frag.first.pt().is_use_or_def() + && frag.last.pt().is_use_or_def() + && frag.first.iix() <= frag.last.iix() + { + return true; + } + // A spill-related ("bridge") frag. There are three possibilities, + // and they correspond exactly to `BridgeKind`. + if frag.first.pt().is_reload() + && frag.last.pt().is_use() + && frag.last.iix() == frag.first.iix() + { + // BridgeKind::RtoU + return true; + } + if frag.first.pt().is_reload() + && frag.last.pt().is_spill() + && frag.last.iix() == frag.first.iix() + { + // BridgeKind::RtoS + return true; + } + if frag.first.pt().is_def() + && frag.last.pt().is_spill() + && frag.last.iix() == frag.first.iix() + { + // BridgeKind::DtoS + return true; + } + // None of the above apply. This RangeFrag is insane \o/ + false + } + + let mut last_insn_ix = -1; + for insn_ix in func.insn_indices() { + // Ensure instruction indices are in order. Logic below requires this. + assert!(insn_ix.get() as i32 > last_insn_ix); + last_insn_ix = insn_ix.get() as i32; + + // advance [cursorStarts, +num_starts) to the group for insn_ix + while cursor_starts < frag_maps_by_start.len() + && frag_maps_by_start[cursor_starts].0.first.iix() < insn_ix + { + cursor_starts += 1; + } + let mut num_starts = 0; + while cursor_starts + num_starts < frag_maps_by_start.len() + && frag_maps_by_start[cursor_starts + num_starts].0.first.iix() == insn_ix + { + num_starts += 1; + } + + // advance [cursorEnds, +num_ends) to the group for insn_ix + while cursor_ends < frag_maps_by_end.len() + && frag_maps_by_end[cursor_ends].0.last.iix() < insn_ix + { + cursor_ends += 1; + } + let mut num_ends = 0; + while cursor_ends + num_ends < frag_maps_by_end.len() + && frag_maps_by_end[cursor_ends + num_ends].0.last.iix() == insn_ix + { + num_ends += 1; + } + + // advance cursor_nop in the iixs_to_nop_out list. + while cursor_nop < iixs_to_nop_out.len() && iixs_to_nop_out[cursor_nop] < insn_ix { + cursor_nop += 1; + } + + let nop_this_insn = + cursor_nop < iixs_to_nop_out.len() && iixs_to_nop_out[cursor_nop] == insn_ix; + + // So now, fragMapsByStart[cursorStarts, +num_starts) are the mappings + // for fragments that begin at this instruction, in no particular + // order. And fragMapsByEnd[cursorEnd, +numEnd) are the RangeFragIxs + // for fragments that end at this instruction. + + // Sanity check all frags. In particular, reload and spill frags are + // heavily constrained. No functional effect. + for j in cursor_starts..cursor_starts + num_starts { + let frag = &frag_maps_by_start[j].0; + // "It really starts here, as claimed." + debug_assert!(frag.first.iix() == insn_ix); + debug_assert!(is_sane(&frag)); + } + for j in cursor_ends..cursor_ends + num_ends { + let frag = &frag_maps_by_end[j].0; + // "It really ends here, as claimed." + debug_assert!(frag.last.iix() == insn_ix); + debug_assert!(is_sane(frag)); + } + + // Here's the plan, conceptually (we don't actually clone the map): + // Update map for I.r: + // add frags starting at I.r + // no frags should end at I.r (it's a reload insn) + // Update map for I.u: + // add frags starting at I.u + // map_uses := map + // remove frags ending at I.u + // Update map for I.d: + // add frags starting at I.d + // map_defs := map + // remove frags ending at I.d + // Update map for I.s: + // no frags should start at I.s (it's a spill insn) + // remove frags ending at I.s + // apply map_uses/map_defs to I + + // To update the running mapper, we: + // - call `mapper.set_direct(vreg, Some(rreg))` with pre-insn starts. + // ("use"-map snapshot conceptually happens here) + // - call `mapper.set_overlay(vreg, None)` with pre-insn, post-reload ends. + // - call `mapper.set_overlay(vreg, Some(rreg))` with post-insn, pre-spill starts. + // ("post"-map snapshot conceptually happens here) + // - call `mapper.finish_overlay()`. + // + // - Use the map. `pre` and `post` are correct wrt the instruction. + // + // - call `mapper.merge_overlay()` to merge post-updates to main map. + // - call `mapper.set_direct(vreg, None)` with post-insn, post-spill + // ends. + + trace!("current mapper {:?}", mapper); + + // Update map for I.r: + // add frags starting at I.r + // no frags should end at I.r (it's a reload insn) + for j in cursor_starts..cursor_starts + num_starts { + let frag = &frag_maps_by_start[j].0; + if frag.first.pt().is_reload() { + //////// STARTS at I.r + mapper.set_direct(frag_maps_by_start[j].1, Some(frag_maps_by_start[j].2)); + } + } + + // Update map for I.u: + // add frags starting at I.u + // map_uses := map + // remove frags ending at I.u + for j in cursor_starts..cursor_starts + num_starts { + let frag = &frag_maps_by_start[j].0; + if frag.first.pt().is_use() { + //////// STARTS at I.u + mapper.set_direct(frag_maps_by_start[j].1, Some(frag_maps_by_start[j].2)); + } + } + for j in cursor_ends..cursor_ends + num_ends { + let frag = &frag_maps_by_end[j].0; + if frag.last.pt().is_use() { + //////// ENDS at I.U + mapper.set_overlay(frag_maps_by_end[j].1, None); + } + } + + trace!("maps after I.u {:?}", mapper); + + // Update map for I.d: + // add frags starting at I.d + // map_defs := map + // remove frags ending at I.d + for j in cursor_starts..cursor_starts + num_starts { + let frag = &frag_maps_by_start[j].0; + if frag.first.pt().is_def() { + //////// STARTS at I.d + mapper.set_overlay(frag_maps_by_start[j].1, Some(frag_maps_by_start[j].2)); + } + } + + mapper.finish_overlay(); + + trace!("maps after I.d {:?}", mapper); + + // If we have a checker, update it with spills, reloads, moves, and this + // instruction, while we have `map_uses` and `map_defs` available. + if let &mut Some(ref mut checker) = &mut checker { + let block_ix = insn_blocks[insn_ix.get() as usize]; + checker + .handle_insn(reg_universe, func, block_ix, insn_ix, &mapper) + .unwrap(); + } + + // Finally, we have map_uses/map_defs set correctly for this instruction. + // Apply it. + if !nop_this_insn { + trace!("map_regs for {:?}", insn_ix); + let mut insn = func.get_insn_mut(insn_ix); + F::map_regs(&mut insn, &mapper); + trace!("mapped instruction: {:?}", insn); + } else { + // N.B. We nop out instructions as requested only *here*, after the + // checker call, because the checker must observe even elided moves + // (they may carry useful information about a move between two virtual + // locations mapped to the same physical location). + trace!("nop'ing out {:?}", insn_ix); + let nop = func.gen_zero_len_nop(); + let insn = func.get_insn_mut(insn_ix); + *insn = nop; + } + + mapper.merge_overlay(); + for j in cursor_ends..cursor_ends + num_ends { + let frag = &frag_maps_by_end[j].0; + if frag.last.pt().is_def() { + //////// ENDS at I.d + mapper.set_direct(frag_maps_by_end[j].1, None); + } + } + + // Update map for I.s: + // no frags should start at I.s (it's a spill insn) + // remove frags ending at I.s + for j in cursor_ends..cursor_ends + num_ends { + let frag = &frag_maps_by_end[j].0; + if frag.last.pt().is_spill() { + //////// ENDS at I.s + mapper.set_direct(frag_maps_by_end[j].1, None); + } + } + + // Update cursorStarts and cursorEnds for the next iteration + cursor_starts += num_starts; + cursor_ends += num_ends; + } + + debug_assert!(mapper.is_empty()); + + if use_checker { + checker.unwrap().run() + } else { + Ok(()) + } +} + +//============================================================================= +// Take the real-register-only code created by `map_vregs_to_rregs` and +// interleave extra instructions (spills, reloads and moves) that the core +// algorithm has asked us to add. + +#[inline(never)] +pub(crate) fn add_spills_reloads_and_moves<F: Function>( + func: &mut F, + safepoint_insns: &Vec<InstIx>, + mut insts_to_add: Vec<InstToInsertAndExtPoint>, +) -> Result< + ( + Vec<F::Inst>, + TypedIxVec<BlockIx, InstIx>, + TypedIxVec<InstIx, InstIx>, + Vec<InstIx>, + ), + String, +> { + // Construct the final code by interleaving the mapped code with the the + // spills, reloads and moves that we have been requested to insert. To do + // that requires having the latter sorted by InstPoint. + // + // We also need to examine and update Func::blocks. This is assumed to + // be arranged in ascending order of the Block::start fields. + // + // Also, if the client requested stackmap creation, then `safepoint_insns` will be + // non-empty, and we will have to return a vector of the same length, that indicates the + // location of each safepoint insn in the final code. `safepoint_insns` is assumed to be + // sorted in ascending order and duplicate-free. + // + // Linear scan relies on the sort being stable here, so make sure to not + // use an unstable sort. See the comment in `resolve_moves_across blocks` + // in linear scan's code. + + insts_to_add.sort_by_key(|to_add| to_add.iep.clone()); + + let mut cur_inst_to_add = 0; + let mut cur_block = BlockIx::new(0); + + let mut insns: Vec<F::Inst> = vec![]; + let mut target_map: TypedIxVec<BlockIx, InstIx> = TypedIxVec::new(); + + let mut new_to_old_insn_map: TypedIxVec<InstIx, InstIx> = TypedIxVec::new(); + target_map.reserve(func.blocks().len()); + new_to_old_insn_map.reserve(func.insn_indices().len() + insts_to_add.len()); + + // Index in `safepoint_insns` of the next safepoint insn we will encounter + let mut next_safepoint_insn_index = 0; + let mut new_safepoint_insns = Vec::<InstIx>::new(); + new_safepoint_insns.reserve(safepoint_insns.len()); + + for iix in func.insn_indices() { + // Is `iix` the first instruction in a block? Meaning, are we + // starting a new block? + debug_assert!(cur_block.get() < func.blocks().len() as u32); + if func.block_insns(cur_block).start() == iix { + assert!(cur_block.get() == target_map.len()); + target_map.push(InstIx::new(insns.len() as u32)); + } + + // Copy to the output vector, the first the extra insts that are to be placed at the + // reload point of `iix`, and then the extras for the spill-before point of `iix`. + while cur_inst_to_add < insts_to_add.len() + && insts_to_add[cur_inst_to_add].iep <= InstExtPoint::new(iix, ExtPoint::SpillBefore) + { + if let Some(inst) = insts_to_add[cur_inst_to_add].inst.construct(func) { + insns.push(inst); + new_to_old_insn_map.push(InstIx::invalid_value()); + } + cur_inst_to_add += 1; + } + + // Copy the inst at `iix` itself + if next_safepoint_insn_index < safepoint_insns.len() + && iix == safepoint_insns[next_safepoint_insn_index] + { + new_safepoint_insns.push(InstIx::new(insns.len() as u32)); + next_safepoint_insn_index += 1; + } + new_to_old_insn_map.push(iix); + insns.push(func.get_insn(iix).clone()); + + // And copy first, the extra insts that are to be placed at the reload-after point + // of `iix`, followed by those to be placed at the spill point of `iix`. + while cur_inst_to_add < insts_to_add.len() + && insts_to_add[cur_inst_to_add].iep <= InstExtPoint::new(iix, ExtPoint::Spill) + { + if let Some(inst) = insts_to_add[cur_inst_to_add].inst.construct(func) { + insns.push(inst); + new_to_old_insn_map.push(InstIx::invalid_value()); + } + cur_inst_to_add += 1; + } + + // Is `iix` the last instruction in a block? + if iix == func.block_insns(cur_block).last() { + debug_assert!(cur_block.get() < func.blocks().len() as u32); + cur_block = cur_block.plus(1); + } + } + + debug_assert!(cur_inst_to_add == insts_to_add.len()); + debug_assert!(cur_block.get() == func.blocks().len() as u32); + debug_assert!(next_safepoint_insn_index == safepoint_insns.len()); + debug_assert!(new_safepoint_insns.len() == safepoint_insns.len()); + + Ok((insns, target_map, new_to_old_insn_map, new_safepoint_insns)) +} + +//============================================================================= +// Main function + +#[inline(never)] +pub(crate) fn edit_inst_stream<F: Function>( + func: &mut F, + safepoint_insns: &Vec<InstIx>, + insts_to_add: Vec<InstToInsertAndExtPoint>, + iixs_to_nop_out: &Vec<InstIx>, + frag_map: Vec<(RangeFrag, VirtualReg, RealReg)>, + reg_universe: &RealRegUniverse, + use_checker: bool, + stackmaps: &[Vec<SpillSlot>], + reftyped_vregs: &[VirtualReg], +) -> Result< + ( + Vec<F::Inst>, + TypedIxVec<BlockIx, InstIx>, + TypedIxVec<InstIx, InstIx>, + Vec<InstIx>, + ), + RegAllocError, +> { + map_vregs_to_rregs( + func, + frag_map, + &insts_to_add, + iixs_to_nop_out, + reg_universe, + use_checker, + &safepoint_insns[..], + stackmaps, + reftyped_vregs, + ) + .map_err(|e| RegAllocError::RegChecker(e))?; + add_spills_reloads_and_moves(func, safepoint_insns, insts_to_add) + .map_err(|e| RegAllocError::Other(e)) +} diff --git a/third_party/rust/regalloc/src/lib.rs b/third_party/rust/regalloc/src/lib.rs new file mode 100644 index 0000000000..66216eb7e4 --- /dev/null +++ b/third_party/rust/regalloc/src/lib.rs @@ -0,0 +1,637 @@ +//! Main file / top-level module for regalloc library. +//! +//! We have tried hard to make the library's interface as simple as possible, +//! yet flexible enough that the allocators it implements can provide good +//! quality allocations in reasonable time. Nevertheless, there is still +//! significant semantic complexity in parts of the interface. If you intend +//! to use this library in your own code, you would be well advised to read +//! the comments in this file very carefully. + +// Make the analysis module public for fuzzing. +#[cfg(feature = "fuzzing")] +pub mod analysis_main; +#[cfg(not(feature = "fuzzing"))] +mod analysis_main; + +mod analysis_control_flow; +mod analysis_data_flow; +mod analysis_reftypes; +mod avl_tree; +mod bt_coalescing_analysis; +mod bt_commitment_map; +mod bt_main; +mod bt_spillslot_allocator; +mod bt_vlr_priority_queue; +mod checker; +mod data_structures; +mod inst_stream; +mod linear_scan; +mod pretty_print; +mod reg_maps; +mod snapshot; +mod sparse_set; +mod union_find; + +use log::{info, log_enabled, Level}; +use std::default; +use std::{borrow::Cow, fmt}; + +// Stuff that is defined by the library + +// Pretty-printing utilities. +pub use crate::pretty_print::*; + +// Sets and maps of things. We can refine these later; but for now the +// interface needs some way to speak about them, so let's use the +// library-provided versions. + +pub use crate::data_structures::Map; +pub use crate::data_structures::Set; + +// Register classes + +pub use crate::data_structures::RegClass; + +// Registers, both real and virtual, and ways to create them + +pub use crate::data_structures::Reg; + +pub use crate::data_structures::RealReg; +pub use crate::data_structures::VirtualReg; + +pub use crate::data_structures::Writable; + +pub use crate::data_structures::NUM_REG_CLASSES; + +// Spill slots + +pub use crate::data_structures::SpillSlot; + +// The "register universe". This describes the registers available to the +// allocator. There are very strict requirements on the structure of the +// universe. If you fail to observe these requirements, either the allocator +// itself, or the resulting code, will fail in mysterious ways, and your life +// will be miserable while you try to figure out what happened. There are +// lower level details on the definition of RealRegUniverse which you also +// need to take note of. The overall contract is as follows. +// +// === (1) === Basic structure === +// +// A "register universe" is a read-only structure that contains all +// information about real registers on a given host. For each register class +// (RegClass) supported by the target, the universe must provide a vector of +// registers that the allocator may use. +// +// The universe may also list other registers that the incoming +// virtual-registerised code may use, but which are not available for use by +// the allocator. Indeed, the universe *must* list *all* registers that will +// ever be mentioned in the incoming code. Failure to do so will cause the +// allocator's analysis phase to return an error. +// +// === (2) === Ordering of registers within each class +// +// The ordering of available registers within these vectors does not affect +// the correctness of the final allocation. However, it will affect the +// quality of final allocation. Clients are recommended to list, for each +// class, the callee-saved registers first, and the caller-saved registers +// after that. The currently supported allocation algorithms (Backtracking +// and LinearScan) will try to use the first available registers in each +// class, that is to say, callee-saved ones first. The purpose of this is to +// try and minimise spilling around calls by avoiding use of caller-saved ones +// if possible. +// +// There is a twist here, however. The abovementioned heuristic works well +// for non-leaf functions (functions that contain at least one call). But for +// leaf functions, we would prefer to use the caller-saved registers first, +// since doing so has potential to minimise the number of registers that must +// be saved/restored in the prologue and epilogue. Presently there is no way +// to tell this interface that the function is a leaf function, and so the +// only way to get optimal code in this case is to present a universe with the +// registers listed in the opposite order. +// +// This is of course inconvenient for the caller, since it requires +// maintenance of two separate universes. In the future we will add a boolean +// parameter to the top level function `allocate_registers` that indicates +// that whether or not the function is a leaf function. +// +// === (3) === The "suggested scratch register" === +// +// Some allocation algorithms, particularly linear-scan, may need to have a +// scratch register available for their own use. The register universe must +// nominate a scratch register in each class, specified in +// RealRegUniverse::allocable_by_class[..]::Some(suggested_scratch). The +// choice of scratch register is influenced by the architecture, the ABI, and +// client-side fixed-use register conventions. There rules are as follows: +// +// (1) For each class, the universe must offer a reserved register. +// +// (2) The reserved register may not have any implied-by-the architecture +// reads/modifies/writes for any instruction in the vcode. Unfortunately +// there is no easy way for this library to check that. +// +// (3) The reserved register must not have any reads or modifies by any +// instruction in the vcode. In other words, it must not be handed to +// either the `add_use` or `add_mod` function of the `RegUsageCollector` +// that is presented to the client's `get_regs` function. If any such +// mention is detected, the library will return an error. +// +// (4) The reserved reg may be mentioned as written by instructions in the +// vcode, though -- in other words it may be handed to `add_def`. The +// library will tolerate and correctly handle that. However, because no +// vcode instruction may read or modify the reserved register, all such +// writes are "dead". This in turn guarantees that the allocator can, if +// it wants, change the value in it at any time, without changing the +// behaviour of the final generated code. +// +// Currently, the LinearScan algorithm may use the reserved registers. The +// Backtracking algorithm will ignore the hints and treat them as "normal" +// allocatable registers. + +pub use crate::data_structures::RealRegUniverse; +pub use crate::data_structures::RegClassInfo; + +// A structure for collecting information about which registers each +// instruction uses. + +pub use crate::data_structures::RegUsageCollector; + +/// A trait for providing mapping results for a given instruction. +/// +/// This provides virtual to real register mappings for every mention in an instruction: use, mod +/// or def. The main purpose of this trait is to be used when re-writing the instruction stream +/// after register allocation happened; see also `Function::map_regs`. +pub trait RegUsageMapper: fmt::Debug { + /// Return the `RealReg` if mapped, or `None`, for `vreg` occuring as a use + /// on the current instruction. + fn get_use(&self, vreg: VirtualReg) -> Option<RealReg>; + + /// Return the `RealReg` if mapped, or `None`, for `vreg` occuring as a def + /// on the current instruction. + fn get_def(&self, vreg: VirtualReg) -> Option<RealReg>; + + /// Return the `RealReg` if mapped, or `None`, for a `vreg` occuring as a + /// mod on the current instruction. + fn get_mod(&self, vreg: VirtualReg) -> Option<RealReg>; +} + +// TypedIxVector, so that the interface can speak about vectors of blocks and +// instructions. + +pub use crate::data_structures::TypedIxVec; +pub use crate::data_structures::{BlockIx, InstIx, Range}; + +/// A trait defined by the regalloc client to provide access to its +/// machine-instruction / CFG representation. +pub trait Function { + /// Regalloc is parameterized on F: Function and so can use the projected + /// type F::Inst. + type Inst: Clone + fmt::Debug; + + // ------------- + // CFG traversal + // ------------- + + /// Allow access to the underlying vector of instructions. + fn insns(&self) -> &[Self::Inst]; + + /// Get all instruction indices as an iterable range. + fn insn_indices(&self) -> Range<InstIx> { + Range::new(InstIx::new(0), self.insns().len()) + } + + /// Allow mutable access to the underlying vector of instructions. + fn insns_mut(&mut self) -> &mut [Self::Inst]; + + /// Get an instruction with a type-safe InstIx index. + fn get_insn(&self, insn: InstIx) -> &Self::Inst; + + /// Get a mutable borrow of an instruction with the given type-safe InstIx + /// index. + fn get_insn_mut(&mut self, insn: InstIx) -> &mut Self::Inst; + + /// Allow iteration over basic blocks (in instruction order). + fn blocks(&self) -> Range<BlockIx>; + + /// Get the index of the entry block. + fn entry_block(&self) -> BlockIx; + + /// Provide the range of instruction indices contained in each block. + fn block_insns(&self, block: BlockIx) -> Range<InstIx>; + + /// Get CFG successors for a given block. + fn block_succs(&self, block: BlockIx) -> Cow<[BlockIx]>; + + /// Determine whether an instruction is a return instruction. + fn is_ret(&self, insn: InstIx) -> bool; + + /// Determine whether an instruction should be considered while computing + /// the set of registers that need to be saved/restored in the function's + /// prologue/epilogue, that is, the registers returned in + /// `clobbered_registers` in `RegAllocResult`. computation. Only + /// instructions for which this function returns `true` will be used to + /// compute that set. + /// + /// One reason that a client might *not* want an instruction to be included + /// would be if it can handle the clobbers some other way: for example, + /// ABI-support code might exclude call instructions' defs and mods from the + /// clobber set, because (given the callee has same ABI as the caller) the + /// registers possibly written by the callee are all registers that the + /// caller is also allowed to clobber (not save/restore in + /// prologue/epilogue). + fn is_included_in_clobbers(&self, _insn: &Self::Inst) -> bool { + // Default impl includes all instructions. + true + } + + // -------------------------- + // Instruction register slots + // -------------------------- + + /// Add to `collector` the used, defined, and modified registers for an + /// instruction. + fn get_regs(insn: &Self::Inst, collector: &mut RegUsageCollector); + + /// Map each register slot through a virtual-to-real mapping indexed + /// by virtual register. The two separate maps in `maps.pre` and + /// `maps.post` provide the mapping to use for uses (which semantically + /// occur just prior to the instruction's effect) and defs (which + /// semantically occur just after the instruction's effect). Regs that were + /// "modified" can use either map; the vreg should be the same in both. + /// + /// Note that this does not take a `self`, because we want to allow the + /// regalloc to have a mutable borrow of an insn (which borrows the whole + /// Function in turn) outstanding while calling this. + fn map_regs<RUM: RegUsageMapper>(insn: &mut Self::Inst, maps: &RUM); + + /// Allow the regalloc to query whether this is a move. Returns (dst, src). + fn is_move(&self, insn: &Self::Inst) -> Option<(Writable<Reg>, Reg)>; + + /// Get the precise number of `VirtualReg` in use in this function, to allow preallocating data + /// structures. This number *must* be a correct lower-bound, otherwise invalid index failures + /// may happen; it is of course better if it is exact. + fn get_num_vregs(&self) -> usize; + + // -------------- + // Spills/reloads + // -------------- + + /// How many logical spill slots does the given regclass require? E.g., on a + /// 64-bit machine, spill slots may nominally be 64-bit words, but a 128-bit + /// vector value will require two slots. The regalloc will always align on + /// this size. + /// + /// This passes the associated virtual register to the client as well, + /// because the way in which we spill a real register may depend on the + /// value that we are using it for. E.g., if a machine has V128 registers + /// but we also use them for F32 and F64 values, we may use a different + /// store-slot size and smaller-operand store/load instructions for an F64 + /// than for a true V128. + fn get_spillslot_size(&self, regclass: RegClass, for_vreg: VirtualReg) -> u32; + + /// Generate a spill instruction for insertion into the instruction + /// sequence. The associated virtual register (whose value is being spilled) + /// is passed, if it exists, so that the client may make decisions about the + /// instruction to generate based on the type of value in question. Because + /// the register allocator will insert spill instructions at arbitrary points, + /// the returned instruction here must not modify the machine's condition codes. + fn gen_spill( + &self, + to_slot: SpillSlot, + from_reg: RealReg, + for_vreg: Option<VirtualReg>, + ) -> Self::Inst; + + /// Generate a reload instruction for insertion into the instruction + /// sequence. The associated virtual register (whose value is being loaded) + /// is passed as well, if it exists. The returned instruction must not modify + /// the machine's condition codes. + fn gen_reload( + &self, + to_reg: Writable<RealReg>, + from_slot: SpillSlot, + for_vreg: Option<VirtualReg>, + ) -> Self::Inst; + + /// Generate a register-to-register move for insertion into the instruction + /// sequence. The associated virtual register is passed as well. The + /// returned instruction must not modify the machine's condition codes. + fn gen_move( + &self, + to_reg: Writable<RealReg>, + from_reg: RealReg, + for_vreg: VirtualReg, + ) -> Self::Inst; + + /// Generate an instruction which is a no-op and has zero length. + fn gen_zero_len_nop(&self) -> Self::Inst; + + /// Try to alter an existing instruction to use a value directly in a + /// spillslot (accessing memory directly) instead of the given register. May + /// be useful on ISAs that have mem/reg ops, like x86. + /// + /// Note that this is not *quite* just fusing a load with the op; if the + /// value is def'd or modified, it should be written back to the spill slot + /// as well. In other words, it is just using the spillslot as if it were a + /// real register, for reads and/or writes. + /// + /// FIXME JRS 2020Feb06: state precisely the constraints on condition code + /// changes. + fn maybe_direct_reload( + &self, + insn: &Self::Inst, + reg: VirtualReg, + slot: SpillSlot, + ) -> Option<Self::Inst>; + + // ---------------------------------------------------------- + // Function liveins, liveouts, and direct-mode real registers + // ---------------------------------------------------------- + + /// Return the set of registers that should be considered live at the + /// beginning of the function. This is semantically equivalent to an + /// instruction at the top of the entry block def'ing all registers in this + /// set. + fn func_liveins(&self) -> Set<RealReg>; + + /// Return the set of registers that should be considered live at the + /// end of the function (after every return instruction). This is + /// semantically equivalent to an instruction at each block with no successors + /// that uses each of these registers. + fn func_liveouts(&self) -> Set<RealReg>; +} + +/// The result of register allocation. Note that allocation can fail! +pub struct RegAllocResult<F: Function> { + /// A new sequence of instructions with all register slots filled with real + /// registers, and spills/fills/moves possibly inserted (and unneeded moves + /// elided). + pub insns: Vec<F::Inst>, + + /// Basic-block start indices for the new instruction list, indexed by the + /// original basic block indices. May be used by the client to, e.g., remap + /// branch targets appropriately. + pub target_map: TypedIxVec<BlockIx, InstIx>, + + /// Full mapping from new instruction indices to original instruction + /// indices. May be needed by the client to, for example, update metadata + /// such as debug/source-location info as the instructions are spliced + /// and reordered. + /// + /// Each entry is an `InstIx`, but may be `InstIx::invalid_value()` if the + /// new instruction at this new index was inserted by the allocator + /// (i.e., if it is a load, spill or move instruction). + pub orig_insn_map: TypedIxVec</* new */ InstIx, /* orig */ InstIx>, + + /// Which real registers were overwritten? This will contain all real regs + /// that appear as defs or modifies in register slots of the output + /// instruction list. This will only list registers that are available to + /// the allocator. If one of the instructions clobbers a register which + /// isn't available to the allocator, it won't be mentioned here. + pub clobbered_registers: Set<RealReg>, + + /// How many spill slots were used? + pub num_spill_slots: u32, + + /// Block annotation strings, for debugging. Requires requesting in the + /// call to `allocate_registers`. Creating of these annotations is + /// potentially expensive, so don't request them if you don't need them. + pub block_annotations: Option<TypedIxVec<BlockIx, Vec<String>>>, + + /// If stackmap support was requested: one stackmap for each of the safepoint instructions + /// declared. Otherwise empty. + pub stackmaps: Vec<Vec<SpillSlot>>, + + /// If stackmap support was requested: one InstIx for each safepoint instruction declared, + /// indicating the corresponding location in the final instruction stream. Otherwise empty. + pub new_safepoint_insns: Vec<InstIx>, +} + +/// A choice of register allocation algorithm to run. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum AlgorithmWithDefaults { + Backtracking, + LinearScan, +} + +pub use crate::analysis_main::AnalysisError; +pub use crate::checker::{CheckerError, CheckerErrors}; + +/// An error from the register allocator. +#[derive(Clone, Debug)] +pub enum RegAllocError { + OutOfRegisters(RegClass), + MissingSuggestedScratchReg(RegClass), + Analysis(AnalysisError), + RegChecker(CheckerErrors), + Other(String), +} + +impl fmt::Display for RegAllocError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +pub use crate::bt_main::BacktrackingOptions; +pub use crate::linear_scan::LinearScanOptions; + +#[derive(Clone)] +pub enum Algorithm { + LinearScan(LinearScanOptions), + Backtracking(BacktrackingOptions), +} + +impl fmt::Debug for Algorithm { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + Algorithm::LinearScan(opts) => write!(fmt, "{:?}", opts), + Algorithm::Backtracking(opts) => write!(fmt, "{:?}", opts), + } + } +} + +/// Tweakable options shared by all the allocators. +#[derive(Clone)] +pub struct Options { + /// Should the register allocator check that its results are valid? This adds runtime to the + /// compiler, so this is disabled by default. + pub run_checker: bool, + + /// Which algorithm should be used for register allocation? By default, selects backtracking, + /// which is slower to compile but creates code of better quality. + pub algorithm: Algorithm, +} + +impl default::Default for Options { + fn default() -> Self { + Self { + run_checker: false, + algorithm: Algorithm::Backtracking(Default::default()), + } + } +} + +impl fmt::Debug for Options { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "checker: {:?}, algorithm: {:?}", + self.run_checker, self.algorithm + ) + } +} + +/// A structure with which callers can request stackmap information. +pub struct StackmapRequestInfo { + /// The register class that holds reftypes. This may only be RegClass::I32 or + /// RegClass::I64, and it must equal the word size of the target architecture. + pub reftype_class: RegClass, + + /// The virtual regs that hold reftyped values. These must be provided in ascending order + /// of register index and be duplicate-free. They must have class `reftype_class`. + pub reftyped_vregs: Vec<VirtualReg>, + + /// The indices of instructions for which the allocator will construct stackmaps. These + /// must be provided in ascending order and be duplicate-free. The specified instructions + /// may not be coalescable move instructions (as the allocator may remove those) and they + /// may not modify any register carrying a reftyped value (they may "def" or "use" them, + /// though). The reason is that, at a safepoint, the client's garbage collector may change + /// the values of all live references, so it would be meaningless for a safepoint + /// instruction also to attempt to do that -- we'd end up with two competing new values. + pub safepoint_insns: Vec<InstIx>, +} + +/// Allocate registers for a function's code, given a universe of real registers that we are +/// allowed to use. Optionally, stackmap support may be requested. +/// +/// The control flow graph must not contain any critical edges, that is, any edge coming from a +/// block with multiple successors must not flow into a block with multiple predecessors. The +/// embedder must have split critical edges before handing over the function to this function. +/// Otherwise, an error will be returned. +/// +/// Allocation may succeed, returning a `RegAllocResult` with the new instruction sequence, or +/// it may fail, returning an error. +/// +/// Runtime options can be passed to the allocators, through the use of [Options] for options +/// common to all the backends. The choice of algorithm is done by passing a given [Algorithm] +/// instance, with options tailored for each algorithm. +#[inline(never)] +pub fn allocate_registers_with_opts<F: Function>( + func: &mut F, + rreg_universe: &RealRegUniverse, + stackmap_info: Option<&StackmapRequestInfo>, + opts: Options, +) -> Result<RegAllocResult<F>, RegAllocError> { + info!(""); + info!("================ regalloc.rs: BEGIN function ================"); + if log_enabled!(Level::Info) { + info!("with options: {:?}", opts); + let strs = rreg_universe.show(); + info!("using RealRegUniverse:"); + for s in strs { + info!(" {}", s); + } + } + // If stackmap support has been requested, perform some initial sanity checks. + if let Some(&StackmapRequestInfo { + reftype_class, + ref reftyped_vregs, + ref safepoint_insns, + }) = stackmap_info + { + if let Algorithm::LinearScan(_) = opts.algorithm { + return Err(RegAllocError::Other( + "stackmap request: not currently available for Linear Scan".to_string(), + )); + } + if reftype_class != RegClass::I64 && reftype_class != RegClass::I32 { + return Err(RegAllocError::Other( + "stackmap request: invalid reftype_class".to_string(), + )); + } + let num_avail_vregs = func.get_num_vregs(); + for i in 0..reftyped_vregs.len() { + let vreg = &reftyped_vregs[i]; + if vreg.get_class() != reftype_class { + return Err(RegAllocError::Other( + "stackmap request: invalid vreg class".to_string(), + )); + } + if vreg.get_index() >= num_avail_vregs { + return Err(RegAllocError::Other( + "stackmap request: out of range vreg".to_string(), + )); + } + if i > 0 && reftyped_vregs[i - 1].get_index() >= vreg.get_index() { + return Err(RegAllocError::Other( + "stackmap request: non-ascending vregs".to_string(), + )); + } + } + let num_avail_insns = func.insns().len(); + for i in 0..safepoint_insns.len() { + let safepoint_iix = safepoint_insns[i]; + if safepoint_iix.get() as usize >= num_avail_insns { + return Err(RegAllocError::Other( + "stackmap request: out of range safepoint insn".to_string(), + )); + } + if i > 0 && safepoint_insns[i - 1].get() >= safepoint_iix.get() { + return Err(RegAllocError::Other( + "stackmap request: non-ascending safepoint insns".to_string(), + )); + } + if func.is_move(func.get_insn(safepoint_iix)).is_some() { + return Err(RegAllocError::Other( + "stackmap request: safepoint insn is a move insn".to_string(), + )); + } + } + // We can't check here that reftyped regs are not changed by safepoint insns. That is + // done deep in the stackmap creation logic, for BT in `get_stackmap_artefacts_at`. + } + + let run_checker = opts.run_checker; + let res = match &opts.algorithm { + Algorithm::Backtracking(opts) => { + bt_main::alloc_main(func, rreg_universe, stackmap_info, run_checker, opts) + } + Algorithm::LinearScan(opts) => linear_scan::run(func, rreg_universe, run_checker, opts), + }; + info!("================ regalloc.rs: END function ================"); + res +} + +/// Allocate registers for a function's code, given a universe of real registers that we are +/// allowed to use. +/// +/// The control flow graph must not contain any critical edges, that is, any edge coming from a +/// block with multiple successors must not flow into a block with multiple predecessors. The +/// embedder must have split critical edges before handing over the function to this function. +/// Otherwise, an error will be returned. +/// +/// Allocate may succeed, returning a `RegAllocResult` with the new instruction sequence, or it may +/// fail, returning an error. +/// +/// This is a convenient function that uses standard options for the allocator, according to the +/// selected algorithm. +#[inline(never)] +pub fn allocate_registers<F: Function>( + func: &mut F, + rreg_universe: &RealRegUniverse, + stackmap_info: Option<&StackmapRequestInfo>, + algorithm: AlgorithmWithDefaults, +) -> Result<RegAllocResult<F>, RegAllocError> { + let algorithm = match algorithm { + AlgorithmWithDefaults::Backtracking => Algorithm::Backtracking(Default::default()), + AlgorithmWithDefaults::LinearScan => Algorithm::LinearScan(Default::default()), + }; + let opts = Options { + algorithm, + ..Default::default() + }; + allocate_registers_with_opts(func, rreg_universe, stackmap_info, opts) +} + +// Facilities to snapshot regalloc inputs and reproduce them in regalloc.rs. +pub use crate::snapshot::IRSnapshot; diff --git a/third_party/rust/regalloc/src/linear_scan/analysis.rs b/third_party/rust/regalloc/src/linear_scan/analysis.rs new file mode 100644 index 0000000000..9e109ef681 --- /dev/null +++ b/third_party/rust/regalloc/src/linear_scan/analysis.rs @@ -0,0 +1,853 @@ +use super::{FixedInterval, IntId, Intervals, Mention, MentionMap, VirtualInterval}; +use crate::{ + analysis_control_flow::{CFGInfo, InstIxToBlockIxMap}, + analysis_data_flow::{ + calc_def_and_use, calc_livein_and_liveout, get_sanitized_reg_uses_for_func, reg_ix_to_reg, + reg_to_reg_ix, + }, + data_structures::{BlockIx, InstPoint, RangeFragIx, RangeFragKind, Reg, RegVecsAndBounds}, + sparse_set::SparseSet, + union_find::UnionFind, + AnalysisError, Function, RealRegUniverse, RegClass, TypedIxVec, +}; +use log::{debug, info, log_enabled, Level}; +use smallvec::{smallvec, SmallVec}; +use std::{fmt, mem}; + +#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct RangeFrag { + pub(crate) first: InstPoint, + pub(crate) last: InstPoint, + pub(crate) mentions: MentionMap, +} + +impl fmt::Debug for RangeFrag { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!(fmt, "[{:?}; {:?}]", self.first, self.last) + } +} + +impl RangeFrag { + fn new<F: Function>( + func: &F, + bix: BlockIx, + first: InstPoint, + last: InstPoint, + mentions: MentionMap, + ) -> (Self, RangeFragMetrics) { + debug_assert!(func.block_insns(bix).len() >= 1); + debug_assert!(func.block_insns(bix).contains(first.iix())); + debug_assert!(func.block_insns(bix).contains(last.iix())); + debug_assert!(first <= last); + + let first_in_block = InstPoint::new_use(func.block_insns(bix).first()); + let last_in_block = InstPoint::new_def(func.block_insns(bix).last()); + let kind = match (first == first_in_block, last == last_in_block) { + (false, false) => RangeFragKind::Local, + (false, true) => RangeFragKind::LiveOut, + (true, false) => RangeFragKind::LiveIn, + (true, true) => RangeFragKind::Thru, + }; + + ( + RangeFrag { + first, + last, + mentions, + }, + RangeFragMetrics { bix, kind }, + ) + } + + #[inline(always)] + #[cfg(debug_assertions)] + pub(crate) fn contains(&self, inst: &InstPoint) -> bool { + self.first <= *inst && *inst <= self.last + } +} + +struct RangeFragMetrics { + bix: BlockIx, + kind: RangeFragKind, +} + +pub(crate) struct AnalysisInfo { + /// The sanitized per-insn reg-use info. + pub(crate) reg_vecs_and_bounds: RegVecsAndBounds, + /// All the intervals, fixed or virtual. + pub(crate) intervals: Intervals, + /// Liveins per block. + pub(crate) liveins: TypedIxVec<BlockIx, SparseSet<Reg>>, + /// Liveouts per block. + pub(crate) liveouts: TypedIxVec<BlockIx, SparseSet<Reg>>, + /// Blocks's loop depths. + pub(crate) _loop_depth: TypedIxVec<BlockIx, u32>, + /// Maps InstIxs to BlockIxs. + pub(crate) _inst_to_block_map: InstIxToBlockIxMap, +} + +#[inline(never)] +pub(crate) fn run<F: Function>( + func: &F, + reg_universe: &RealRegUniverse, +) -> Result<AnalysisInfo, AnalysisError> { + info!( + "run_analysis: begin: {} blocks, {} insns", + func.blocks().len(), + func.insns().len() + ); + + // First do control flow analysis. This is (relatively) simple. Note that this can fail, for + // various reasons; we propagate the failure if so. Also create the InstIx-to-BlockIx map; + // this isn't really control-flow analysis, but needs to be done at some point. + + info!(" run_analysis: begin control flow analysis"); + let cfg_info = CFGInfo::create(func)?; + let inst_to_block_map = InstIxToBlockIxMap::new(func); + info!(" run_analysis: end control flow analysis"); + + info!(" run_analysis: begin data flow analysis"); + + // See `get_sanitized_reg_uses_for_func` for the meaning of "sanitized". + let reg_vecs_and_bounds = get_sanitized_reg_uses_for_func(func, reg_universe) + .map_err(|reg| AnalysisError::IllegalRealReg(reg))?; + assert!(reg_vecs_and_bounds.is_sanitized()); + + // Calculate block-local def/use sets. + let (def_sets_per_block, use_sets_per_block) = + calc_def_and_use(func, ®_vecs_and_bounds, ®_universe); + debug_assert!(def_sets_per_block.len() == func.blocks().len() as u32); + debug_assert!(use_sets_per_block.len() == func.blocks().len() as u32); + + // Calculate live-in and live-out sets per block, using the traditional + // iterate-to-a-fixed-point scheme. + // `liveout_sets_per_block` is amended below for return blocks, hence `mut`. + + let (livein_sets_per_block, mut liveout_sets_per_block) = calc_livein_and_liveout( + func, + &def_sets_per_block, + &use_sets_per_block, + &cfg_info, + ®_universe, + ); + debug_assert!(livein_sets_per_block.len() == func.blocks().len() as u32); + debug_assert!(liveout_sets_per_block.len() == func.blocks().len() as u32); + + // Verify livein set of entry block against liveins specified by function (e.g., ABI params). + let func_liveins = SparseSet::from_vec( + func.func_liveins() + .to_vec() + .into_iter() + .map(|rreg| rreg.to_reg()) + .collect(), + ); + if !livein_sets_per_block[func.entry_block()].is_subset_of(&func_liveins) { + let mut regs = livein_sets_per_block[func.entry_block()].clone(); + regs.remove(&func_liveins); + return Err(AnalysisError::EntryLiveinValues(regs.to_vec())); + } + + // Add function liveouts to every block ending in a return. + let func_liveouts = SparseSet::from_vec( + func.func_liveouts() + .to_vec() + .into_iter() + .map(|rreg| rreg.to_reg()) + .collect(), + ); + for block in func.blocks() { + let last_iix = func.block_insns(block).last(); + if func.is_ret(last_iix) { + liveout_sets_per_block[block].union(&func_liveouts); + } + } + + info!(" run_analysis: end data flow analysis"); + + info!(" run_analysis: begin liveness analysis"); + let (frag_ixs_per_reg, mut frag_env, frag_metrics_env, vreg_classes) = get_range_frags( + func, + ®_vecs_and_bounds, + ®_universe, + &livein_sets_per_block, + &liveout_sets_per_block, + ); + + let (mut fixed_intervals, virtual_intervals) = merge_range_frags( + ®_universe, + &frag_ixs_per_reg, + &mut frag_env, + &frag_metrics_env, + &cfg_info, + &vreg_classes, + ); + info!(" run_analysis: end liveness analysis"); + + // Finalize interval construction by doing some last minute sort of the fixed intervals. + for fixed in fixed_intervals.iter_mut() { + fixed.frags.sort_unstable_by_key(|frag| frag.first); + } + let intervals = Intervals { + virtuals: virtual_intervals, + fixeds: fixed_intervals, + }; + + info!("run_analysis: end"); + + Ok(AnalysisInfo { + reg_vecs_and_bounds, + intervals, + liveins: livein_sets_per_block, + liveouts: liveout_sets_per_block, + _loop_depth: cfg_info.depth_map, + _inst_to_block_map: inst_to_block_map, + }) +} + +/// Calculate all the RangeFrags for `bix`. Add them to `out_frags` and +/// corresponding metrics data to `out_frag_metrics`. Add to `out_map`, the +/// associated RangeFragIxs, segregated by Reg. `bix`, `livein`, `liveout` and +/// `rvb` are expected to be valid in the context of the Func `f` (duh!). +#[inline(never)] +fn get_range_frags_for_block<F: Function>( + func: &F, + rvb: &RegVecsAndBounds, + reg_universe: &RealRegUniverse, + vreg_classes: &Vec<RegClass>, + bix: BlockIx, + livein: &SparseSet<Reg>, + liveout: &SparseSet<Reg>, + // Temporary state reusable across function calls. + visited: &mut Vec<u32>, + state: &mut Vec</*rreg index, then vreg index, */ Option<RangeFrag>>, + // Effectively results. + out_map: &mut Vec<SmallVec<[RangeFragIx; 8]>>, + out_frags: &mut Vec<RangeFrag>, + out_frag_metrics: &mut Vec<RangeFragMetrics>, +) { + let mut emit_range_frag = + |r: Reg, frag: RangeFrag, frag_metrics: RangeFragMetrics, num_real_regs: u32| { + let fix = RangeFragIx::new(out_frags.len() as u32); + out_frags.push(frag); + out_frag_metrics.push(frag_metrics); + + let out_map_index = reg_to_reg_ix(num_real_regs, r) as usize; + out_map[out_map_index].push(fix); + }; + + // Some handy constants. + debug_assert!(func.block_insns(bix).len() >= 1); + let first_pt_in_block = InstPoint::new_use(func.block_insns(bix).first()); + let last_pt_in_block = InstPoint::new_def(func.block_insns(bix).last()); + + // Clear the running state. + visited.clear(); + + let num_real_regs = reg_universe.regs.len() as u32; + + // First, set up `state` as if all of `livein` had been written just prior to the block. + for r in livein.iter() { + let r_state_ix = reg_to_reg_ix(num_real_regs, *r) as usize; + debug_assert!(state[r_state_ix].is_none()); + state[r_state_ix] = Some(RangeFrag { + mentions: MentionMap::new(), + first: first_pt_in_block, + last: first_pt_in_block, + }); + visited.push(r_state_ix as u32); + } + + // Now visit each instruction in turn, examining first the registers it reads, then those it + // modifies, and finally those it writes. + for iix in func.block_insns(bix) { + let bounds_for_iix = &rvb.bounds[iix]; + + // Examine reads: they extend an existing RangeFrag to the U point of the reading + // insn. + for i in bounds_for_iix.uses_start as usize + ..bounds_for_iix.uses_start as usize + bounds_for_iix.uses_len as usize + { + let r = &rvb.vecs.uses[i]; + let r_state_ix = reg_to_reg_ix(num_real_regs, *r) as usize; + + // There has to be an entry, otherwise we'd do a read of a register not listed in + // liveins. + let pf = match &mut state[r_state_ix] { + None => panic!("get_range_frags_for_block: fail #1"), + Some(ref mut pf) => pf, + }; + + // This the first or subsequent read after a write. Note that the "write" can be + // either a real write, or due to the fact that `r` is listed in `livein`. We don't + // care here. + let new_last = InstPoint::new_use(iix); + debug_assert!(pf.last <= new_last); + pf.last = new_last; + + // This first loop iterates over all the uses for the first time, so there shouldn't be + // any duplicates. + debug_assert!(!pf.mentions.iter().any(|tuple| tuple.0 == iix)); + let mut mention_set = Mention::new(); + mention_set.add_use(); + pf.mentions.push((iix, mention_set)); + } + + // Examine modifies. These are handled almost identically to + // reads, except that they extend an existing RangeFrag down to + // the D point of the modifying insn. + for i in bounds_for_iix.mods_start as usize + ..bounds_for_iix.mods_start as usize + bounds_for_iix.mods_len as usize + { + let r = &rvb.vecs.mods[i]; + let r_state_ix = reg_to_reg_ix(num_real_regs, *r) as usize; + + // There has to be an entry here too. + let pf = match &mut state[r_state_ix] { + None => panic!("get_range_frags_for_block: fail #2"), + Some(ref mut pf) => pf, + }; + + // This the first or subsequent modify after a write. + let new_last = InstPoint::new_def(iix); + debug_assert!(pf.last <= new_last); + pf.last = new_last; + + pf.mentions.push((iix, { + let mut mention_set = Mention::new(); + mention_set.add_mod(); + mention_set + })); + } + + // Examine writes (but not writes implied by modifies). The general idea is that a write + // causes us to terminate the existing RangeFrag, if any, add it to the results, + // and start a new frag. + for i in bounds_for_iix.defs_start as usize + ..bounds_for_iix.defs_start as usize + bounds_for_iix.defs_len as usize + { + let r = &rvb.vecs.defs[i]; + let r_state_ix = reg_to_reg_ix(num_real_regs, *r) as usize; + + match &mut state[r_state_ix] { + // First mention of a Reg we've never heard of before. + // Start a new RangeFrag for it and keep going. + None => { + let new_pt = InstPoint::new_def(iix); + let mut mention_set = Mention::new(); + mention_set.add_def(); + state[r_state_ix] = Some(RangeFrag { + first: new_pt, + last: new_pt, + mentions: smallvec![(iix, mention_set)], + }) + } + + // There's already a RangeFrag for `r`. This write will start a new one, so + // flush the existing one and note this write. + Some(RangeFrag { + ref mut first, + ref mut last, + ref mut mentions, + }) => { + // Steal the mentions and replace the mutable ref by an empty vector for reuse. + let stolen_mentions = mem::replace(mentions, MentionMap::new()); + + let (frag, frag_metrics) = + RangeFrag::new(func, bix, *first, *last, stolen_mentions); + emit_range_frag(*r, frag, frag_metrics, num_real_regs); + + let mut mention_set = Mention::new(); + mention_set.add_def(); + mentions.push((iix, mention_set)); + + // Reuse the previous entry for this new definition of the same vreg. + let new_pt = InstPoint::new_def(iix); + *first = new_pt; + *last = new_pt; + } + } + + visited.push(r_state_ix as u32); + } + } + + // We are at the end of the block. We still have to deal with live-out Regs. We must also + // deal with RangeFrag in `state` that are for registers not listed as live-out. + + // Deal with live-out Regs. Treat each one as if it is read just after the block. + for r in liveout.iter() { + // Remove the entry from `state` so that the following loop doesn't process it again. + let r_state_ix = reg_to_reg_ix(num_real_regs, *r) as usize; + let entry = mem::replace(&mut state[r_state_ix], None); + match entry { + None => panic!("get_range_frags_for_block: fail #3"), + Some(pf) => { + let (frag, frag_metrics) = + RangeFrag::new(func, bix, pf.first, last_pt_in_block, pf.mentions); + emit_range_frag(*r, frag, frag_metrics, num_real_regs); + } + } + } + + // Finally, round up any remaining RangeFrag left in `state`. + for r_state_ix in visited { + if let Some(pf) = &mut state[*r_state_ix as usize] { + let r = reg_ix_to_reg(reg_universe, vreg_classes, *r_state_ix); + let (frag, frag_metrics) = RangeFrag::new( + func, + bix, + pf.first, + pf.last, + mem::replace(&mut pf.mentions, MentionMap::new()), + ); + emit_range_frag(r, frag, frag_metrics, num_real_regs); + state[*r_state_ix as usize] = None; + } + } +} + +#[inline(never)] +fn get_range_frags<F: Function>( + func: &F, + rvb: &RegVecsAndBounds, + reg_universe: &RealRegUniverse, + liveins: &TypedIxVec<BlockIx, SparseSet<Reg>>, + liveouts: &TypedIxVec<BlockIx, SparseSet<Reg>>, +) -> ( + Vec</*rreg index, then vreg index, */ SmallVec<[RangeFragIx; 8]>>, + Vec<RangeFrag>, + Vec<RangeFragMetrics>, + Vec</*vreg index,*/ RegClass>, +) { + info!(" get_range_frags: begin"); + debug_assert!(liveins.len() == func.blocks().len() as u32); + debug_assert!(liveouts.len() == func.blocks().len() as u32); + debug_assert!(rvb.is_sanitized()); + + let mut vreg_classes = vec![RegClass::INVALID; func.get_num_vregs()]; + for r in rvb + .vecs + .uses + .iter() + .chain(rvb.vecs.defs.iter()) + .chain(rvb.vecs.mods.iter()) + { + if r.is_real() { + continue; + } + let r_ix = r.get_index(); + let vreg_classes_ptr = &mut vreg_classes[r_ix]; + if *vreg_classes_ptr == RegClass::INVALID { + *vreg_classes_ptr = r.get_class(); + } else { + debug_assert_eq!(*vreg_classes_ptr, r.get_class()); + } + } + + let num_real_regs = reg_universe.regs.len(); + let num_virtual_regs = vreg_classes.len(); + let num_regs = num_real_regs + num_virtual_regs; + + // Reused by the function below. + let mut tmp_state = vec![None; num_regs]; + let mut tmp_visited = Vec::with_capacity(32); + + let mut result_map = vec![SmallVec::new(); num_regs]; + let mut result_frags = Vec::new(); + let mut result_frag_metrics = Vec::new(); + for bix in func.blocks() { + get_range_frags_for_block( + func, + &rvb, + reg_universe, + &vreg_classes, + bix, + &liveins[bix], + &liveouts[bix], + &mut tmp_visited, + &mut tmp_state, + &mut result_map, + &mut result_frags, + &mut result_frag_metrics, + ); + } + + assert!(tmp_state.len() == num_regs); + assert!(result_map.len() == num_regs); + assert!(vreg_classes.len() == num_virtual_regs); + // This is pretty cheap (once per fn) and any failure will be catastrophic since it means we + // may have forgotten some live range fragments. Hence `assert!` and not `debug_assert!`. + for state_elem in &tmp_state { + assert!(state_elem.is_none()); + } + + if log_enabled!(Level::Debug) { + debug!(""); + let mut n = 0; + for frag in result_frags.iter() { + debug!("{:<3?} {:?}", RangeFragIx::new(n), frag); + n += 1; + } + + debug!(""); + for (reg_ix, frag_ixs) in result_map.iter().enumerate() { + if frag_ixs.len() == 0 { + continue; + } + let reg = reg_ix_to_reg(reg_universe, &vreg_classes, reg_ix as u32); + debug!( + "frags for {} {:?}", + reg.show_with_rru(reg_universe), + frag_ixs + ); + } + } + + info!(" get_range_frags: end"); + assert!(result_frags.len() == result_frag_metrics.len()); + + (result_map, result_frags, result_frag_metrics, vreg_classes) +} + +#[inline(never)] +fn merge_range_frags( + reg_universe: &RealRegUniverse, + frag_ix_vec_per_reg: &[SmallVec<[RangeFragIx; 8]>], + frag_env: &mut Vec<RangeFrag>, + frag_metrics_env: &Vec<RangeFragMetrics>, + cfg_info: &CFGInfo, + vreg_classes: &Vec</*vreg index,*/ RegClass>, +) -> (Vec<FixedInterval>, Vec<VirtualInterval>) { + info!(" merge_range_frags: begin"); + if log_enabled!(Level::Info) { + let mut stats_num_total_incoming_frags = 0; + for all_frag_ixs_for_reg in frag_ix_vec_per_reg.iter() { + stats_num_total_incoming_frags += all_frag_ixs_for_reg.len(); + } + info!(" in: {} in frag_env", frag_env.len()); + info!( + " in: {} regs containing in total {} frags", + frag_ix_vec_per_reg.len(), + stats_num_total_incoming_frags + ); + } + + debug_assert!(frag_env.len() == frag_metrics_env.len()); + + // Prefill fixed intervals, one per real register. + let mut result_fixed = Vec::with_capacity(reg_universe.regs.len() as usize); + for rreg in reg_universe.regs.iter() { + result_fixed.push(FixedInterval { + reg: rreg.0, + frags: Vec::new(), + }); + } + + let mut result_virtual = Vec::new(); + + let mut triples = Vec::<(RangeFragIx, RangeFragKind, BlockIx)>::new(); + + // BEGIN per_reg_loop + for (reg_ix, all_frag_ixs_for_reg) in frag_ix_vec_per_reg.iter().enumerate() { + let reg = reg_ix_to_reg(reg_universe, vreg_classes, reg_ix as u32); + + let num_reg_frags = all_frag_ixs_for_reg.len(); + + // The reg might never have been mentioned at all, especially if it's a real reg. + if num_reg_frags == 0 { + continue; + } + + // Do some shortcutting. First off, if there's only one frag for this reg, we can directly + // give it its own live range, and have done. + if num_reg_frags == 1 { + flush_interval( + &mut result_fixed, + &mut result_virtual, + reg, + all_frag_ixs_for_reg, + frag_env, + ); + continue; + } + + // BEGIN merge `all_frag_ixs_for_reg` entries as much as possible. + // but .. if we come across independents (RangeKind::Local), pull them out + // immediately. + triples.clear(); + + // Create `triples`. We will use it to guide the merging phase, but it is immutable there. + for fix in all_frag_ixs_for_reg { + let frag_metrics = &frag_metrics_env[fix.get() as usize]; + + if frag_metrics.kind == RangeFragKind::Local { + // This frag is Local (standalone). Give it its own Range and move on. This is an + // optimisation, but it's also necessary: the main fragment-merging logic below + // relies on the fact that the fragments it is presented with are all either + // LiveIn, LiveOut or Thru. + flush_interval( + &mut result_fixed, + &mut result_virtual, + reg, + &[*fix], + frag_env, + ); + continue; + } + + // This frag isn't Local (standalone) so we have to process it the slow way. + triples.push((*fix, frag_metrics.kind, frag_metrics.bix)); + } + + let triples_len = triples.len(); + + // This is the core of the merging algorithm. + // + // For each ix@(fix, kind, bix) in `triples` (order unimportant): + // + // (1) "Merge with blocks that are live 'downstream' from here": + // if fix is live-out or live-through: + // for b in succs[bix] + // for each ix2@(fix2, kind2, bix2) in `triples` + // if bix2 == b && kind2 is live-in or live-through: + // merge(ix, ix2) + // + // (2) "Merge with blocks that are live 'upstream' from here": + // if fix is live-in or live-through: + // for b in preds[bix] + // for each ix2@(fix2, kind2, bix2) in `triples` + // if bix2 == b && kind2 is live-out or live-through: + // merge(ix, ix2) + // + // `triples` remains unchanged. The equivalence class info is accumulated + // in `eclasses_uf` instead. `eclasses_uf` entries are indices into + // `triples`. + // + // Now, you might think it necessary to do both (1) and (2). But no, they + // are mutually redundant, since if two blocks are connected by a live + // flow from one to the other, then they are also connected in the other + // direction. Hence checking one of the directions is enough. + let mut eclasses_uf = UnionFind::<usize>::new(triples_len); + + // We have two schemes for group merging, one of which is N^2 in the + // length of triples, the other is N-log-N, but with higher constant + // factors. Some experimentation with the bz2 test on a Cortex A57 puts + // the optimal crossover point between 200 and 300; it's not critical. + // Having this protects us against bad behaviour for huge inputs whilst + // still being fast for small inputs. + if triples_len <= 250 { + // The simple way, which is N^2 in the length of `triples`. + for (ix, (_fix, kind, bix)) in triples.iter().enumerate() { + // Deal with liveness flows outbound from `fix`. Meaning, (1) above. + if *kind == RangeFragKind::LiveOut || *kind == RangeFragKind::Thru { + for b in cfg_info.succ_map[*bix].iter() { + // Visit all entries in `triples` that are for `b`. + for (ix2, (_fix2, kind2, bix2)) in triples.iter().enumerate() { + if *bix2 != *b || *kind2 == RangeFragKind::LiveOut { + continue; + } + debug_assert!( + *kind2 == RangeFragKind::LiveIn || *kind2 == RangeFragKind::Thru + ); + // Now we know that liveness for this reg "flows" from `triples[ix]` to + // `triples[ix2]`. So those two frags must be part of the same live + // range. Note this. + if ix != ix2 { + eclasses_uf.union(ix, ix2); // Order of args irrelevant + } + } + } + } + } // outermost iteration over `triples` + } else { + // The more complex way, which is N-log-N in the length of `triples`. This is the same + // as the simple way, except that the innermost loop, which is a linear search in + // `triples` to find entries for some block `b`, is replaced by a binary search. This + // means that `triples` first needs to be sorted by block index. + triples.sort_unstable_by_key(|(_, _, bix)| *bix); + + for (ix, (_fix, kind, bix)) in triples.iter().enumerate() { + // Deal with liveness flows outbound from `fix`. Meaning, (1) above. + if *kind == RangeFragKind::LiveOut || *kind == RangeFragKind::Thru { + for b in cfg_info.succ_map[*bix].iter() { + // Visit all entries in `triples` that are for `b`. Binary search + // `triples` to find the lowest-indexed entry for `b`. + let mut ix_left = 0; + let mut ix_right = triples_len; + while ix_left < ix_right { + let m = (ix_left + ix_right) >> 1; + if triples[m].2 < *b { + ix_left = m + 1; + } else { + ix_right = m; + } + } + + // It might be that there is no block for `b` in the sequence. That's + // legit; it just means that block `bix` jumps to a successor where the + // associated register isn't live-in/thru. A failure to find `b` can be + // indicated one of two ways: + // + // * ix_left == triples_len + // * ix_left < triples_len and b < triples[ix_left].b + // + // In both cases I *think* the 'loop_over_entries_for_b below will not do + // anything. But this is all a bit hairy, so let's convert the second + // variant into the first, so as to make it obvious that the loop won't do + // anything. + + // ix_left now holds the lowest index of any `triples` entry for block `b`. + // Assert this. + if ix_left < triples_len && *b < triples[ix_left].2 { + ix_left = triples_len; + } + if ix_left < triples_len { + assert!(ix_left == 0 || triples[ix_left - 1].2 < *b); + } + + // ix2 plays the same role as in the quadratic version. ix_left and + // ix_right are not used after this point. + let mut ix2 = ix_left; + loop { + let (_fix2, kind2, bix2) = match triples.get(ix2) { + None => break, + Some(triple) => *triple, + }; + if *b < bix2 { + // We've come to the end of the sequence of `b`-blocks. + break; + } + debug_assert!(*b == bix2); + if kind2 == RangeFragKind::LiveOut { + ix2 += 1; + continue; + } + // Now we know that liveness for this reg "flows" from `triples[ix]` to + // `triples[ix2]`. So those two frags must be part of the same live + // range. Note this. + eclasses_uf.union(ix, ix2); + ix2 += 1; + } + + if ix2 + 1 < triples_len { + debug_assert!(*b < triples[ix2 + 1].2); + } + } + } + } + } + + // Now `eclasses_uf` contains the results of the merging-search. Visit each of its + // equivalence classes in turn, and convert each into a virtual or real live range as + // appropriate. + let eclasses = eclasses_uf.get_equiv_classes(); + for leader_triple_ix in eclasses.equiv_class_leaders_iter() { + // `leader_triple_ix` is an eclass leader. Enumerate the whole eclass. + let mut frag_ixs = SmallVec::<[RangeFragIx; 4]>::new(); + for triple_ix in eclasses.equiv_class_elems_iter(leader_triple_ix) { + frag_ixs.push(triples[triple_ix].0 /*first field is frag ix*/); + } + flush_interval( + &mut result_fixed, + &mut result_virtual, + reg, + &frag_ixs, + frag_env, + ); + } + // END merge `all_frag_ixs_for_reg` entries as much as possible + } // END per reg loop + + info!(" merge_range_frags: end"); + + (result_fixed, result_virtual) +} + +#[inline(never)] +fn flush_interval( + result_real: &mut Vec<FixedInterval>, + result_virtual: &mut Vec<VirtualInterval>, + reg: Reg, + frag_ixs: &[RangeFragIx], + frags: &mut Vec<RangeFrag>, +) { + if reg.is_real() { + // Append all the RangeFrags to this fixed interval. They'll get sorted later. + result_real[reg.to_real_reg().get_index()] + .frags + .extend(frag_ixs.iter().map(|&i| { + let frag = &mut frags[i.get() as usize]; + RangeFrag { + first: frag.first, + last: frag.last, + mentions: mem::replace(&mut frag.mentions, MentionMap::new()), + } + })); + return; + } + + debug_assert!(reg.is_virtual()); + + let (start, end, mentions) = { + // Merge all the mentions together. + let capacity = frag_ixs + .iter() + .map(|fix| frags[fix.get() as usize].mentions.len()) + .fold(0, |a, b| a + b); + + let mut start = InstPoint::max_value(); + let mut end = InstPoint::min_value(); + + // TODO rework this! + let mut mentions = MentionMap::with_capacity(capacity); + for frag in frag_ixs.iter().map(|fix| &frags[fix.get() as usize]) { + mentions.extend(frag.mentions.iter().cloned()); + start = InstPoint::min(start, frag.first); + end = InstPoint::max(end, frag.last); + } + mentions.sort_unstable_by_key(|tuple| tuple.0); + + // Merge mention set that are at the same instruction. + let mut s = 0; + let mut e; + let mut to_remove = Vec::new(); + while s < mentions.len() { + e = s; + while e + 1 < mentions.len() && mentions[s].0 == mentions[e + 1].0 { + e += 1; + } + if s != e { + let mut i = s + 1; + while i <= e { + if mentions[i].1.is_use() { + mentions[s].1.add_use(); + } + if mentions[i].1.is_mod() { + mentions[s].1.add_mod(); + } + if mentions[i].1.is_def() { + mentions[s].1.add_def(); + } + i += 1; + } + for i in s + 1..=e { + to_remove.push(i); + } + } + s = e + 1; + } + + for &i in to_remove.iter().rev() { + // TODO not efficient. + mentions.remove(i); + } + + (start, end, mentions) + }; + + let id = IntId(result_virtual.len()); + let mut int = VirtualInterval::new(id, reg.to_virtual_reg(), start, end, mentions); + int.ancestor = Some(id); + + result_virtual.push(int); +} diff --git a/third_party/rust/regalloc/src/linear_scan/assign_registers.rs b/third_party/rust/regalloc/src/linear_scan/assign_registers.rs new file mode 100644 index 0000000000..dd57ffed48 --- /dev/null +++ b/third_party/rust/regalloc/src/linear_scan/assign_registers.rs @@ -0,0 +1,1248 @@ +use super::{ + last_use, next_use, IntId, Intervals, Mention, MentionMap, OptimalSplitStrategy, RegUses, + Statistics, VirtualInterval, +}; +use crate::{ + data_structures::{InstPoint, Point, RegVecsAndBounds}, + Function, InstIx, LinearScanOptions, RealReg, RealRegUniverse, Reg, RegAllocError, SpillSlot, + VirtualReg, NUM_REG_CLASSES, +}; + +use log::{debug, info, log_enabled, trace, Level}; +use rustc_hash::FxHashMap as HashMap; +use smallvec::SmallVec; +use std::collections::BinaryHeap; +use std::{cmp, cmp::Ordering, fmt}; + +macro_rules! lsra_assert { + ($arg:expr) => { + #[cfg(debug_assertions)] + debug_assert!($arg); + }; + + ($arg:expr, $text:expr) => { + #[cfg(debug_assertions)] + debug_assert!($arg, $text); + }; +} + +#[derive(Clone, Copy, PartialEq)] +enum ActiveInt { + Virtual(IntId), + Fixed((RealReg, usize)), +} + +impl fmt::Debug for ActiveInt { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ActiveInt::Virtual(id) => write!(fmt, "virtual({:?})", id), + ActiveInt::Fixed((rreg, _)) => write!(fmt, "real({:?})", rreg), + } + } +} + +struct ActivityTracker { + /// Intervals that are covering the current interval's start position. + /// TODO Invariant: they always have a register attached to them. + active: Vec<ActiveInt>, + + /// Intervals that are not covering but end after the current interval's start position. + /// None means that the interval may have fragments, but they all live after the current + /// position. + /// TODO Invariant: they're all fixed registers, so they must have a register attached to them. + inactive: Vec<(RealReg, usize)>, +} + +impl ActivityTracker { + fn new(intervals: &Intervals) -> Self { + let mut inactive = Vec::with_capacity(intervals.fixeds.len()); + for fixed in &intervals.fixeds { + if !fixed.frags.is_empty() { + inactive.push((fixed.reg, 0)) + } + } + + Self { + active: Vec::new(), + inactive, + } + } + + fn set_active(&mut self, id: IntId) { + self.active.push(ActiveInt::Virtual(id)); + } + + fn update(&mut self, start: InstPoint, stats: &mut Option<Statistics>, intervals: &Intervals) { + // From active, only possible transitions are to active or expired. + // From inactive, only possible transitions are to inactive, active or expired. + // => active has an upper bound. + // => inactive only shrinks. + let mut to_delete: SmallVec<[usize; 16]> = SmallVec::new(); + let mut new_inactive: SmallVec<[(RealReg, usize); 16]> = SmallVec::new(); + + for (i, id) in self.active.iter_mut().enumerate() { + match id { + ActiveInt::Virtual(int_id) => { + let int = intervals.get(*int_id); + + if int.location.spill().is_some() { + // TODO these shouldn't appear here... + to_delete.push(i); + continue; + } + //debug_assert!(int.location.spill().is_none(), "active int must have a reg"); + + if int.end < start { + // It's expired, forget about it. + to_delete.push(i); + } else { + // Stays active. + lsra_assert!(int.covers(start), "no active to inactive transition"); + } + } + + ActiveInt::Fixed((rreg, ref mut fix)) => { + // Possible transitions: active => { active, inactive, expired }. + let frags = &intervals.fixeds[rreg.get_index()].frags; + + // Fast-forward to the first fragment that contains or is after start. + while *fix < frags.len() && start > frags[*fix].last { + *fix += 1; + } + + if *fix == frags.len() { + // It expired, remove it from the active list. + to_delete.push(i); + } else if start < frags[*fix].first { + // It is now inactive. + lsra_assert!(!frags[*fix].contains(&start)); + new_inactive.push((*rreg, *fix)); + to_delete.push(i); + } else { + // Otherwise, it's still active. + lsra_assert!(frags[*fix].contains(&start)); + } + } + } + } + + for &i in to_delete.iter().rev() { + self.active.swap_remove(i); + } + to_delete.clear(); + + for (i, (rreg, fix)) in self.inactive.iter_mut().enumerate() { + // Possible transitions: inactive => { active, inactive, expired }. + let frags = &intervals.fixeds[rreg.get_index()].frags; + + // Fast-forward to the first fragment that contains or is after start. + while *fix < frags.len() && start > frags[*fix].last { + *fix += 1; + } + + if *fix == frags.len() { + // It expired, remove it from the inactive list. + to_delete.push(i); + } else if start >= frags[*fix].first { + // It is now active. + lsra_assert!(frags[*fix].contains(&start)); + self.active.push(ActiveInt::Fixed((*rreg, *fix))); + to_delete.push(i); + } else { + // Otherwise it remains inactive. + lsra_assert!(!frags[*fix].contains(&start)); + } + } + + for &i in to_delete.iter().rev() { + self.inactive.swap_remove(i); + } + self.inactive.extend(new_inactive.into_vec()); + + trace!("active:"); + for aid in &self.active { + match aid { + ActiveInt::Virtual(id) => { + trace!(" {}", intervals.get(*id)); + } + ActiveInt::Fixed((real_reg, _frag)) => { + trace!(" {}", intervals.fixeds[real_reg.get_index()]); + } + } + } + trace!("inactive:"); + for &(rreg, fix) in &self.inactive { + trace!( + " {:?} {:?}", + rreg, + intervals.fixeds[rreg.get_index()].frags[fix] + ); + } + trace!("end update state"); + + stats.as_mut().map(|stats| { + stats.peak_active = usize::max(stats.peak_active, self.active.len()); + stats.peak_inactive = usize::max(stats.peak_inactive, self.inactive.len()); + }); + } +} + +pub(crate) fn run<F: Function>( + opts: &LinearScanOptions, + func: &F, + reg_uses: &RegVecsAndBounds, + reg_universe: &RealRegUniverse, + scratches_by_rc: &Vec<Option<RealReg>>, + intervals: Intervals, + stats: Option<Statistics>, +) -> Result<(Intervals, u32), RegAllocError> { + let mut state = State::new(opts, func, ®_uses, intervals, stats); + let mut reusable = ReusableState::new(reg_universe, &scratches_by_rc); + + #[cfg(debug_assertions)] + let mut prev_start = InstPoint::min_value(); + + while let Some(id) = state.next_unhandled() { + info!("main loop: allocating {}", state.intervals.get(id)); + + #[cfg(debug_assertions)] + { + let int = state.intervals.get(id); + debug_assert!(prev_start <= int.start, "main loop must make progress"); + prev_start = int.start; + } + + if state.intervals.get(id).location.is_none() { + let int = state.intervals.get(id); + + state + .activity + .update(int.start, &mut state.stats, &state.intervals); + + let ok = try_allocate_reg(&mut reusable, id, &mut state); + if !ok { + allocate_blocked_reg(&mut reusable, id, &mut state)?; + } + + if state.intervals.get(id).location.reg().is_some() { + state.activity.set_active(id); + } + + // Reset reusable state. + reusable.computed_inactive = false; + } + + debug!(""); + } + + if log_enabled!(Level::Debug) { + debug!("allocation results (in order):"); + for int in state.intervals.virtuals.iter() { + debug!("{}", int); + } + debug!(""); + } + + Ok((state.intervals, state.next_spill_slot.get())) +} + +/// A mapping from real reg to some T. +#[derive(Clone)] +struct RegisterMapping<T> { + offset: usize, + regs: Vec<(RealReg, T)>, + scratch: Option<RealReg>, + initial_value: T, + reg_class_index: usize, +} + +impl<T: Copy> RegisterMapping<T> { + fn with_default( + reg_class_index: usize, + reg_universe: &RealRegUniverse, + scratch: Option<RealReg>, + initial_value: T, + ) -> Self { + let mut regs = Vec::new(); + let mut offset = 0; + // Collect all the registers for the current class. + if let Some(ref info) = reg_universe.allocable_by_class[reg_class_index] { + lsra_assert!(info.first <= info.last); + offset = info.first; + for reg in ®_universe.regs[info.first..=info.last] { + lsra_assert!(regs.len() == reg.0.get_index() - offset); + regs.push((reg.0, initial_value)); + } + }; + Self { + offset, + regs, + scratch, + initial_value, + reg_class_index, + } + } + + fn clear(&mut self) { + for reg in self.regs.iter_mut() { + reg.1 = self.initial_value; + } + } + + fn iter<'a>(&'a self) -> RegisterMappingIter<T> { + RegisterMappingIter { + iter: self.regs.iter(), + scratch: self.scratch, + } + } +} + +struct RegisterMappingIter<'a, T: Copy> { + iter: std::slice::Iter<'a, (RealReg, T)>, + scratch: Option<RealReg>, +} + +impl<'a, T: Copy> std::iter::Iterator for RegisterMappingIter<'a, T> { + type Item = &'a (RealReg, T); + fn next(&mut self) -> Option<Self::Item> { + match self.iter.next() { + Some(pair) => { + if Some(pair.0) == self.scratch { + // Skip to the next one. + self.iter.next() + } else { + Some(pair) + } + } + None => None, + } + } +} + +impl<T> std::ops::Index<RealReg> for RegisterMapping<T> { + type Output = T; + fn index(&self, rreg: RealReg) -> &Self::Output { + lsra_assert!( + rreg.get_class() as usize == self.reg_class_index, + "trying to index a reg from the wrong class" + ); + lsra_assert!(Some(rreg) != self.scratch, "trying to use the scratch"); + &self.regs[rreg.get_index() - self.offset].1 + } +} + +impl<T> std::ops::IndexMut<RealReg> for RegisterMapping<T> { + fn index_mut(&mut self, rreg: RealReg) -> &mut Self::Output { + lsra_assert!( + rreg.get_class() as usize == self.reg_class_index, + "trying to index a reg from the wrong class" + ); + lsra_assert!(Some(rreg) != self.scratch, "trying to use the scratch"); + &mut self.regs[rreg.get_index() - self.offset].1 + } +} + +// State management. + +/// Parts of state just reused for recycling memory. +struct ReusableState { + inactive_intersecting: Vec<(RealReg, InstPoint)>, + computed_inactive: bool, + reg_to_instpoint_1: Vec<RegisterMapping<InstPoint>>, + reg_to_instpoint_2: Vec<RegisterMapping<InstPoint>>, +} + +impl ReusableState { + fn new(reg_universe: &RealRegUniverse, scratches: &[Option<RealReg>]) -> Self { + let mut reg_to_instpoint_1 = Vec::with_capacity(NUM_REG_CLASSES); + + for i in 0..NUM_REG_CLASSES { + let scratch = scratches[i]; + reg_to_instpoint_1.push(RegisterMapping::with_default( + i, + reg_universe, + scratch, + InstPoint::max_value(), + )); + } + + let reg_to_instpoint_2 = reg_to_instpoint_1.clone(); + + Self { + inactive_intersecting: Vec::new(), + computed_inactive: false, + reg_to_instpoint_1, + reg_to_instpoint_2, + } + } +} + +/// A small pair containing the interval id and the instruction point of an interval that is still +/// to be allocated, to be stored in the unhandled list of intervals. +struct IntervalStart(IntId, InstPoint); + +impl cmp::PartialEq for IntervalStart { + #[inline(always)] + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} +impl cmp::Eq for IntervalStart {} + +impl cmp::PartialOrd for IntervalStart { + #[inline(always)] + fn partial_cmp(&self, other: &Self) -> Option<Ordering> { + // Note: we want a reverse ordering on start positions, so that we have a MinHeap and not a + // MaxHeap in UnhandledIntervals. + other.1.partial_cmp(&self.1) + } +} + +impl cmp::Ord for IntervalStart { + #[inline(always)] + fn cmp(&self, other: &Self) -> Ordering { + self.partial_cmp(other).unwrap() + } +} + +struct UnhandledIntervals { + heap: BinaryHeap<IntervalStart>, +} + +impl UnhandledIntervals { + fn new() -> Self { + Self { + heap: BinaryHeap::with_capacity(16), + } + } + + /// Insert a virtual interval that's unallocated in the list of unhandled intervals. + /// + /// This relies on the fact that unhandled intervals's start positions can't change over time. + fn insert(&mut self, id: IntId, intervals: &Intervals) { + self.heap.push(IntervalStart(id, intervals.get(id).start)) + } + + /// Get the new unhandled interval, in start order. + fn next_unhandled(&mut self, _intervals: &Intervals) -> Option<IntId> { + self.heap.pop().map(|entry| { + let ret = entry.0; + lsra_assert!(_intervals.get(ret).start == entry.1); + ret + }) + } +} + +/// State structure, which can be cleared between different calls to register allocation. +/// TODO: split this into clearable fields and non-clearable fields. +struct State<'a, F: Function> { + func: &'a F, + reg_uses: &'a RegUses, + opts: &'a LinearScanOptions, + + intervals: Intervals, + + /// Intervals that are starting after the current interval's start position. + unhandled: UnhandledIntervals, + + /// Next available spill slot. + next_spill_slot: SpillSlot, + + /// Maps given virtual registers to the spill slots they should be assigned + /// to. + spill_map: HashMap<VirtualReg, SpillSlot>, + + activity: ActivityTracker, + stats: Option<Statistics>, +} + +impl<'a, F: Function> State<'a, F> { + fn new( + opts: &'a LinearScanOptions, + func: &'a F, + reg_uses: &'a RegUses, + intervals: Intervals, + stats: Option<Statistics>, + ) -> Self { + let mut unhandled = UnhandledIntervals::new(); + for int in intervals.virtuals.iter() { + unhandled.insert(int.id, &intervals); + } + + let activity = ActivityTracker::new(&intervals); + + Self { + func, + reg_uses, + opts, + intervals, + unhandled, + next_spill_slot: SpillSlot::new(0), + spill_map: HashMap::default(), + stats, + activity, + } + } + + fn next_unhandled(&mut self) -> Option<IntId> { + self.unhandled.next_unhandled(&self.intervals) + } + fn insert_unhandled(&mut self, id: IntId) { + self.unhandled.insert(id, &self.intervals); + } + + fn spill(&mut self, id: IntId) { + let int = self.intervals.get(id); + debug_assert!(int.location.spill().is_none(), "already spilled"); + debug!("spilling {:?}", id); + + let vreg = int.vreg; + let spill_slot = if let Some(spill_slot) = self.spill_map.get(&vreg) { + *spill_slot + } else { + let size_slot = self.func.get_spillslot_size(vreg.get_class(), vreg); + let spill_slot = self.next_spill_slot.round_up(size_slot); + self.next_spill_slot = self.next_spill_slot.inc(1); + self.spill_map.insert(vreg, spill_slot); + spill_slot + }; + + self.intervals.set_spill(id, spill_slot); + } +} + +#[inline(never)] +fn lazy_compute_inactive( + intervals: &Intervals, + activity: &ActivityTracker, + cur_id: IntId, + inactive_intersecting: &mut Vec<(RealReg, InstPoint)>, + computed_inactive: &mut bool, +) { + if *computed_inactive { + return; + } + inactive_intersecting.clear(); + + let int = intervals.get(cur_id); + let reg_class = int.vreg.get_class(); + + for &(rreg, fix) in &activity.inactive { + if rreg.get_class() != reg_class { + continue; + } + + let frags = &intervals.fixeds[rreg.get_index()].frags; + let mut i = fix; + while let Some(ref frag) = frags.get(i) { + if frag.first > int.end { + break; + } + if frag.first >= int.start { + inactive_intersecting.push((rreg, frag.first)); + break; + } + i += 1; + } + } + + *computed_inactive = true; +} + +/// Transitions intervals from active/inactive into active/inactive/handled. +/// +/// An interval tree is stored in the state, containing all the active and +/// inactive intervals. The comparison key is the interval's start point. +/// +/// A state update consists in the following. We consider the next interval to +/// allocate, and in particular its start point S. +/// +/// 1. remove all the active/inactive intervals that have expired, i.e. their +/// end point is before S. +/// 2. reconsider active/inactive intervals: +/// - if they contain S, they become (or stay) active. +/// - otherwise, they become (or stay) inactive. +/// +/// Item 1 is easy to implement, and fast enough. +/// +/// Item 2 is a bit trickier. While we could just call `Intervals::covers` for +/// each interval on S, this is quite expensive. In addition to this, it happens +/// that most intervals are inactive. This is explained by the fact that linear +/// scan can create large intervals, if a value is used much later after it's +/// been created, *according to the block ordering*. +/// +/// For each interval, we remember the last active fragment, or the first +/// inactive fragment that starts after S. This makes search really fast: +/// +/// - if the considered (active or inactive) interval start is before S, then we +/// should look more precisely if it's active or inactive. This might include +/// seeking to the next fragment that contains S. +/// - otherwise, if the considered interval start is *after* S, then it means +/// this interval, as well as all the remaining ones in the interval tree (since +/// they're sorted by starting position) are inactive, and we can escape the +/// loop eagerly. +/// +/// The escape for inactive intervals make this function overall cheap. + +/// Naive heuristic to select a register when we're not aware of any conflict. +/// Currently, it chooses the register with the furthest next use. +#[inline(never)] +fn select_naive_reg<F: Function>( + reusable: &mut ReusableState, + state: &mut State<F>, + id: IntId, +) -> Option<(RealReg, InstPoint)> { + let reg_class = state.intervals.get(id).vreg.get_class(); + let free_until_pos = &mut reusable.reg_to_instpoint_1[reg_class as usize]; + free_until_pos.clear(); + + let mut num_free = usize::max(1, free_until_pos.regs.len()) - 1; + + // All registers currently in use are blocked. + for &aid in &state.activity.active { + let reg = match aid { + ActiveInt::Virtual(int_id) => { + if let Some(reg) = state.intervals.get(int_id).location.reg() { + reg + } else { + continue; + } + } + ActiveInt::Fixed((real_reg, _)) => real_reg, + }; + + if reg.get_class() == reg_class { + free_until_pos[reg] = InstPoint::min_value(); + num_free -= 1; + } + } + + // Shortcut: if all the registers are taken, don't even bother. + if num_free == 0 { + lsra_assert!(!free_until_pos + .iter() + .any(|pair| pair.1 != InstPoint::min_value())); + return None; + } + + // All registers that would be used at the same time as the current interval + // are partially blocked, up to the point when they start being used. + lazy_compute_inactive( + &state.intervals, + &state.activity, + id, + &mut reusable.inactive_intersecting, + &mut reusable.computed_inactive, + ); + + for &(reg, intersect_at) in reusable.inactive_intersecting.iter() { + if intersect_at < free_until_pos[reg] { + free_until_pos[reg] = intersect_at; + } + } + + // Find the register with the furthest next use, if there's any. + let mut best_reg = None; + let mut best_pos = InstPoint::min_value(); + for &(reg, pos) in free_until_pos.iter() { + if pos > best_pos { + best_pos = pos; + best_reg = Some(reg); + } + } + + best_reg.and_then(|reg| Some((reg, best_pos))) +} + +#[inline(never)] +fn try_allocate_reg<F: Function>( + reusable: &mut ReusableState, + id: IntId, + state: &mut State<F>, +) -> bool { + state + .stats + .as_mut() + .map(|stats| stats.num_try_allocate_reg += 1); + + let (best_reg, best_pos) = if let Some(solution) = select_naive_reg(reusable, state, id) { + solution + } else { + debug!("try_allocate_reg: all registers taken, need to spill."); + return false; + }; + debug!( + "try_allocate_reg: best register {:?} has next use at {:?}", + best_reg, best_pos + ); + + if best_pos <= state.intervals.get(id).end { + if !state.opts.partial_split || !try_split_regs(state, id, best_pos) { + return false; + } + } + + // At least a partial match: allocate. + debug!( + "{:?}: {:?} <- {:?}", + id, + state.intervals.get(id).vreg, + best_reg + ); + state.intervals.set_reg(id, best_reg); + + state + .stats + .as_mut() + .map(|stats| stats.num_try_allocate_reg_success += 1); + + true +} + +#[inline(never)] +fn allocate_blocked_reg<F: Function>( + reusable: &mut ReusableState, + cur_id: IntId, + state: &mut State<F>, +) -> Result<(), RegAllocError> { + // If the current interval has no uses, spill it directly. + let first_use = match next_use( + &state.intervals.get(cur_id), + InstPoint::min_value(), + &state.reg_uses, + ) { + Some(u) => u, + None => { + state.spill(cur_id); + return Ok(()); + } + }; + + let (start_pos, reg_class) = { + let int = state.intervals.get(cur_id); + (int.start, int.vreg.get_class()) + }; + + // Note: in this function, "use" isn't just a use as in use-def; it really + // means a mention, so either a use or a definition. + // + // 1. Compute all the positions of next uses for registers of active intervals + // and inactive intervals that might intersect with the current one. + // 2. Then use this to select the interval with the further next use. + // 3. Spill either the current interval or active/inactive intervals with the + // selected register. + // 4. Make sure that the current interval doesn't intersect with the fixed + // interval for the selected register. + + // Step 1: compute all the next use positions. + let next_use_pos = &mut reusable.reg_to_instpoint_1[reg_class as usize]; + next_use_pos.clear(); + + let block_pos = &mut reusable.reg_to_instpoint_2[reg_class as usize]; + block_pos.clear(); + + trace!( + "allocate_blocked_reg: searching reg with next use after {:?}", + start_pos + ); + + for &aid in &state.activity.active { + match aid { + ActiveInt::Virtual(int_id) => { + let int = state.intervals.get(int_id); + if int.vreg.get_class() != reg_class { + continue; + } + if let Some(reg) = int.location.reg() { + if next_use_pos[reg] != InstPoint::min_value() { + if let Some(next_use) = + next_use(&state.intervals.get(int_id), start_pos, &state.reg_uses) + { + next_use_pos[reg] = InstPoint::min(next_use_pos[reg], next_use); + } + } + } + } + + ActiveInt::Fixed((reg, _frag)) => { + if reg.get_class() == reg_class { + block_pos[reg] = InstPoint::min_value(); + next_use_pos[reg] = InstPoint::min_value(); + } + } + } + } + + lazy_compute_inactive( + &state.intervals, + &state.activity, + cur_id, + &mut reusable.inactive_intersecting, + &mut reusable.computed_inactive, + ); + + for &(reg, intersect_pos) in &reusable.inactive_intersecting { + debug_assert!(reg.get_class() == reg_class); + if block_pos[reg] == InstPoint::min_value() { + // This register is already blocked. + debug_assert!(next_use_pos[reg] == InstPoint::min_value()); + continue; + } + block_pos[reg] = InstPoint::min(block_pos[reg], intersect_pos); + next_use_pos[reg] = InstPoint::min(next_use_pos[reg], intersect_pos); + } + + // Step 2: find the register with the furthest next use. + let best_reg = { + let mut best = None; + for (reg, pos) in next_use_pos.iter() { + trace!("allocate_blocked_reg: {:?} has next use at {:?}", reg, pos); + match best { + None => best = Some((reg, pos)), + Some((ref mut best_reg, ref mut best_pos)) => { + if *best_pos < pos { + *best_pos = pos; + *best_reg = reg; + } + } + } + } + match best { + Some(best) => *best.0, + None => { + return Err(RegAllocError::Other(format!( + "the {:?} register class has no registers!", + reg_class + ))); + } + } + }; + debug!( + "selecting blocked register {:?} with furthest next use at {:?}", + best_reg, next_use_pos[best_reg] + ); + + // Step 3: if the next use of the current interval is after the furthest use + // of the selected register, then we should spill the current interval. + // Otherwise, spill other intervals. + debug!( + "current first used at {:?}, next use of best reg at {:?}", + first_use, next_use_pos[best_reg] + ); + + if first_use >= next_use_pos[best_reg] { + if first_use == start_pos { + return Err(RegAllocError::OutOfRegisters(reg_class)); + } + debug!("spill current interval"); + let new_int = split(state, cur_id, first_use); + state.insert_unhandled(new_int); + state.spill(cur_id); + } else { + debug!("taking over register, spilling intersecting intervals"); + + // Spill intervals that currently block the selected register. + state.intervals.set_reg(cur_id, best_reg); + + // If there's an interference with a fixed interval, split at the + // intersection. + let int_end = state.intervals.get(cur_id).end; + if block_pos[best_reg] <= int_end { + debug!( + "allocate_blocked_reg: fixed conflict! blocked at {:?}, while ending at {:?}", + block_pos[best_reg], int_end + ); + + if !state.opts.partial_split || !try_split_regs(state, cur_id, block_pos[best_reg]) { + split_and_spill(state, cur_id, block_pos[best_reg]); + } + } + + for &aid in &state.activity.active { + match aid { + ActiveInt::Virtual(int_id) => { + let int = state.intervals.get(int_id); + if int.vreg.get_class() != reg_class { + continue; + } + if let Some(reg) = int.location.reg() { + if reg == best_reg { + // spill it! + debug!("allocate_blocked_reg: split and spill active stolen reg"); + split_and_spill(state, int_id, start_pos); + break; + } + } + } + + ActiveInt::Fixed((_reg, _fix)) => { + lsra_assert!( + _reg != best_reg + || state.intervals.get(cur_id).end + < state.intervals.fixeds[_reg.get_index()].frags[_fix].first, + "can't split fixed active interval" + ); + } + } + } + + // Inactive virtual intervals would need to be split and spilled here too, but we can't + // have inactive virtual intervals. + #[cfg(debug_assertions)] + for &(reg, intersect_pos) in &reusable.inactive_intersecting { + debug_assert!( + reg != best_reg || state.intervals.get(cur_id).end < intersect_pos, + "can't split fixed inactive interval" + ); + } + } + + Ok(()) +} + +/// Finds an optimal split position, whenever we're given a range of possible +/// positions where to split. +fn find_optimal_split_pos<F: Function>( + state: &State<F>, + id: IntId, + from: InstPoint, + to: InstPoint, +) -> InstPoint { + trace!("find_optimal_split_pos between {:?} and {:?}", from, to); + + debug_assert!(from <= to, "split between positions are inconsistent"); + let int = state.intervals.get(id); + debug_assert!(from >= int.start, "split should happen after the start"); + debug_assert!(to <= int.end, "split should happen before the end"); + + if from == to { + return from; + } + + let candidate = match state.opts.split_strategy { + OptimalSplitStrategy::To => Some(to), + OptimalSplitStrategy::NextFrom => Some(next_pos(from)), + OptimalSplitStrategy::NextNextFrom => Some(next_pos(next_pos(from))), + OptimalSplitStrategy::From => { + // This is the general setting, so win some time and eagerly return here. + return from; + } + OptimalSplitStrategy::PrevTo => Some(prev_pos(to)), + OptimalSplitStrategy::PrevPrevTo => Some(prev_pos(prev_pos(to))), + OptimalSplitStrategy::Mid => Some(InstPoint::new_use(InstIx::new( + (from.iix().get() + to.iix().get()) / 2, + ))), + }; + + if let Some(pos) = candidate { + if pos >= from && pos <= to && state.intervals.get(id).covers(pos) { + return pos; + } + } + + from +} + +fn prev_pos(mut pos: InstPoint) -> InstPoint { + match pos.pt() { + Point::Def => { + pos.set_pt(Point::Use); + pos + } + Point::Use => { + pos.set_iix(pos.iix().minus(1)); + pos.set_pt(Point::Def); + pos + } + _ => unreachable!(), + } +} + +fn next_pos(mut pos: InstPoint) -> InstPoint { + match pos.pt() { + Point::Use => pos.set_pt(Point::Def), + Point::Def => { + pos.set_pt(Point::Use); + pos.set_iix(pos.iix().plus(1)); + } + _ => unreachable!(), + }; + pos +} + +/// Splits the given interval between the last use before `split_pos` and +/// `split_pos`. +/// +/// In case of two-ways split (i.e. only place to split is precisely split_pos), +/// returns the live interval id for the middle child, to be added back to the +/// list of active/inactive intervals after iterating on these. +fn split_and_spill<F: Function>(state: &mut State<F>, id: IntId, split_pos: InstPoint) { + let child = match last_use(&state.intervals.get(id), split_pos, &state.reg_uses) { + Some(last_use) => { + debug!( + "split_and_spill {:?}: spill between {:?} and {:?}", + id, last_use, split_pos + ); + + // Maintain ascending order between the min and max positions. + let min_pos = InstPoint::min(next_pos(last_use), split_pos); + + // Make sure that if the two positions are the same, we'll be splitting in + // a position that's in the current interval. + let optimal_pos = find_optimal_split_pos(state, id, min_pos, split_pos); + + let child = split(state, id, optimal_pos); + state.spill(child); + child + } + + None => { + // The current interval has no uses before the split position, it can + // safely be spilled. + debug!( + "split_and_spill {:?}: spilling it since no uses before split position", + id + ); + state.spill(id); + id + } + }; + + // Split until the next register use. + match next_use(&state.intervals.get(child), split_pos, &state.reg_uses) { + Some(next_use_pos) => { + debug!( + "split spilled interval before next use @ {:?}", + next_use_pos + ); + let child = split(state, child, next_use_pos); + state.insert_unhandled(child); + } + None => { + // Let it be spilled for the rest of its lifetime. + } + } + + // In both cases, the spilled child interval can remain on the stack. + debug!("spilled split child {:?} silently expires", child); +} + +/// Try to find a (use) position where to split the interval until the next point at which it +/// becomes unavailable, and put it back into the queue of intervals to allocate later on. Returns +/// true if it succeeded in finding such a position, false otherwise. +fn try_split_regs<F: Function>( + state: &mut State<F>, + id: IntId, + available_until: InstPoint, +) -> bool { + state.stats.as_mut().map(|stats| stats.num_reg_splits += 1); + + // Find a position for the split: we'll iterate backwards from the point until the register is + // available, down to the previous use of the current interval. + let prev_use = match last_use(&state.intervals.get(id), available_until, &state.reg_uses) { + Some(prev_use) => prev_use, + None => state.intervals.get(id).start, + }; + + let split_pos = if state.opts.partial_split_near_end { + // Split at the position closest to the available_until position. + let pos = match available_until.pt() { + Point::Use => prev_pos(prev_pos(available_until)), + Point::Def => prev_pos(available_until), + _ => unreachable!(), + }; + if pos <= prev_use { + return false; + } + pos + } else { + // Split at the position closest to the prev_use position. If it was a def, we can split + // just thereafter, if it was at a use, go to the next use. + let pos = match prev_use.pt() { + Point::Use => next_pos(next_pos(prev_use)), + Point::Def => next_pos(prev_use), + _ => unreachable!(), + }; + if pos >= available_until { + return false; + } + pos + }; + + let child = split(state, id, split_pos); + state.insert_unhandled(child); + + state + .stats + .as_mut() + .map(|stats| stats.num_reg_splits_success += 1); + + true +} + +/// Splits the interval at the given position. +/// +/// The split position must either be a Def of the current vreg, or it must be +/// at a Use position (otherwise there's no place to put the moves created by +/// the split). +/// +/// The id of the new interval is returned, while the parent interval is mutated +/// in place. The child interval starts after (including) at_pos. +#[inline(never)] +fn split<F: Function>(state: &mut State<F>, id: IntId, at_pos: InstPoint) -> IntId { + debug!("split {:?} at {:?}", id, at_pos); + trace!("interval: {}", state.intervals.get(id)); + + let int = state.intervals.get(id); + debug_assert!(int.start <= at_pos, "must split after the start"); + debug_assert!(at_pos <= int.end, "must split before the end"); + + // We're splitting in the middle of a fragment: [L, R]. + // Split it into two fragments: parent [L, pos[ + child [pos, R]. + debug_assert!(int.start < int.end, "trying to split unit fragment"); + debug_assert!(int.start <= at_pos, "no space to split fragment"); + + let parent_start = int.start; + let parent_end = prev_pos(at_pos); + let child_start = at_pos; + let child_end = int.end; + + trace!( + "split fragment [{:?}; {:?}] into two parts: [{:?}; {:?}] to [{:?}; {:?}]", + int.start, + int.end, + parent_start, + parent_end, + child_start, + child_end + ); + + debug_assert!(parent_start <= parent_end); + debug_assert!(parent_end <= child_start); + debug_assert!(child_start <= child_end); + + let vreg = int.vreg; + let ancestor = int.ancestor; + + let parent_mentions = state.intervals.get_mut(id).mentions_mut(); + let index = parent_mentions.binary_search_by(|mention| { + // The comparator function returns the position of the argument compared to the target. + + // Search by index first. + let iix = mention.0; + if iix < at_pos.iix() { + return Ordering::Less; + } + if iix > at_pos.iix() { + return Ordering::Greater; + } + + // The instruction index is the same. Consider the instruction side now, and compare it + // with the set. For the purpose of LSRA, mod means use and def. + let set = mention.1; + if at_pos.pt() == Point::Use { + if set.is_use_or_mod() { + Ordering::Equal + } else { + // It has to be Mod or Def. We need to look more to the right of the seeked array. + // Thus indicate this mention is after the target. + Ordering::Greater + } + } else { + debug_assert!(at_pos.pt() == Point::Def); + if set.is_mod_or_def() { + Ordering::Equal + } else { + // Look to the left. + Ordering::Less + } + } + }); + + let (index, may_need_fixup) = match index { + Ok(index) => (index, true), + Err(index) => (index, false), + }; + + // Emulate split_off for SmallVec here. + let mut child_mentions = MentionMap::with_capacity(parent_mentions.len() - index); + for mention in parent_mentions.iter().skip(index) { + child_mentions.push(mention.clone()); + } + parent_mentions.truncate(index); + + // In the situation where we split at the def point of an instruction, and the mention set + // contains the use point, we need to refine the sets: + // - the parent must still contain the use point (and the modified point if present) + // - the child must only contain the def point (and the modified point if present). + // Note that if we split at the use point of an instruction, and the mention set contains the + // def point, it is fine: we're not splitting between the two of them. + if may_need_fixup && at_pos.pt() == Point::Def && child_mentions.first().unwrap().1.is_use() { + let first_child_mention = child_mentions.first_mut().unwrap(); + first_child_mention.1.remove_use(); + + let last_parent_mention = parent_mentions.last_mut().unwrap(); + last_parent_mention.1.add_use(); + + if first_child_mention.1.is_mod() { + last_parent_mention.1.add_mod(); + } + } + + let child_id = IntId(state.intervals.num_virtual_intervals()); + let mut child_int = + VirtualInterval::new(child_id, vreg, child_start, child_end, child_mentions); + child_int.parent = Some(id); + child_int.ancestor = ancestor; + + state.intervals.push_interval(child_int); + + state.intervals.get_mut(id).end = parent_end; + state.intervals.set_child(id, child_id); + + if log_enabled!(Level::Trace) { + trace!("split results:"); + trace!("- {}", state.intervals.get(id)); + trace!("- {}", state.intervals.get(child_id)); + } + + child_id +} + +fn _build_mention_map(reg_uses: &RegUses) -> HashMap<Reg, MentionMap> { + // Maps reg to its mentions. + let mut reg_mentions: HashMap<Reg, MentionMap> = HashMap::default(); + + // Collect all the mentions. + for i in 0..reg_uses.num_insns() { + let iix = InstIx::new(i as u32); + let regsets = reg_uses.get_reg_sets_for_iix(iix); + debug_assert!(regsets.is_sanitized()); + + for reg in regsets.uses.iter() { + let mentions = reg_mentions.entry(*reg).or_default(); + if mentions.is_empty() || mentions.last().unwrap().0 != iix { + mentions.push((iix, Mention::new())); + } + mentions.last_mut().unwrap().1.add_use(); + } + + for reg in regsets.mods.iter() { + let mentions = reg_mentions.entry(*reg).or_default(); + if mentions.is_empty() || mentions.last().unwrap().0 != iix { + mentions.push((iix, Mention::new())); + } + mentions.last_mut().unwrap().1.add_mod(); + } + + for reg in regsets.defs.iter() { + let mentions = reg_mentions.entry(*reg).or_default(); + if mentions.is_empty() || mentions.last().unwrap().0 != iix { + mentions.push((iix, Mention::new())); + } + mentions.last_mut().unwrap().1.add_def(); + } + } + + reg_mentions +} diff --git a/third_party/rust/regalloc/src/linear_scan/mod.rs b/third_party/rust/regalloc/src/linear_scan/mod.rs new file mode 100644 index 0000000000..44c92e2e7a --- /dev/null +++ b/third_party/rust/regalloc/src/linear_scan/mod.rs @@ -0,0 +1,807 @@ +//! pub(crate) Implementation of the linear scan allocator algorithm. +//! +//! This tries to follow the implementation as suggested by: +//! Optimized Interval Splitting in a Linear Scan Register Allocator, +//! by Wimmer et al., 2005 + +use log::{info, log_enabled, trace, Level}; + +use std::default; +use std::env; +use std::fmt; + +use crate::data_structures::{BlockIx, InstIx, InstPoint, Point, RealReg, RegVecsAndBounds}; +use crate::inst_stream::{add_spills_reloads_and_moves, InstToInsertAndExtPoint}; +use crate::{ + checker::CheckerContext, reg_maps::MentionRegUsageMapper, Function, RealRegUniverse, + RegAllocError, RegAllocResult, RegClass, Set, SpillSlot, VirtualReg, NUM_REG_CLASSES, +}; + +use analysis::{AnalysisInfo, RangeFrag}; +use smallvec::SmallVec; + +mod analysis; +mod assign_registers; +mod resolve_moves; + +#[derive(Default)] +pub(crate) struct Statistics { + only_large: bool, + + num_fixed: usize, + num_vregs: usize, + num_virtual_ranges: usize, + + peak_active: usize, + peak_inactive: usize, + + num_try_allocate_reg: usize, + num_try_allocate_reg_success: usize, + + num_reg_splits: usize, + num_reg_splits_success: usize, +} + +impl Drop for Statistics { + fn drop(&mut self) { + if self.only_large && self.num_vregs < 1000 { + return; + } + println!( + "stats: {} fixed; {} vreg; {} vranges; {} peak-active; {} peak-inactive, {} direct-alloc; {} total-alloc; {} partial-splits; {} partial-splits-attempts", + self.num_fixed, + self.num_vregs, + self.num_virtual_ranges, + self.peak_active, + self.peak_inactive, + self.num_try_allocate_reg_success, + self.num_try_allocate_reg, + self.num_reg_splits_success, + self.num_reg_splits, + ); + } +} + +/// Which strategy should we use when trying to find the best split position? +/// TODO Consider loop depth to avoid splitting in the middle of a loop +/// whenever possible. +#[derive(Copy, Clone, Debug)] +enum OptimalSplitStrategy { + From, + To, + NextFrom, + NextNextFrom, + PrevTo, + PrevPrevTo, + Mid, +} + +#[derive(Clone)] +pub struct LinearScanOptions { + split_strategy: OptimalSplitStrategy, + partial_split: bool, + partial_split_near_end: bool, + stats: bool, + large_stats: bool, +} + +impl default::Default for LinearScanOptions { + fn default() -> Self { + // Useful for debugging. + let optimal_split_strategy = match env::var("LSRA_SPLIT") { + Ok(s) => match s.as_str() { + "t" | "to" => OptimalSplitStrategy::To, + "n" => OptimalSplitStrategy::NextFrom, + "nn" => OptimalSplitStrategy::NextNextFrom, + "p" => OptimalSplitStrategy::PrevTo, + "pp" => OptimalSplitStrategy::PrevPrevTo, + "m" | "mid" => OptimalSplitStrategy::Mid, + _ => OptimalSplitStrategy::From, + }, + Err(_) => OptimalSplitStrategy::From, + }; + + let large_stats = env::var("LSRA_LARGE_STATS").is_ok(); + let stats = env::var("LSRA_STATS").is_ok() || large_stats; + + let partial_split = env::var("LSRA_PARTIAL").is_ok(); + let partial_split_near_end = env::var("LSRA_PARTIAL_END").is_ok(); + + Self { + split_strategy: optimal_split_strategy, + partial_split, + partial_split_near_end, + stats, + large_stats, + } + } +} + +impl fmt::Debug for LinearScanOptions { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + writeln!(fmt, "linear scan")?; + write!(fmt, " split: {:?}", self.split_strategy) + } +} + +// Local shorthands. +type RegUses = RegVecsAndBounds; + +/// A unique identifier for an interval. +#[derive(Clone, Copy, PartialEq, Eq)] +struct IntId(pub(crate) usize); + +impl fmt::Debug for IntId { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!(fmt, "int{}", self.0) + } +} + +#[derive(Clone)] +struct FixedInterval { + reg: RealReg, + frags: Vec<RangeFrag>, +} + +impl fmt::Display for FixedInterval { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "fixed {:?} [", self.reg)?; + for (i, frag) in self.frags.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "({:?}, {:?})", frag.first, frag.last)?; + } + write!(f, "]") + } +} + +#[derive(Clone)] +pub(crate) struct VirtualInterval { + id: IntId, + vreg: VirtualReg, + + /// Parent interval in the split tree. + parent: Option<IntId>, + ancestor: Option<IntId>, + /// Child interval, if it has one, in the split tree. + child: Option<IntId>, + + /// Location assigned to this live interval. + location: Location, + + mentions: MentionMap, + start: InstPoint, + end: InstPoint, +} + +impl fmt::Display for VirtualInterval { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(fmt, "virtual {:?}", self.id)?; + if let Some(ref p) = self.parent { + write!(fmt, " (parent={:?})", p)?; + } + write!( + fmt, + ": {:?} {} [{:?}; {:?}]", + self.vreg, self.location, self.start, self.end + ) + } +} + +impl VirtualInterval { + fn new( + id: IntId, + vreg: VirtualReg, + start: InstPoint, + end: InstPoint, + mentions: MentionMap, + ) -> Self { + Self { + id, + vreg, + parent: None, + ancestor: None, + child: None, + location: Location::None, + mentions, + start, + end, + } + } + fn mentions(&self) -> &MentionMap { + &self.mentions + } + fn mentions_mut(&mut self) -> &mut MentionMap { + &mut self.mentions + } + fn covers(&self, pos: InstPoint) -> bool { + self.start <= pos && pos <= self.end + } +} + +/// This data structure tracks the mentions of a register (virtual or real) at a precise +/// instruction point. It's a set encoded as three flags, one for each of use/mod/def. +#[derive(Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct Mention(u8); + +impl fmt::Debug for Mention { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let mut comma = false; + if self.0 & 1 == 1 { + write!(fmt, "use")?; + comma = true; + } + if (self.0 >> 1) & 1 == 1 { + if comma { + write!(fmt, ",")?; + } + write!(fmt, "mod")?; + comma = true; + } + if (self.0 >> 2) & 1 == 1 { + if comma { + write!(fmt, ",")?; + } + write!(fmt, "def")?; + } + Ok(()) + } +} + +impl Mention { + fn new() -> Self { + Self(0) + } + + // Setters. + fn add_use(&mut self) { + self.0 |= 1 << 0; + } + fn add_mod(&mut self) { + self.0 |= 1 << 1; + } + fn add_def(&mut self) { + self.0 |= 1 << 2; + } + + fn remove_use(&mut self) { + self.0 &= !(1 << 0); + } + + // Getters. + fn is_use(&self) -> bool { + (self.0 & 0b001) != 0 + } + fn is_mod(&self) -> bool { + (self.0 & 0b010) != 0 + } + fn is_def(&self) -> bool { + (self.0 & 0b100) != 0 + } + fn is_use_or_mod(&self) -> bool { + (self.0 & 0b011) != 0 + } + fn is_mod_or_def(&self) -> bool { + (self.0 & 0b110) != 0 + } +} + +pub type MentionMap = SmallVec<[(InstIx, Mention); 2]>; + +#[derive(Debug, Clone, Copy)] +pub(crate) enum Location { + None, + Reg(RealReg), + Stack(SpillSlot), +} + +impl Location { + pub(crate) fn reg(&self) -> Option<RealReg> { + match self { + Location::Reg(reg) => Some(*reg), + _ => None, + } + } + pub(crate) fn spill(&self) -> Option<SpillSlot> { + match self { + Location::Stack(slot) => Some(*slot), + _ => None, + } + } + pub(crate) fn is_none(&self) -> bool { + match self { + Location::None => true, + _ => false, + } + } +} + +impl fmt::Display for Location { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + Location::None => write!(fmt, "none"), + Location::Reg(reg) => write!(fmt, "{:?}", reg), + Location::Stack(slot) => write!(fmt, "{:?}", slot), + } + } +} + +/// A group of live intervals. +pub struct Intervals { + virtuals: Vec<VirtualInterval>, + fixeds: Vec<FixedInterval>, +} + +impl Intervals { + fn get(&self, int_id: IntId) -> &VirtualInterval { + &self.virtuals[int_id.0] + } + fn get_mut(&mut self, int_id: IntId) -> &mut VirtualInterval { + &mut self.virtuals[int_id.0] + } + fn num_virtual_intervals(&self) -> usize { + self.virtuals.len() + } + + // Mutators. + fn set_reg(&mut self, int_id: IntId, reg: RealReg) { + let int = self.get_mut(int_id); + debug_assert!(int.location.is_none()); + int.location = Location::Reg(reg); + } + fn set_spill(&mut self, int_id: IntId, slot: SpillSlot) { + let int = self.get_mut(int_id); + debug_assert!(int.location.spill().is_none()); + int.location = Location::Stack(slot); + } + fn push_interval(&mut self, int: VirtualInterval) { + debug_assert!(int.id.0 == self.virtuals.len()); + self.virtuals.push(int); + } + fn set_child(&mut self, int_id: IntId, child_id: IntId) { + if let Some(prev_child) = self.virtuals[int_id.0].child.clone() { + self.virtuals[child_id.0].child = Some(prev_child); + self.virtuals[prev_child.0].parent = Some(child_id); + } + self.virtuals[int_id.0].child = Some(child_id); + } +} + +/// Finds the first use for the current interval that's located after the given +/// `pos` (included), in a broad sense of use (any of use, def or mod). +/// +/// Extends to the left, that is, "modified" means "used". +#[inline(never)] +fn next_use(interval: &VirtualInterval, pos: InstPoint, _reg_uses: &RegUses) -> Option<InstPoint> { + if log_enabled!(Level::Trace) { + trace!("find next use of {} after {:?}", interval, pos); + } + + let mentions = interval.mentions(); + let target = InstPoint::max(pos, interval.start); + + let ret = match mentions.binary_search_by_key(&target.iix(), |mention| mention.0) { + Ok(index) => { + // Either the selected index is a perfect match, or the next mention is + // the correct answer. + let mention = &mentions[index]; + if target.pt() == Point::Use { + if mention.1.is_use_or_mod() { + Some(InstPoint::new_use(mention.0)) + } else { + Some(InstPoint::new_def(mention.0)) + } + } else if target.pt() == Point::Def && mention.1.is_mod_or_def() { + Some(target) + } else if index == mentions.len() - 1 { + None + } else { + let mention = &mentions[index + 1]; + if mention.1.is_use_or_mod() { + Some(InstPoint::new_use(mention.0)) + } else { + Some(InstPoint::new_def(mention.0)) + } + } + } + + Err(index) => { + if index == mentions.len() { + None + } else { + let mention = &mentions[index]; + if mention.1.is_use_or_mod() { + Some(InstPoint::new_use(mention.0)) + } else { + Some(InstPoint::new_def(mention.0)) + } + } + } + }; + + // TODO once the mentions are properly split, this could be removed, in + // theory. + let ret = match ret { + Some(pos) => { + if pos <= interval.end { + Some(pos) + } else { + None + } + } + None => None, + }; + + ret +} + +/// Finds the last use of a vreg before a given target, including it in possible +/// return values. +/// Extends to the right, that is, modified means "def". +#[inline(never)] +fn last_use(interval: &VirtualInterval, pos: InstPoint, _reg_uses: &RegUses) -> Option<InstPoint> { + if log_enabled!(Level::Trace) { + trace!("searching last use of {} before {:?}", interval, pos,); + } + + let mentions = interval.mentions(); + + let target = InstPoint::min(pos, interval.end); + + let ret = match mentions.binary_search_by_key(&target.iix(), |mention| mention.0) { + Ok(index) => { + // Either the selected index is a perfect match, or the previous mention + // is the correct answer. + let mention = &mentions[index]; + if target.pt() == Point::Def { + if mention.1.is_mod_or_def() { + Some(InstPoint::new_def(mention.0)) + } else { + Some(InstPoint::new_use(mention.0)) + } + } else if target.pt() == Point::Use && mention.1.is_use() { + Some(target) + } else if index == 0 { + None + } else { + let mention = &mentions[index - 1]; + if mention.1.is_mod_or_def() { + Some(InstPoint::new_def(mention.0)) + } else { + Some(InstPoint::new_use(mention.0)) + } + } + } + + Err(index) => { + if index == 0 { + None + } else { + let mention = &mentions[index - 1]; + if mention.1.is_mod_or_def() { + Some(InstPoint::new_def(mention.0)) + } else { + Some(InstPoint::new_use(mention.0)) + } + } + } + }; + + // TODO once the mentions are properly split, this could be removed, in + // theory. + let ret = match ret { + Some(pos) => { + if pos >= interval.start { + Some(pos) + } else { + None + } + } + None => None, + }; + + trace!("mentions: {:?}", mentions); + trace!("found: {:?}", ret); + + ret +} + +/// Checks that each register class has its own scratch register in addition to one available +/// register, and creates a mapping of register class -> scratch register. +fn compute_scratches( + reg_universe: &RealRegUniverse, +) -> Result<Vec<Option<RealReg>>, RegAllocError> { + let mut scratches_by_rc = vec![None; NUM_REG_CLASSES]; + for i in 0..NUM_REG_CLASSES { + if let Some(info) = ®_universe.allocable_by_class[i] { + if info.first == info.last { + return Err(RegAllocError::Other( + "at least 2 registers required for linear scan".into(), + )); + } + let scratch = if let Some(suggested_reg) = info.suggested_scratch { + reg_universe.regs[suggested_reg].0 + } else { + return Err(RegAllocError::MissingSuggestedScratchReg( + RegClass::rc_from_u32(i as u32), + )); + }; + scratches_by_rc[i] = Some(scratch); + } + } + Ok(scratches_by_rc) +} + +/// Allocator top level. +/// +/// `func` is modified so that, when this function returns, it will contain no VirtualReg uses. +/// +/// Allocation can fail if there are insufficient registers to even generate spill/reload code, or +/// if the function appears to have any undefined VirtualReg/RealReg uses. +#[inline(never)] +pub(crate) fn run<F: Function>( + func: &mut F, + reg_universe: &RealRegUniverse, + use_checker: bool, + opts: &LinearScanOptions, +) -> Result<RegAllocResult<F>, RegAllocError> { + let AnalysisInfo { + reg_vecs_and_bounds: reg_uses, + intervals, + liveins, + liveouts, + .. + } = analysis::run(func, reg_universe).map_err(|err| RegAllocError::Analysis(err))?; + + let scratches_by_rc = compute_scratches(reg_universe)?; + + let stats = if opts.stats { + let mut stats = Statistics::default(); + stats.num_fixed = intervals.fixeds.len(); + stats.num_virtual_ranges = intervals.virtuals.len(); + stats.num_vregs = intervals + .virtuals + .iter() + .map(|virt| virt.vreg.get_index()) + .fold(0, |a, b| usize::max(a, b)); + stats.only_large = opts.large_stats; + Some(stats) + } else { + None + }; + + if log_enabled!(Level::Trace) { + trace!("fixed intervals:"); + for int in &intervals.fixeds { + trace!("{}", int); + } + trace!(""); + trace!("unassigned intervals:"); + for int in &intervals.virtuals { + trace!("{}", int); + for mention in &int.mentions { + trace!(" mention @ {:?}: {:?}", mention.0, mention.1); + } + } + trace!(""); + } + + let (intervals, mut num_spill_slots) = assign_registers::run( + opts, + func, + ®_uses, + reg_universe, + &scratches_by_rc, + intervals, + stats, + )?; + + let virtuals = &intervals.virtuals; + + let memory_moves = resolve_moves::run( + func, + ®_uses, + virtuals, + &liveins, + &liveouts, + &mut num_spill_slots, + &scratches_by_rc, + ); + + apply_registers( + func, + virtuals, + memory_moves, + reg_universe, + num_spill_slots, + use_checker, + ) +} + +#[inline(never)] +fn set_registers<F: Function>( + func: &mut F, + virtual_intervals: &Vec<VirtualInterval>, + reg_universe: &RealRegUniverse, + use_checker: bool, + memory_moves: &Vec<InstToInsertAndExtPoint>, +) -> Set<RealReg> { + info!("set_registers"); + + // Set up checker state, if indicated by our configuration. + let mut checker: Option<CheckerContext> = None; + let mut insn_blocks: Vec<BlockIx> = vec![]; + if use_checker { + checker = Some(CheckerContext::new( + func, + reg_universe, + memory_moves, + &[], + &[], + &[], + )); + insn_blocks.resize(func.insns().len(), BlockIx::new(0)); + for block_ix in func.blocks() { + for insn_ix in func.block_insns(block_ix) { + insn_blocks[insn_ix.get() as usize] = block_ix; + } + } + } + + let mut clobbered_registers = Set::empty(); + + // Collect all the regs per instruction and mention set. + let capacity = virtual_intervals + .iter() + .map(|int| int.mentions.len()) + .fold(0, |a, b| a + b); + + if capacity == 0 { + // No virtual registers have been allocated, exit early. + return clobbered_registers; + } + + let mut mention_map = Vec::with_capacity(capacity); + + for int in virtual_intervals { + let rreg = match int.location.reg() { + Some(rreg) => rreg, + _ => continue, + }; + trace!("int: {}", int); + trace!(" {:?}", int.mentions); + for &mention in &int.mentions { + mention_map.push((mention.0, mention.1, int.vreg, rreg)); + } + } + + // Sort by instruction index. + mention_map.sort_unstable_by_key(|quad| quad.0); + + // Iterate over all the mentions. + let mut mapper = MentionRegUsageMapper::new(); + + let flush_inst = |func: &mut F, + mapper: &mut MentionRegUsageMapper, + iix: InstIx, + checker: Option<&mut CheckerContext>| { + trace!("map_regs for {:?}", iix); + let mut inst = func.get_insn_mut(iix); + F::map_regs(&mut inst, mapper); + + if let Some(checker) = checker { + let block_ix = insn_blocks[iix.get() as usize]; + checker + .handle_insn(reg_universe, func, block_ix, iix, mapper) + .unwrap(); + } + + mapper.clear(); + }; + + let mut prev_iix = mention_map[0].0; + for (iix, mention_set, vreg, rreg) in mention_map { + if prev_iix != iix { + // Flush previous instruction. + flush_inst(func, &mut mapper, prev_iix, checker.as_mut()); + prev_iix = iix; + } + + trace!( + "{:?}: {:?} is in {:?} at {:?}", + iix, + vreg, + rreg, + mention_set + ); + + // Fill in new information at the given index. + if mention_set.is_use() { + if let Some(prev_rreg) = mapper.lookup_use(vreg) { + debug_assert_eq!(prev_rreg, rreg, "different use allocs for {:?}", vreg); + } + mapper.set_use(vreg, rreg); + } + + let included_in_clobbers = func.is_included_in_clobbers(func.get_insn(iix)); + if mention_set.is_mod() { + if let Some(prev_rreg) = mapper.lookup_use(vreg) { + debug_assert_eq!(prev_rreg, rreg, "different use allocs for {:?}", vreg); + } + if let Some(prev_rreg) = mapper.lookup_def(vreg) { + debug_assert_eq!(prev_rreg, rreg, "different def allocs for {:?}", vreg); + } + + mapper.set_use(vreg, rreg); + mapper.set_def(vreg, rreg); + if included_in_clobbers { + clobbered_registers.insert(rreg); + } + } + + if mention_set.is_def() { + if let Some(prev_rreg) = mapper.lookup_def(vreg) { + debug_assert_eq!(prev_rreg, rreg, "different def allocs for {:?}", vreg); + } + + mapper.set_def(vreg, rreg); + if included_in_clobbers { + clobbered_registers.insert(rreg); + } + } + } + + // Flush last instruction. + flush_inst(func, &mut mapper, prev_iix, checker.as_mut()); + + clobbered_registers +} + +/// Fills in the register assignments into instructions. +#[inline(never)] +fn apply_registers<F: Function>( + func: &mut F, + virtual_intervals: &Vec<VirtualInterval>, + memory_moves: Vec<InstToInsertAndExtPoint>, + reg_universe: &RealRegUniverse, + num_spill_slots: u32, + use_checker: bool, +) -> Result<RegAllocResult<F>, RegAllocError> { + info!("apply_registers"); + + let clobbered_registers = set_registers( + func, + virtual_intervals, + reg_universe, + use_checker, + &memory_moves, + ); + + let safepoint_insns = vec![]; + let (final_insns, target_map, new_to_old_insn_map, new_safepoint_insns) = + add_spills_reloads_and_moves(func, &safepoint_insns, memory_moves) + .map_err(|e| RegAllocError::Other(e))?; + assert!(new_safepoint_insns.is_empty()); // because `safepoint_insns` is also empty. + + // And now remove from the clobbered registers set, all those not available to the allocator. + // But not removing the reserved regs, since we might have modified those. + clobbered_registers.filter_map(|®| { + if reg.get_index() >= reg_universe.allocable { + None + } else { + Some(reg) + } + }); + + Ok(RegAllocResult { + insns: final_insns, + target_map, + orig_insn_map: new_to_old_insn_map, + clobbered_registers, + num_spill_slots, + block_annotations: None, + stackmaps: vec![], + new_safepoint_insns, + }) +} diff --git a/third_party/rust/regalloc/src/linear_scan/resolve_moves.rs b/third_party/rust/regalloc/src/linear_scan/resolve_moves.rs new file mode 100644 index 0000000000..8012404a86 --- /dev/null +++ b/third_party/rust/regalloc/src/linear_scan/resolve_moves.rs @@ -0,0 +1,889 @@ +use super::{next_use, IntId, Location, RegUses, VirtualInterval}; +use crate::{ + data_structures::{BlockIx, InstPoint, Point}, + inst_stream::{InstExtPoint, InstToInsert, InstToInsertAndExtPoint}, + sparse_set::SparseSet, + Function, RealReg, Reg, SpillSlot, TypedIxVec, VirtualReg, Writable, +}; + +use log::{debug, info, trace}; +use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; +use smallvec::SmallVec; +use std::fmt; + +fn resolve_moves_in_block<F: Function>( + func: &F, + intervals: &Vec<VirtualInterval>, + reg_uses: &RegUses, + scratches_by_rc: &[Option<RealReg>], + spill_slot: &mut u32, + moves_in_blocks: &mut Vec<InstToInsertAndExtPoint>, + tmp_ordered_moves: &mut Vec<MoveOp>, + tmp_stack: &mut Vec<MoveOp>, +) { + let mut block_ends = HashSet::default(); + let mut block_starts = HashSet::default(); + for bix in func.blocks() { + let insts = func.block_insns(bix); + block_ends.insert(insts.last()); + block_starts.insert(insts.first()); + } + + let mut reloads_at_inst = HashMap::default(); + let mut spills_at_inst = Vec::new(); + + for interval in intervals { + let parent_id = match interval.parent { + Some(pid) => pid, + None => { + // In unreachable code, it's possible that a given interval has no + // parents and is assigned to a stack location for its whole lifetime. + // + // In reachable code, the analysis only create intervals for virtual + // registers with at least one register use, so a parentless interval (= + // hasn't ever been split) can't live in a stack slot. + #[cfg(debug_assertions)] + debug_assert!( + interval.location.spill().is_none() + || (next_use(interval, InstPoint::min_value(), reg_uses,).is_none()) + ); + continue; + } + }; + + let parent = &intervals[parent_id.0]; + + // If this is a move between blocks, handle it as such. + if parent.end.pt() == Point::Def + && interval.start.pt() == Point::Use + && block_ends.contains(&parent.end.iix()) + && block_starts.contains(&interval.start.iix()) + { + continue; + } + + let child_start = interval.start; + let vreg = interval.vreg; + + match interval.location { + Location::None => panic!("interval has no location after regalloc!"), + + Location::Reg(rreg) => { + // Reconnect with the parent location, by adding a move if needed. + if let Some(next_use) = next_use(interval, child_start, reg_uses) { + // No need to reload before a new definition. + if next_use.pt() == Point::Def { + continue; + } + }; + + let mut at_inst = child_start; + match at_inst.pt() { + Point::Use => { + at_inst.set_pt(Point::Reload); + } + Point::Def => { + at_inst.set_pt(Point::Spill); + } + _ => unreachable!(), + } + + let entry = reloads_at_inst.entry(at_inst).or_insert_with(|| Vec::new()); + + match parent.location { + Location::None => unreachable!(), + + Location::Reg(from_rreg) => { + if from_rreg != rreg { + debug!( + "inblock fixup: {:?} move {:?} -> {:?} at {:?}", + interval.id, from_rreg, rreg, at_inst + ); + entry.push(MoveOp::new_move(from_rreg, rreg, vreg)); + } + } + + Location::Stack(spill) => { + debug!( + "inblock fixup: {:?} reload {:?} -> {:?} at {:?}", + interval.id, spill, rreg, at_inst + ); + entry.push(MoveOp::new_reload(spill, rreg, vreg)); + } + } + } + + Location::Stack(spill) => { + // This interval has been spilled (i.e. split). Spill after the last def or before + // the last use. + let mut at_inst = parent.end; + at_inst.set_pt(if at_inst.pt() == Point::Use { + Point::Reload + } else { + debug_assert!(at_inst.pt() == Point::Def); + Point::Spill + }); + + match parent.location { + Location::None => unreachable!(), + + Location::Reg(rreg) => { + debug!( + "inblock fixup: {:?} spill {:?} -> {:?} at {:?}", + interval.id, rreg, spill, at_inst + ); + spills_at_inst.push(InstToInsertAndExtPoint::new( + InstToInsert::Spill { + to_slot: spill, + from_reg: rreg, + for_vreg: Some(vreg), + }, + InstExtPoint::from_inst_point(at_inst), + )); + } + + Location::Stack(parent_spill) => { + debug_assert_eq!(parent_spill, spill); + } + } + } + } + } + + // Flush the memory moves caused by in-block fixups. Conceptually, the spills + // must happen after the right locations have been set, that is, after the + // reloads. Reloads may include several moves that must happen in parallel + // (e.g. if two real regs must be swapped), so process them first. Once all + // the parallel assignments have been done, push forward all the spills. + for (at_inst, mut pending_moves) in reloads_at_inst { + schedule_moves(&mut pending_moves, tmp_ordered_moves, tmp_stack); + emit_moves( + at_inst, + &tmp_ordered_moves, + spill_slot, + scratches_by_rc, + moves_in_blocks, + ); + } + + moves_in_blocks.append(&mut spills_at_inst); +} + +#[derive(Clone, Copy)] +enum BlockPos { + Start, + End, +} + +#[derive(Default, Clone)] +struct BlockInfo { + start: SmallVec<[(VirtualReg, IntId); 4]>, + end: SmallVec<[(VirtualReg, IntId); 4]>, +} + +static UNSORTED_THRESHOLD: usize = 8; + +impl BlockInfo { + #[inline(never)] + fn insert(&mut self, pos: BlockPos, vreg: VirtualReg, id: IntId) { + match pos { + BlockPos::Start => { + #[cfg(debug_assertions)] + debug_assert!(self.start.iter().find(|prev| prev.0 == vreg).is_none()); + self.start.push((vreg, id)); + } + BlockPos::End => { + #[cfg(debug_assertions)] + debug_assert!(self.end.iter().find(|prev| prev.0 == vreg).is_none()); + self.end.push((vreg, id)); + } + } + } + + #[inline(never)] + fn finish(&mut self) { + if self.start.len() >= UNSORTED_THRESHOLD { + self.start.sort_unstable_by_key(|pair| pair.0); + } + if self.end.len() >= UNSORTED_THRESHOLD { + self.end.sort_unstable_by_key(|pair| pair.0); + } + } + + #[inline(never)] + fn lookup(&self, pos: BlockPos, vreg: &VirtualReg) -> IntId { + let array = match pos { + BlockPos::Start => &self.start, + BlockPos::End => &self.end, + }; + if array.len() >= UNSORTED_THRESHOLD { + array[array.binary_search_by_key(vreg, |pair| pair.0).unwrap()].1 + } else { + array + .iter() + .find(|el| el.0 == *vreg) + .expect("should have found target reg") + .1 + } + } +} + +/// For each block, collect a mapping of block_{start, end} -> actual location, to make the +/// across-blocks fixup phase fast. +#[inline(never)] +fn collect_block_infos<F: Function>( + func: &F, + intervals: &Vec<VirtualInterval>, + liveins: &TypedIxVec<BlockIx, SparseSet<Reg>>, + liveouts: &TypedIxVec<BlockIx, SparseSet<Reg>>, +) -> Vec<BlockInfo> { + // First, collect the first and last instructions of each block. + let mut block_start_and_ends = Vec::with_capacity(2 * func.blocks().len()); + for bix in func.blocks() { + let insts = func.block_insns(bix); + block_start_and_ends.push((InstPoint::new_use(insts.first()), BlockPos::Start, bix)); + block_start_and_ends.push((InstPoint::new_def(insts.last()), BlockPos::End, bix)); + } + + // Sort this array by instruction point, to be able to do binary search later. + block_start_and_ends.sort_unstable_by_key(|pair| pair.0); + + // Preallocate the block information, with the final size of each vector. + let mut infos = Vec::with_capacity(func.blocks().len()); + for bix in func.blocks() { + infos.push(BlockInfo { + start: SmallVec::with_capacity(liveins[bix].card()), + end: SmallVec::with_capacity(liveouts[bix].card()), + }); + } + + // For each interval: + // - find the first block start or end instruction that's in the interval, with a binary search + // on the previous array. + // - add an entry for each livein ou liveout variable in the block info. + for int in intervals { + let mut i = match block_start_and_ends.binary_search_by_key(&int.start, |pair| pair.0) { + Ok(i) => i, + Err(i) => i, + }; + + let vreg = int.vreg; + let id = int.id; + + while let Some(&(inst, pos, bix)) = block_start_and_ends.get(i) { + if inst > int.end { + break; + } + + #[cfg(debug_assertions)] + debug_assert!(int.covers(inst)); + + // Skip virtual registers that are not live-in (at start) or live-out (at end). + match pos { + BlockPos::Start => { + if !liveins[bix].contains(vreg.to_reg()) { + i += 1; + continue; + } + } + BlockPos::End => { + if !liveouts[bix].contains(vreg.to_reg()) { + i += 1; + continue; + } + } + } + + infos[bix.get() as usize].insert(pos, vreg, id); + i += 1; + } + } + + for info in infos.iter_mut() { + info.finish(); + } + + infos +} + +/// Figure the sequence of parallel moves to insert at block boundaries: +/// - for each block +/// - for each liveout vreg in this block +/// - for each successor of this block +/// - if the locations allocated in the block and its successor don't +/// match, insert a pending move from one location to the other. +/// +/// Once that's done: +/// - resolve cycles in the pending moves +/// - generate real moves from the pending moves. +#[inline(never)] +fn resolve_moves_across_blocks<F: Function>( + func: &F, + liveins: &TypedIxVec<BlockIx, SparseSet<Reg>>, + liveouts: &TypedIxVec<BlockIx, SparseSet<Reg>>, + intervals: &Vec<VirtualInterval>, + scratches_by_rc: &[Option<RealReg>], + spill_slot: &mut u32, + moves_at_block_starts: &mut Vec<InstToInsertAndExtPoint>, + moves_at_block_ends: &mut Vec<InstToInsertAndExtPoint>, + tmp_ordered_moves: &mut Vec<MoveOp>, + tmp_stack: &mut Vec<MoveOp>, +) { + let mut parallel_move_map = HashMap::default(); + + let block_info = collect_block_infos(func, intervals, liveins, liveouts); + + let mut seen_successors = HashSet::default(); + for block in func.blocks() { + let successors = func.block_succs(block); + + // Where to insert the fixup move, if needed? If there's more than one + // successor to the current block, inserting in the current block will + // impact all the successors. + // + // We assume critical edges have been split, so + // if the current block has more than one successor, then its successors + // have at most one predecessor. + let cur_has_one_succ = successors.len() == 1; + + for ® in liveouts[block].iter() { + let vreg = if let Some(vreg) = reg.as_virtual_reg() { + vreg + } else { + continue; + }; + + seen_successors.clear(); + + let cur_id = block_info[block.get() as usize].lookup(BlockPos::End, &vreg); + let cur_int = &intervals[cur_id.0]; + let loc_at_cur_end = cur_int.location; + + for &succ in successors.iter() { + if !liveins[succ].contains(reg) { + // This variable isn't live in this block. + continue; + } + if !seen_successors.insert(succ) { + continue; + } + + let succ_id = block_info[succ.get() as usize].lookup(BlockPos::Start, &vreg); + let succ_int = &intervals[succ_id.0]; + + // If the two intervals aren't related to the same virtual range, then the move is + // not required. + if cur_int.ancestor != succ_int.ancestor { + continue; + } + + let loc_at_succ_start = succ_int.location; + + let (at_inst, block_pos) = if cur_has_one_succ { + // Before the control flow instruction. + let pos = InstPoint::new_reload(func.block_insns(block).last()); + (pos, BlockPos::End) + } else { + let pos = InstPoint::new_reload(func.block_insns(succ).first()); + (pos, BlockPos::Start) + }; + + let pending_moves = parallel_move_map + .entry(at_inst) + .or_insert_with(|| (Vec::new(), block_pos)); + + match (loc_at_cur_end, loc_at_succ_start) { + (Location::Reg(cur_rreg), Location::Reg(succ_rreg)) => { + if cur_rreg == succ_rreg { + continue; + } + debug!( + "boundary fixup: move {:?} -> {:?} at {:?} for {:?} between {:?} and {:?}", + cur_rreg, + succ_rreg, + at_inst, + vreg, + block, + succ + ); + pending_moves + .0 + .push(MoveOp::new_move(cur_rreg, succ_rreg, vreg)); + } + + (Location::Reg(cur_rreg), Location::Stack(spillslot)) => { + debug!( + "boundary fixup: spill {:?} -> {:?} at {:?} for {:?} between {:?} and {:?}", + cur_rreg, + spillslot, + at_inst, + vreg, + block, + succ + ); + pending_moves + .0 + .push(MoveOp::new_spill(cur_rreg, spillslot, vreg)); + } + + (Location::Stack(spillslot), Location::Reg(rreg)) => { + debug!( + "boundary fixup: reload {:?} -> {:?} at {:?} for {:?} between {:?} and {:?}", + spillslot, + rreg, + at_inst, + vreg, + block, + succ + ); + pending_moves + .0 + .push(MoveOp::new_reload(spillslot, rreg, vreg)); + } + + (Location::Stack(left_spill_slot), Location::Stack(right_spill_slot)) => { + // Stack to stack should not happen here, since two ranges for the + // same vreg can't be intersecting, so the same stack slot ought to + // be reused in this case. + debug_assert_eq!( + left_spill_slot, right_spill_slot, + "Moves from stack to stack only happen on the same vreg, thus the same stack slot" + ); + continue; + } + + (_, _) => { + panic!("register or stack slots must have been allocated."); + } + }; + } + } + + // Flush the memory moves caused by block fixups for this block. + for (at_inst, (move_insts, block_pos)) in parallel_move_map.iter_mut() { + schedule_moves(move_insts, tmp_ordered_moves, tmp_stack); + + match block_pos { + BlockPos::Start => { + emit_moves( + *at_inst, + &tmp_ordered_moves, + spill_slot, + scratches_by_rc, + moves_at_block_starts, + ); + } + BlockPos::End => { + emit_moves( + *at_inst, + &tmp_ordered_moves, + spill_slot, + scratches_by_rc, + moves_at_block_ends, + ); + } + }; + } + + parallel_move_map.clear(); + } + + debug!(""); +} + +#[inline(never)] +pub(crate) fn run<F: Function>( + func: &F, + reg_uses: &RegUses, + intervals: &Vec<VirtualInterval>, + liveins: &TypedIxVec<BlockIx, SparseSet<Reg>>, + liveouts: &TypedIxVec<BlockIx, SparseSet<Reg>>, + spill_slot: &mut u32, + scratches_by_rc: &[Option<RealReg>], +) -> Vec<InstToInsertAndExtPoint> { + info!("resolve_moves"); + + // Keep three lists of moves to insert: + // - moves across blocks, that must happen at the start of blocks, + // - moves within a given block, + // - moves across blocks, that must happen at the end of blocks. + // + // To maintain the property that these moves are eventually sorted at the end, we'll compute + // the final array of moves by concatenating these three arrays. `inst_stream` uses a stable + // sort, making sure the at-block-start/within-block/at-block-end will be respected. + let mut moves_at_block_starts = Vec::new(); + let mut moves_at_block_ends = Vec::new(); + let mut moves_in_blocks = Vec::new(); + + let mut tmp_stack = Vec::new(); + let mut tmp_ordered_moves = Vec::new(); + resolve_moves_in_block( + func, + intervals, + reg_uses, + scratches_by_rc, + spill_slot, + &mut moves_in_blocks, + &mut tmp_ordered_moves, + &mut tmp_stack, + ); + + resolve_moves_across_blocks( + func, + liveins, + liveouts, + intervals, + scratches_by_rc, + spill_slot, + &mut moves_at_block_starts, + &mut moves_at_block_ends, + &mut tmp_ordered_moves, + &mut tmp_stack, + ); + + let mut insts_and_points = moves_at_block_starts; + insts_and_points.reserve(moves_in_blocks.len() + moves_at_block_ends.len()); + insts_and_points.append(&mut moves_in_blocks); + insts_and_points.append(&mut moves_at_block_ends); + + insts_and_points +} + +#[derive(PartialEq, Debug)] +enum MoveOperand { + Reg(RealReg), + Stack(SpillSlot), +} + +impl MoveOperand { + fn aliases(&self, other: &Self) -> bool { + self == other + } +} + +struct MoveOp { + from: MoveOperand, + to: MoveOperand, + vreg: VirtualReg, + cycle_begin: Option<usize>, + cycle_end: Option<usize>, +} + +impl fmt::Debug for MoveOp { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + write!(fmt, "{:?}: {:?} -> {:?}", self.vreg, self.from, self.to)?; + if let Some(ref begin) = self.cycle_begin { + write!(fmt, ", start of cycle #{}", begin)?; + } + if let Some(ref end) = self.cycle_end { + write!(fmt, ", end of cycle #{}", end)?; + } + Ok(()) + } +} + +impl MoveOp { + fn new_move(from: RealReg, to: RealReg, vreg: VirtualReg) -> Self { + Self { + from: MoveOperand::Reg(from), + to: MoveOperand::Reg(to), + vreg, + cycle_begin: None, + cycle_end: None, + } + } + + fn new_spill(from: RealReg, to: SpillSlot, vreg: VirtualReg) -> Self { + Self { + from: MoveOperand::Reg(from), + to: MoveOperand::Stack(to), + vreg, + cycle_begin: None, + cycle_end: None, + } + } + + fn new_reload(from: SpillSlot, to: RealReg, vreg: VirtualReg) -> Self { + Self { + from: MoveOperand::Stack(from), + to: MoveOperand::Reg(to), + vreg, + cycle_begin: None, + cycle_end: None, + } + } + + fn gen_inst(&self) -> InstToInsert { + match self.from { + MoveOperand::Reg(from) => match self.to { + MoveOperand::Reg(to) => InstToInsert::Move { + to_reg: Writable::from_reg(to), + from_reg: from, + for_vreg: self.vreg, + }, + MoveOperand::Stack(to) => InstToInsert::Spill { + to_slot: to, + from_reg: from, + for_vreg: Some(self.vreg), + }, + }, + MoveOperand::Stack(from) => match self.to { + MoveOperand::Reg(to) => InstToInsert::Reload { + to_reg: Writable::from_reg(to), + from_slot: from, + for_vreg: Some(self.vreg), + }, + MoveOperand::Stack(_to) => unreachable!("stack to stack move"), + }, + } + } +} + +fn find_blocking_move<'a>( + pending: &'a mut Vec<MoveOp>, + last: &MoveOp, +) -> Option<(usize, &'a mut MoveOp)> { + for (i, other) in pending.iter_mut().enumerate() { + if other.from.aliases(&last.to) { + return Some((i, other)); + } + } + None +} + +fn find_cycled_move<'a>( + stack: &'a mut Vec<MoveOp>, + from: &mut usize, + last: &MoveOp, +) -> Option<&'a mut MoveOp> { + for i in *from..stack.len() { + *from += 1; + let other = &stack[i]; + if other.from.aliases(&last.to) { + return Some(&mut stack[i]); + } + } + None +} + +/// Given a pending list of moves, returns a list of moves ordered in a correct +/// way, i.e., no move clobbers another one. +#[inline(never)] +fn schedule_moves( + pending: &mut Vec<MoveOp>, + ordered_moves: &mut Vec<MoveOp>, + stack: &mut Vec<MoveOp>, +) { + ordered_moves.clear(); + + let mut num_cycles = 0; + let mut cur_cycles = 0; + + trace!("pending moves: {:#?}", pending); + + while let Some(pm) = pending.pop() { + trace!("handling pending move {:?}", pm); + debug_assert!( + pm.from != pm.to, + "spurious moves should not have been inserted" + ); + + stack.clear(); + stack.push(pm); + + while !stack.is_empty() { + let blocking_pair = find_blocking_move(pending, stack.last().unwrap()); + + if let Some((blocking_idx, blocking)) = blocking_pair { + trace!("found blocker: {:?}", blocking); + let mut stack_cur = 0; + + let has_cycles = + if let Some(mut cycled) = find_cycled_move(stack, &mut stack_cur, blocking) { + trace!("found cycle: {:?}", cycled); + debug_assert!(cycled.cycle_end.is_none()); + cycled.cycle_end = Some(cur_cycles); + true + } else { + false + }; + + if has_cycles { + loop { + match find_cycled_move(stack, &mut stack_cur, blocking) { + Some(ref mut cycled) => { + trace!("found more cycles ending on blocker: {:?}", cycled); + debug_assert!(cycled.cycle_end.is_none()); + cycled.cycle_end = Some(cur_cycles); + } + None => break, + } + } + + debug_assert!(blocking.cycle_begin.is_none()); + blocking.cycle_begin = Some(cur_cycles); + cur_cycles += 1; + } + + let blocking = pending.remove(blocking_idx); + stack.push(blocking); + } else { + // There's no blocking move! We can push this in the ordered list of + // moves. + // TODO IonMonkey has more optimizations for this case. + let last = stack.pop().unwrap(); + ordered_moves.push(last); + } + } + + if num_cycles < cur_cycles { + num_cycles = cur_cycles; + } + cur_cycles = 0; + } +} + +#[inline(never)] +fn emit_moves( + at_inst: InstPoint, + ordered_moves: &Vec<MoveOp>, + num_spill_slots: &mut u32, + scratches_by_rc: &[Option<RealReg>], + moves_in_blocks: &mut Vec<InstToInsertAndExtPoint>, +) { + let mut spill_slot = None; + let mut in_cycle = false; + + trace!("emit_moves"); + + for mov in ordered_moves { + if let Some(_) = &mov.cycle_end { + debug_assert!(in_cycle); + + // There is some pattern: + // (A -> B) + // (B -> A) + // This case handles (B -> A), which we reach last. We emit a move from + // the saved value of B, to A. + match mov.to { + MoveOperand::Reg(dst_reg) => { + let inst = InstToInsert::Reload { + to_reg: Writable::from_reg(dst_reg), + from_slot: spill_slot.expect("should have a cycle spill slot"), + for_vreg: Some(mov.vreg), + }; + moves_in_blocks.push(InstToInsertAndExtPoint::new( + inst, + InstExtPoint::from_inst_point(at_inst), + )); + trace!( + "finishing cycle: {:?} -> {:?}", + spill_slot.unwrap(), + dst_reg + ); + } + MoveOperand::Stack(dst_spill) => { + let scratch = scratches_by_rc[mov.vreg.get_class() as usize] + .expect("missing scratch reg"); + let inst = InstToInsert::Reload { + to_reg: Writable::from_reg(scratch), + from_slot: spill_slot.expect("should have a cycle spill slot"), + for_vreg: Some(mov.vreg), + }; + moves_in_blocks.push(InstToInsertAndExtPoint::new( + inst, + InstExtPoint::from_inst_point(at_inst), + )); + let inst = InstToInsert::Spill { + to_slot: dst_spill, + from_reg: scratch, + for_vreg: Some(mov.vreg), + }; + moves_in_blocks.push(InstToInsertAndExtPoint::new( + inst, + InstExtPoint::from_inst_point(at_inst), + )); + trace!( + "finishing cycle: {:?} -> {:?} -> {:?}", + spill_slot.unwrap(), + scratch, + dst_spill + ); + } + }; + + in_cycle = false; + continue; + } + + if let Some(_) = &mov.cycle_begin { + debug_assert!(!in_cycle); + + // There is some pattern: + // (A -> B) + // (B -> A) + // This case handles (A -> B), which we reach first. We save B, then allow + // the original move to continue. + match spill_slot { + Some(_) => {} + None => { + spill_slot = Some(SpillSlot::new(*num_spill_slots)); + *num_spill_slots += 1; + } + } + + match mov.to { + MoveOperand::Reg(src_reg) => { + let inst = InstToInsert::Spill { + to_slot: spill_slot.unwrap(), + from_reg: src_reg, + for_vreg: Some(mov.vreg), + }; + moves_in_blocks.push(InstToInsertAndExtPoint::new( + inst, + InstExtPoint::from_inst_point(at_inst), + )); + trace!("starting cycle: {:?} -> {:?}", src_reg, spill_slot.unwrap()); + } + MoveOperand::Stack(src_spill) => { + let scratch = scratches_by_rc[mov.vreg.get_class() as usize] + .expect("missing scratch reg"); + let inst = InstToInsert::Reload { + to_reg: Writable::from_reg(scratch), + from_slot: src_spill, + for_vreg: Some(mov.vreg), + }; + moves_in_blocks.push(InstToInsertAndExtPoint::new( + inst, + InstExtPoint::from_inst_point(at_inst), + )); + let inst = InstToInsert::Spill { + to_slot: spill_slot.expect("should have a cycle spill slot"), + from_reg: scratch, + for_vreg: Some(mov.vreg), + }; + moves_in_blocks.push(InstToInsertAndExtPoint::new( + inst, + InstExtPoint::from_inst_point(at_inst), + )); + trace!( + "starting cycle: {:?} -> {:?} -> {:?}", + src_spill, + scratch, + spill_slot.unwrap() + ); + } + }; + + in_cycle = true; + } + + // A normal move which is not part of a cycle. + let inst = mov.gen_inst(); + moves_in_blocks.push(InstToInsertAndExtPoint::new( + inst, + InstExtPoint::from_inst_point(at_inst), + )); + trace!("moving {:?} -> {:?}", mov.from, mov.to); + } +} diff --git a/third_party/rust/regalloc/src/pretty_print.rs b/third_party/rust/regalloc/src/pretty_print.rs new file mode 100644 index 0000000000..8f01c0766e --- /dev/null +++ b/third_party/rust/regalloc/src/pretty_print.rs @@ -0,0 +1,56 @@ +//! Pretty-printing for the main data structures. + +use crate::data_structures::WritableBase; +use crate::{RealRegUniverse, Reg, Writable}; + +/// A trait for printing instruction bits and pieces, with the the ability to take a +/// contextualising `RealRegUniverse` that is used to give proper names to registers. +pub trait PrettyPrint { + /// Return a string that shows the implementing object in context of the given + /// `RealRegUniverse`, if provided. + fn show_rru(&self, maybe_reg_universe: Option<&RealRegUniverse>) -> String; +} + +/// Same as `PrettyPrint`, but can also take a size hint into account to specialize the displayed +/// string. +pub trait PrettyPrintSized: PrettyPrint { + /// The same as |show_rru|, but with an optional hint giving a size in bytes. Its + /// interpretation is object-dependent, and it is intended to pass around enough information to + /// facilitate printing sub-parts of real registers correctly. Objects may ignore size hints + /// that are irrelevant to them. + /// + /// The default implementation ignores the size hint. + fn show_rru_sized(&self, maybe_reg_universe: Option<&RealRegUniverse>, _size: u8) -> String { + self.show_rru(maybe_reg_universe) + } +} + +impl PrettyPrint for Reg { + fn show_rru(&self, maybe_reg_universe: Option<&RealRegUniverse>) -> String { + if self.is_real() { + if let Some(rru) = maybe_reg_universe { + let reg_ix = self.get_index(); + assert!( + reg_ix < rru.regs.len(), + "unknown real register with index {:?}", + reg_ix + ); + return rru.regs[reg_ix].1.to_string(); + } + } + // The reg is virtual, or we have no universe. Be generic. + format!("%{:?}", self) + } +} + +impl<R: PrettyPrint + WritableBase> PrettyPrint for Writable<R> { + fn show_rru(&self, maybe_reg_universe: Option<&RealRegUniverse>) -> String { + self.to_reg().show_rru(maybe_reg_universe) + } +} + +impl<R: PrettyPrintSized + WritableBase> PrettyPrintSized for Writable<R> { + fn show_rru_sized(&self, maybe_reg_universe: Option<&RealRegUniverse>, size: u8) -> String { + self.to_reg().show_rru_sized(maybe_reg_universe, size) + } +} diff --git a/third_party/rust/regalloc/src/reg_maps.rs b/third_party/rust/regalloc/src/reg_maps.rs new file mode 100644 index 0000000000..f65ea372a3 --- /dev/null +++ b/third_party/rust/regalloc/src/reg_maps.rs @@ -0,0 +1,347 @@ +use crate::{RealReg, RegUsageMapper, VirtualReg}; +use smallvec::SmallVec; +use std::mem; + +/// This data structure holds the mappings needed to map an instruction's uses, mods and defs from +/// virtual to real registers. +/// +/// It remembers the sets of mappings (of a virtual register to a real register) over time, based +/// on precise virtual ranges and their allocations. +/// +/// This is the right implementation to use when a register allocation algorithm keeps track of +/// precise virtual ranges, and maintains them over time. +#[derive(Debug)] +pub struct VrangeRegUsageMapper { + /// Dense vector-map indexed by virtual register number. This is consulted + /// directly for use-queries and augmented with the overlay for def-queries. + slots: Vec<RealReg>, + + /// Overlay for def-queries. This is a set of updates that occurs "during" + /// the instruction in question, and will be applied to the slots array + /// once we are done processing this instruction (in preparation for + /// the next one). + overlay: SmallVec<[(VirtualReg, RealReg); 16]>, +} + +impl VrangeRegUsageMapper { + /// Allocate a reg-usage mapper with the given predicted vreg capacity. + pub(crate) fn new(vreg_capacity: usize) -> VrangeRegUsageMapper { + VrangeRegUsageMapper { + slots: Vec::with_capacity(vreg_capacity), + overlay: SmallVec::new(), + } + } + + /// Is the overlay past the sorted-size threshold? + fn is_overlay_large_enough_to_sort(&self) -> bool { + // Use the SmallVec spill-to-heap threshold as a threshold for "large + // enough to sort"; this has the effect of amortizing the cost of + // sorting along with the cost of copying out to heap memory, and also + // ensures that when we access heap (more likely to miss in cache), we + // do it with O(log N) accesses instead of O(N). + self.overlay.spilled() + } + + /// Update the overlay. + pub(crate) fn set_overlay(&mut self, vreg: VirtualReg, rreg: Option<RealReg>) { + let rreg = rreg.unwrap_or(RealReg::invalid()); + self.overlay.push((vreg, rreg)); + } + + /// Finish updates to the overlay, sorting if necessary. + pub(crate) fn finish_overlay(&mut self) { + if self.overlay.len() == 0 || !self.is_overlay_large_enough_to_sort() { + return; + } + + // Sort stably, so that later updates continue to come after earlier + // ones. + self.overlay.sort_by_key(|pair| pair.0); + // Remove duplicates by collapsing runs of same-vreg pairs down to + // the last one. + let mut last_vreg = self.overlay[0].0; + let mut out = 0; + for i in 1..self.overlay.len() { + let this_vreg = self.overlay[i].0; + if this_vreg != last_vreg { + out += 1; + } + if i != out { + self.overlay[out] = self.overlay[i]; + } + last_vreg = this_vreg; + } + let new_len = out + 1; + self.overlay.truncate(new_len); + } + + /// Merge the overlay into the main map. + pub(crate) fn merge_overlay(&mut self) { + // Take the SmallVec and swap with empty to allow `&mut self` method + // call below. + let mappings = mem::replace(&mut self.overlay, SmallVec::new()); + for (vreg, rreg) in mappings.into_iter() { + self.set_direct_internal(vreg, rreg); + } + } + + /// Make a direct update to the mapping. Only usable when the overlay + /// is empty. + pub(crate) fn set_direct(&mut self, vreg: VirtualReg, rreg: Option<RealReg>) { + debug_assert!(self.overlay.is_empty()); + let rreg = rreg.unwrap_or(RealReg::invalid()); + self.set_direct_internal(vreg, rreg); + } + + fn set_direct_internal(&mut self, vreg: VirtualReg, rreg: RealReg) { + let idx = vreg.get_index(); + if idx >= self.slots.len() { + self.slots.resize(idx + 1, RealReg::invalid()); + } + self.slots[idx] = rreg; + } + + /// Perform a lookup directly in the main map. Returns `None` for + /// not-present. + fn lookup_direct(&self, vreg: VirtualReg) -> Option<RealReg> { + let idx = vreg.get_index(); + if idx >= self.slots.len() { + None + } else { + Some(self.slots[idx]) + } + } + + /// Perform a lookup in the overlay. Returns `None` for not-present. No + /// fallback to main map (that happens in callers). Returns `Some` even + /// if mapped to `RealReg::invalid()`, because this is a tombstone + /// (represents deletion) in the overlay. + fn lookup_overlay(&self, vreg: VirtualReg) -> Option<RealReg> { + if self.is_overlay_large_enough_to_sort() { + // Do a binary search; we are guaranteed to have at most one + // matching because duplicates were collapsed after sorting. + if let Ok(idx) = self.overlay.binary_search_by_key(&vreg, |pair| pair.0) { + return Some(self.overlay[idx].1); + } + } else { + // Search in reverse order to find later updates first. + for &(this_vreg, this_rreg) in self.overlay.iter().rev() { + if this_vreg == vreg { + return Some(this_rreg); + } + } + } + None + } + + /// Sanity check: check that all slots are empty. Typically for use at the + /// end of processing as a debug-assert. + pub(crate) fn is_empty(&self) -> bool { + self.overlay.iter().all(|pair| pair.1.is_invalid()) + && self.slots.iter().all(|rreg| rreg.is_invalid()) + } +} + +impl RegUsageMapper for VrangeRegUsageMapper { + /// Return the `RealReg` if mapped, or `None`, for `vreg` occuring as a use + /// on the current instruction. + fn get_use(&self, vreg: VirtualReg) -> Option<RealReg> { + self.lookup_direct(vreg) + // Convert Some(RealReg::invalid()) to None. + .and_then(|reg| reg.maybe_valid()) + } + + /// Return the `RealReg` if mapped, or `None`, for `vreg` occuring as a def + /// on the current instruction. + fn get_def(&self, vreg: VirtualReg) -> Option<RealReg> { + self.lookup_overlay(vreg) + .or_else(|| self.lookup_direct(vreg)) + // Convert Some(RealReg::invalid()) to None. + .and_then(|reg| reg.maybe_valid()) + } + + /// Return the `RealReg` if mapped, or `None`, for a `vreg` occuring as a + /// mod on the current instruction. + fn get_mod(&self, vreg: VirtualReg) -> Option<RealReg> { + let result = self.get_use(vreg); + debug_assert_eq!(result, self.get_def(vreg)); + result + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::{Reg, RegClass, VirtualReg}; + + fn vreg(idx: u32) -> VirtualReg { + Reg::new_virtual(RegClass::I64, idx).to_virtual_reg() + } + fn rreg(idx: u8) -> RealReg { + Reg::new_real(RegClass::I64, /* enc = */ 0, /* index = */ idx).to_real_reg() + } + + #[test] + fn test_reg_use_mapper() { + let mut mapper = VrangeRegUsageMapper::new(/* estimated vregs = */ 16); + assert_eq!(None, mapper.get_use(vreg(0))); + assert_eq!(None, mapper.get_def(vreg(0))); + assert_eq!(None, mapper.get_mod(vreg(0))); + + mapper.set_direct(vreg(0), Some(rreg(1))); + mapper.set_direct(vreg(1), Some(rreg(2))); + + assert_eq!(Some(rreg(1)), mapper.get_use(vreg(0))); + assert_eq!(Some(rreg(1)), mapper.get_def(vreg(0))); + assert_eq!(Some(rreg(1)), mapper.get_mod(vreg(0))); + assert_eq!(Some(rreg(2)), mapper.get_use(vreg(1))); + assert_eq!(Some(rreg(2)), mapper.get_def(vreg(1))); + assert_eq!(Some(rreg(2)), mapper.get_mod(vreg(1))); + + mapper.set_overlay(vreg(0), Some(rreg(3))); + mapper.set_overlay(vreg(2), Some(rreg(4))); + mapper.finish_overlay(); + + assert_eq!(Some(rreg(1)), mapper.get_use(vreg(0))); + assert_eq!(Some(rreg(3)), mapper.get_def(vreg(0))); + // vreg 0 not valid for mod (use and def differ). + assert_eq!(Some(rreg(2)), mapper.get_use(vreg(1))); + assert_eq!(Some(rreg(2)), mapper.get_def(vreg(1))); + assert_eq!(Some(rreg(2)), mapper.get_mod(vreg(1))); + assert_eq!(None, mapper.get_use(vreg(2))); + assert_eq!(Some(rreg(4)), mapper.get_def(vreg(2))); + // vreg 2 not valid for mod (use and def differ). + + mapper.merge_overlay(); + assert_eq!(Some(rreg(3)), mapper.get_use(vreg(0))); + assert_eq!(Some(rreg(2)), mapper.get_use(vreg(1))); + assert_eq!(Some(rreg(4)), mapper.get_use(vreg(2))); + assert_eq!(None, mapper.get_use(vreg(3))); + + // Check tombstoning behavior. + mapper.set_overlay(vreg(0), None); + mapper.finish_overlay(); + assert_eq!(Some(rreg(3)), mapper.get_use(vreg(0))); + assert_eq!(None, mapper.get_def(vreg(0))); + mapper.merge_overlay(); + + // Check large (sorted) overlay mode. + for i in (2..50).rev() { + mapper.set_overlay(vreg(i), Some(rreg((i + 100) as u8))); + } + mapper.finish_overlay(); + assert_eq!(None, mapper.get_use(vreg(0))); + assert_eq!(Some(rreg(2)), mapper.get_use(vreg(1))); + assert_eq!(Some(rreg(4)), mapper.get_use(vreg(2))); + for i in 2..50 { + assert_eq!(Some(rreg((i + 100) as u8)), mapper.get_def(vreg(i))); + } + mapper.merge_overlay(); + + for i in (0..100).rev() { + mapper.set_overlay(vreg(i), None); + } + mapper.finish_overlay(); + for i in 0..100 { + assert_eq!(None, mapper.get_def(vreg(i))); + } + assert_eq!(false, mapper.is_empty()); + mapper.merge_overlay(); + assert_eq!(true, mapper.is_empty()); + + // Check multiple-update behavior in small mode. + mapper.set_overlay(vreg(1), Some(rreg(1))); + mapper.set_overlay(vreg(1), Some(rreg(2))); + mapper.finish_overlay(); + assert_eq!(Some(rreg(2)), mapper.get_def(vreg(1))); + mapper.merge_overlay(); + assert_eq!(Some(rreg(2)), mapper.get_use(vreg(1))); + + mapper.set_overlay(vreg(1), Some(rreg(2))); + mapper.set_overlay(vreg(1), None); + mapper.finish_overlay(); + assert_eq!(None, mapper.get_def(vreg(1))); + mapper.merge_overlay(); + assert_eq!(None, mapper.get_use(vreg(1))); + + // Check multiple-update behavior in sorted mode. + for i in 0..100 { + mapper.set_overlay(vreg(2), Some(rreg(i))); + } + for i in 0..100 { + mapper.set_overlay(vreg(2), Some(rreg(2 * i))); + } + mapper.finish_overlay(); + assert_eq!(Some(rreg(198)), mapper.get_def(vreg(2))); + mapper.merge_overlay(); + assert_eq!(Some(rreg(198)), mapper.get_use(vreg(2))); + + for i in 0..100 { + mapper.set_overlay(vreg(2), Some(rreg(i))); + } + for _ in 0..100 { + mapper.set_overlay(vreg(2), None); + } + mapper.finish_overlay(); + assert_eq!(None, mapper.get_def(vreg(50))); + mapper.merge_overlay(); + assert_eq!(None, mapper.get_use(vreg(50))); + } +} + +/// This implementation of RegUsageMapper relies on explicit mentions of vregs in instructions. The +/// caller must keep them, and for each instruction: +/// +/// - clear the previous mappings, using `clear()`, +/// - feed the mappings from vregs to rregs for uses and defs, with `set_use`/`set_def`, +/// - then call the `Function::map_regs` function with this structure. +/// +/// This avoids a lot of resizes, and makes it possible for algorithms that don't have precise live +/// ranges to fill in vreg -> rreg mappings. +#[derive(Debug)] +pub struct MentionRegUsageMapper { + /// Sparse vector-map indexed by virtual register number. This is consulted for use-queries. + uses: SmallVec<[(VirtualReg, RealReg); 8]>, + + /// Sparse vector-map indexed by virtual register number. This is consulted for def-queries. + defs: SmallVec<[(VirtualReg, RealReg); 8]>, +} + +impl MentionRegUsageMapper { + pub(crate) fn new() -> Self { + Self { + uses: SmallVec::new(), + defs: SmallVec::new(), + } + } + pub(crate) fn clear(&mut self) { + self.uses.clear(); + self.defs.clear(); + } + pub(crate) fn lookup_use(&self, vreg: VirtualReg) -> Option<RealReg> { + self.uses.iter().find(|&pair| pair.0 == vreg).map(|x| x.1) + } + pub(crate) fn lookup_def(&self, vreg: VirtualReg) -> Option<RealReg> { + self.defs.iter().find(|&pair| pair.0 == vreg).map(|x| x.1) + } + pub(crate) fn set_use(&mut self, vreg: VirtualReg, rreg: RealReg) { + self.uses.push((vreg, rreg)); + } + pub(crate) fn set_def(&mut self, vreg: VirtualReg, rreg: RealReg) { + self.defs.push((vreg, rreg)); + } +} + +impl RegUsageMapper for MentionRegUsageMapper { + fn get_use(&self, vreg: VirtualReg) -> Option<RealReg> { + return self.lookup_use(vreg); + } + fn get_def(&self, vreg: VirtualReg) -> Option<RealReg> { + return self.lookup_def(vreg); + } + fn get_mod(&self, vreg: VirtualReg) -> Option<RealReg> { + let result = self.lookup_use(vreg); + debug_assert_eq!(result, self.lookup_def(vreg)); + return result; + } +} diff --git a/third_party/rust/regalloc/src/snapshot.rs b/third_party/rust/regalloc/src/snapshot.rs new file mode 100644 index 0000000000..7442805ebb --- /dev/null +++ b/third_party/rust/regalloc/src/snapshot.rs @@ -0,0 +1,320 @@ +//! Snapshotting facilities. +//! +//! This makes it possible to save one entire IR input in a generic form that encapsulates all the +//! constraints, so as to be replayed only in the regalloc.rs environment. The main structure, +//! `GenericFunction`, can be created from any type implementing `Function`, acting as a generic +//! Function wrapper. Its layout is simple enough that it can be optionally serialized and +//! deserialized, making it easy to transfer test cases from regalloc.rs users to the crate's +//! maintainers. + +use crate::data_structures::RegVecs; +use crate::*; +use std::borrow::Cow; + +#[cfg(feature = "enable-serde")] +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +enum IRInstKind { + Spill { vreg: Option<VirtualReg> }, + Reload { vreg: Option<VirtualReg> }, + Move { vreg: VirtualReg }, + ZeroLenNop, + UserReturn, + UserMove, + UserOther, +} + +#[derive(Clone, Debug)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct IRInst { + reg_uses: Vec<Reg>, + reg_mods: Vec<Writable<Reg>>, + reg_defs: Vec<Writable<Reg>>, + kind: IRInstKind, +} + +#[derive(Clone)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct IRFunction { + instructions: Vec<IRInst>, + block_ranges: Vec<Range<InstIx>>, + block_succs: Vec<Vec<BlockIx>>, + entry_block: BlockIx, + liveins: Set<RealReg>, + liveouts: Set<RealReg>, + vreg_spill_slot_sizes: Vec<Option<(u32, RegClass)>>, + num_vregs: usize, +} + +#[derive(Clone)] +#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))] +pub struct IRSnapshot { + reg_universe: RealRegUniverse, + func: IRFunction, +} + +impl IRSnapshot { + fn new_inst<F: Function>(func: &F, ix: InstIx, inst: &F::Inst) -> IRInst { + let mut reg_vecs = RegVecs::new(/* sanitized */ false); + + let mut collector = RegUsageCollector::new(&mut reg_vecs); + F::get_regs(inst, &mut collector); + + let kind = if let Some((_wreg, _reg)) = func.is_move(inst) { + IRInstKind::UserMove + } else if func.is_ret(ix) { + IRInstKind::UserReturn + } else { + IRInstKind::UserOther + }; + + IRInst { + reg_uses: reg_vecs.uses, + reg_mods: reg_vecs + .mods + .into_iter() + .map(|reg| Writable::from_reg(reg)) + .collect(), + reg_defs: reg_vecs + .defs + .into_iter() + .map(|reg| Writable::from_reg(reg)) + .collect(), + kind, + } + } + + pub fn from_function<F: Function>(func: &F, reg_universe: &RealRegUniverse) -> Self { + let instructions: Vec<IRInst> = func + .insns() + .iter() + .enumerate() + .map(|(ix, inst)| IRSnapshot::new_inst(func, InstIx::new(ix as u32), inst)) + .collect(); + + let mut block_ranges = Vec::new(); + let mut block_succs = Vec::new(); + for block in func.blocks() { + block_ranges.push(func.block_insns(block)); + block_succs.push(func.block_succs(block).into()); + } + + let vreg_spill_slot_sizes = { + let mut array: Vec<Option<(u32, RegClass)>> = Vec::new(); + + let mut handle_reg = |reg: &Reg| { + if let Some(vreg) = reg.as_virtual_reg() { + let rc = vreg.get_class(); + let spill_slot_size = func.get_spillslot_size(rc, vreg); + let index = vreg.get_index(); + if index >= array.len() { + array.resize(index + 1, None); + } + let entry = &mut array[vreg.get_index()]; + match entry { + None => *entry = Some((spill_slot_size, rc)), + Some((prev_size, prev_rc)) => { + assert_eq!(*prev_rc, rc); + assert_eq!(*prev_size, spill_slot_size); + } + } + } + }; + + for inst in &instructions { + for reg in &inst.reg_uses { + handle_reg(reg); + } + for reg in &inst.reg_mods { + handle_reg(®.to_reg()); + } + for reg in &inst.reg_defs { + handle_reg(®.to_reg()); + } + } + + array + }; + + let entry_block = func.entry_block(); + let liveins = func.func_liveins(); + let liveouts = func.func_liveouts(); + + Self { + reg_universe: reg_universe.clone(), + func: IRFunction { + instructions, + block_ranges, + block_succs, + entry_block, + liveins, + liveouts, + vreg_spill_slot_sizes, + num_vregs: func.get_num_vregs(), + }, + } + } + + pub fn allocate(&mut self, opts: Options) -> Result<RegAllocResult<IRFunction>, RegAllocError> { + allocate_registers_with_opts( + &mut self.func, + &self.reg_universe, + None, /*no stackmap request*/ + opts, + ) + } +} + +impl Function for IRFunction { + type Inst = IRInst; + + // Liveins, liveouts. + fn func_liveins(&self) -> Set<RealReg> { + self.liveins.clone() + } + fn func_liveouts(&self) -> Set<RealReg> { + self.liveouts.clone() + } + fn get_num_vregs(&self) -> usize { + self.num_vregs + } + + // Instructions. + fn insns(&self) -> &[Self::Inst] { + &self.instructions + } + fn insns_mut(&mut self) -> &mut [Self::Inst] { + &mut self.instructions + } + fn get_insn(&self, insn: InstIx) -> &Self::Inst { + &self.instructions[insn.get() as usize] + } + fn get_insn_mut(&mut self, insn: InstIx) -> &mut Self::Inst { + &mut self.instructions[insn.get() as usize] + } + + fn is_ret(&self, insn: InstIx) -> bool { + let inst = &self.instructions[insn.get() as usize]; + if let IRInstKind::UserReturn = inst.kind { + true + } else { + false + } + } + + fn is_move(&self, insn: &Self::Inst) -> Option<(Writable<Reg>, Reg)> { + if let IRInstKind::UserMove = insn.kind { + let from = insn.reg_uses[0]; + let to = insn.reg_defs[0]; + Some((to, from)) + } else { + None + } + } + + // Blocks. + fn blocks(&self) -> Range<BlockIx> { + Range::new(BlockIx::new(0), self.block_ranges.len()) + } + fn entry_block(&self) -> BlockIx { + self.entry_block + } + fn block_insns(&self, block: BlockIx) -> Range<InstIx> { + self.block_ranges[block.get() as usize] + } + fn block_succs(&self, block: BlockIx) -> Cow<[BlockIx]> { + Cow::Borrowed(&self.block_succs[block.get() as usize]) + } + + fn get_regs(insn: &Self::Inst, collector: &mut RegUsageCollector) { + collector.add_uses(&insn.reg_uses); + collector.add_mods(&insn.reg_mods); + collector.add_defs(&insn.reg_defs); + } + + fn map_regs<RUM: RegUsageMapper>(insn: &mut Self::Inst, maps: &RUM) { + for reg_use in insn.reg_uses.iter_mut() { + if let Some(vreg) = reg_use.as_virtual_reg() { + *reg_use = maps.get_use(vreg).expect("missing alloc for use").to_reg(); + } + } + for reg_mod in insn.reg_mods.iter_mut() { + if let Some(vreg) = reg_mod.to_reg().as_virtual_reg() { + *reg_mod = + Writable::from_reg(maps.get_mod(vreg).expect("missing alloc for mod").to_reg()); + } + } + for reg_def in insn.reg_defs.iter_mut() { + if let Some(vreg) = reg_def.to_reg().as_virtual_reg() { + *reg_def = + Writable::from_reg(maps.get_def(vreg).expect("missing alloc for def").to_reg()); + } + } + } + + fn gen_spill( + &self, + _to_slot: SpillSlot, + from_reg: RealReg, + for_vreg: Option<VirtualReg>, + ) -> Self::Inst { + IRInst { + reg_uses: vec![from_reg.to_reg()], + reg_mods: vec![], + reg_defs: vec![], + kind: IRInstKind::Spill { vreg: for_vreg }, + } + } + fn gen_reload( + &self, + to_reg: Writable<RealReg>, + _from_slot: SpillSlot, + for_vreg: Option<VirtualReg>, + ) -> Self::Inst { + IRInst { + reg_uses: vec![], + reg_mods: vec![], + reg_defs: vec![Writable::from_reg(to_reg.to_reg().to_reg())], + kind: IRInstKind::Reload { vreg: for_vreg }, + } + } + fn gen_move( + &self, + to_reg: Writable<RealReg>, + from_reg: RealReg, + for_vreg: VirtualReg, + ) -> Self::Inst { + IRInst { + reg_uses: vec![from_reg.to_reg()], + reg_mods: vec![], + reg_defs: vec![Writable::from_reg(to_reg.to_reg().to_reg())], + kind: IRInstKind::Move { vreg: for_vreg }, + } + } + fn gen_zero_len_nop(&self) -> Self::Inst { + IRInst { + reg_uses: vec![], + reg_mods: vec![], + reg_defs: vec![], + kind: IRInstKind::ZeroLenNop, + } + } + + fn get_spillslot_size(&self, regclass: RegClass, for_vreg: VirtualReg) -> u32 { + let entry = + self.vreg_spill_slot_sizes[for_vreg.get_index()].expect("missing spillslot info"); + assert_eq!(entry.1, regclass); + return entry.0; + } + + fn maybe_direct_reload( + &self, + _insn: &Self::Inst, + _reg: VirtualReg, + _slot: SpillSlot, + ) -> Option<Self::Inst> { + unimplemented!(); + } +} diff --git a/third_party/rust/regalloc/src/sparse_set.rs b/third_party/rust/regalloc/src/sparse_set.rs new file mode 100644 index 0000000000..f07fab8792 --- /dev/null +++ b/third_party/rust/regalloc/src/sparse_set.rs @@ -0,0 +1,881 @@ +#![allow(non_snake_case)] +#![allow(non_camel_case_types)] + +//! An implementation of sets which aims to be fast for both large sets and +//! very small sets, even if the elements are sparse relative to the universe. + +use rustc_hash::FxHashSet; +use std::fmt; +use std::hash::Hash; + +//============================================================================= +// SparseSet + +// Handy wrappers around `SparseSetU`, if you don't want to have to guess at an "optimal" +// in-line size. +pub type SparseSet<T> = SparseSetU<[T; 12]>; +//pub type SparseSetIter<'a, T> = SparseSetUIter<'a, [T; 12]>; // No use case yet + +// Implementation: for small, unordered but no dups + +use core::mem::MaybeUninit; +use core::ptr::{read, write}; + +// Types that can be used as the backing store for a SparseSet. +pub trait Array { + // The type of the array's elements. + type Item; + // Returns the number of items the array can hold. + fn size() -> usize; +} +macro_rules! impl_array( + ($($size:expr),+) => { + $( + impl<T> Array for [T; $size] { + type Item = T; + fn size() -> usize { $size } + } + )+ + } +); +impl_array!(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 24, 28, 32); + +// The U here stands for "unordered". It refers to the fact that the elements +// in `Small::arr` are in no particular order, although they are +// duplicate-free. +pub enum SparseSetU<A: Array> { + Large { set: FxHashSet<A::Item> }, + Small { card: usize, arr: MaybeUninit<A> }, +} + +// ================ Admin (private) methods ================ + +impl<A> SparseSetU<A> +where + A: Array + Eq + Ord + Hash + Copy + fmt::Debug, + A::Item: Eq + Ord + Hash + Copy + fmt::Debug, +{ + #[cfg(test)] + fn is_small(&self) -> bool { + match self { + SparseSetU::Large { .. } => false, + SparseSetU::Small { .. } => true, + } + } + #[cfg(test)] + fn is_large(&self) -> bool { + !self.is_small() + } + #[inline(never)] + fn upgrade(&mut self) { + match self { + SparseSetU::Large { .. } => panic!("SparseSetU: upgrade"), + SparseSetU::Small { card, arr } => { + assert!(*card == A::size()); + let mut set = FxHashSet::<A::Item>::default(); + set.reserve(A::size()); + // Could this be done faster? + let arr_p = arr.as_mut_ptr() as *mut A::Item; + for i in 0..*card { + set.insert(unsafe { read(arr_p.add(i)) }); + } + *self = SparseSetU::Large { set } + } + } + } + // A large set is only downgradeable if its card does not exceed this value. + #[inline(always)] + fn small_halfmax_card(&self) -> usize { + let limit = A::size(); + //if limit >= 4 { + // limit / 2 + //} else { + // limit - 1 + //} + if false { + // Set the transition point as roughly half of the inline size + match limit { + 0 | 1 => panic!("SparseSetU: small_halfmax_card"), + 2 => 1, + 3 => 2, + 4 => 2, + 5 => 3, + 6 => 3, + _ => limit / 2, + } + } else { + // Set the transition point as roughly two thirds of the inline size + match limit { + 0 | 1 => panic!("SparseSetU: small_halfmax_card"), + 2 => 1, + 3 => 2, + 4 => 3, + 5 => 4, + 6 => 4, + // FIXME JRS 2020Apr10 avoid possible integer overflow here: + _ => (2 * limit) / 3, + } + } + } + // If we have a large-format set, but the cardinality has fallen below half + // the size of a small format set, convert it to the small format. This + // isn't done at the point when the cardinality falls to the max capacity of + // a small set in order to give some hysteresis -- we don't want to be + // constantly converting back and forth for a set whose size repeatedly + // crosses the border. + #[inline(never)] + fn maybe_downgrade(&mut self) { + let small_halfmax_card = self.small_halfmax_card(); + match self { + SparseSetU::Large { set } => { + if set.len() <= small_halfmax_card { + let mut arr = MaybeUninit::<A>::uninit(); + let arr_p = arr.as_mut_ptr() as *mut A::Item; + let mut i = 0; + for e in set.iter() { + unsafe { write(arr_p.add(i), *e) }; + i += 1; + } + assert!(i <= small_halfmax_card); + *self = SparseSetU::Small { card: i, arr }; + } + } + SparseSetU::Small { .. } => { + panic!("SparseSetU::maybe_downgrade: is already small"); + } + } + } + #[inline(always)] + fn insert_no_dup_check(&mut self, item: A::Item) { + match self { + SparseSetU::Large { set } => { + set.insert(item); + } + SparseSetU::Small { card, arr } => { + assert!(*card <= A::size()); + if *card < A::size() { + // Stay small + let arr_p = arr.as_mut_ptr() as *mut A::Item; + unsafe { + write(arr_p.add(*card), item); + } + *card += 1; + } else { + // Transition up + self.upgrade(); + match self { + SparseSetU::Large { set } => { + let _ = set.insert(item); + } + SparseSetU::Small { .. } => { + // Err, what? Still Small after upgrade? + panic!("SparseSetU::insert_no_dup_check") + } + } + } + } + } + } +} +#[inline(always)] +fn small_contains<A>(card: usize, arr: &MaybeUninit<A>, item: A::Item) -> bool +where + A: Array, + A::Item: Eq, +{ + let arr_p = arr.as_ptr() as *const A::Item; + for i in 0..card { + if unsafe { read(arr_p.add(i)) } == item { + return true; + } + } + false +} + +// ================ Public methods ================ + +impl<A> SparseSetU<A> +where + A: Array + Eq + Ord + Hash + Copy + fmt::Debug, + A::Item: Eq + Ord + Hash + Copy + fmt::Debug, +{ + #[inline(always)] + pub fn empty() -> Self { + SparseSetU::Small { + card: 0, + arr: MaybeUninit::uninit(), + } + } + + #[inline(always)] + pub fn is_empty(&self) -> bool { + match self { + SparseSetU::Small { card, .. } => *card == 0, + SparseSetU::Large { set } => { + // This holds because `maybe_downgrade` will always convert a + // zero-sized large variant into a small variant. + assert!(set.len() > 0); + false + } + } + } + + #[inline(never)] + pub fn card(&self) -> usize { + match self { + SparseSetU::Large { set } => set.len(), + SparseSetU::Small { card, .. } => *card, + } + } + + #[inline(never)] + pub fn insert(&mut self, item: A::Item) { + match self { + SparseSetU::Large { set } => { + set.insert(item); + } + SparseSetU::Small { card, arr } => { + assert!(*card <= A::size()); + // Do we already have it? + if small_contains(*card, arr, item) { + return; + } + // No. + let arr_p = arr.as_mut_ptr() as *mut A::Item; + if *card < A::size() { + // Stay small + unsafe { + write(arr_p.add(*card), item); + } + *card += 1; + } else { + // Transition up + self.upgrade(); + self.insert(item); + } + } + } + } + + #[inline(always)] + pub fn contains(&self, item: A::Item) -> bool { + match self { + SparseSetU::Large { set } => set.contains(&item), + SparseSetU::Small { card, arr } => small_contains(*card, arr, item), + } + } + + #[inline(never)] + pub fn union(&mut self, other: &Self) { + match self { + SparseSetU::Large { set: set1 } => match other { + SparseSetU::Large { set: set2 } => { + for item in set2.iter() { + set1.insert(*item); + } + } + SparseSetU::Small { + card: card2, + arr: arr2, + } => { + let arr2_p = arr2.as_ptr() as *const A::Item; + for i in 0..*card2 { + let item = unsafe { read(arr2_p.add(i)) }; + set1.insert(item); + } + } + }, + SparseSetU::Small { + card: card1, + arr: arr1, + } => { + let arr1_p = arr1.as_mut_ptr() as *mut A::Item; + match other { + SparseSetU::Large { set: set2 } => { + let mut set2c = set2.clone(); + for i in 0..*card1 { + let item = unsafe { read(arr1_p.add(i)) }; + set2c.insert(item); + } + *self = SparseSetU::Large { set: set2c }; + } + SparseSetU::Small { + card: card2, + arr: arr2, + } => { + let mut extras: MaybeUninit<A> = MaybeUninit::uninit(); + let mut n_extras = 0; + let extras_p = extras.as_mut_ptr() as *mut A::Item; + let arr2_p = arr2.as_ptr() as *const A::Item; + // Iterate through the second set. Add every item not in the + // first set to `extras`. + for i in 0..*card2 { + let item2 = unsafe { read(arr2_p.add(i)) }; + let mut in1 = false; + for j in 0..*card1 { + let item1 = unsafe { read(arr1_p.add(j)) }; + if item1 == item2 { + in1 = true; + break; + } + } + if !in1 { + debug_assert!(n_extras < A::size()); + unsafe { + write(extras_p.add(n_extras), item2); + } + n_extras += 1; + } + } + // The result is the concatenation of arr1 and extras. + for i in 0..n_extras { + let item = unsafe { read(extras_p.add(i)) }; + self.insert_no_dup_check(item); + } + } + } + } + } + } + + #[inline(never)] + pub fn remove(&mut self, other: &Self) { + match self { + SparseSetU::Large { set: set1 } => { + match other { + SparseSetU::Large { set: set2 } => { + for item in set2.iter() { + set1.remove(item); + } + } + SparseSetU::Small { + card: card2, + arr: arr2, + } => { + let arr2_p = arr2.as_ptr() as *const A::Item; + for i in 0..*card2 { + let item = unsafe { read(arr2_p.add(i)) }; + set1.remove(&item); + } + } + } + self.maybe_downgrade(); + } + SparseSetU::Small { + card: card1, + arr: arr1, + } => { + let arr1_p = arr1.as_mut_ptr() as *mut A::Item; + match other { + SparseSetU::Large { set: set2 } => { + let mut w = 0; + for r in 0..*card1 { + let item = unsafe { read(arr1_p.add(r)) }; + let is_in2 = set2.contains(&item); + if !is_in2 { + // Keep it. + if r != w { + unsafe { + write(arr1_p.add(w), item); + } + } + w += 1; + } + } + *card1 = w; + } + SparseSetU::Small { + card: card2, + arr: arr2, + } => { + let arr2_p = arr2.as_ptr() as *const A::Item; + let mut w = 0; + for r in 0..*card1 { + let item = unsafe { read(arr1_p.add(r)) }; + let mut is_in2 = false; + for i in 0..*card2 { + if unsafe { read(arr2_p.add(i)) } == item { + is_in2 = true; + break; + } + } + if !is_in2 { + // Keep it. + if r != w { + unsafe { + write(arr1_p.add(w), item); + } + } + w += 1; + } + } + *card1 = w; + } + } + } + } + } + + // return true if `self` is a subset of `other` + #[inline(never)] + pub fn is_subset_of(&self, other: &Self) -> bool { + if self.card() > other.card() { + return false; + } + // Visit all items in `self`, and see if they are in `other`. If so + // return true. + match self { + SparseSetU::Large { set: set1 } => match other { + SparseSetU::Large { set: set2 } => set1.is_subset(set2), + SparseSetU::Small { + card: card2, + arr: arr2, + } => { + for item in set1.iter() { + if !small_contains(*card2, arr2, *item) { + return false; + } + } + true + } + }, + SparseSetU::Small { + card: card1, + arr: arr1, + } => { + let arr1_p = arr1.as_ptr() as *const A::Item; + match other { + SparseSetU::Large { set: set2 } => { + for i in 0..*card1 { + let item = unsafe { read(arr1_p.add(i)) }; + if !set2.contains(&item) { + return false; + } + } + true + } + SparseSetU::Small { + card: card2, + arr: arr2, + } => { + for i in 0..*card1 { + let item = unsafe { read(arr1_p.add(i)) }; + if !small_contains(*card2, arr2, item) { + return false; + } + } + true + } + } + } + } + } + + #[inline(never)] + pub fn to_vec(&self) -> Vec<A::Item> { + let mut res = Vec::<A::Item>::new(); + match self { + SparseSetU::Large { set } => { + for item in set.iter() { + res.push(*item); + } + } + SparseSetU::Small { card, arr } => { + let arr_p = arr.as_ptr() as *const A::Item; + for i in 0..*card { + res.push(unsafe { read(arr_p.add(i)) }); + } + } + } + // Don't delete this. It is important. + res.sort_unstable(); + res + } + + #[inline(never)] + pub fn from_vec(vec: Vec<A::Item>) -> Self { + let vec_len = vec.len(); + if vec_len <= A::size() { + let mut card = 0; + let mut arr: MaybeUninit<A> = MaybeUninit::uninit(); + for i in 0..vec_len { + let item = vec[i]; + if small_contains(card, &arr, item) { + continue; + } + let arr_p = arr.as_mut_ptr() as *mut A::Item; + unsafe { write(arr_p.add(card), item) } + card += 1; + } + SparseSetU::Small { card, arr } + } else { + let mut set = FxHashSet::<A::Item>::default(); + for i in 0..vec_len { + set.insert(vec[i]); + } + SparseSetU::Large { set } + } + } + + #[inline(never)] + pub fn equals(&self, other: &Self) -> bool { + if self.card() != other.card() { + return false; + } + match (self, other) { + (SparseSetU::Large { set: set1 }, SparseSetU::Large { set: set2 }) => set1 == set2, + ( + SparseSetU::Small { + card: card1, + arr: arr1, + }, + SparseSetU::Small { + card: card2, + arr: arr2, + }, + ) => { + assert!(*card1 == *card2); + // Check to see that all items in arr1 are present in arr2. Since the + // arrays have the same length and are duplicate free, although + // unordered, this is a sufficient equality test. + let arr1_p = arr1.as_ptr() as *const A::Item; + let arr2_p = arr2.as_ptr() as *const A::Item; + for i1 in 0..*card1 { + let item1 = unsafe { read(arr1_p.add(i1)) }; + let mut found1 = false; + for i2 in 0..*card2 { + let item2 = unsafe { read(arr2_p.add(i2)) }; + if item1 == item2 { + found1 = true; + break; + } + } + if !found1 { + return false; + } + } + true + } + (SparseSetU::Small { card, arr }, SparseSetU::Large { set }) + | (SparseSetU::Large { set }, SparseSetU::Small { card, arr }) => { + // Same rationale as above as to why this is a sufficient test. + let arr_p = arr.as_ptr() as *const A::Item; + for i in 0..*card { + let item = unsafe { read(arr_p.add(i)) }; + if !set.contains(&item) { + return false; + } + } + true + } + } + } +} + +impl<A> fmt::Debug for SparseSetU<A> +where + A: Array + Eq + Ord + Hash + Copy + fmt::Debug, + A::Item: Eq + Ord + Hash + Copy + fmt::Debug, +{ + #[inline(never)] + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + // Print the elements in some way which depends only on what is + // present in the set, and not on any other factor. In particular, + // <Debug for FxHashSet> has been observed to to print the elements + // of a two element set in both orders on different occasions. + let sorted_vec = self.to_vec(); + let mut s = "{".to_string(); + for i in 0..sorted_vec.len() { + if i > 0 { + s = s + &", ".to_string(); + } + s = s + &format!("{:?}", &sorted_vec[i]); + } + s = s + &"}".to_string(); + write!(fmt, "{}", s) + } +} + +impl<A> Clone for SparseSetU<A> +where + A: Array + Eq + Ord + Hash + Copy + Clone + fmt::Debug, + A::Item: Eq + Ord + Hash + Copy + Clone + fmt::Debug, +{ + #[inline(never)] + fn clone(&self) -> Self { + match self { + SparseSetU::Large { set } => SparseSetU::Large { set: set.clone() }, + SparseSetU::Small { card, arr } => { + let arr2 = arr.clone(); + SparseSetU::Small { + card: *card, + arr: arr2, + } + } + } + } +} + +pub enum SparseSetUIter<'a, A: Array> { + Large { + set_iter: std::collections::hash_set::Iter<'a, A::Item>, + }, + Small { + card: usize, + arr: &'a MaybeUninit<A>, + next: usize, + }, +} +impl<A: Array> SparseSetU<A> { + pub fn iter(&self) -> SparseSetUIter<A> { + match self { + SparseSetU::Large { set } => SparseSetUIter::Large { + set_iter: set.iter(), + }, + SparseSetU::Small { card, arr } => SparseSetUIter::Small { + card: *card, + arr, + next: 0, + }, + } + } +} +impl<'a, A: Array> Iterator for SparseSetUIter<'a, A> { + type Item = &'a A::Item; + fn next(&mut self) -> Option<Self::Item> { + match self { + SparseSetUIter::Large { set_iter } => set_iter.next(), + SparseSetUIter::Small { card, arr, next } => { + if next < card { + let arr_p = arr.as_ptr() as *const A::Item; + let item_p = unsafe { arr_p.add(*next) }; + *next += 1; + Some(unsafe { &*item_p }) + } else { + None + } + } + } + } +} + +// ================ Testing machinery for SparseSetU ================ + +#[cfg(test)] +mod sparse_set_test_utils { + // As currently set up, each number (from rand, not rand_base) has a 1-in-4 + // chance of being a dup of the last 8 numbers produced. + pub struct RNGwithDups { + seed: u32, + circ: [u32; 8], + circC: usize, // the cursor for `circ` + } + impl RNGwithDups { + pub fn new() -> Self { + Self { + seed: 0, + circ: [0; 8], + circC: 0, + } + } + fn rand_base(&mut self) -> u32 { + self.seed = self.seed.wrapping_mul(1103515245).wrapping_add(12345); + self.seed + } + pub fn rand(&mut self) -> u32 { + let r = self.rand_base(); + let rlo = r & 0xFFFF; + let rhi = (r >> 16) & 0xFF; + if rhi < 64 { + self.circ[(rhi % 8) as usize] + } else { + self.circ[self.circC as usize] = rlo; + self.circC += 1; + if self.circC == 8 { + self.circC = 0 + }; + rlo + } + } + pub fn rand_vec(&mut self, len: usize) -> Vec<u32> { + let mut res = vec![]; + for _ in 0..len { + res.push(self.rand()); + } + res + } + } +} + +#[test] +fn test_sparse_set() { + use crate::data_structures::Set; + let mut set = SparseSetU::<[u32; 3]>::empty(); + assert!(set.is_small()); + set.insert(3); + assert!(set.is_small()); + set.insert(1); + assert!(set.is_small()); + set.insert(4); + assert!(set.is_small()); + set.insert(7); + assert!(set.is_large()); + + let iters = 20; + let mut rng = sparse_set_test_utils::RNGwithDups::new(); + + // empty + { + let spa = SparseSetU::<[u32; 10]>::empty(); + assert!(spa.card() == 0); + } + + // card, is_empty + for _ in 0..iters * 3 { + for n1 in 0..100 { + let size1 = n1 % 25; + let vec1a = rng.rand_vec(size1); + let vec1b = vec1a.clone(); // This is very stupid. + let spa1 = SparseSetU::<[u32; 10]>::from_vec(vec1a); + let std1 = Set::<u32>::from_vec(vec1b); + assert!(spa1.card() == std1.card()); + assert!(spa1.is_empty() == (size1 == 0)); + } + } + + // insert + for _ in 0..iters * 3 { + for n1 in 0..100 { + let size1 = n1 % 25; + let vec1a = rng.rand_vec(size1); + let vec1b = vec1a.clone(); + let tmp = if size1 == 0 { 0 } else { vec1a[0] }; + let mut spa1 = SparseSetU::<[u32; 10]>::from_vec(vec1a); + let mut std1 = Set::<u32>::from_vec(vec1b); + // Insert an item which is almost certainly not in the set. + let n = rng.rand(); + spa1.insert(n); + std1.insert(n); + assert!(spa1.card() == std1.card()); + assert!(spa1.to_vec() == std1.to_vec()); + // Insert an item which is already in the set. + if n1 > 0 { + spa1.insert(tmp); + std1.insert(tmp); + assert!(spa1.card() == std1.card()); + assert!(spa1.to_vec() == std1.to_vec()); + } + } + } + + // contains + for _ in 0..iters * 2 { + for n1 in 0..100 { + let size1 = n1 % 25; + let vec1a = rng.rand_vec(size1); + let vec1b = vec1a.clone(); + let tmp = if size1 == 0 { 0 } else { vec1a[0] }; + let spa1 = SparseSetU::<[u32; 10]>::from_vec(vec1a); + let std1 = Set::<u32>::from_vec(vec1b); + // Check for an item which is almost certainly not in the set. + let n = rng.rand(); + assert!(spa1.contains(n) == std1.contains(n)); + // Check for an item which is already in the set. + if n1 > 0 { + assert!(spa1.contains(tmp) == std1.contains(tmp)); + } + } + } + + // union + for _ in 0..iters * 2 { + for size1 in 0..25 { + for size2 in 0..25 { + let vec1a = rng.rand_vec(size1); + let vec2a = rng.rand_vec(size2); + let vec1b = vec1a.clone(); + let vec2b = vec2a.clone(); + let mut spa1 = SparseSetU::<[u32; 10]>::from_vec(vec1a); + let spa2 = SparseSetU::<[u32; 10]>::from_vec(vec2a); + let mut std1 = Set::<u32>::from_vec(vec1b); + let std2 = Set::<u32>::from_vec(vec2b); + spa1.union(&spa2); + std1.union(&std2); + assert!(spa1.to_vec() == std1.to_vec()); + } + } + } + + // remove + for _ in 0..iters * 2 { + for size1 in 0..25 { + for size2 in 0..25 { + let vec1a = rng.rand_vec(size1); + let vec2a = rng.rand_vec(size2); + let vec1b = vec1a.clone(); + let vec2b = vec2a.clone(); + let mut spa1 = SparseSetU::<[u32; 10]>::from_vec(vec1a); + let spa2 = SparseSetU::<[u32; 10]>::from_vec(vec2a); + let mut std1 = Set::<u32>::from_vec(vec1b); + let std2 = Set::<u32>::from_vec(vec2b); + spa1.remove(&spa2); + std1.remove(&std2); + assert!(spa1.to_vec() == std1.to_vec()); + } + } + } + + // is_subset_of + for _ in 0..iters * 2 { + for size1 in 0..25 { + for size2 in 0..25 { + let vec1a = rng.rand_vec(size1); + let vec2a = rng.rand_vec(size2); + let vec1b = vec1a.clone(); + let vec2b = vec2a.clone(); + let spa1 = SparseSetU::<[u32; 10]>::from_vec(vec1a); + let spa2 = SparseSetU::<[u32; 10]>::from_vec(vec2a); + let std1 = Set::<u32>::from_vec(vec1b); + let std2 = Set::<u32>::from_vec(vec2b); + assert!(spa1.is_subset_of(&spa2) == std1.is_subset_of(&std2)); + } + } + } + + // to_vec and from_vec are implicitly tested by the above; there's no way + // they could be wrong and still have the above tests succeed. + // (Famous last words!) + + // equals + for _ in 0..iters * 2 { + for size1 in 0..25 { + for size2 in 0..25 { + let vec1a = rng.rand_vec(size1); + let vec2a = rng.rand_vec(size2); + let vec1b = vec1a.clone(); + let vec2b = vec2a.clone(); + let spa1 = SparseSetU::<[u32; 10]>::from_vec(vec1a); + let spa2 = SparseSetU::<[u32; 10]>::from_vec(vec2a); + let std1 = Set::<u32>::from_vec(vec1b); + let std2 = Set::<u32>::from_vec(vec2b); + assert!(std1.equals(&std1)); // obviously + assert!(std2.equals(&std2)); // obviously + assert!(spa1.equals(&spa1)); // obviously + assert!(spa2.equals(&spa2)); // obviously + // More seriously + assert!(spa1.equals(&spa2) == std1.equals(&std2)); + } + } + } + + // clone + for _ in 0..iters * 3 { + for n1 in 0..100 { + let size1 = n1 % 25; + let vec1a = rng.rand_vec(size1); + let spa1 = SparseSetU::<[u32; 10]>::from_vec(vec1a); + let spa2 = spa1.clone(); + assert!(spa1.equals(&spa2)); + } + } +} diff --git a/third_party/rust/regalloc/src/union_find.rs b/third_party/rust/regalloc/src/union_find.rs new file mode 100644 index 0000000000..bb5347abe3 --- /dev/null +++ b/third_party/rust/regalloc/src/union_find.rs @@ -0,0 +1,749 @@ +#![allow(non_snake_case)] +#![allow(non_camel_case_types)] + +//! An implementation of a fast union-find implementation for "T: ToFromU32" items +//! in some dense range [0, N-1]. + +use std::marker::PhantomData; + +//============================================================================= +// ToFromU32 + +// First, we need this. You can store anything you like in this union-find +// mechanism, so long as it is really a u32. Reminds me of that old joke +// about the Model T Ford being available in any colour you want, so long as +// it is black. +pub trait ToFromU32<T: Sized = Self> { + fn to_u32(x: Self) -> u32; + fn from_u32(x: u32) -> Self; +} +//impl ToFromU32 for i32 { +// fn to_u32(x: i32) -> u32 { +// x as u32 +// } +// fn from_u32(x: u32) -> i32 { +// x as i32 +// } +//} +impl ToFromU32 for u32 { + fn to_u32(x: u32) -> u32 { + x + } + fn from_u32(x: u32) -> u32 { + x + } +} + +//============================================================================= +// UnionFind + +// This is a fast union-find implementation for "T: ToFromU32" items in some +// dense range [0, N-1]. The allowed operations are: +// +// (1) create a new `UnionFind`er +// +// (2) mark two elements as being in the same equivalence class +// +// (3) get the equivalence classes wrapped up in an opaque structure +// `UnionFindEquivClasses`, which makes it possible to cheaply find and +// iterate through the equivalence class of any item. +// +// (4) get an iterator over the "equivalence class leaders". Iterating this +// produces one value from each equivalence class. By presenting each of +// these to (3), it is possible to enumerate all the equivalence classes +// exactly once. +// +// `UnionFind` and the operations `union` and `find` are loosely based on the +// discussion in Chapter 8 of "Data Structures and Algorithm Analysis in C" +// (Mark Allen Weiss, 1993). `UnionFindEquivClasses` and the algorithm to +// construct it is home-grown; although I'm sure the same idea has been +// implemented many times before. + +pub struct UnionFind<T: ToFromU32> { + // These are the trees that we are building. A value that is negative means + // that this node is a tree root, and the negation of its value is the size + // of the tree. A value that is positive (which must be in the range [0, + // N-1]) indicates that this tree is a subtree and that its parent has the + // given index. + // + // One consequence of this representation is that at most 2^31-1 values can + // be supported. Doesn't seem like much of a limitation in practice, given + // that all of this allocator's data structures are limited to 2^32 entries. + /*priv*/ + parent_or_size: Vec<i32>, + + // Keep the typechecker happy + /*priv*/ + anchor: PhantomData<T>, +} + +/*priv*/ +const UF_MAX_SIZE: u32 = 0x7FFF_FFF0; + +impl<T: ToFromU32> UnionFind<T> { + pub fn new(size: usize) -> Self { + // Test a slightly conservative limit to avoid any off-by-one errors. + if size > UF_MAX_SIZE as usize { + panic!("UnionFind::new: too many elements; max = 2^31 - 16."); + } + let mut parent_or_size = Vec::<i32>::new(); + parent_or_size.resize(size, -1); + Self { + parent_or_size, + anchor: PhantomData, + } + } + + // Find, with path compression. Returns the index of tree root for the + // given element. This is not for external use. There's no boundary + // checking since Rust will do that anyway. + // + // This was initially implemented using a recursive function. However, + // this function gets called a lot, and the recursion led to significant + // expense. Attempts to replace the recursion with an explicit stack + // didn't give much speedup. Hence the following scheme, which retains + // the recursion but unrolls the function. To avoid performance problems + // caused by the interaction of inlining and recursion, it is split into + // two functions: `find` and `find_slow`. + // + // This is the main function. It is hot, so it is unrolled 4 times. If + // those 4 iterations don't complete the traversal back to the root, it + // calls onwards to `find_slow`, which recurses. The idea is that `find` + // handles the majority of the cases and can always be inlined, and we + // hand off the remaining cases to `find_slow` which will never be inlined + // (and hence will not interfere with the inlining of this function). + // + // As a reminder of the comments above: + // + // * A `parent_or_size` value that is negative means that this node is a + // tree root. + // + // * A `parent_or_size` that is non-negative indicates that this tree is a + // subtree, and its parent has the given index in `parent_or_size`. + #[inline(always)] + fn find(&mut self, elem0: u32) -> u32 { + // Handle up to 4 steps up the tree in-line. + let elem0_parent_or_size: i32 = self.parent_or_size[elem0 as usize]; + if elem0_parent_or_size < 0 { + // We're at a tree root. + return elem0; + } + + let elem1 = elem0_parent_or_size as u32; + let elem1_parent_or_size: i32 = self.parent_or_size[elem1 as usize]; + if elem1_parent_or_size < 0 { + self.parent_or_size[elem0 as usize] = elem1 as i32; + return elem1; + } + + let elem2 = elem1_parent_or_size as u32; + let elem2_parent_or_size: i32 = self.parent_or_size[elem2 as usize]; + if elem2_parent_or_size < 0 { + self.parent_or_size[elem1 as usize] = elem2 as i32; + self.parent_or_size[elem0 as usize] = elem2 as i32; + return elem2; + } + + let elem3 = elem2_parent_or_size as u32; + let elem3_parent_or_size: i32 = self.parent_or_size[elem3 as usize]; + if elem3_parent_or_size < 0 { + self.parent_or_size[elem2 as usize] = elem3 as i32; + self.parent_or_size[elem1 as usize] = elem3 as i32; + self.parent_or_size[elem0 as usize] = elem3 as i32; + return elem3; + } + + // Hand off to `find_slow` to deal with all the remaining steps. + let elem4 = elem3_parent_or_size as u32; + let root = self.find_slow(elem4); + assert!(root < UF_MAX_SIZE); + self.parent_or_size[elem3 as usize] = root as i32; + self.parent_or_size[elem2 as usize] = root as i32; + self.parent_or_size[elem1 as usize] = root as i32; + self.parent_or_size[elem0 as usize] = root as i32; + return root; + } + + // This is the same as `find`, except with unroll factor of 2 rather than + // 4, and self-recursive. Don't call it directly. It is intended only as + // a fallback for `find`. + #[inline(never)] + fn find_slow(&mut self, elem0: u32) -> u32 { + // Recurse up to the root. On the way back out, make all nodes point + // directly at the root index. + + let elem0_parent_or_size: i32 = self.parent_or_size[elem0 as usize]; + if elem0_parent_or_size < 0 { + // We're at a tree root. + return elem0; + } + + let elem1 = elem0_parent_or_size as u32; + let elem1_parent_or_size: i32 = self.parent_or_size[elem1 as usize]; + if elem1_parent_or_size < 0 { + self.parent_or_size[elem0 as usize] = elem1 as i32; + return elem1; + } + + let elem2 = elem1_parent_or_size as u32; + let root = self.find_slow(elem2); + assert!(root < UF_MAX_SIZE); + self.parent_or_size[elem1 as usize] = root as i32; + self.parent_or_size[elem0 as usize] = root as i32; + return root; + } + + // Union, by size (weight). This is publicly visible. + pub fn union(&mut self, elem1t: T, elem2t: T) { + let elem1 = ToFromU32::to_u32(elem1t); + let elem2 = ToFromU32::to_u32(elem2t); + if elem1 == elem2 { + // Ideally, we'd alert the callers they're mistakenly do `union` on + // identical values repeatedly, but fuzzing hits this repeatedly. + return; + } + let root1: u32 = self.find(elem1); + let root2: u32 = self.find(elem2); + if root1 == root2 { + // `elem1` and `elem2` are already in the same tree. Do nothing. + return; + } + let size1: i32 = self.parent_or_size[root1 as usize]; + let size2: i32 = self.parent_or_size[root2 as usize]; + // "They are both roots" + assert!(size1 < 0 && size2 < 0); + // Make the root of the smaller tree point at the root of the bigger tree. + // Update the root of the bigger tree to reflect its increased size. That + // only requires adding the two `size` values, since they are both + // negative, so adding them will (correctly) drive it more negative. + if size1 < size2 { + self.parent_or_size[root1 as usize] = root2 as i32; + self.parent_or_size[root2 as usize] += size1; + } else { + self.parent_or_size[root2 as usize] = root1 as i32; + self.parent_or_size[root1 as usize] += size2; + } + } +} + +//============================================================================= +// UnionFindEquivClasses + +// This is a compact representation for all the equivalence classes in a +// `UnionFind`, that can be constructed in more-or-less linear time (meaning, +// O(universe size), and allows iteration over the elements of each +// equivalence class in time linear in the size of the equivalence class (you +// can't ask for better). It doesn't support queries of the form "are these +// two elements in the same equivalence class" in linear time, but we don't +// care about that. What we care about is being able to find and visit the +// equivalence class of an element quickly. +// +// The fields are non-public. What is publically available is the ability to +// get an iterator (for the equivalence class elements), given a starting +// element. + +/*priv*/ +const UFEC_NULL: u32 = 0xFFFF_FFFF; + +/*priv*/ +#[derive(Clone)] +struct LLElem { + // This list element + elem: u32, + // Pointer to the rest of the list (index in `llelems`), or UFEC_NULL. + tail: u32, +} + +pub struct UnionFindEquivClasses<T: ToFromU32> { + // Linked list start "pointers". Has .len() == universe size. Entries must + // not be UFEC_NULL since each element is at least a member of its own + // equivalence class. + /*priv*/ + heads: Vec<u32>, + + // Linked list elements. Has .len() == universe size. + /*priv*/ + lists: Vec<LLElem>, + + // Keep the typechecker happy + /*priv*/ + anchor: PhantomData<T>, + // This struct doesn't have a `new` method since construction is done by a + // carefully designed algorithm, `UnionFind::get_equiv_classes`. +} + +impl<T: ToFromU32> UnionFind<T> { + // This requires mutable `self` because it needs to do a bunch of `find` + // operations, and those modify `self` in order to perform path compression. + // We could avoid this by using a non-path-compressing `find` operation, but + // that could have the serious side effect of making the big-O complexity of + // `get_equiv_classes` worse. Hence we play safe and accept the mutability + // requirement. + pub fn get_equiv_classes(&mut self) -> UnionFindEquivClasses<T> { + let nElemsUSize = self.parent_or_size.len(); + // The construction algorithm requires that all elements have a value + // strictly less than 2^31. The union-find machinery, that builds + // `parent_or_size` that we read here, however relies on a slightly + // tighter bound, which which we reiterate here due to general paranoia: + assert!(nElemsUSize < UF_MAX_SIZE as usize); + let nElems = nElemsUSize as u32; + + // Avoid reallocation; we know how big these need to be. + let mut heads = Vec::<u32>::new(); + heads.resize(nElems as usize, UFEC_NULL); // all invalid + + let mut lists = Vec::<LLElem>::new(); + lists.resize( + nElems as usize, + LLElem { + elem: 0, + tail: UFEC_NULL, + }, + ); + + // As explanation, let there be N elements (`nElems`) which have been + // partitioned into M <= N equivalence classes by calls to `union`. + // + // When we are finished, `lists` will contain M independent linked lists, + // each of which represents one equivalence class, and which is terminated + // by UFEC_NULL. And `heads` is used to point to the starting point of + // each elem's equivalence class, as follows: + // + // * if heads[elem][bit 31] == 1, then heads[i][bits 30:0] contain the + // index in lists[] of the first element in `elem`s equivalence class. + // + // * if heads[elem][bit 31] == 0, then heads[i][bits 30:0] contain tell us + // what `elem`s equivalence class leader is. That is, heads[i][bits + // 30:0] tells us the index in `heads` of the entry that contains the + // first element in `elem`s equivalence class. + // + // With this arrangement, we can: + // + // * detect whether `elem` is an equivalence class leader, by inspecting + // heads[elem][bit 31] + // + // * find the start of `elem`s equivalence class list, either by using + // heads[elem][bits 30:0] directly if heads[elem][bit 31] == 1, or + // using a single indirection if heads[elem][bit 31] == 0. + // + // For a universe of size N, this makes it possible to: + // + // * find the equivalence class list of any elem in O(1) time. + // + // * find and iterate through any single equivalence class in time O(1) + + // O(size of the equivalence class). + // + // * find all the equivalence class headers in O(N) time. + // + // * find all the equivalence class headers, and then iterate through each + // equivalence class exactly once, in time k1.O(N) + k2.O(N). The first + // term is the cost of finding all the headers. The second term is the + // cost of visiting all elements of each equivalence class exactly once. + // + // The construction algorithm requires two forward passes over + // `parent_or_size`. + // + // In the first pass, we visit each element. If a element is a tree root, + // its `heads` entry is left at UFEC_NULL. If a element isn't a tree + // root, we use `find` to find the root element, and set + // `heads[elem][30:0]` to be the tree root, and heads[elem][31] to 0. + // Hence, after the first pass, `heads` maps each non-root element to its + // equivalence class leader. + // + // The second pass builds the lists. We again visit each element. If a + // element is a tree root, it is added as a list element, and its `heads` + // entry is updated to point at the list element. If a element isn't a + // tree root, we find its root in constant time by inspecting its `head` + // entry. The element is added to the the root element's list, and the + // root element's `head` entry is accordingly updated. Hence, after the + // second pass, the `head` entry for root elements points to a linked list + // that contains all elements in that tree. And the `head` entry for + // non-root elements is unchanged from the first pass, that is, it points + // to the `head` entry for that element's root element. + // + // Note that the heads[] entry for any class leader (tree root) can never + // be UFEC_NULL, since all elements must at least be in an equivalence + // class of size 1. Hence there is no confusion possible resulting from + // using the heads bit 31 entries as a direct/indirect flag. + + // First pass + for i in 0..nElems { + if self.parent_or_size[i as usize] >= 0 { + // i is non-root + let root_i: u32 = self.find(i); + assert!(root_i < 0x8000_0000u32); + heads[i as usize] = root_i; // .direct flag == 0 + } + } + + // Second pass + let mut list_bump = 0u32; + for i in 0..nElems { + if self.parent_or_size[i as usize] < 0 { + // i is root + lists[list_bump as usize] = LLElem { + elem: i, + tail: if heads[i as usize] == UFEC_NULL { + UFEC_NULL + } else { + heads[i as usize] & 0x7FFF_FFFF + }, + }; + assert!(list_bump < 0x8000_0000u32); + heads[i as usize] = list_bump | 0x8000_0000u32; // .direct flag == 1 + list_bump += 1; + } else { + // i is non-root + let i_root = heads[i as usize]; + lists[list_bump as usize] = LLElem { + elem: i, + tail: if heads[i_root as usize] == UFEC_NULL { + UFEC_NULL + } else { + heads[i_root as usize] & 0x7FFF_FFFF + }, + }; + assert!(list_bump < 0x8000_0000u32); + heads[i_root as usize] = list_bump | 0x8000_0000u32; // .direct flag == 1 + list_bump += 1; + } + } + assert!(list_bump == nElems); + + // It's a wrap! + assert!(heads.len() == nElemsUSize); + assert!(lists.len() == nElemsUSize); + //{ + // for i in 0 .. heads.len() { + // println!("{}: heads {:x} lists.elem {} .tail {:x}", i, + // heads[i], lists[i].elem, lists[i].tail); + // } + //} + UnionFindEquivClasses { + heads, + lists, + anchor: PhantomData, + } + } +} + +impl<T: ToFromU32> UnionFindEquivClasses<T> { + // Indicates whether `item1` and `item2` are in the same equivalence + // class. If either falls outside the "universe", returns `None`. + pub fn in_same_equivalence_class(&self, item1: T, item2: T) -> Option<bool> { + let mut item1num = ToFromU32::to_u32(item1) as usize; + let mut item2num = ToFromU32::to_u32(item2) as usize; + // If either item is outside our "universe", say we don't know. + if item1num >= self.heads.len() || item2num >= self.heads.len() { + return None; + } + // Ensure that `item1num` and `item2num` both point at class leaders. + if (self.heads[item1num] & 0x8000_0000) == 0 { + item1num = self.heads[item1num] as usize; + } + if (self.heads[item2num] & 0x8000_0000) == 0 { + item2num = self.heads[item2num] as usize; + } + debug_assert!((self.heads[item1num] & 0x8000_0000) == 0x8000_0000); + debug_assert!((self.heads[item2num] & 0x8000_0000) == 0x8000_0000); + Some(item1num == item2num) + } +} + +//============================================================================= +// UnionFindEquivClassElemsIter + +// We may want to find the equivalence class for some given element, and +// iterate through its elements. This iterator provides that. + +pub struct UnionFindEquivClassElemsIter<'a, T: ToFromU32> { + // The equivalence classes + /*priv*/ + ufec: &'a UnionFindEquivClasses<T>, + // Index into `ufec.lists`, or UFEC_NULL. + /*priv*/ + next: u32, +} + +impl<T: ToFromU32> UnionFindEquivClasses<T> { + pub fn equiv_class_elems_iter<'a>(&'a self, item: T) -> UnionFindEquivClassElemsIter<'a, T> { + let mut itemU32 = ToFromU32::to_u32(item); + assert!((itemU32 as usize) < self.heads.len()); + if (self.heads[itemU32 as usize] & 0x8000_0000) == 0 { + // .direct flag is not set. This is not a class leader. We must + // indirect. + itemU32 = self.heads[itemU32 as usize]; + } + // Now `itemU32` must point at a class leader. + assert!((self.heads[itemU32 as usize] & 0x8000_0000) == 0x8000_0000); + let next = self.heads[itemU32 as usize] & 0x7FFF_FFFF; + // Now `next` points at the first element in the list. + UnionFindEquivClassElemsIter { ufec: &self, next } + } +} + +impl<'a, T: ToFromU32> Iterator for UnionFindEquivClassElemsIter<'a, T> { + type Item = T; + fn next(&mut self) -> Option<Self::Item> { + if self.next == UFEC_NULL { + None + } else { + let res: T = ToFromU32::from_u32(self.ufec.lists[self.next as usize].elem); + self.next = self.ufec.lists[self.next as usize].tail; + Some(res) + } + } +} + +// In order to visit all equivalence classes exactly once, we need something +// else: a way to enumerate their leaders (some value arbitrarily drawn from +// each one). This provides that. + +pub struct UnionFindEquivClassLeadersIter<'a, T: ToFromU32> { + // The equivalence classes + /*priv*/ + ufec: &'a UnionFindEquivClasses<T>, + // Index into `ufec.heads` of the next unvisited item. + /*priv*/ + next: u32, +} + +impl<T: ToFromU32> UnionFindEquivClasses<T> { + pub fn equiv_class_leaders_iter<'a>(&'a self) -> UnionFindEquivClassLeadersIter<'a, T> { + UnionFindEquivClassLeadersIter { + ufec: &self, + next: 0, + } + } +} + +impl<'a, T: ToFromU32> Iterator for UnionFindEquivClassLeadersIter<'a, T> { + type Item = T; + fn next(&mut self) -> Option<Self::Item> { + // Scan forwards through `ufec.heads` to find the next unvisited one which + // is a leader (a tree root). + loop { + if self.next as usize >= self.ufec.heads.len() { + return None; + } + if (self.ufec.heads[self.next as usize] & 0x8000_0000) == 0x8000_0000 { + // This is a leader. + let res = ToFromU32::from_u32(self.next); + self.next += 1; + return Some(res); + } + // No luck, keep one searching. + self.next += 1; + } + /*NOTREACHED*/ + } +} + +//============================================================================= +// Testing machinery for UnionFind + +#[cfg(test)] +mod union_find_test_utils { + use super::UnionFindEquivClasses; + // Test that the eclass for `elem` is `expected` (modulo ordering). + pub fn test_eclass(eclasses: &UnionFindEquivClasses<u32>, elem: u32, expected: &Vec<u32>) { + let mut expected_sorted = expected.clone(); + let mut actual = vec![]; + for ecm in eclasses.equiv_class_elems_iter(elem) { + actual.push(ecm); + } + expected_sorted.sort(); + actual.sort(); + assert!(actual == expected_sorted); + } + // Test that the eclass leaders are exactly `expected`. + pub fn test_leaders( + univ_size: u32, + eclasses: &UnionFindEquivClasses<u32>, + expected: &Vec<u32>, + ) { + let mut actual = vec![]; + for leader in eclasses.equiv_class_leaders_iter() { + actual.push(leader); + } + assert!(actual == *expected); + // Now use the headers to enumerate each eclass exactly once, and collect + // up the elements. The resulting vector should be some permutation of + // [0 .. univ_size-1]. + let mut univ_actual = vec![]; + for leader in eclasses.equiv_class_leaders_iter() { + for elem in eclasses.equiv_class_elems_iter(leader) { + univ_actual.push(elem); + } + } + univ_actual.sort(); + let mut univ_expected = vec![]; + for i in 0..univ_size { + univ_expected.push(i); + } + assert!(univ_actual == univ_expected); + } + // Test that `in_same_equivalence_class` produces the expected results. + pub fn test_in_same_eclass( + eclasses: &UnionFindEquivClasses<u32>, + elem1: u32, + elem2: u32, + expected: Option<bool>, + ) { + assert!(eclasses.in_same_equivalence_class(elem1, elem2) == expected); + assert!(eclasses.in_same_equivalence_class(elem2, elem1) == expected); + } +} + +#[test] +fn test_union_find() { + const UNIV_SIZE: u32 = 8; + let mut uf = UnionFind::new(UNIV_SIZE as usize); + let mut uf_eclasses = uf.get_equiv_classes(); + union_find_test_utils::test_eclass(&uf_eclasses, 0, &vec![0]); + union_find_test_utils::test_eclass(&uf_eclasses, 1, &vec![1]); + union_find_test_utils::test_eclass(&uf_eclasses, 2, &vec![2]); + union_find_test_utils::test_eclass(&uf_eclasses, 3, &vec![3]); + union_find_test_utils::test_eclass(&uf_eclasses, 4, &vec![4]); + union_find_test_utils::test_eclass(&uf_eclasses, 5, &vec![5]); + union_find_test_utils::test_eclass(&uf_eclasses, 6, &vec![6]); + union_find_test_utils::test_eclass(&uf_eclasses, 7, &vec![7]); + union_find_test_utils::test_leaders(UNIV_SIZE, &uf_eclasses, &vec![0, 1, 2, 3, 4, 5, 6, 7]); + + uf.union(2, 4); + uf_eclasses = uf.get_equiv_classes(); + union_find_test_utils::test_eclass(&uf_eclasses, 0, &vec![0]); + union_find_test_utils::test_eclass(&uf_eclasses, 1, &vec![1]); + union_find_test_utils::test_eclass(&uf_eclasses, 2, &vec![4, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 3, &vec![3]); + union_find_test_utils::test_eclass(&uf_eclasses, 4, &vec![4, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 5, &vec![5]); + union_find_test_utils::test_eclass(&uf_eclasses, 6, &vec![6]); + union_find_test_utils::test_eclass(&uf_eclasses, 7, &vec![7]); + union_find_test_utils::test_leaders(UNIV_SIZE, &uf_eclasses, &vec![0, 1, 2, 3, 5, 6, 7]); + + uf.union(5, 3); + uf_eclasses = uf.get_equiv_classes(); + union_find_test_utils::test_eclass(&uf_eclasses, 0, &vec![0]); + union_find_test_utils::test_eclass(&uf_eclasses, 1, &vec![1]); + union_find_test_utils::test_eclass(&uf_eclasses, 2, &vec![4, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 3, &vec![5, 3]); + union_find_test_utils::test_eclass(&uf_eclasses, 4, &vec![4, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 5, &vec![5, 3]); + union_find_test_utils::test_eclass(&uf_eclasses, 6, &vec![6]); + union_find_test_utils::test_eclass(&uf_eclasses, 7, &vec![7]); + union_find_test_utils::test_leaders(UNIV_SIZE, &uf_eclasses, &vec![0, 1, 2, 5, 6, 7]); + + uf.union(2, 5); + uf_eclasses = uf.get_equiv_classes(); + union_find_test_utils::test_eclass(&uf_eclasses, 0, &vec![0]); + union_find_test_utils::test_eclass(&uf_eclasses, 1, &vec![1]); + union_find_test_utils::test_eclass(&uf_eclasses, 2, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 3, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 4, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 5, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 6, &vec![6]); + union_find_test_utils::test_eclass(&uf_eclasses, 7, &vec![7]); + union_find_test_utils::test_leaders(UNIV_SIZE, &uf_eclasses, &vec![0, 1, 2, 6, 7]); + // At this point, also check the "in same equivalence class?" function. + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 0, 0, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 0, 1, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 0, 2, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 0, 3, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 0, 4, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 0, 5, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 0, 6, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 0, 7, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 1, 1, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 1, 2, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 1, 3, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 1, 4, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 1, 5, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 1, 6, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 1, 7, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 2, 2, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 2, 3, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 2, 4, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 2, 5, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 2, 6, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 2, 7, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 3, 3, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 3, 4, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 3, 5, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 3, 6, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 3, 7, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 4, 4, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 4, 5, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 4, 6, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 4, 7, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 5, 5, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 5, 6, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 5, 7, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 6, 6, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 6, 7, Some(false)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 7, 7, Some(true)); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 0, 8, None); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 8, 0, None); + union_find_test_utils::test_in_same_eclass(&uf_eclasses, 8, 8, None); + + uf.union(7, 1); + uf_eclasses = uf.get_equiv_classes(); + union_find_test_utils::test_eclass(&uf_eclasses, 0, &vec![0]); + union_find_test_utils::test_eclass(&uf_eclasses, 1, &vec![7, 1]); + union_find_test_utils::test_eclass(&uf_eclasses, 2, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 3, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 4, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 5, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 6, &vec![6]); + union_find_test_utils::test_eclass(&uf_eclasses, 7, &vec![7, 1]); + union_find_test_utils::test_leaders(UNIV_SIZE, &uf_eclasses, &vec![0, 2, 6, 7]); + + uf.union(6, 7); + uf_eclasses = uf.get_equiv_classes(); + union_find_test_utils::test_eclass(&uf_eclasses, 0, &vec![0]); + union_find_test_utils::test_eclass(&uf_eclasses, 1, &vec![7, 6, 1]); + union_find_test_utils::test_eclass(&uf_eclasses, 2, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 3, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 4, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 5, &vec![5, 4, 3, 2]); + union_find_test_utils::test_eclass(&uf_eclasses, 6, &vec![7, 6, 1]); + union_find_test_utils::test_eclass(&uf_eclasses, 7, &vec![7, 6, 1]); + union_find_test_utils::test_leaders(UNIV_SIZE, &uf_eclasses, &vec![0, 2, 6]); + + uf.union(4, 1); + uf_eclasses = uf.get_equiv_classes(); + union_find_test_utils::test_eclass(&uf_eclasses, 0, &vec![0]); + union_find_test_utils::test_eclass(&uf_eclasses, 1, &vec![7, 6, 5, 4, 3, 2, 1]); + union_find_test_utils::test_eclass(&uf_eclasses, 2, &vec![7, 6, 5, 4, 3, 2, 1]); + union_find_test_utils::test_eclass(&uf_eclasses, 3, &vec![7, 6, 5, 4, 3, 2, 1]); + union_find_test_utils::test_eclass(&uf_eclasses, 4, &vec![7, 6, 5, 4, 3, 2, 1]); + union_find_test_utils::test_eclass(&uf_eclasses, 5, &vec![7, 6, 5, 4, 3, 2, 1]); + union_find_test_utils::test_eclass(&uf_eclasses, 6, &vec![7, 6, 5, 4, 3, 2, 1]); + union_find_test_utils::test_eclass(&uf_eclasses, 7, &vec![7, 6, 5, 4, 3, 2, 1]); + union_find_test_utils::test_leaders(UNIV_SIZE, &uf_eclasses, &vec![0, 6]); + + uf.union(0, 3); + uf_eclasses = uf.get_equiv_classes(); + union_find_test_utils::test_eclass(&uf_eclasses, 0, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 1, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 2, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 3, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 4, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 5, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 6, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 7, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_leaders(UNIV_SIZE, &uf_eclasses, &vec![0]); + + // Pointless, because the classes are already maximal. + uf.union(1, 2); + uf_eclasses = uf.get_equiv_classes(); + union_find_test_utils::test_eclass(&uf_eclasses, 0, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 1, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 2, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 3, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 4, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 5, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 6, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_eclass(&uf_eclasses, 7, &vec![7, 6, 5, 4, 3, 2, 1, 0]); + union_find_test_utils::test_leaders(UNIV_SIZE, &uf_eclasses, &vec![0]); +} |