diff options
Diffstat (limited to 'vendor/regex-automata/src/dfa/minimize.rs')
-rw-r--r-- | vendor/regex-automata/src/dfa/minimize.rs | 463 |
1 files changed, 463 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/dfa/minimize.rs b/vendor/regex-automata/src/dfa/minimize.rs new file mode 100644 index 0000000..fea925b --- /dev/null +++ b/vendor/regex-automata/src/dfa/minimize.rs @@ -0,0 +1,463 @@ +use core::{cell::RefCell, fmt, mem}; + +use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec}; + +use crate::{ + dfa::{automaton::Automaton, dense, DEAD}, + util::{ + alphabet, + primitives::{PatternID, StateID}, + }, +}; + +/// An implementation of Hopcroft's algorithm for minimizing DFAs. +/// +/// The algorithm implemented here is mostly taken from Wikipedia: +/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm +/// +/// This code has had some light optimization attention paid to it, +/// particularly in the form of reducing allocation as much as possible. +/// However, it is still generally slow. Future optimization work should +/// probably focus on the bigger picture rather than micro-optimizations. For +/// example: +/// +/// 1. Figure out how to more intelligently create initial partitions. That is, +/// Hopcroft's algorithm starts by creating two partitions of DFA states +/// that are known to NOT be equivalent: match states and non-match states. +/// The algorithm proceeds by progressively refining these partitions into +/// smaller partitions. If we could start with more partitions, then we +/// could reduce the amount of work that Hopcroft's algorithm needs to do. +/// 2. For every partition that we visit, we find all incoming transitions to +/// every state in the partition for *every* element in the alphabet. (This +/// is why using byte classes can significantly decrease minimization times, +/// since byte classes shrink the alphabet.) This is quite costly and there +/// is perhaps some redundant work being performed depending on the specific +/// states in the set. For example, we might be able to only visit some +/// elements of the alphabet based on the transitions. +/// 3. Move parts of minimization into determinization. If minimization has +/// fewer states to deal with, then it should run faster. A prime example +/// of this might be large Unicode classes, which are generated in way that +/// can create a lot of redundant states. (Some work has been done on this +/// point during NFA compilation via the algorithm described in the +/// "Incremental Construction of MinimalAcyclic Finite-State Automata" +/// paper.) +pub(crate) struct Minimizer<'a> { + dfa: &'a mut dense::OwnedDFA, + in_transitions: Vec<Vec<Vec<StateID>>>, + partitions: Vec<StateSet>, + waiting: Vec<StateSet>, +} + +impl<'a> fmt::Debug for Minimizer<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Minimizer") + .field("dfa", &self.dfa) + .field("in_transitions", &self.in_transitions) + .field("partitions", &self.partitions) + .field("waiting", &self.waiting) + .finish() + } +} + +/// A set of states. A state set makes up a single partition in Hopcroft's +/// algorithm. +/// +/// It is represented by an ordered set of state identifiers. We use shared +/// ownership so that a single state set can be in both the set of partitions +/// and in the set of waiting sets simultaneously without an additional +/// allocation. Generally, once a state set is built, it becomes immutable. +/// +/// We use this representation because it avoids the overhead of more +/// traditional set data structures (HashSet/BTreeSet), and also because +/// computing intersection/subtraction on this representation is especially +/// fast. +#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +struct StateSet { + ids: Rc<RefCell<Vec<StateID>>>, +} + +impl<'a> Minimizer<'a> { + pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> { + let in_transitions = Minimizer::incoming_transitions(dfa); + let partitions = Minimizer::initial_partitions(dfa); + let waiting = partitions.clone(); + Minimizer { dfa, in_transitions, partitions, waiting } + } + + pub fn run(mut self) { + let stride2 = self.dfa.stride2(); + let as_state_id = |index: usize| -> StateID { + StateID::new(index << stride2).unwrap() + }; + let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; + + let mut incoming = StateSet::empty(); + let mut scratch1 = StateSet::empty(); + let mut scratch2 = StateSet::empty(); + let mut newparts = vec![]; + + // This loop is basically Hopcroft's algorithm. Everything else is just + // shuffling data around to fit our representation. + while let Some(set) = self.waiting.pop() { + for b in self.dfa.byte_classes().iter() { + self.find_incoming_to(b, &set, &mut incoming); + // If incoming is empty, then the intersection with any other + // set must also be empty. So 'newparts' just ends up being + // 'self.partitions'. So there's no need to go through the loop + // below. + // + // This actually turns out to be rather large optimization. On + // the order of making minimization 4-5x faster. It's likely + // that the vast majority of all states have very few incoming + // transitions. + if incoming.is_empty() { + continue; + } + + for p in 0..self.partitions.len() { + self.partitions[p].intersection(&incoming, &mut scratch1); + if scratch1.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + self.partitions[p].subtract(&incoming, &mut scratch2); + if scratch2.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + let (x, y) = + (scratch1.deep_clone(), scratch2.deep_clone()); + newparts.push(x.clone()); + newparts.push(y.clone()); + match self.find_waiting(&self.partitions[p]) { + Some(i) => { + self.waiting[i] = x; + self.waiting.push(y); + } + None => { + if x.len() <= y.len() { + self.waiting.push(x); + } else { + self.waiting.push(y); + } + } + } + } + newparts = mem::replace(&mut self.partitions, newparts); + newparts.clear(); + } + } + + // At this point, we now have a minimal partitioning of states, where + // each partition is an equivalence class of DFA states. Now we need to + // use this partitioning to update the DFA to only contain one state for + // each partition. + + // Create a map from DFA state ID to the representative ID of the + // equivalence class to which it belongs. The representative ID of an + // equivalence class of states is the minimum ID in that class. + let mut state_to_part = vec![DEAD; self.dfa.state_len()]; + for p in &self.partitions { + p.iter(|id| state_to_part[as_index(id)] = p.min()); + } + + // Generate a new contiguous sequence of IDs for minimal states, and + // create a map from equivalence IDs to the new IDs. Thus, the new + // minimal ID of *any* state in the unminimized DFA can be obtained + // with minimals_ids[state_to_part[old_id]]. + let mut minimal_ids = vec![DEAD; self.dfa.state_len()]; + let mut new_index = 0; + for state in self.dfa.states() { + if state_to_part[as_index(state.id())] == state.id() { + minimal_ids[as_index(state.id())] = as_state_id(new_index); + new_index += 1; + } + } + // The total number of states in the minimal DFA. + let minimal_count = new_index; + // Convenience function for remapping state IDs. This takes an old ID, + // looks up its Hopcroft partition and then maps that to the new ID + // range. + let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])]; + + // Re-map this DFA in place such that the only states remaining + // correspond to the representative states of every equivalence class. + for id in (0..self.dfa.state_len()).map(as_state_id) { + // If this state isn't a representative for an equivalence class, + // then we skip it since it won't appear in the minimal DFA. + if state_to_part[as_index(id)] != id { + continue; + } + self.dfa.remap_state(id, remap); + self.dfa.swap_states(id, minimal_ids[as_index(id)]); + } + // Trim off all unused states from the pre-minimized DFA. This + // represents all states that were merged into a non-singleton + // equivalence class of states, and appeared after the first state + // in each such class. (Because the state with the smallest ID in each + // equivalence class is its representative ID.) + self.dfa.truncate_states(minimal_count); + + // Update the new start states, which is now just the minimal ID of + // whatever state the old start state was collapsed into. Also, we + // collect everything before-hand to work around the borrow checker. + // We're already allocating so much that this is probably fine. If this + // turns out to be costly, then I guess add a `starts_mut` iterator. + let starts: Vec<_> = self.dfa.starts().collect(); + for (old_start_id, anchored, start_type) in starts { + self.dfa.set_start_state( + anchored, + start_type, + remap(old_start_id), + ); + } + + // Update the match state pattern ID list for multi-regexes. All we + // need to do is remap the match state IDs. The pattern ID lists are + // always the same as they were since match states with distinct + // pattern ID lists are always considered distinct states. + let mut pmap = BTreeMap::new(); + for (match_id, pattern_ids) in self.dfa.pattern_map() { + let new_id = remap(match_id); + pmap.insert(new_id, pattern_ids); + } + // This unwrap is OK because minimization never increases the number of + // match states or patterns in those match states. Since minimization + // runs after the pattern map has already been set at least once, we + // know that our match states cannot error. + self.dfa.set_pattern_map(&pmap).unwrap(); + + // In order to update the ID of the maximum match state, we need to + // find the maximum ID among all of the match states in the minimized + // DFA. This is not necessarily the new ID of the unminimized maximum + // match state, since that could have been collapsed with a much + // earlier match state. Therefore, to find the new max match state, + // we iterate over all previous match states, find their corresponding + // new minimal ID, and take the maximum of those. + let old = self.dfa.special().clone(); + let new = self.dfa.special_mut(); + // ... but only remap if we had match states. + if old.matches() { + new.min_match = StateID::MAX; + new.max_match = StateID::ZERO; + for i in as_index(old.min_match)..=as_index(old.max_match) { + let new_id = remap(as_state_id(i)); + if new_id < new.min_match { + new.min_match = new_id; + } + if new_id > new.max_match { + new.max_match = new_id; + } + } + } + // ... same, but for start states. + if old.starts() { + new.min_start = StateID::MAX; + new.max_start = StateID::ZERO; + for i in as_index(old.min_start)..=as_index(old.max_start) { + let new_id = remap(as_state_id(i)); + if new_id == DEAD { + continue; + } + if new_id < new.min_start { + new.min_start = new_id; + } + if new_id > new.max_start { + new.max_start = new_id; + } + } + if new.max_start == DEAD { + new.min_start = DEAD; + } + } + new.quit_id = remap(new.quit_id); + new.set_max(); + } + + fn find_waiting(&self, set: &StateSet) -> Option<usize> { + self.waiting.iter().position(|s| s == set) + } + + fn find_incoming_to( + &self, + b: alphabet::Unit, + set: &StateSet, + incoming: &mut StateSet, + ) { + incoming.clear(); + set.iter(|id| { + for &inid in + &self.in_transitions[self.dfa.to_index(id)][b.as_usize()] + { + incoming.add(inid); + } + }); + incoming.canonicalize(); + } + + fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> { + // For match states, we know that two match states with different + // pattern ID lists will *always* be distinct, so we can partition them + // initially based on that. + let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new(); + let mut is_quit = StateSet::empty(); + let mut no_match = StateSet::empty(); + for state in dfa.states() { + if dfa.is_match_state(state.id()) { + let mut pids = vec![]; + for i in 0..dfa.match_len(state.id()) { + pids.push(dfa.match_pattern(state.id(), i)); + } + matching + .entry(pids) + .or_insert(StateSet::empty()) + .add(state.id()); + } else if dfa.is_quit_state(state.id()) { + is_quit.add(state.id()); + } else { + no_match.add(state.id()); + } + } + + let mut sets: Vec<StateSet> = + matching.into_iter().map(|(_, set)| set).collect(); + sets.push(no_match); + sets.push(is_quit); + sets + } + + fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> { + let mut incoming = vec![]; + for _ in dfa.states() { + incoming.push(vec![vec![]; dfa.alphabet_len()]); + } + for state in dfa.states() { + for (b, next) in state.transitions() { + incoming[dfa.to_index(next)][b.as_usize()].push(state.id()); + } + } + incoming + } +} + +impl StateSet { + fn empty() -> StateSet { + StateSet { ids: Rc::new(RefCell::new(vec![])) } + } + + fn add(&mut self, id: StateID) { + self.ids.borrow_mut().push(id); + } + + fn min(&self) -> StateID { + self.ids.borrow()[0] + } + + fn canonicalize(&mut self) { + self.ids.borrow_mut().sort(); + self.ids.borrow_mut().dedup(); + } + + fn clear(&mut self) { + self.ids.borrow_mut().clear(); + } + + fn len(&self) -> usize { + self.ids.borrow().len() + } + + fn is_empty(&self) -> bool { + self.len() == 0 + } + + fn deep_clone(&self) -> StateSet { + let ids = self.ids.borrow().iter().cloned().collect(); + StateSet { ids: Rc::new(RefCell::new(ids)) } + } + + fn iter<F: FnMut(StateID)>(&self, mut f: F) { + for &id in self.ids.borrow().iter() { + f(id); + } + } + + fn intersection(&self, other: &StateSet, dest: &mut StateSet) { + dest.clear(); + if self.is_empty() || other.is_empty() { + return; + } + + let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => break, + Some(b) => b, + }; + } else if a < b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => break, + Some(b) => b, + }; + } + } + } + + fn subtract(&self, other: &StateSet, dest: &mut StateSet) { + dest.clear(); + if self.is_empty() || other.is_empty() { + self.iter(|s| dest.add(s)); + return; + } + + let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } else if a < b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } + } + for a in ita { + dest.add(a); + } + } +} |