diff options
Diffstat (limited to 'vendor/regex-automata/src/minimize.rs')
-rw-r--r-- | vendor/regex-automata/src/minimize.rs | 373 |
1 files changed, 373 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/minimize.rs b/vendor/regex-automata/src/minimize.rs new file mode 100644 index 000000000..ededa5f66 --- /dev/null +++ b/vendor/regex-automata/src/minimize.rs @@ -0,0 +1,373 @@ +use std::cell::RefCell; +use std::fmt; +use std::mem; +use std::rc::Rc; + +use dense; +use state_id::{dead_id, StateID}; + +type DFARepr<S> = dense::Repr<Vec<S>, S>; + +/// An implementation of Hopcroft's algorithm for minimizing DFAs. +/// +/// The algorithm implemented here is mostly taken from Wikipedia: +/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm +/// +/// This code has had some light optimization attention paid to it, +/// particularly in the form of reducing allocation as much as possible. +/// However, it is still generally slow. Future optimization work should +/// probably focus on the bigger picture rather than micro-optimizations. For +/// example: +/// +/// 1. Figure out how to more intelligently create initial partitions. That is, +/// Hopcroft's algorithm starts by creating two partitions of DFA states +/// that are known to NOT be equivalent: match states and non-match states. +/// The algorithm proceeds by progressively refining these partitions into +/// smaller partitions. If we could start with more partitions, then we +/// could reduce the amount of work that Hopcroft's algorithm needs to do. +/// 2. For every partition that we visit, we find all incoming transitions to +/// every state in the partition for *every* element in the alphabet. (This +/// is why using byte classes can significantly decrease minimization times, +/// since byte classes shrink the alphabet.) This is quite costly and there +/// is perhaps some redundant work being performed depending on the specific +/// states in the set. For example, we might be able to only visit some +/// elements of the alphabet based on the transitions. +/// 3. Move parts of minimization into determinization. If minimization has +/// fewer states to deal with, then it should run faster. A prime example +/// of this might be large Unicode classes, which are generated in way that +/// can create a lot of redundant states. (Some work has been done on this +/// point during NFA compilation via the algorithm described in the +/// "Incremental Construction of MinimalAcyclic Finite-State Automata" +/// paper.) +pub(crate) struct Minimizer<'a, S: 'a> { + dfa: &'a mut DFARepr<S>, + in_transitions: Vec<Vec<Vec<S>>>, + partitions: Vec<StateSet<S>>, + waiting: Vec<StateSet<S>>, +} + +impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Minimizer") + .field("dfa", &self.dfa) + .field("in_transitions", &self.in_transitions) + .field("partitions", &self.partitions) + .field("waiting", &self.waiting) + .finish() + } +} + +/// A set of states. A state set makes up a single partition in Hopcroft's +/// algorithm. +/// +/// It is represented by an ordered set of state identifiers. We use shared +/// ownership so that a single state set can be in both the set of partitions +/// and in the set of waiting sets simultaneously without an additional +/// allocation. Generally, once a state set is built, it becomes immutable. +/// +/// We use this representation because it avoids the overhead of more +/// traditional set data structures (HashSet/BTreeSet), and also because +/// computing intersection/subtraction on this representation is especially +/// fast. +#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +struct StateSet<S>(Rc<RefCell<Vec<S>>>); + +impl<'a, S: StateID> Minimizer<'a, S> { + pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> { + let in_transitions = Minimizer::incoming_transitions(dfa); + let partitions = Minimizer::initial_partitions(dfa); + let waiting = vec![partitions[0].clone()]; + + Minimizer { dfa, in_transitions, partitions, waiting } + } + + pub fn run(mut self) { + let mut incoming = StateSet::empty(); + let mut scratch1 = StateSet::empty(); + let mut scratch2 = StateSet::empty(); + let mut newparts = vec![]; + + while let Some(set) = self.waiting.pop() { + for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) { + self.find_incoming_to(b, &set, &mut incoming); + + for p in 0..self.partitions.len() { + self.partitions[p].intersection(&incoming, &mut scratch1); + if scratch1.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + self.partitions[p].subtract(&incoming, &mut scratch2); + if scratch2.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + let (x, y) = + (scratch1.deep_clone(), scratch2.deep_clone()); + newparts.push(x.clone()); + newparts.push(y.clone()); + match self.find_waiting(&self.partitions[p]) { + Some(i) => { + self.waiting[i] = x; + self.waiting.push(y); + } + None => { + if x.len() <= y.len() { + self.waiting.push(x); + } else { + self.waiting.push(y); + } + } + } + } + newparts = mem::replace(&mut self.partitions, newparts); + newparts.clear(); + } + } + + // At this point, we now have a minimal partitioning of states, where + // each partition is an equivalence class of DFA states. Now we need to + // use this partioning to update the DFA to only contain one state for + // each partition. + + // Create a map from DFA state ID to the representative ID of the + // equivalence class to which it belongs. The representative ID of an + // equivalence class of states is the minimum ID in that class. + let mut state_to_part = vec![dead_id(); self.dfa.state_count()]; + for p in &self.partitions { + p.iter(|id| state_to_part[id.to_usize()] = p.min()); + } + + // Generate a new contiguous sequence of IDs for minimal states, and + // create a map from equivalence IDs to the new IDs. Thus, the new + // minimal ID of *any* state in the unminimized DFA can be obtained + // with minimals_ids[state_to_part[old_id]]. + let mut minimal_ids = vec![dead_id(); self.dfa.state_count()]; + let mut new_id = S::from_usize(0); + for (id, _) in self.dfa.states() { + if state_to_part[id.to_usize()] == id { + minimal_ids[id.to_usize()] = new_id; + new_id = S::from_usize(new_id.to_usize() + 1); + } + } + // The total number of states in the minimal DFA. + let minimal_count = new_id.to_usize(); + + // Re-map this DFA in place such that the only states remaining + // correspond to the representative states of every equivalence class. + for id in (0..self.dfa.state_count()).map(S::from_usize) { + // If this state isn't a representative for an equivalence class, + // then we skip it since it won't appear in the minimal DFA. + if state_to_part[id.to_usize()] != id { + continue; + } + for (_, next) in self.dfa.get_state_mut(id).iter_mut() { + *next = minimal_ids[state_to_part[next.to_usize()].to_usize()]; + } + self.dfa.swap_states(id, minimal_ids[id.to_usize()]); + } + // Trim off all unused states from the pre-minimized DFA. This + // represents all states that were merged into a non-singleton + // equivalence class of states, and appeared after the first state + // in each such class. (Because the state with the smallest ID in each + // equivalence class is its representative ID.) + self.dfa.truncate_states(minimal_count); + + // Update the new start state, which is now just the minimal ID of + // whatever state the old start state was collapsed into. + let old_start = self.dfa.start_state(); + self.dfa.set_start_state( + minimal_ids[state_to_part[old_start.to_usize()].to_usize()], + ); + + // In order to update the ID of the maximum match state, we need to + // find the maximum ID among all of the match states in the minimized + // DFA. This is not necessarily the new ID of the unminimized maximum + // match state, since that could have been collapsed with a much + // earlier match state. Therefore, to find the new max match state, + // we iterate over all previous match states, find their corresponding + // new minimal ID, and take the maximum of those. + let old_max = self.dfa.max_match_state(); + self.dfa.set_max_match_state(dead_id()); + for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) { + let part = state_to_part[id.to_usize()]; + let new_id = minimal_ids[part.to_usize()]; + if new_id > self.dfa.max_match_state() { + self.dfa.set_max_match_state(new_id); + } + } + } + + fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> { + self.waiting.iter().position(|s| s == set) + } + + fn find_incoming_to( + &self, + b: u8, + set: &StateSet<S>, + incoming: &mut StateSet<S>, + ) { + incoming.clear(); + set.iter(|id| { + for &inid in &self.in_transitions[id.to_usize()][b as usize] { + incoming.add(inid); + } + }); + incoming.canonicalize(); + } + + fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> { + let mut is_match = StateSet::empty(); + let mut no_match = StateSet::empty(); + for (id, _) in dfa.states() { + if dfa.is_match_state(id) { + is_match.add(id); + } else { + no_match.add(id); + } + } + + let mut sets = vec![is_match]; + if !no_match.is_empty() { + sets.push(no_match); + } + sets.sort_by_key(|s| s.len()); + sets + } + + fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> { + let mut incoming = vec![]; + for _ in dfa.states() { + incoming.push(vec![vec![]; dfa.alphabet_len()]); + } + for (id, state) in dfa.states() { + for (b, next) in state.transitions() { + incoming[next.to_usize()][b as usize].push(id); + } + } + incoming + } +} + +impl<S: StateID> StateSet<S> { + fn empty() -> StateSet<S> { + StateSet(Rc::new(RefCell::new(vec![]))) + } + + fn add(&mut self, id: S) { + self.0.borrow_mut().push(id); + } + + fn min(&self) -> S { + self.0.borrow()[0] + } + + fn canonicalize(&mut self) { + self.0.borrow_mut().sort(); + self.0.borrow_mut().dedup(); + } + + fn clear(&mut self) { + self.0.borrow_mut().clear(); + } + + fn len(&self) -> usize { + self.0.borrow().len() + } + + fn is_empty(&self) -> bool { + self.len() == 0 + } + + fn deep_clone(&self) -> StateSet<S> { + let ids = self.0.borrow().iter().cloned().collect(); + StateSet(Rc::new(RefCell::new(ids))) + } + + fn iter<F: FnMut(S)>(&self, mut f: F) { + for &id in self.0.borrow().iter() { + f(id); + } + } + + fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) { + dest.clear(); + if self.is_empty() || other.is_empty() { + return; + } + + let (seta, setb) = (self.0.borrow(), other.0.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => break, + Some(b) => b, + }; + } else if a < b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => break, + Some(b) => b, + }; + } + } + } + + fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) { + dest.clear(); + if self.is_empty() || other.is_empty() { + self.iter(|s| dest.add(s)); + return; + } + + let (seta, setb) = (self.0.borrow(), other.0.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } else if a < b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } + } + for a in ita { + dest.add(a); + } + } +} |