use core::{cell::RefCell, fmt, mem}; use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec}; use crate::{ dfa::{automaton::Automaton, dense, DEAD}, util::{ alphabet, primitives::{PatternID, StateID}, }, }; /// An implementation of Hopcroft's algorithm for minimizing DFAs. /// /// The algorithm implemented here is mostly taken from Wikipedia: /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm /// /// This code has had some light optimization attention paid to it, /// particularly in the form of reducing allocation as much as possible. /// However, it is still generally slow. Future optimization work should /// probably focus on the bigger picture rather than micro-optimizations. For /// example: /// /// 1. Figure out how to more intelligently create initial partitions. That is, /// Hopcroft's algorithm starts by creating two partitions of DFA states /// that are known to NOT be equivalent: match states and non-match states. /// The algorithm proceeds by progressively refining these partitions into /// smaller partitions. If we could start with more partitions, then we /// could reduce the amount of work that Hopcroft's algorithm needs to do. /// 2. For every partition that we visit, we find all incoming transitions to /// every state in the partition for *every* element in the alphabet. (This /// is why using byte classes can significantly decrease minimization times, /// since byte classes shrink the alphabet.) This is quite costly and there /// is perhaps some redundant work being performed depending on the specific /// states in the set. For example, we might be able to only visit some /// elements of the alphabet based on the transitions. /// 3. Move parts of minimization into determinization. If minimization has /// fewer states to deal with, then it should run faster. A prime example /// of this might be large Unicode classes, which are generated in way that /// can create a lot of redundant states. (Some work has been done on this /// point during NFA compilation via the algorithm described in the /// "Incremental Construction of MinimalAcyclic Finite-State Automata" /// paper.) pub(crate) struct Minimizer<'a> { dfa: &'a mut dense::OwnedDFA, in_transitions: Vec>>, partitions: Vec, waiting: Vec, } impl<'a> fmt::Debug for Minimizer<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Minimizer") .field("dfa", &self.dfa) .field("in_transitions", &self.in_transitions) .field("partitions", &self.partitions) .field("waiting", &self.waiting) .finish() } } /// A set of states. A state set makes up a single partition in Hopcroft's /// algorithm. /// /// It is represented by an ordered set of state identifiers. We use shared /// ownership so that a single state set can be in both the set of partitions /// and in the set of waiting sets simultaneously without an additional /// allocation. Generally, once a state set is built, it becomes immutable. /// /// We use this representation because it avoids the overhead of more /// traditional set data structures (HashSet/BTreeSet), and also because /// computing intersection/subtraction on this representation is especially /// fast. #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] struct StateSet { ids: Rc>>, } impl<'a> Minimizer<'a> { pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> { let in_transitions = Minimizer::incoming_transitions(dfa); let partitions = Minimizer::initial_partitions(dfa); let waiting = partitions.clone(); Minimizer { dfa, in_transitions, partitions, waiting } } pub fn run(mut self) { let stride2 = self.dfa.stride2(); let as_state_id = |index: usize| -> StateID { StateID::new(index << stride2).unwrap() }; let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; let mut incoming = StateSet::empty(); let mut scratch1 = StateSet::empty(); let mut scratch2 = StateSet::empty(); let mut newparts = vec![]; // This loop is basically Hopcroft's algorithm. Everything else is just // shuffling data around to fit our representation. while let Some(set) = self.waiting.pop() { for b in self.dfa.byte_classes().iter() { self.find_incoming_to(b, &set, &mut incoming); // If incoming is empty, then the intersection with any other // set must also be empty. So 'newparts' just ends up being // 'self.partitions'. So there's no need to go through the loop // below. // // This actually turns out to be rather large optimization. On // the order of making minimization 4-5x faster. It's likely // that the vast majority of all states have very few incoming // transitions. if incoming.is_empty() { continue; } for p in 0..self.partitions.len() { self.partitions[p].intersection(&incoming, &mut scratch1); if scratch1.is_empty() { newparts.push(self.partitions[p].clone()); continue; } self.partitions[p].subtract(&incoming, &mut scratch2); if scratch2.is_empty() { newparts.push(self.partitions[p].clone()); continue; } let (x, y) = (scratch1.deep_clone(), scratch2.deep_clone()); newparts.push(x.clone()); newparts.push(y.clone()); match self.find_waiting(&self.partitions[p]) { Some(i) => { self.waiting[i] = x; self.waiting.push(y); } None => { if x.len() <= y.len() { self.waiting.push(x); } else { self.waiting.push(y); } } } } newparts = mem::replace(&mut self.partitions, newparts); newparts.clear(); } } // At this point, we now have a minimal partitioning of states, where // each partition is an equivalence class of DFA states. Now we need to // use this partitioning to update the DFA to only contain one state for // each partition. // Create a map from DFA state ID to the representative ID of the // equivalence class to which it belongs. The representative ID of an // equivalence class of states is the minimum ID in that class. let mut state_to_part = vec![DEAD; self.dfa.state_len()]; for p in &self.partitions { p.iter(|id| state_to_part[as_index(id)] = p.min()); } // Generate a new contiguous sequence of IDs for minimal states, and // create a map from equivalence IDs to the new IDs. Thus, the new // minimal ID of *any* state in the unminimized DFA can be obtained // with minimals_ids[state_to_part[old_id]]. let mut minimal_ids = vec![DEAD; self.dfa.state_len()]; let mut new_index = 0; for state in self.dfa.states() { if state_to_part[as_index(state.id())] == state.id() { minimal_ids[as_index(state.id())] = as_state_id(new_index); new_index += 1; } } // The total number of states in the minimal DFA. let minimal_count = new_index; // Convenience function for remapping state IDs. This takes an old ID, // looks up its Hopcroft partition and then maps that to the new ID // range. let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])]; // Re-map this DFA in place such that the only states remaining // correspond to the representative states of every equivalence class. for id in (0..self.dfa.state_len()).map(as_state_id) { // If this state isn't a representative for an equivalence class, // then we skip it since it won't appear in the minimal DFA. if state_to_part[as_index(id)] != id { continue; } self.dfa.remap_state(id, remap); self.dfa.swap_states(id, minimal_ids[as_index(id)]); } // Trim off all unused states from the pre-minimized DFA. This // represents all states that were merged into a non-singleton // equivalence class of states, and appeared after the first state // in each such class. (Because the state with the smallest ID in each // equivalence class is its representative ID.) self.dfa.truncate_states(minimal_count); // Update the new start states, which is now just the minimal ID of // whatever state the old start state was collapsed into. Also, we // collect everything before-hand to work around the borrow checker. // We're already allocating so much that this is probably fine. If this // turns out to be costly, then I guess add a `starts_mut` iterator. let starts: Vec<_> = self.dfa.starts().collect(); for (old_start_id, anchored, start_type) in starts { self.dfa.set_start_state( anchored, start_type, remap(old_start_id), ); } // Update the match state pattern ID list for multi-regexes. All we // need to do is remap the match state IDs. The pattern ID lists are // always the same as they were since match states with distinct // pattern ID lists are always considered distinct states. let mut pmap = BTreeMap::new(); for (match_id, pattern_ids) in self.dfa.pattern_map() { let new_id = remap(match_id); pmap.insert(new_id, pattern_ids); } // This unwrap is OK because minimization never increases the number of // match states or patterns in those match states. Since minimization // runs after the pattern map has already been set at least once, we // know that our match states cannot error. self.dfa.set_pattern_map(&pmap).unwrap(); // In order to update the ID of the maximum match state, we need to // find the maximum ID among all of the match states in the minimized // DFA. This is not necessarily the new ID of the unminimized maximum // match state, since that could have been collapsed with a much // earlier match state. Therefore, to find the new max match state, // we iterate over all previous match states, find their corresponding // new minimal ID, and take the maximum of those. let old = self.dfa.special().clone(); let new = self.dfa.special_mut(); // ... but only remap if we had match states. if old.matches() { new.min_match = StateID::MAX; new.max_match = StateID::ZERO; for i in as_index(old.min_match)..=as_index(old.max_match) { let new_id = remap(as_state_id(i)); if new_id < new.min_match { new.min_match = new_id; } if new_id > new.max_match { new.max_match = new_id; } } } // ... same, but for start states. if old.starts() { new.min_start = StateID::MAX; new.max_start = StateID::ZERO; for i in as_index(old.min_start)..=as_index(old.max_start) { let new_id = remap(as_state_id(i)); if new_id == DEAD { continue; } if new_id < new.min_start { new.min_start = new_id; } if new_id > new.max_start { new.max_start = new_id; } } if new.max_start == DEAD { new.min_start = DEAD; } } new.quit_id = remap(new.quit_id); new.set_max(); } fn find_waiting(&self, set: &StateSet) -> Option { self.waiting.iter().position(|s| s == set) } fn find_incoming_to( &self, b: alphabet::Unit, set: &StateSet, incoming: &mut StateSet, ) { incoming.clear(); set.iter(|id| { for &inid in &self.in_transitions[self.dfa.to_index(id)][b.as_usize()] { incoming.add(inid); } }); incoming.canonicalize(); } fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec { // For match states, we know that two match states with different // pattern ID lists will *always* be distinct, so we can partition them // initially based on that. let mut matching: BTreeMap, StateSet> = BTreeMap::new(); let mut is_quit = StateSet::empty(); let mut no_match = StateSet::empty(); for state in dfa.states() { if dfa.is_match_state(state.id()) { let mut pids = vec![]; for i in 0..dfa.match_len(state.id()) { pids.push(dfa.match_pattern(state.id(), i)); } matching .entry(pids) .or_insert(StateSet::empty()) .add(state.id()); } else if dfa.is_quit_state(state.id()) { is_quit.add(state.id()); } else { no_match.add(state.id()); } } let mut sets: Vec = matching.into_iter().map(|(_, set)| set).collect(); sets.push(no_match); sets.push(is_quit); sets } fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec>> { let mut incoming = vec![]; for _ in dfa.states() { incoming.push(vec![vec![]; dfa.alphabet_len()]); } for state in dfa.states() { for (b, next) in state.transitions() { incoming[dfa.to_index(next)][b.as_usize()].push(state.id()); } } incoming } } impl StateSet { fn empty() -> StateSet { StateSet { ids: Rc::new(RefCell::new(vec![])) } } fn add(&mut self, id: StateID) { self.ids.borrow_mut().push(id); } fn min(&self) -> StateID { self.ids.borrow()[0] } fn canonicalize(&mut self) { self.ids.borrow_mut().sort(); self.ids.borrow_mut().dedup(); } fn clear(&mut self) { self.ids.borrow_mut().clear(); } fn len(&self) -> usize { self.ids.borrow().len() } fn is_empty(&self) -> bool { self.len() == 0 } fn deep_clone(&self) -> StateSet { let ids = self.ids.borrow().iter().cloned().collect(); StateSet { ids: Rc::new(RefCell::new(ids)) } } fn iter(&self, mut f: F) { for &id in self.ids.borrow().iter() { f(id); } } fn intersection(&self, other: &StateSet, dest: &mut StateSet) { dest.clear(); if self.is_empty() || other.is_empty() { return; } let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); loop { if a == b { dest.add(a); a = match ita.next() { None => break, Some(a) => a, }; b = match itb.next() { None => break, Some(b) => b, }; } else if a < b { a = match ita.next() { None => break, Some(a) => a, }; } else { b = match itb.next() { None => break, Some(b) => b, }; } } } fn subtract(&self, other: &StateSet, dest: &mut StateSet) { dest.clear(); if self.is_empty() || other.is_empty() { self.iter(|s| dest.add(s)); return; } let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); loop { if a == b { a = match ita.next() { None => break, Some(a) => a, }; b = match itb.next() { None => { dest.add(a); break; } Some(b) => b, }; } else if a < b { dest.add(a); a = match ita.next() { None => break, Some(a) => a, }; } else { b = match itb.next() { None => { dest.add(a); break; } Some(b) => b, }; } } for a in ita { dest.add(a); } } }