use std::collections::HashMap; use std::mem; use std::rc::Rc; use dense; use error::Result; use nfa::{self, NFA}; use sparse_set::SparseSet; use state_id::{dead_id, StateID}; type DFARepr = dense::Repr, S>; /// A determinizer converts an NFA to a DFA. /// /// This determinizer follows the typical powerset construction, where each /// DFA state is comprised of one or more NFA states. In the worst case, there /// is one DFA state for every possible combination of NFA states. In practice, /// this only happens in certain conditions, typically when there are bounded /// repetitions. /// /// The type variable `S` refers to the chosen state identifier representation /// used for the DFA. /// /// The lifetime variable `'a` refers to the lifetime of the NFA being /// converted to a DFA. #[derive(Debug)] pub(crate) struct Determinizer<'a, S: StateID> { /// The NFA we're converting into a DFA. nfa: &'a NFA, /// The DFA we're building. dfa: DFARepr, /// Each DFA state being built is defined as an *ordered* set of NFA /// states, along with a flag indicating whether the state is a match /// state or not. /// /// This is never empty. The first state is always a dummy state such that /// a state id == 0 corresponds to a dead state. builder_states: Vec>, /// A cache of DFA states that already exist and can be easily looked up /// via ordered sets of NFA states. cache: HashMap, S>, /// Scratch space for a stack of NFA states to visit, for depth first /// visiting without recursion. stack: Vec, /// Scratch space for storing an ordered sequence of NFA states, for /// amortizing allocation. scratch_nfa_states: Vec, /// Whether to build a DFA that finds the longest possible match. longest_match: bool, } /// An intermediate representation for a DFA state during determinization. #[derive(Debug, Eq, Hash, PartialEq)] struct State { /// Whether this state is a match state or not. is_match: bool, /// An ordered sequence of NFA states that make up this DFA state. nfa_states: Vec, } impl<'a, S: StateID> Determinizer<'a, S> { /// Create a new determinizer for converting the given NFA to a DFA. pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> { let dead = Rc::new(State::dead()); let mut cache = HashMap::default(); cache.insert(dead.clone(), dead_id()); Determinizer { nfa, dfa: DFARepr::empty().anchored(nfa.is_anchored()), builder_states: vec![dead], cache, stack: vec![], scratch_nfa_states: vec![], longest_match: false, } } /// Instruct the determinizer to use equivalence classes as the transition /// alphabet instead of all possible byte values. pub fn with_byte_classes(mut self) -> Determinizer<'a, S> { let byte_classes = self.nfa.byte_classes().clone(); self.dfa = DFARepr::empty_with_byte_classes(byte_classes) .anchored(self.nfa.is_anchored()); self } /// Instruct the determinizer to build a DFA that recognizes the longest /// possible match instead of the leftmost first match. This is useful when /// constructing reverse DFAs for finding the start of a match. pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> { self.longest_match = yes; self } /// Build the DFA. If there was a problem constructing the DFA (e.g., if /// the chosen state identifier representation is too small), then an error /// is returned. pub fn build(mut self) -> Result> { let representative_bytes: Vec = self.dfa.byte_classes().representatives().collect(); let mut sparse = self.new_sparse_set(); let mut uncompiled = vec![self.add_start(&mut sparse)?]; while let Some(dfa_id) = uncompiled.pop() { for &b in &representative_bytes { let (next_dfa_id, is_new) = self.cached_state(dfa_id, b, &mut sparse)?; self.dfa.add_transition(dfa_id, b, next_dfa_id); if is_new { uncompiled.push(next_dfa_id); } } } // At this point, we shuffle the matching states in the final DFA to // the beginning. This permits a DFA's match loop to detect a match // condition by merely inspecting the current state's identifier, and // avoids the need for any additional auxiliary storage. let is_match: Vec = self.builder_states.iter().map(|s| s.is_match).collect(); self.dfa.shuffle_match_states(&is_match); Ok(self.dfa) } /// Return the identifier for the next DFA state given an existing DFA /// state and an input byte. If the next DFA state already exists, then /// return its identifier from the cache. Otherwise, build the state, cache /// it and return its identifier. /// /// The given sparse set is used for scratch space. It must have a capacity /// equivalent to the total number of NFA states, but its contents are /// otherwise unspecified. /// /// This routine returns a boolean indicating whether a new state was /// built. If a new state is built, then the caller needs to add it to its /// frontier of uncompiled DFA states to compute transitions for. fn cached_state( &mut self, dfa_id: S, b: u8, sparse: &mut SparseSet, ) -> Result<(S, bool)> { sparse.clear(); // Compute the set of all reachable NFA states, including epsilons. self.next(dfa_id, b, sparse); // Build a candidate state and check if it has already been built. let state = self.new_state(sparse); if let Some(&cached_id) = self.cache.get(&state) { // Since we have a cached state, put the constructed state's // memory back into our scratch space, so that it can be reused. let _ = mem::replace(&mut self.scratch_nfa_states, state.nfa_states); return Ok((cached_id, false)); } // Nothing was in the cache, so add this state to the cache. self.add_state(state).map(|s| (s, true)) } /// Compute the set of all eachable NFA states, including the full epsilon /// closure, from a DFA state for a single byte of input. fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) { next_nfa_states.clear(); for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() { let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i]; match *self.nfa.state(nfa_id) { nfa::State::Union { .. } | nfa::State::Fail | nfa::State::Match => {} nfa::State::Range { range: ref r } => { if r.start <= b && b <= r.end { self.epsilon_closure(r.next, next_nfa_states); } } nfa::State::Sparse { ref ranges } => { for r in ranges.iter() { if r.start > b { break; } else if r.start <= b && b <= r.end { self.epsilon_closure(r.next, next_nfa_states); break; } } } } } } /// Compute the epsilon closure for the given NFA state. fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) { if !self.nfa.state(start).is_epsilon() { set.insert(start); return; } self.stack.push(start); while let Some(mut id) = self.stack.pop() { loop { if set.contains(id) { break; } set.insert(id); match *self.nfa.state(id) { nfa::State::Range { .. } | nfa::State::Sparse { .. } | nfa::State::Fail | nfa::State::Match => break, nfa::State::Union { ref alternates } => { id = match alternates.get(0) { None => break, Some(&id) => id, }; self.stack.extend(alternates[1..].iter().rev()); } } } } } /// Compute the initial DFA state and return its identifier. /// /// The sparse set given is used for scratch space, and must have capacity /// equal to the total number of NFA states. Its contents are unspecified. fn add_start(&mut self, sparse: &mut SparseSet) -> Result { sparse.clear(); self.epsilon_closure(self.nfa.start(), sparse); let state = self.new_state(&sparse); let id = self.add_state(state)?; self.dfa.set_start_state(id); Ok(id) } /// Add the given state to the DFA and make it available in the cache. /// /// The state initially has no transitions. That is, it transitions to the /// dead state for all possible inputs. fn add_state(&mut self, state: State) -> Result { let id = self.dfa.add_empty_state()?; let rstate = Rc::new(state); self.builder_states.push(rstate.clone()); self.cache.insert(rstate, id); Ok(id) } /// Convert the given set of ordered NFA states to a DFA state. fn new_state(&mut self, set: &SparseSet) -> State { let mut state = State { is_match: false, nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]), }; state.nfa_states.clear(); for &id in set { match *self.nfa.state(id) { nfa::State::Range { .. } => { state.nfa_states.push(id); } nfa::State::Sparse { .. } => { state.nfa_states.push(id); } nfa::State::Fail => { break; } nfa::State::Match => { state.is_match = true; if !self.longest_match { break; } } nfa::State::Union { .. } => {} } } state } /// Create a new sparse set with enough capacity to hold all NFA states. fn new_sparse_set(&self) -> SparseSet { SparseSet::new(self.nfa.len()) } } impl State { /// Create a new empty dead state. fn dead() -> State { State { nfa_states: vec![], is_match: false } } }