diff options
Diffstat (limited to 'vendor/regex-automata/src/dfa/determinize.rs')
-rw-r--r-- | vendor/regex-automata/src/dfa/determinize.rs | 599 |
1 files changed, 599 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/dfa/determinize.rs b/vendor/regex-automata/src/dfa/determinize.rs new file mode 100644 index 0000000..19f99f5 --- /dev/null +++ b/vendor/regex-automata/src/dfa/determinize.rs @@ -0,0 +1,599 @@ +use alloc::{collections::BTreeMap, vec::Vec}; + +use crate::{ + dfa::{ + dense::{self, BuildError}, + DEAD, + }, + nfa::thompson, + util::{ + self, + alphabet::{self, ByteSet}, + determinize::{State, StateBuilderEmpty, StateBuilderNFA}, + primitives::{PatternID, StateID}, + search::{Anchored, MatchKind}, + sparse_set::SparseSets, + start::Start, + }, +}; + +/// A builder for configuring and running a DFA determinizer. +#[derive(Clone, Debug)] +pub(crate) struct Config { + match_kind: MatchKind, + quit: ByteSet, + dfa_size_limit: Option<usize>, + determinize_size_limit: Option<usize>, +} + +impl Config { + /// Create a new default config for a determinizer. The determinizer may be + /// configured before calling `run`. + pub fn new() -> Config { + Config { + match_kind: MatchKind::LeftmostFirst, + quit: ByteSet::empty(), + dfa_size_limit: None, + determinize_size_limit: None, + } + } + + /// Run determinization on the given NFA and write the resulting DFA into + /// the one given. The DFA given should be initialized but otherwise empty. + /// "Initialized" means that it is setup to handle the NFA's byte classes, + /// number of patterns and whether to build start states for each pattern. + pub fn run( + &self, + nfa: &thompson::NFA, + dfa: &mut dense::OwnedDFA, + ) -> Result<(), BuildError> { + let dead = State::dead(); + let quit = State::dead(); + let mut cache = StateMap::default(); + // We only insert the dead state here since its representation is + // identical to the quit state. And we never want anything pointing + // to the quit state other than specific transitions derived from the + // determinizer's configured "quit" bytes. + // + // We do put the quit state into 'builder_states' below. This ensures + // that a proper DFA state ID is allocated for it, and that no other + // DFA state uses the "location after the DEAD state." That is, it + // is assumed that the quit state is always the state immediately + // following the DEAD state. + cache.insert(dead.clone(), DEAD); + + let runner = Runner { + config: self.clone(), + nfa, + dfa, + builder_states: alloc::vec![dead, quit], + cache, + memory_usage_state: 0, + sparses: SparseSets::new(nfa.states().len()), + stack: alloc::vec![], + scratch_state_builder: StateBuilderEmpty::new(), + }; + runner.run() + } + + /// The match semantics to use for determinization. + /// + /// MatchKind::All corresponds to the standard textbook construction. + /// All possible match states are represented in the DFA. + /// MatchKind::LeftmostFirst permits greediness and otherwise tries to + /// simulate the match semantics of backtracking regex engines. Namely, + /// only a subset of match states are built, and dead states are used to + /// stop searches with an unanchored prefix. + /// + /// The default is MatchKind::LeftmostFirst. + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { + self.match_kind = kind; + self + } + + /// The set of bytes to use that will cause the DFA to enter a quit state, + /// stop searching and return an error. By default, this is empty. + pub fn quit(&mut self, set: ByteSet) -> &mut Config { + self.quit = set; + self + } + + /// The limit, in bytes of the heap, that the DFA is permitted to use. This + /// does not include the auxiliary heap storage used by determinization. + pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config { + self.dfa_size_limit = bytes; + self + } + + /// The limit, in bytes of the heap, that determinization itself is allowed + /// to use. This does not include the size of the DFA being built. + pub fn determinize_size_limit( + &mut self, + bytes: Option<usize>, + ) -> &mut Config { + self.determinize_size_limit = bytes; + self + } +} + +/// The actual implementation of determinization that converts an NFA to a DFA +/// through powerset construction. +/// +/// This determinizer roughly follows the typical powerset construction, where +/// each DFA state is comprised of one or more NFA states. In the worst case, +/// there is one DFA state for every possible combination of NFA states. In +/// practice, this only happens in certain conditions, typically when there are +/// bounded repetitions. +/// +/// The main differences between this implementation and typical deteminization +/// are that this implementation delays matches by one state and hackily makes +/// look-around work. Comments below attempt to explain this. +/// +/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA, +/// whichever is shorter. +#[derive(Debug)] +struct Runner<'a> { + /// The configuration used to initialize determinization. + config: Config, + /// The NFA we're converting into a DFA. + nfa: &'a thompson::NFA, + /// The DFA we're building. + dfa: &'a mut dense::OwnedDFA, + /// Each DFA state being built is defined as an *ordered* set of NFA + /// states, along with some meta facts about the ordered set of NFA states. + /// + /// This is never empty. The first state is always a dummy state such that + /// a state id == 0 corresponds to a dead state. The second state is always + /// the quit state. + /// + /// Why do we have states in both a `Vec` and in a cache map below? + /// Well, they serve two different roles based on access patterns. + /// `builder_states` is the canonical home of each state, and provides + /// constant random access by a DFA state's ID. The cache map below, on + /// the other hand, provides a quick way of searching for identical DFA + /// states by using the DFA state as a key in the map. Of course, we use + /// reference counting to avoid actually duplicating the state's data + /// itself. (Although this has never been benchmarked.) Note that the cache + /// map does not give us full minimization; it just lets us avoid some very + /// obvious redundant states. + /// + /// Note that the index into this Vec isn't quite the DFA's state ID. + /// Rather, it's just an index. To get the state ID, you have to multiply + /// it by the DFA's stride. That's done by self.dfa.from_index. And the + /// inverse is self.dfa.to_index. + /// + /// Moreover, DFA states don't usually retain the IDs assigned to them + /// by their position in this Vec. After determinization completes, + /// states are shuffled around to support other optimizations. See the + /// sibling 'special' module for more details on that. (The reason for + /// mentioning this is that if you print out the DFA for debugging during + /// determinization, and then print out the final DFA after it is fully + /// built, then the state IDs likely won't match up.) + builder_states: Vec<State>, + /// A cache of DFA states that already exist and can be easily looked up + /// via ordered sets of NFA states. + /// + /// See `builder_states` docs for why we store states in two different + /// ways. + cache: StateMap, + /// The memory usage, in bytes, used by builder_states and cache. We track + /// this as new states are added since states use a variable amount of + /// heap. Tracking this as we add states makes it possible to compute the + /// total amount of memory used by the determinizer in constant time. + memory_usage_state: usize, + /// A pair of sparse sets for tracking ordered sets of NFA state IDs. + /// These are reused throughout determinization. A bounded sparse set + /// gives us constant time insertion, membership testing and clearing. + sparses: SparseSets, + /// Scratch space for a stack of NFA states to visit, for depth first + /// visiting without recursion. + stack: Vec<StateID>, + /// Scratch space for storing an ordered sequence of NFA states, for + /// amortizing allocation. This is principally useful for when we avoid + /// adding a new DFA state since it already exists. In order to detect this + /// case though, we still need an ordered set of NFA state IDs. So we use + /// this space to stage that ordered set before we know whether we need to + /// create a new DFA state or not. + scratch_state_builder: StateBuilderEmpty, +} + +/// A map from states to state identifiers. When using std, we use a standard +/// hashmap, since it's a bit faster for this use case. (Other maps, like +/// one's based on FNV, have not yet been benchmarked.) +/// +/// The main purpose of this map is to reuse states where possible. This won't +/// fully minimize the DFA, but it works well in a lot of cases. +#[cfg(feature = "std")] +type StateMap = std::collections::HashMap<State, StateID>; +#[cfg(not(feature = "std"))] +type StateMap = BTreeMap<State, StateID>; + +impl<'a> Runner<'a> { + /// Build the DFA. If there was a problem constructing the DFA (e.g., if + /// the chosen state identifier representation is too small), then an error + /// is returned. + fn run(mut self) -> Result<(), BuildError> { + if self.nfa.look_set_any().contains_word_unicode() + && !self.config.quit.contains_range(0x80, 0xFF) + { + return Err(BuildError::unsupported_dfa_word_boundary_unicode()); + } + + // A sequence of "representative" bytes drawn from each equivalence + // class. These representative bytes are fed to the NFA to compute + // state transitions. This allows us to avoid re-computing state + // transitions for bytes that are guaranteed to produce identical + // results. Since computing the representatives needs to do a little + // work, we do it once here because we'll be iterating over them a lot. + let representatives: Vec<alphabet::Unit> = + self.dfa.byte_classes().representatives(..).collect(); + // The set of all DFA state IDs that still need to have their + // transitions set. We start by seeding this with all starting states. + let mut uncompiled = alloc::vec![]; + self.add_all_starts(&mut uncompiled)?; + while let Some(dfa_id) = uncompiled.pop() { + for &unit in &representatives { + if unit.as_u8().map_or(false, |b| self.config.quit.contains(b)) + { + continue; + } + // In many cases, the state we transition to has already been + // computed. 'cached_state' will do the minimal amount of work + // to check this, and if it exists, immediately return an + // already existing state ID. + let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?; + self.dfa.set_transition(dfa_id, unit, next_dfa_id); + // If the state ID we got back is newly created, then we need + // to compile it, so add it to our uncompiled frontier. + if is_new { + uncompiled.push(next_dfa_id); + } + } + } + debug!( + "determinization complete, memory usage: {}, \ + dense DFA size: {}, \ + is reverse? {}", + self.memory_usage(), + self.dfa.memory_usage(), + self.nfa.is_reverse(), + ); + + // A map from DFA state ID to one or more NFA match IDs. Each NFA match + // ID corresponds to a distinct regex pattern that matches in the state + // corresponding to the key. + let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new(); + self.cache.clear(); + #[cfg(feature = "logging")] + let mut total_pat_len = 0; + for (i, state) in self.builder_states.into_iter().enumerate() { + if let Some(pat_ids) = state.match_pattern_ids() { + let id = self.dfa.to_state_id(i); + log! { + total_pat_len += pat_ids.len(); + } + matches.insert(id, pat_ids); + } + } + log! { + use core::mem::size_of; + let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>(); + let pats = total_pat_len * size_of::<PatternID>(); + let mem = (matches.len() * per_elem) + pats; + log::debug!("matches map built, memory usage: {}", mem); + } + // At this point, we shuffle the "special" states in the final DFA. + // This permits a DFA's match loop to detect a match condition (among + // other things) by merely inspecting the current state's identifier, + // and avoids the need for any additional auxiliary storage. + self.dfa.shuffle(matches)?; + Ok(()) + } + + /// Return the identifier for the next DFA state given an existing DFA + /// state and an input byte. If the next DFA state already exists, then + /// return its identifier from the cache. Otherwise, build the state, cache + /// it and return its identifier. + /// + /// This routine returns a boolean indicating whether a new state was + /// built. If a new state is built, then the caller needs to add it to its + /// frontier of uncompiled DFA states to compute transitions for. + fn cached_state( + &mut self, + dfa_id: StateID, + unit: alphabet::Unit, + ) -> Result<(StateID, bool), BuildError> { + // Compute the set of all reachable NFA states, including epsilons. + let empty_builder = self.get_state_builder(); + let builder = util::determinize::next( + self.nfa, + self.config.match_kind, + &mut self.sparses, + &mut self.stack, + &self.builder_states[self.dfa.to_index(dfa_id)], + unit, + empty_builder, + ); + self.maybe_add_state(builder) + } + + /// Compute the set of DFA start states and add their identifiers in + /// 'dfa_state_ids' (no duplicates are added). + fn add_all_starts( + &mut self, + dfa_state_ids: &mut Vec<StateID>, + ) -> Result<(), BuildError> { + // These should be the first states added. + assert!(dfa_state_ids.is_empty()); + // We only want to add (un)anchored starting states that is consistent + // with our DFA's configuration. Unconditionally adding both (although + // it is the default) can make DFAs quite a bit bigger. + if self.dfa.start_kind().has_unanchored() { + self.add_start_group(Anchored::No, dfa_state_ids)?; + } + if self.dfa.start_kind().has_anchored() { + self.add_start_group(Anchored::Yes, dfa_state_ids)?; + } + // I previously has an 'assert' here checking that either + // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it + // turns out this isn't always true. For example, the NFA might have + // one or more patterns but where all such patterns are just 'fail' + // states. These will ultimately just compile down to DFA dead states, + // and since the dead state was added earlier, no new DFA states are + // added. And thus, it is valid and okay for 'dfa_state_ids' to be + // empty even if there are a non-zero number of patterns in the NFA. + + // We only need to compute anchored start states for each pattern if it + // was requested to do so. + if self.dfa.starts_for_each_pattern() { + for pid in self.nfa.patterns() { + self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?; + } + } + Ok(()) + } + + /// Add a group of start states for the given match pattern ID. Any new + /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are + /// pushed.) + /// + /// When pattern_id is None, then this will compile a group of unanchored + /// start states (if the DFA is unanchored). When the pattern_id is + /// present, then this will compile a group of anchored start states that + /// only match the given pattern. + /// + /// This panics if `anchored` corresponds to an invalid pattern ID. + fn add_start_group( + &mut self, + anchored: Anchored, + dfa_state_ids: &mut Vec<StateID>, + ) -> Result<(), BuildError> { + let nfa_start = match anchored { + Anchored::No => self.nfa.start_unanchored(), + Anchored::Yes => self.nfa.start_anchored(), + Anchored::Pattern(pid) => { + self.nfa.start_pattern(pid).expect("valid pattern ID") + } + }; + + // When compiling start states, we're careful not to build additional + // states that aren't necessary. For example, if the NFA has no word + // boundary assertion, then there's no reason to have distinct start + // states for 'NonWordByte' and 'WordByte' starting configurations. + // Instead, the 'WordByte' starting configuration can just point + // directly to the start state for the 'NonWordByte' config. + // + // Note though that we only need to care about assertions in the prefix + // of an NFA since this only concerns the starting states. (Actually, + // the most precisely thing we could do it is look at the prefix + // assertions of each pattern when 'anchored == Anchored::Pattern', + // and then only compile extra states if the prefix is non-empty.) But + // we settle for simplicity here instead of absolute minimalism. It is + // somewhat rare, after all, for multiple patterns in the same regex to + // have different prefix look-arounds. + + let (id, is_new) = + self.add_one_start(nfa_start, Start::NonWordByte)?; + self.dfa.set_start_state(anchored, Start::NonWordByte, id); + if is_new { + dfa_state_ids.push(id); + } + + if !self.nfa.look_set_prefix_any().contains_word() { + self.dfa.set_start_state(anchored, Start::WordByte, id); + } else { + let (id, is_new) = + self.add_one_start(nfa_start, Start::WordByte)?; + self.dfa.set_start_state(anchored, Start::WordByte, id); + if is_new { + dfa_state_ids.push(id); + } + } + if !self.nfa.look_set_prefix_any().contains_anchor() { + self.dfa.set_start_state(anchored, Start::Text, id); + self.dfa.set_start_state(anchored, Start::LineLF, id); + self.dfa.set_start_state(anchored, Start::LineCR, id); + self.dfa.set_start_state( + anchored, + Start::CustomLineTerminator, + id, + ); + } else { + let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?; + self.dfa.set_start_state(anchored, Start::Text, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?; + self.dfa.set_start_state(anchored, Start::LineLF, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?; + self.dfa.set_start_state(anchored, Start::LineCR, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = + self.add_one_start(nfa_start, Start::CustomLineTerminator)?; + self.dfa.set_start_state( + anchored, + Start::CustomLineTerminator, + id, + ); + if is_new { + dfa_state_ids.push(id); + } + } + + Ok(()) + } + + /// Add a new DFA start state corresponding to the given starting NFA + /// state, and the starting search configuration. (The starting search + /// configuration essentially tells us which look-behind assertions are + /// true for this particular state.) + /// + /// The boolean returned indicates whether the state ID returned is a newly + /// created state, or a previously cached state. + fn add_one_start( + &mut self, + nfa_start: StateID, + start: Start, + ) -> Result<(StateID, bool), BuildError> { + // Compute the look-behind assertions that are true in this starting + // configuration, and the determine the epsilon closure. While + // computing the epsilon closure, we only follow condiional epsilon + // transitions that satisfy the look-behind assertions in 'look_have'. + let mut builder_matches = self.get_state_builder().into_matches(); + util::determinize::set_lookbehind_from_start( + self.nfa, + &start, + &mut builder_matches, + ); + self.sparses.set1.clear(); + util::determinize::epsilon_closure( + self.nfa, + nfa_start, + builder_matches.look_have(), + &mut self.stack, + &mut self.sparses.set1, + ); + let mut builder = builder_matches.into_nfa(); + util::determinize::add_nfa_states( + &self.nfa, + &self.sparses.set1, + &mut builder, + ); + self.maybe_add_state(builder) + } + + /// Adds the given state to the DFA being built depending on whether it + /// already exists in this determinizer's cache. + /// + /// If it does exist, then the memory used by 'state' is put back into the + /// determinizer and the previously created state's ID is returned. (Along + /// with 'false', indicating that no new state was added.) + /// + /// If it does not exist, then the state is added to the DFA being built + /// and a fresh ID is allocated (if ID allocation fails, then an error is + /// returned) and returned. (Along with 'true', indicating that a new state + /// was added.) + fn maybe_add_state( + &mut self, + builder: StateBuilderNFA, + ) -> Result<(StateID, bool), BuildError> { + if let Some(&cached_id) = self.cache.get(builder.as_bytes()) { + // Since we have a cached state, put the constructed state's + // memory back into our scratch space, so that it can be reused. + self.put_state_builder(builder); + return Ok((cached_id, false)); + } + self.add_state(builder).map(|sid| (sid, true)) + } + + /// Add the given state to the DFA and make it available in the cache. + /// + /// The state initially has no transitions. That is, it transitions to the + /// dead state for all possible inputs, and transitions to the quit state + /// for all quit bytes. + /// + /// If adding the state would exceed the maximum value for StateID, then an + /// error is returned. + fn add_state( + &mut self, + builder: StateBuilderNFA, + ) -> Result<StateID, BuildError> { + let id = self.dfa.add_empty_state()?; + if !self.config.quit.is_empty() { + for b in self.config.quit.iter() { + self.dfa.set_transition( + id, + alphabet::Unit::u8(b), + self.dfa.quit_id(), + ); + } + } + let state = builder.to_state(); + // States use reference counting internally, so we only need to count + // their memory usage once. + self.memory_usage_state += state.memory_usage(); + self.builder_states.push(state.clone()); + self.cache.insert(state, id); + self.put_state_builder(builder); + if let Some(limit) = self.config.dfa_size_limit { + if self.dfa.memory_usage() > limit { + return Err(BuildError::dfa_exceeded_size_limit(limit)); + } + } + if let Some(limit) = self.config.determinize_size_limit { + if self.memory_usage() > limit { + return Err(BuildError::determinize_exceeded_size_limit( + limit, + )); + } + } + Ok(id) + } + + /// Returns a state builder from this determinizer that might have existing + /// capacity. This helps avoid allocs in cases where a state is built that + /// turns out to already be cached. + /// + /// Callers must put the state builder back with 'put_state_builder', + /// otherwise the allocation reuse won't work. + fn get_state_builder(&mut self) -> StateBuilderEmpty { + core::mem::replace( + &mut self.scratch_state_builder, + StateBuilderEmpty::new(), + ) + } + + /// Puts the given state builder back into this determinizer for reuse. + /// + /// Note that building a 'State' from a builder always creates a new + /// alloc, so callers should always put the builder back. + fn put_state_builder(&mut self, builder: StateBuilderNFA) { + let _ = core::mem::replace( + &mut self.scratch_state_builder, + builder.clear(), + ); + } + + /// Return the memory usage, in bytes, of this determinizer at the current + /// point in time. This does not include memory used by the NFA or the + /// dense DFA itself. + fn memory_usage(&self) -> usize { + use core::mem::size_of; + + self.builder_states.len() * size_of::<State>() + // Maps likely use more memory than this, but it's probably close. + + self.cache.len() * (size_of::<State>() + size_of::<StateID>()) + + self.memory_usage_state + + self.stack.capacity() * size_of::<StateID>() + + self.scratch_state_builder.capacity() + } +} |