summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/dfa/determinize.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/dfa/determinize.rs')
-rw-r--r--third_party/rust/regex-automata/src/dfa/determinize.rs599
1 files changed, 599 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/dfa/determinize.rs b/third_party/rust/regex-automata/src/dfa/determinize.rs
new file mode 100644
index 0000000000..19f99f5d64
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/determinize.rs
@@ -0,0 +1,599 @@
+use alloc::{collections::BTreeMap, vec::Vec};
+
+use crate::{
+ dfa::{
+ dense::{self, BuildError},
+ DEAD,
+ },
+ nfa::thompson,
+ util::{
+ self,
+ alphabet::{self, ByteSet},
+ determinize::{State, StateBuilderEmpty, StateBuilderNFA},
+ primitives::{PatternID, StateID},
+ search::{Anchored, MatchKind},
+ sparse_set::SparseSets,
+ start::Start,
+ },
+};
+
+/// A builder for configuring and running a DFA determinizer.
+#[derive(Clone, Debug)]
+pub(crate) struct Config {
+ match_kind: MatchKind,
+ quit: ByteSet,
+ dfa_size_limit: Option<usize>,
+ determinize_size_limit: Option<usize>,
+}
+
+impl Config {
+ /// Create a new default config for a determinizer. The determinizer may be
+ /// configured before calling `run`.
+ pub fn new() -> Config {
+ Config {
+ match_kind: MatchKind::LeftmostFirst,
+ quit: ByteSet::empty(),
+ dfa_size_limit: None,
+ determinize_size_limit: None,
+ }
+ }
+
+ /// Run determinization on the given NFA and write the resulting DFA into
+ /// the one given. The DFA given should be initialized but otherwise empty.
+ /// "Initialized" means that it is setup to handle the NFA's byte classes,
+ /// number of patterns and whether to build start states for each pattern.
+ pub fn run(
+ &self,
+ nfa: &thompson::NFA,
+ dfa: &mut dense::OwnedDFA,
+ ) -> Result<(), BuildError> {
+ let dead = State::dead();
+ let quit = State::dead();
+ let mut cache = StateMap::default();
+ // We only insert the dead state here since its representation is
+ // identical to the quit state. And we never want anything pointing
+ // to the quit state other than specific transitions derived from the
+ // determinizer's configured "quit" bytes.
+ //
+ // We do put the quit state into 'builder_states' below. This ensures
+ // that a proper DFA state ID is allocated for it, and that no other
+ // DFA state uses the "location after the DEAD state." That is, it
+ // is assumed that the quit state is always the state immediately
+ // following the DEAD state.
+ cache.insert(dead.clone(), DEAD);
+
+ let runner = Runner {
+ config: self.clone(),
+ nfa,
+ dfa,
+ builder_states: alloc::vec![dead, quit],
+ cache,
+ memory_usage_state: 0,
+ sparses: SparseSets::new(nfa.states().len()),
+ stack: alloc::vec![],
+ scratch_state_builder: StateBuilderEmpty::new(),
+ };
+ runner.run()
+ }
+
+ /// The match semantics to use for determinization.
+ ///
+ /// MatchKind::All corresponds to the standard textbook construction.
+ /// All possible match states are represented in the DFA.
+ /// MatchKind::LeftmostFirst permits greediness and otherwise tries to
+ /// simulate the match semantics of backtracking regex engines. Namely,
+ /// only a subset of match states are built, and dead states are used to
+ /// stop searches with an unanchored prefix.
+ ///
+ /// The default is MatchKind::LeftmostFirst.
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
+ self.match_kind = kind;
+ self
+ }
+
+ /// The set of bytes to use that will cause the DFA to enter a quit state,
+ /// stop searching and return an error. By default, this is empty.
+ pub fn quit(&mut self, set: ByteSet) -> &mut Config {
+ self.quit = set;
+ self
+ }
+
+ /// The limit, in bytes of the heap, that the DFA is permitted to use. This
+ /// does not include the auxiliary heap storage used by determinization.
+ pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
+ self.dfa_size_limit = bytes;
+ self
+ }
+
+ /// The limit, in bytes of the heap, that determinization itself is allowed
+ /// to use. This does not include the size of the DFA being built.
+ pub fn determinize_size_limit(
+ &mut self,
+ bytes: Option<usize>,
+ ) -> &mut Config {
+ self.determinize_size_limit = bytes;
+ self
+ }
+}
+
+/// The actual implementation of determinization that converts an NFA to a DFA
+/// through powerset construction.
+///
+/// This determinizer roughly follows the typical powerset construction, where
+/// each DFA state is comprised of one or more NFA states. In the worst case,
+/// there is one DFA state for every possible combination of NFA states. In
+/// practice, this only happens in certain conditions, typically when there are
+/// bounded repetitions.
+///
+/// The main differences between this implementation and typical deteminization
+/// are that this implementation delays matches by one state and hackily makes
+/// look-around work. Comments below attempt to explain this.
+///
+/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
+/// whichever is shorter.
+#[derive(Debug)]
+struct Runner<'a> {
+ /// The configuration used to initialize determinization.
+ config: Config,
+ /// The NFA we're converting into a DFA.
+ nfa: &'a thompson::NFA,
+ /// The DFA we're building.
+ dfa: &'a mut dense::OwnedDFA,
+ /// Each DFA state being built is defined as an *ordered* set of NFA
+ /// states, along with some meta facts about the ordered set of NFA states.
+ ///
+ /// This is never empty. The first state is always a dummy state such that
+ /// a state id == 0 corresponds to a dead state. The second state is always
+ /// the quit state.
+ ///
+ /// Why do we have states in both a `Vec` and in a cache map below?
+ /// Well, they serve two different roles based on access patterns.
+ /// `builder_states` is the canonical home of each state, and provides
+ /// constant random access by a DFA state's ID. The cache map below, on
+ /// the other hand, provides a quick way of searching for identical DFA
+ /// states by using the DFA state as a key in the map. Of course, we use
+ /// reference counting to avoid actually duplicating the state's data
+ /// itself. (Although this has never been benchmarked.) Note that the cache
+ /// map does not give us full minimization; it just lets us avoid some very
+ /// obvious redundant states.
+ ///
+ /// Note that the index into this Vec isn't quite the DFA's state ID.
+ /// Rather, it's just an index. To get the state ID, you have to multiply
+ /// it by the DFA's stride. That's done by self.dfa.from_index. And the
+ /// inverse is self.dfa.to_index.
+ ///
+ /// Moreover, DFA states don't usually retain the IDs assigned to them
+ /// by their position in this Vec. After determinization completes,
+ /// states are shuffled around to support other optimizations. See the
+ /// sibling 'special' module for more details on that. (The reason for
+ /// mentioning this is that if you print out the DFA for debugging during
+ /// determinization, and then print out the final DFA after it is fully
+ /// built, then the state IDs likely won't match up.)
+ builder_states: Vec<State>,
+ /// A cache of DFA states that already exist and can be easily looked up
+ /// via ordered sets of NFA states.
+ ///
+ /// See `builder_states` docs for why we store states in two different
+ /// ways.
+ cache: StateMap,
+ /// The memory usage, in bytes, used by builder_states and cache. We track
+ /// this as new states are added since states use a variable amount of
+ /// heap. Tracking this as we add states makes it possible to compute the
+ /// total amount of memory used by the determinizer in constant time.
+ memory_usage_state: usize,
+ /// A pair of sparse sets for tracking ordered sets of NFA state IDs.
+ /// These are reused throughout determinization. A bounded sparse set
+ /// gives us constant time insertion, membership testing and clearing.
+ sparses: SparseSets,
+ /// Scratch space for a stack of NFA states to visit, for depth first
+ /// visiting without recursion.
+ stack: Vec<StateID>,
+ /// Scratch space for storing an ordered sequence of NFA states, for
+ /// amortizing allocation. This is principally useful for when we avoid
+ /// adding a new DFA state since it already exists. In order to detect this
+ /// case though, we still need an ordered set of NFA state IDs. So we use
+ /// this space to stage that ordered set before we know whether we need to
+ /// create a new DFA state or not.
+ scratch_state_builder: StateBuilderEmpty,
+}
+
+/// A map from states to state identifiers. When using std, we use a standard
+/// hashmap, since it's a bit faster for this use case. (Other maps, like
+/// one's based on FNV, have not yet been benchmarked.)
+///
+/// The main purpose of this map is to reuse states where possible. This won't
+/// fully minimize the DFA, but it works well in a lot of cases.
+#[cfg(feature = "std")]
+type StateMap = std::collections::HashMap<State, StateID>;
+#[cfg(not(feature = "std"))]
+type StateMap = BTreeMap<State, StateID>;
+
+impl<'a> Runner<'a> {
+ /// Build the DFA. If there was a problem constructing the DFA (e.g., if
+ /// the chosen state identifier representation is too small), then an error
+ /// is returned.
+ fn run(mut self) -> Result<(), BuildError> {
+ if self.nfa.look_set_any().contains_word_unicode()
+ && !self.config.quit.contains_range(0x80, 0xFF)
+ {
+ return Err(BuildError::unsupported_dfa_word_boundary_unicode());
+ }
+
+ // A sequence of "representative" bytes drawn from each equivalence
+ // class. These representative bytes are fed to the NFA to compute
+ // state transitions. This allows us to avoid re-computing state
+ // transitions for bytes that are guaranteed to produce identical
+ // results. Since computing the representatives needs to do a little
+ // work, we do it once here because we'll be iterating over them a lot.
+ let representatives: Vec<alphabet::Unit> =
+ self.dfa.byte_classes().representatives(..).collect();
+ // The set of all DFA state IDs that still need to have their
+ // transitions set. We start by seeding this with all starting states.
+ let mut uncompiled = alloc::vec![];
+ self.add_all_starts(&mut uncompiled)?;
+ while let Some(dfa_id) = uncompiled.pop() {
+ for &unit in &representatives {
+ if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
+ {
+ continue;
+ }
+ // In many cases, the state we transition to has already been
+ // computed. 'cached_state' will do the minimal amount of work
+ // to check this, and if it exists, immediately return an
+ // already existing state ID.
+ let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
+ self.dfa.set_transition(dfa_id, unit, next_dfa_id);
+ // If the state ID we got back is newly created, then we need
+ // to compile it, so add it to our uncompiled frontier.
+ if is_new {
+ uncompiled.push(next_dfa_id);
+ }
+ }
+ }
+ debug!(
+ "determinization complete, memory usage: {}, \
+ dense DFA size: {}, \
+ is reverse? {}",
+ self.memory_usage(),
+ self.dfa.memory_usage(),
+ self.nfa.is_reverse(),
+ );
+
+ // A map from DFA state ID to one or more NFA match IDs. Each NFA match
+ // ID corresponds to a distinct regex pattern that matches in the state
+ // corresponding to the key.
+ let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
+ self.cache.clear();
+ #[cfg(feature = "logging")]
+ let mut total_pat_len = 0;
+ for (i, state) in self.builder_states.into_iter().enumerate() {
+ if let Some(pat_ids) = state.match_pattern_ids() {
+ let id = self.dfa.to_state_id(i);
+ log! {
+ total_pat_len += pat_ids.len();
+ }
+ matches.insert(id, pat_ids);
+ }
+ }
+ log! {
+ use core::mem::size_of;
+ let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
+ let pats = total_pat_len * size_of::<PatternID>();
+ let mem = (matches.len() * per_elem) + pats;
+ log::debug!("matches map built, memory usage: {}", mem);
+ }
+ // At this point, we shuffle the "special" states in the final DFA.
+ // This permits a DFA's match loop to detect a match condition (among
+ // other things) by merely inspecting the current state's identifier,
+ // and avoids the need for any additional auxiliary storage.
+ self.dfa.shuffle(matches)?;
+ Ok(())
+ }
+
+ /// Return the identifier for the next DFA state given an existing DFA
+ /// state and an input byte. If the next DFA state already exists, then
+ /// return its identifier from the cache. Otherwise, build the state, cache
+ /// it and return its identifier.
+ ///
+ /// This routine returns a boolean indicating whether a new state was
+ /// built. If a new state is built, then the caller needs to add it to its
+ /// frontier of uncompiled DFA states to compute transitions for.
+ fn cached_state(
+ &mut self,
+ dfa_id: StateID,
+ unit: alphabet::Unit,
+ ) -> Result<(StateID, bool), BuildError> {
+ // Compute the set of all reachable NFA states, including epsilons.
+ let empty_builder = self.get_state_builder();
+ let builder = util::determinize::next(
+ self.nfa,
+ self.config.match_kind,
+ &mut self.sparses,
+ &mut self.stack,
+ &self.builder_states[self.dfa.to_index(dfa_id)],
+ unit,
+ empty_builder,
+ );
+ self.maybe_add_state(builder)
+ }
+
+ /// Compute the set of DFA start states and add their identifiers in
+ /// 'dfa_state_ids' (no duplicates are added).
+ fn add_all_starts(
+ &mut self,
+ dfa_state_ids: &mut Vec<StateID>,
+ ) -> Result<(), BuildError> {
+ // These should be the first states added.
+ assert!(dfa_state_ids.is_empty());
+ // We only want to add (un)anchored starting states that is consistent
+ // with our DFA's configuration. Unconditionally adding both (although
+ // it is the default) can make DFAs quite a bit bigger.
+ if self.dfa.start_kind().has_unanchored() {
+ self.add_start_group(Anchored::No, dfa_state_ids)?;
+ }
+ if self.dfa.start_kind().has_anchored() {
+ self.add_start_group(Anchored::Yes, dfa_state_ids)?;
+ }
+ // I previously has an 'assert' here checking that either
+ // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
+ // turns out this isn't always true. For example, the NFA might have
+ // one or more patterns but where all such patterns are just 'fail'
+ // states. These will ultimately just compile down to DFA dead states,
+ // and since the dead state was added earlier, no new DFA states are
+ // added. And thus, it is valid and okay for 'dfa_state_ids' to be
+ // empty even if there are a non-zero number of patterns in the NFA.
+
+ // We only need to compute anchored start states for each pattern if it
+ // was requested to do so.
+ if self.dfa.starts_for_each_pattern() {
+ for pid in self.nfa.patterns() {
+ self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
+ }
+ }
+ Ok(())
+ }
+
+ /// Add a group of start states for the given match pattern ID. Any new
+ /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
+ /// pushed.)
+ ///
+ /// When pattern_id is None, then this will compile a group of unanchored
+ /// start states (if the DFA is unanchored). When the pattern_id is
+ /// present, then this will compile a group of anchored start states that
+ /// only match the given pattern.
+ ///
+ /// This panics if `anchored` corresponds to an invalid pattern ID.
+ fn add_start_group(
+ &mut self,
+ anchored: Anchored,
+ dfa_state_ids: &mut Vec<StateID>,
+ ) -> Result<(), BuildError> {
+ let nfa_start = match anchored {
+ Anchored::No => self.nfa.start_unanchored(),
+ Anchored::Yes => self.nfa.start_anchored(),
+ Anchored::Pattern(pid) => {
+ self.nfa.start_pattern(pid).expect("valid pattern ID")
+ }
+ };
+
+ // When compiling start states, we're careful not to build additional
+ // states that aren't necessary. For example, if the NFA has no word
+ // boundary assertion, then there's no reason to have distinct start
+ // states for 'NonWordByte' and 'WordByte' starting configurations.
+ // Instead, the 'WordByte' starting configuration can just point
+ // directly to the start state for the 'NonWordByte' config.
+ //
+ // Note though that we only need to care about assertions in the prefix
+ // of an NFA since this only concerns the starting states. (Actually,
+ // the most precisely thing we could do it is look at the prefix
+ // assertions of each pattern when 'anchored == Anchored::Pattern',
+ // and then only compile extra states if the prefix is non-empty.) But
+ // we settle for simplicity here instead of absolute minimalism. It is
+ // somewhat rare, after all, for multiple patterns in the same regex to
+ // have different prefix look-arounds.
+
+ let (id, is_new) =
+ self.add_one_start(nfa_start, Start::NonWordByte)?;
+ self.dfa.set_start_state(anchored, Start::NonWordByte, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ if !self.nfa.look_set_prefix_any().contains_word() {
+ self.dfa.set_start_state(anchored, Start::WordByte, id);
+ } else {
+ let (id, is_new) =
+ self.add_one_start(nfa_start, Start::WordByte)?;
+ self.dfa.set_start_state(anchored, Start::WordByte, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+ }
+ if !self.nfa.look_set_prefix_any().contains_anchor() {
+ self.dfa.set_start_state(anchored, Start::Text, id);
+ self.dfa.set_start_state(anchored, Start::LineLF, id);
+ self.dfa.set_start_state(anchored, Start::LineCR, id);
+ self.dfa.set_start_state(
+ anchored,
+ Start::CustomLineTerminator,
+ id,
+ );
+ } else {
+ let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
+ self.dfa.set_start_state(anchored, Start::Text, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
+ self.dfa.set_start_state(anchored, Start::LineLF, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
+ self.dfa.set_start_state(anchored, Start::LineCR, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) =
+ self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
+ self.dfa.set_start_state(
+ anchored,
+ Start::CustomLineTerminator,
+ id,
+ );
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Add a new DFA start state corresponding to the given starting NFA
+ /// state, and the starting search configuration. (The starting search
+ /// configuration essentially tells us which look-behind assertions are
+ /// true for this particular state.)
+ ///
+ /// The boolean returned indicates whether the state ID returned is a newly
+ /// created state, or a previously cached state.
+ fn add_one_start(
+ &mut self,
+ nfa_start: StateID,
+ start: Start,
+ ) -> Result<(StateID, bool), BuildError> {
+ // Compute the look-behind assertions that are true in this starting
+ // configuration, and the determine the epsilon closure. While
+ // computing the epsilon closure, we only follow condiional epsilon
+ // transitions that satisfy the look-behind assertions in 'look_have'.
+ let mut builder_matches = self.get_state_builder().into_matches();
+ util::determinize::set_lookbehind_from_start(
+ self.nfa,
+ &start,
+ &mut builder_matches,
+ );
+ self.sparses.set1.clear();
+ util::determinize::epsilon_closure(
+ self.nfa,
+ nfa_start,
+ builder_matches.look_have(),
+ &mut self.stack,
+ &mut self.sparses.set1,
+ );
+ let mut builder = builder_matches.into_nfa();
+ util::determinize::add_nfa_states(
+ &self.nfa,
+ &self.sparses.set1,
+ &mut builder,
+ );
+ self.maybe_add_state(builder)
+ }
+
+ /// Adds the given state to the DFA being built depending on whether it
+ /// already exists in this determinizer's cache.
+ ///
+ /// If it does exist, then the memory used by 'state' is put back into the
+ /// determinizer and the previously created state's ID is returned. (Along
+ /// with 'false', indicating that no new state was added.)
+ ///
+ /// If it does not exist, then the state is added to the DFA being built
+ /// and a fresh ID is allocated (if ID allocation fails, then an error is
+ /// returned) and returned. (Along with 'true', indicating that a new state
+ /// was added.)
+ fn maybe_add_state(
+ &mut self,
+ builder: StateBuilderNFA,
+ ) -> Result<(StateID, bool), BuildError> {
+ if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
+ // Since we have a cached state, put the constructed state's
+ // memory back into our scratch space, so that it can be reused.
+ self.put_state_builder(builder);
+ return Ok((cached_id, false));
+ }
+ self.add_state(builder).map(|sid| (sid, true))
+ }
+
+ /// Add the given state to the DFA and make it available in the cache.
+ ///
+ /// The state initially has no transitions. That is, it transitions to the
+ /// dead state for all possible inputs, and transitions to the quit state
+ /// for all quit bytes.
+ ///
+ /// If adding the state would exceed the maximum value for StateID, then an
+ /// error is returned.
+ fn add_state(
+ &mut self,
+ builder: StateBuilderNFA,
+ ) -> Result<StateID, BuildError> {
+ let id = self.dfa.add_empty_state()?;
+ if !self.config.quit.is_empty() {
+ for b in self.config.quit.iter() {
+ self.dfa.set_transition(
+ id,
+ alphabet::Unit::u8(b),
+ self.dfa.quit_id(),
+ );
+ }
+ }
+ let state = builder.to_state();
+ // States use reference counting internally, so we only need to count
+ // their memory usage once.
+ self.memory_usage_state += state.memory_usage();
+ self.builder_states.push(state.clone());
+ self.cache.insert(state, id);
+ self.put_state_builder(builder);
+ if let Some(limit) = self.config.dfa_size_limit {
+ if self.dfa.memory_usage() > limit {
+ return Err(BuildError::dfa_exceeded_size_limit(limit));
+ }
+ }
+ if let Some(limit) = self.config.determinize_size_limit {
+ if self.memory_usage() > limit {
+ return Err(BuildError::determinize_exceeded_size_limit(
+ limit,
+ ));
+ }
+ }
+ Ok(id)
+ }
+
+ /// Returns a state builder from this determinizer that might have existing
+ /// capacity. This helps avoid allocs in cases where a state is built that
+ /// turns out to already be cached.
+ ///
+ /// Callers must put the state builder back with 'put_state_builder',
+ /// otherwise the allocation reuse won't work.
+ fn get_state_builder(&mut self) -> StateBuilderEmpty {
+ core::mem::replace(
+ &mut self.scratch_state_builder,
+ StateBuilderEmpty::new(),
+ )
+ }
+
+ /// Puts the given state builder back into this determinizer for reuse.
+ ///
+ /// Note that building a 'State' from a builder always creates a new
+ /// alloc, so callers should always put the builder back.
+ fn put_state_builder(&mut self, builder: StateBuilderNFA) {
+ let _ = core::mem::replace(
+ &mut self.scratch_state_builder,
+ builder.clear(),
+ );
+ }
+
+ /// Return the memory usage, in bytes, of this determinizer at the current
+ /// point in time. This does not include memory used by the NFA or the
+ /// dense DFA itself.
+ fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+
+ self.builder_states.len() * size_of::<State>()
+ // Maps likely use more memory than this, but it's probably close.
+ + self.cache.len() * (size_of::<State>() + size_of::<StateID>())
+ + self.memory_usage_state
+ + self.stack.capacity() * size_of::<StateID>()
+ + self.scratch_state_builder.capacity()
+ }
+}