diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
commit | c23a457e72abe608715ac76f076f47dc42af07a5 (patch) | |
tree | 2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /vendor/regex-automata/src/dfa/determinize.rs | |
parent | Releasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff) | |
download | rustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip |
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/dfa/determinize.rs')
-rw-r--r-- | vendor/regex-automata/src/dfa/determinize.rs | 182 |
1 files changed, 117 insertions, 65 deletions
diff --git a/vendor/regex-automata/src/dfa/determinize.rs b/vendor/regex-automata/src/dfa/determinize.rs index 61603481b..19f99f5d6 100644 --- a/vendor/regex-automata/src/dfa/determinize.rs +++ b/vendor/regex-automata/src/dfa/determinize.rs @@ -1,18 +1,18 @@ -use alloc::{ - collections::BTreeMap, - vec::{self, Vec}, -}; +use alloc::{collections::BTreeMap, vec::Vec}; use crate::{ - dfa::{dense, Error, DEAD}, + dfa::{ + dense::{self, BuildError}, + DEAD, + }, nfa::thompson, util::{ self, alphabet::{self, ByteSet}, determinize::{State, StateBuilderEmpty, StateBuilderNFA}, - id::{PatternID, StateID}, - matchtypes::MatchKind, - sparse_set::{SparseSet, SparseSets}, + primitives::{PatternID, StateID}, + search::{Anchored, MatchKind}, + sparse_set::SparseSets, start::Start, }, }; @@ -20,7 +20,6 @@ use crate::{ /// A builder for configuring and running a DFA determinizer. #[derive(Clone, Debug)] pub(crate) struct Config { - anchored: bool, match_kind: MatchKind, quit: ByteSet, dfa_size_limit: Option<usize>, @@ -32,7 +31,6 @@ impl Config { /// configured before calling `run`. pub fn new() -> Config { Config { - anchored: false, match_kind: MatchKind::LeftmostFirst, quit: ByteSet::empty(), dfa_size_limit: None, @@ -48,7 +46,7 @@ impl Config { &self, nfa: &thompson::NFA, dfa: &mut dense::OwnedDFA, - ) -> Result<(), Error> { + ) -> Result<(), BuildError> { let dead = State::dead(); let quit = State::dead(); let mut cache = StateMap::default(); @@ -71,21 +69,13 @@ impl Config { builder_states: alloc::vec![dead, quit], cache, memory_usage_state: 0, - sparses: SparseSets::new(nfa.len()), + sparses: SparseSets::new(nfa.states().len()), stack: alloc::vec![], scratch_state_builder: StateBuilderEmpty::new(), }; runner.run() } - /// Whether to build an anchored DFA or not. When disabled (the default), - /// the unanchored prefix from the NFA is used to start the DFA. Otherwise, - /// the anchored start state of the NFA is used to start the DFA. - pub fn anchored(&mut self, yes: bool) -> &mut Config { - self.anchored = yes; - self - } - /// The match semantics to use for determinization. /// /// MatchKind::All corresponds to the standard textbook construction. @@ -222,20 +212,21 @@ impl<'a> Runner<'a> { /// Build the DFA. If there was a problem constructing the DFA (e.g., if /// the chosen state identifier representation is too small), then an error /// is returned. - fn run(mut self) -> Result<(), Error> { - if self.nfa.has_word_boundary_unicode() + fn run(mut self) -> Result<(), BuildError> { + if self.nfa.look_set_any().contains_word_unicode() && !self.config.quit.contains_range(0x80, 0xFF) { - return Err(Error::unsupported_dfa_word_boundary_unicode()); + return Err(BuildError::unsupported_dfa_word_boundary_unicode()); } // A sequence of "representative" bytes drawn from each equivalence // class. These representative bytes are fed to the NFA to compute // state transitions. This allows us to avoid re-computing state // transitions for bytes that are guaranteed to produce identical - // results. + // results. Since computing the representatives needs to do a little + // work, we do it once here because we'll be iterating over them a lot. let representatives: Vec<alphabet::Unit> = - self.dfa.byte_classes().representatives().collect(); + self.dfa.byte_classes().representatives(..).collect(); // The set of all DFA state IDs that still need to have their // transitions set. We start by seeding this with all starting states. let mut uncompiled = alloc::vec![]; @@ -259,10 +250,13 @@ impl<'a> Runner<'a> { } } } - trace!( - "determinization complete, memory usage: {}, dense DFA size: {}", + debug!( + "determinization complete, memory usage: {}, \ + dense DFA size: {}, \ + is reverse? {}", self.memory_usage(), self.dfa.memory_usage(), + self.nfa.is_reverse(), ); // A map from DFA state ID to one or more NFA match IDs. Each NFA match @@ -270,21 +264,23 @@ impl<'a> Runner<'a> { // corresponding to the key. let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new(); self.cache.clear(); - #[allow(unused_variables)] - let mut total_pat_count = 0; + #[cfg(feature = "logging")] + let mut total_pat_len = 0; for (i, state) in self.builder_states.into_iter().enumerate() { if let Some(pat_ids) = state.match_pattern_ids() { - let id = self.dfa.from_index(i); - total_pat_count += pat_ids.len(); + let id = self.dfa.to_state_id(i); + log! { + total_pat_len += pat_ids.len(); + } matches.insert(id, pat_ids); } } log! { use core::mem::size_of; let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>(); - let pats = total_pat_count * size_of::<PatternID>(); + let pats = total_pat_len * size_of::<PatternID>(); let mem = (matches.len() * per_elem) + pats; - log::trace!("matches map built, memory usage: {}", mem); + log::debug!("matches map built, memory usage: {}", mem); } // At this point, we shuffle the "special" states in the final DFA. // This permits a DFA's match loop to detect a match condition (among @@ -306,7 +302,7 @@ impl<'a> Runner<'a> { &mut self, dfa_id: StateID, unit: alphabet::Unit, - ) -> Result<(StateID, bool), Error> { + ) -> Result<(StateID, bool), BuildError> { // Compute the set of all reachable NFA states, including epsilons. let empty_builder = self.get_state_builder(); let builder = util::determinize::next( @@ -326,15 +322,32 @@ impl<'a> Runner<'a> { fn add_all_starts( &mut self, dfa_state_ids: &mut Vec<StateID>, - ) -> Result<(), Error> { - // Always add the (possibly unanchored) start states for matching any - // of the patterns in this DFA. - self.add_start_group(None, dfa_state_ids)?; + ) -> Result<(), BuildError> { + // These should be the first states added. + assert!(dfa_state_ids.is_empty()); + // We only want to add (un)anchored starting states that is consistent + // with our DFA's configuration. Unconditionally adding both (although + // it is the default) can make DFAs quite a bit bigger. + if self.dfa.start_kind().has_unanchored() { + self.add_start_group(Anchored::No, dfa_state_ids)?; + } + if self.dfa.start_kind().has_anchored() { + self.add_start_group(Anchored::Yes, dfa_state_ids)?; + } + // I previously has an 'assert' here checking that either + // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it + // turns out this isn't always true. For example, the NFA might have + // one or more patterns but where all such patterns are just 'fail' + // states. These will ultimately just compile down to DFA dead states, + // and since the dead state was added earlier, no new DFA states are + // added. And thus, it is valid and okay for 'dfa_state_ids' to be + // empty even if there are a non-zero number of patterns in the NFA. + // We only need to compute anchored start states for each pattern if it // was requested to do so. - if self.dfa.has_starts_for_each_pattern() { - for pid in PatternID::iter(self.dfa.pattern_count()) { - self.add_start_group(Some(pid), dfa_state_ids)?; + if self.dfa.starts_for_each_pattern() { + for pid in self.nfa.patterns() { + self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?; } } Ok(()) @@ -348,15 +361,19 @@ impl<'a> Runner<'a> { /// start states (if the DFA is unanchored). When the pattern_id is /// present, then this will compile a group of anchored start states that /// only match the given pattern. + /// + /// This panics if `anchored` corresponds to an invalid pattern ID. fn add_start_group( &mut self, - pattern_id: Option<PatternID>, + anchored: Anchored, dfa_state_ids: &mut Vec<StateID>, - ) -> Result<(), Error> { - let nfa_start = match pattern_id { - Some(pid) => self.nfa.start_pattern(pid), - None if self.config.anchored => self.nfa.start_anchored(), - None => self.nfa.start_unanchored(), + ) -> Result<(), BuildError> { + let nfa_start = match anchored { + Anchored::No => self.nfa.start_unanchored(), + Anchored::Yes => self.nfa.start_anchored(), + Anchored::Pattern(pid) => { + self.nfa.start_pattern(pid).expect("valid pattern ID") + } }; // When compiling start states, we're careful not to build additional @@ -365,36 +382,68 @@ impl<'a> Runner<'a> { // states for 'NonWordByte' and 'WordByte' starting configurations. // Instead, the 'WordByte' starting configuration can just point // directly to the start state for the 'NonWordByte' config. + // + // Note though that we only need to care about assertions in the prefix + // of an NFA since this only concerns the starting states. (Actually, + // the most precisely thing we could do it is look at the prefix + // assertions of each pattern when 'anchored == Anchored::Pattern', + // and then only compile extra states if the prefix is non-empty.) But + // we settle for simplicity here instead of absolute minimalism. It is + // somewhat rare, after all, for multiple patterns in the same regex to + // have different prefix look-arounds. let (id, is_new) = self.add_one_start(nfa_start, Start::NonWordByte)?; - self.dfa.set_start_state(Start::NonWordByte, pattern_id, id); + self.dfa.set_start_state(anchored, Start::NonWordByte, id); if is_new { dfa_state_ids.push(id); } - if !self.nfa.has_word_boundary() { - self.dfa.set_start_state(Start::WordByte, pattern_id, id); + if !self.nfa.look_set_prefix_any().contains_word() { + self.dfa.set_start_state(anchored, Start::WordByte, id); } else { let (id, is_new) = self.add_one_start(nfa_start, Start::WordByte)?; - self.dfa.set_start_state(Start::WordByte, pattern_id, id); + self.dfa.set_start_state(anchored, Start::WordByte, id); if is_new { dfa_state_ids.push(id); } } - if !self.nfa.has_any_anchor() { - self.dfa.set_start_state(Start::Text, pattern_id, id); - self.dfa.set_start_state(Start::Line, pattern_id, id); + if !self.nfa.look_set_prefix_any().contains_anchor() { + self.dfa.set_start_state(anchored, Start::Text, id); + self.dfa.set_start_state(anchored, Start::LineLF, id); + self.dfa.set_start_state(anchored, Start::LineCR, id); + self.dfa.set_start_state( + anchored, + Start::CustomLineTerminator, + id, + ); } else { let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?; - self.dfa.set_start_state(Start::Text, pattern_id, id); + self.dfa.set_start_state(anchored, Start::Text, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?; + self.dfa.set_start_state(anchored, Start::LineLF, id); if is_new { dfa_state_ids.push(id); } - let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?; - self.dfa.set_start_state(Start::Line, pattern_id, id); + let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?; + self.dfa.set_start_state(anchored, Start::LineCR, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = + self.add_one_start(nfa_start, Start::CustomLineTerminator)?; + self.dfa.set_start_state( + anchored, + Start::CustomLineTerminator, + id, + ); if is_new { dfa_state_ids.push(id); } @@ -414,13 +463,14 @@ impl<'a> Runner<'a> { &mut self, nfa_start: StateID, start: Start, - ) -> Result<(StateID, bool), Error> { + ) -> Result<(StateID, bool), BuildError> { // Compute the look-behind assertions that are true in this starting // configuration, and the determine the epsilon closure. While // computing the epsilon closure, we only follow condiional epsilon - // transitions that satisfy the look-behind assertions in 'facts'. + // transitions that satisfy the look-behind assertions in 'look_have'. let mut builder_matches = self.get_state_builder().into_matches(); util::determinize::set_lookbehind_from_start( + self.nfa, &start, &mut builder_matches, ); @@ -428,7 +478,7 @@ impl<'a> Runner<'a> { util::determinize::epsilon_closure( self.nfa, nfa_start, - *builder_matches.look_have(), + builder_matches.look_have(), &mut self.stack, &mut self.sparses.set1, ); @@ -455,7 +505,7 @@ impl<'a> Runner<'a> { fn maybe_add_state( &mut self, builder: StateBuilderNFA, - ) -> Result<(StateID, bool), Error> { + ) -> Result<(StateID, bool), BuildError> { if let Some(&cached_id) = self.cache.get(builder.as_bytes()) { // Since we have a cached state, put the constructed state's // memory back into our scratch space, so that it can be reused. @@ -476,7 +526,7 @@ impl<'a> Runner<'a> { fn add_state( &mut self, builder: StateBuilderNFA, - ) -> Result<StateID, Error> { + ) -> Result<StateID, BuildError> { let id = self.dfa.add_empty_state()?; if !self.config.quit.is_empty() { for b in self.config.quit.iter() { @@ -489,19 +539,21 @@ impl<'a> Runner<'a> { } let state = builder.to_state(); // States use reference counting internally, so we only need to count - // their memroy usage once. + // their memory usage once. self.memory_usage_state += state.memory_usage(); self.builder_states.push(state.clone()); self.cache.insert(state, id); self.put_state_builder(builder); if let Some(limit) = self.config.dfa_size_limit { if self.dfa.memory_usage() > limit { - return Err(Error::dfa_exceeded_size_limit(limit)); + return Err(BuildError::dfa_exceeded_size_limit(limit)); } } if let Some(limit) = self.config.determinize_size_limit { if self.memory_usage() > limit { - return Err(Error::determinize_exceeded_size_limit(limit)); + return Err(BuildError::determinize_exceeded_size_limit( + limit, + )); } } Ok(id) |