Merging upstream version 1.74.1+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-30 18:31:44 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-30 18:31:44 +0000
commit: c23a457e72abe608715ac76f076f47dc42af07a5 (patch)
tree: 2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /vendor/regex-automata/src/dfa/determinize.rs
parent: Releasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff)
download: rustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz
rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip
1 files changed, 117 insertions, 65 deletions
diff --git a/vendor/regex-automata/src/dfa/determinize.rs b/vendor/regex-automata/src/dfa/determinize.rs
index 61603481b..19f99f5d6 100644
--- a/vendor/regex-automata/src/dfa/determinize.rs
+++ b/vendor/regex-automata/src/dfa/determinize.rs
@@ -1,18 +1,18 @@
-use alloc::{
-    collections::BTreeMap,
-    vec::{self, Vec},
-};
+use alloc::{collections::BTreeMap, vec::Vec};
 
 use crate::{
-    dfa::{dense, Error, DEAD},
+    dfa::{
+        dense::{self, BuildError},
+        DEAD,
+    },
     nfa::thompson,
     util::{
         self,
         alphabet::{self, ByteSet},
         determinize::{State, StateBuilderEmpty, StateBuilderNFA},
-        id::{PatternID, StateID},
-        matchtypes::MatchKind,
-        sparse_set::{SparseSet, SparseSets},
+        primitives::{PatternID, StateID},
+        search::{Anchored, MatchKind},
+        sparse_set::SparseSets,
         start::Start,
     },
 };
@@ -20,7 +20,6 @@ use crate::{
 /// A builder for configuring and running a DFA determinizer.
 #[derive(Clone, Debug)]
 pub(crate) struct Config {
-    anchored: bool,
     match_kind: MatchKind,
     quit: ByteSet,
     dfa_size_limit: Option<usize>,
@@ -32,7 +31,6 @@ impl Config {
     /// configured before calling `run`.
     pub fn new() -> Config {
         Config {
-            anchored: false,
             match_kind: MatchKind::LeftmostFirst,
             quit: ByteSet::empty(),
             dfa_size_limit: None,
@@ -48,7 +46,7 @@ impl Config {
         &self,
         nfa: &thompson::NFA,
         dfa: &mut dense::OwnedDFA,
-    ) -> Result<(), Error> {
+    ) -> Result<(), BuildError> {
         let dead = State::dead();
         let quit = State::dead();
         let mut cache = StateMap::default();
@@ -71,21 +69,13 @@ impl Config {
             builder_states: alloc::vec![dead, quit],
             cache,
             memory_usage_state: 0,
-            sparses: SparseSets::new(nfa.len()),
+            sparses: SparseSets::new(nfa.states().len()),
             stack: alloc::vec![],
             scratch_state_builder: StateBuilderEmpty::new(),
         };
         runner.run()
     }
 
-    /// Whether to build an anchored DFA or not. When disabled (the default),
-    /// the unanchored prefix from the NFA is used to start the DFA. Otherwise,
-    /// the anchored start state of the NFA is used to start the DFA.
-    pub fn anchored(&mut self, yes: bool) -> &mut Config {
-        self.anchored = yes;
-        self
-    }
-
     /// The match semantics to use for determinization.
     ///
     /// MatchKind::All corresponds to the standard textbook construction.
@@ -222,20 +212,21 @@ impl<'a> Runner<'a> {
     /// Build the DFA. If there was a problem constructing the DFA (e.g., if
     /// the chosen state identifier representation is too small), then an error
     /// is returned.
-    fn run(mut self) -> Result<(), Error> {
-        if self.nfa.has_word_boundary_unicode()
+    fn run(mut self) -> Result<(), BuildError> {
+        if self.nfa.look_set_any().contains_word_unicode()
             && !self.config.quit.contains_range(0x80, 0xFF)
         {
-            return Err(Error::unsupported_dfa_word_boundary_unicode());
+            return Err(BuildError::unsupported_dfa_word_boundary_unicode());
         }
 
         // A sequence of "representative" bytes drawn from each equivalence
         // class. These representative bytes are fed to the NFA to compute
         // state transitions. This allows us to avoid re-computing state
         // transitions for bytes that are guaranteed to produce identical
-        // results.
+        // results. Since computing the representatives needs to do a little
+        // work, we do it once here because we'll be iterating over them a lot.
         let representatives: Vec<alphabet::Unit> =
-            self.dfa.byte_classes().representatives().collect();
+            self.dfa.byte_classes().representatives(..).collect();
         // The set of all DFA state IDs that still need to have their
         // transitions set. We start by seeding this with all starting states.
         let mut uncompiled = alloc::vec![];
@@ -259,10 +250,13 @@ impl<'a> Runner<'a> {
                 }
             }
         }
-        trace!(
-            "determinization complete, memory usage: {}, dense DFA size: {}",
+        debug!(
+            "determinization complete, memory usage: {}, \
+             dense DFA size: {}, \
+             is reverse? {}",
             self.memory_usage(),
             self.dfa.memory_usage(),
+            self.nfa.is_reverse(),
         );
 
         // A map from DFA state ID to one or more NFA match IDs. Each NFA match
@@ -270,21 +264,23 @@ impl<'a> Runner<'a> {
         // corresponding to the key.
         let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
         self.cache.clear();
-        #[allow(unused_variables)]
-        let mut total_pat_count = 0;
+        #[cfg(feature = "logging")]
+        let mut total_pat_len = 0;
         for (i, state) in self.builder_states.into_iter().enumerate() {
             if let Some(pat_ids) = state.match_pattern_ids() {
-                let id = self.dfa.from_index(i);
-                total_pat_count += pat_ids.len();
+                let id = self.dfa.to_state_id(i);
+                log! {
+                    total_pat_len += pat_ids.len();
+                }
                 matches.insert(id, pat_ids);
             }
         }
         log! {
             use core::mem::size_of;
             let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
-            let pats = total_pat_count * size_of::<PatternID>();
+            let pats = total_pat_len * size_of::<PatternID>();
             let mem = (matches.len() * per_elem) + pats;
-            log::trace!("matches map built, memory usage: {}", mem);
+            log::debug!("matches map built, memory usage: {}", mem);
         }
         // At this point, we shuffle the "special" states in the final DFA.
         // This permits a DFA's match loop to detect a match condition (among
@@ -306,7 +302,7 @@ impl<'a> Runner<'a> {
         &mut self,
         dfa_id: StateID,
         unit: alphabet::Unit,
-    ) -> Result<(StateID, bool), Error> {
+    ) -> Result<(StateID, bool), BuildError> {
         // Compute the set of all reachable NFA states, including epsilons.
         let empty_builder = self.get_state_builder();
         let builder = util::determinize::next(
@@ -326,15 +322,32 @@ impl<'a> Runner<'a> {
     fn add_all_starts(
         &mut self,
         dfa_state_ids: &mut Vec<StateID>,
-    ) -> Result<(), Error> {
-        // Always add the (possibly unanchored) start states for matching any
-        // of the patterns in this DFA.
-        self.add_start_group(None, dfa_state_ids)?;
+    ) -> Result<(), BuildError> {
+        // These should be the first states added.
+        assert!(dfa_state_ids.is_empty());
+        // We only want to add (un)anchored starting states that is consistent
+        // with our DFA's configuration. Unconditionally adding both (although
+        // it is the default) can make DFAs quite a bit bigger.
+        if self.dfa.start_kind().has_unanchored() {
+            self.add_start_group(Anchored::No, dfa_state_ids)?;
+        }
+        if self.dfa.start_kind().has_anchored() {
+            self.add_start_group(Anchored::Yes, dfa_state_ids)?;
+        }
+        // I previously has an 'assert' here checking that either
+        // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
+        // turns out this isn't always true. For example, the NFA might have
+        // one or more patterns but where all such patterns are just 'fail'
+        // states. These will ultimately just compile down to DFA dead states,
+        // and since the dead state was added earlier, no new DFA states are
+        // added. And thus, it is valid and okay for 'dfa_state_ids' to be
+        // empty even if there are a non-zero number of patterns in the NFA.
+
         // We only need to compute anchored start states for each pattern if it
         // was requested to do so.
-        if self.dfa.has_starts_for_each_pattern() {
-            for pid in PatternID::iter(self.dfa.pattern_count()) {
-                self.add_start_group(Some(pid), dfa_state_ids)?;
+        if self.dfa.starts_for_each_pattern() {
+            for pid in self.nfa.patterns() {
+                self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
             }
         }
         Ok(())
@@ -348,15 +361,19 @@ impl<'a> Runner<'a> {
     /// start states (if the DFA is unanchored). When the pattern_id is
     /// present, then this will compile a group of anchored start states that
     /// only match the given pattern.
+    ///
+    /// This panics if `anchored` corresponds to an invalid pattern ID.
     fn add_start_group(
         &mut self,
-        pattern_id: Option<PatternID>,
+        anchored: Anchored,
         dfa_state_ids: &mut Vec<StateID>,
-    ) -> Result<(), Error> {
-        let nfa_start = match pattern_id {
-            Some(pid) => self.nfa.start_pattern(pid),
-            None if self.config.anchored => self.nfa.start_anchored(),
-            None => self.nfa.start_unanchored(),
+    ) -> Result<(), BuildError> {
+        let nfa_start = match anchored {
+            Anchored::No => self.nfa.start_unanchored(),
+            Anchored::Yes => self.nfa.start_anchored(),
+            Anchored::Pattern(pid) => {
+                self.nfa.start_pattern(pid).expect("valid pattern ID")
+            }
         };
 
         // When compiling start states, we're careful not to build additional
@@ -365,36 +382,68 @@ impl<'a> Runner<'a> {
         // states for 'NonWordByte' and 'WordByte' starting configurations.
         // Instead, the 'WordByte' starting configuration can just point
         // directly to the start state for the 'NonWordByte' config.
+        //
+        // Note though that we only need to care about assertions in the prefix
+        // of an NFA since this only concerns the starting states. (Actually,
+        // the most precisely thing we could do it is look at the prefix
+        // assertions of each pattern when 'anchored == Anchored::Pattern',
+        // and then only compile extra states if the prefix is non-empty.) But
+        // we settle for simplicity here instead of absolute minimalism. It is
+        // somewhat rare, after all, for multiple patterns in the same regex to
+        // have different prefix look-arounds.
 
         let (id, is_new) =
             self.add_one_start(nfa_start, Start::NonWordByte)?;
-        self.dfa.set_start_state(Start::NonWordByte, pattern_id, id);
+        self.dfa.set_start_state(anchored, Start::NonWordByte, id);
         if is_new {
             dfa_state_ids.push(id);
         }
 
-        if !self.nfa.has_word_boundary() {
-            self.dfa.set_start_state(Start::WordByte, pattern_id, id);
+        if !self.nfa.look_set_prefix_any().contains_word() {
+            self.dfa.set_start_state(anchored, Start::WordByte, id);
         } else {
             let (id, is_new) =
                 self.add_one_start(nfa_start, Start::WordByte)?;
-            self.dfa.set_start_state(Start::WordByte, pattern_id, id);
+            self.dfa.set_start_state(anchored, Start::WordByte, id);
             if is_new {
                 dfa_state_ids.push(id);
             }
         }
-        if !self.nfa.has_any_anchor() {
-            self.dfa.set_start_state(Start::Text, pattern_id, id);
-            self.dfa.set_start_state(Start::Line, pattern_id, id);
+        if !self.nfa.look_set_prefix_any().contains_anchor() {
+            self.dfa.set_start_state(anchored, Start::Text, id);
+            self.dfa.set_start_state(anchored, Start::LineLF, id);
+            self.dfa.set_start_state(anchored, Start::LineCR, id);
+            self.dfa.set_start_state(
+                anchored,
+                Start::CustomLineTerminator,
+                id,
+            );
         } else {
             let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
-            self.dfa.set_start_state(Start::Text, pattern_id, id);
+            self.dfa.set_start_state(anchored, Start::Text, id);
+            if is_new {
+                dfa_state_ids.push(id);
+            }
+
+            let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
+            self.dfa.set_start_state(anchored, Start::LineLF, id);
             if is_new {
                 dfa_state_ids.push(id);
             }
 
-            let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?;
-            self.dfa.set_start_state(Start::Line, pattern_id, id);
+            let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
+            self.dfa.set_start_state(anchored, Start::LineCR, id);
+            if is_new {
+                dfa_state_ids.push(id);
+            }
+
+            let (id, is_new) =
+                self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
+            self.dfa.set_start_state(
+                anchored,
+                Start::CustomLineTerminator,
+                id,
+            );
             if is_new {
                 dfa_state_ids.push(id);
             }
@@ -414,13 +463,14 @@ impl<'a> Runner<'a> {
         &mut self,
         nfa_start: StateID,
         start: Start,
-    ) -> Result<(StateID, bool), Error> {
+    ) -> Result<(StateID, bool), BuildError> {
         // Compute the look-behind assertions that are true in this starting
         // configuration, and the determine the epsilon closure. While
         // computing the epsilon closure, we only follow condiional epsilon
-        // transitions that satisfy the look-behind assertions in 'facts'.
+        // transitions that satisfy the look-behind assertions in 'look_have'.
         let mut builder_matches = self.get_state_builder().into_matches();
         util::determinize::set_lookbehind_from_start(
+            self.nfa,
             &start,
             &mut builder_matches,
         );
@@ -428,7 +478,7 @@ impl<'a> Runner<'a> {
         util::determinize::epsilon_closure(
             self.nfa,
             nfa_start,
-            *builder_matches.look_have(),
+            builder_matches.look_have(),
             &mut self.stack,
             &mut self.sparses.set1,
         );
@@ -455,7 +505,7 @@ impl<'a> Runner<'a> {
     fn maybe_add_state(
         &mut self,
         builder: StateBuilderNFA,
-    ) -> Result<(StateID, bool), Error> {
+    ) -> Result<(StateID, bool), BuildError> {
         if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
             // Since we have a cached state, put the constructed state's
             // memory back into our scratch space, so that it can be reused.
@@ -476,7 +526,7 @@ impl<'a> Runner<'a> {
     fn add_state(
         &mut self,
         builder: StateBuilderNFA,
-    ) -> Result<StateID, Error> {
+    ) -> Result<StateID, BuildError> {
         let id = self.dfa.add_empty_state()?;
         if !self.config.quit.is_empty() {
             for b in self.config.quit.iter() {
@@ -489,19 +539,21 @@ impl<'a> Runner<'a> {
         }
         let state = builder.to_state();
         // States use reference counting internally, so we only need to count
-        // their memroy usage once.
+        // their memory usage once.
         self.memory_usage_state += state.memory_usage();
         self.builder_states.push(state.clone());
         self.cache.insert(state, id);
         self.put_state_builder(builder);
         if let Some(limit) = self.config.dfa_size_limit {
             if self.dfa.memory_usage() > limit {
-                return Err(Error::dfa_exceeded_size_limit(limit));
+                return Err(BuildError::dfa_exceeded_size_limit(limit));
             }
         }
         if let Some(limit) = self.config.determinize_size_limit {
             if self.memory_usage() > limit {
-                return Err(Error::determinize_exceeded_size_limit(limit));
+                return Err(BuildError::determinize_exceeded_size_limit(
+                    limit,
+                ));
             }
         }
         Ok(id)
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-30 18:31:44 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-30 18:31:44 +0000
commit	c23a457e72abe608715ac76f076f47dc42af07a5 (patch)
tree	2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /vendor/regex-automata/src/dfa/determinize.rs
parent	Releasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff)
download	rustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip