Merging upstream version 1.76.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-06-19 09:26:03 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-06-19 09:26:03 +0000
commit: 9918693037dce8aa4bb6f08741b6812923486c18 (patch)
tree: 21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /vendor/regex-syntax/src/hir/literal.rs
parent: Releasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff)
download: rustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz
rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip
1 files changed, 108 insertions, 59 deletions
diff --git a/vendor/regex-syntax/src/hir/literal.rs b/vendor/regex-syntax/src/hir/literal.rs
index bd3a2d143..a5a3737f6 100644
--- a/vendor/regex-syntax/src/hir/literal.rs
+++ b/vendor/regex-syntax/src/hir/literal.rs
@@ -23,7 +23,7 @@ effective literal optimizations:
 to lead to substring search that is only a little faster than a regex search,
 and thus the overhead of using literal optimizations in the first place might
 make things slower overall.
-* The literals in your [`Seq`] shoudn't be too short. In general, longer is
+* The literals in your [`Seq`] shouldn't be too short. In general, longer is
 better. A sequence corresponding to single bytes that occur frequently in the
 haystack, for example, is probably a bad literal optimization because it's
 likely to produce many false positive candidates. Longer literals are less
@@ -51,7 +51,7 @@ the "trickier" parts are how to combine literal sequences, and that is all
 implemented on [`Seq`].
 */
 
-use core::{cmp, mem};
+use core::{cmp, mem, num::NonZeroUsize};
 
 use alloc::{vec, vec::Vec};
 
@@ -477,7 +477,7 @@ impl Extractor {
                 }
                 seq
             }
-            hir::Repetition { min, max: Some(max), .. } if min < max => {
+            hir::Repetition { min, .. } => {
                 assert!(min > 0); // handled above
                 let limit =
                     u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
@@ -491,10 +491,6 @@ impl Extractor {
                 seq.make_inexact();
                 seq
             }
-            hir::Repetition { .. } => {
-                subseq.make_inexact();
-                subseq
-            }
         }
     }
 
@@ -692,7 +688,7 @@ impl Default for ExtractKind {
 /// from making assumptions about what literals are required in order to match
 /// a particular [`Hir`] expression. Generally speaking, when a set is in this
 /// state, literal optimizations are inhibited. A good example of a regex that
-/// will cause this sort of set to apppear is `[A-Za-z]`. The character class
+/// will cause this sort of set to appear is `[A-Za-z]`. The character class
 /// is just too big (and also too narrow) to be usefully expanded into 52
 /// different literals. (Note that the decision for when a seq should become
 /// infinite is determined by the caller. A seq itself has no hard-coded
@@ -1571,7 +1567,7 @@ impl Seq {
     /// unioning `self` with `other`. If either set is infinite, then this
     /// returns `None`.
     #[inline]
-    fn max_union_len(&self, other: &Seq) -> Option<usize> {
+    pub fn max_union_len(&self, other: &Seq) -> Option<usize> {
         let len1 = self.len()?;
         let len2 = other.len()?;
         Some(len1.saturating_add(len2))
@@ -1581,7 +1577,7 @@ impl Seq {
     /// cross product of `self` with `other`. If either set is infinite, then
     /// this returns `None`.
     #[inline]
-    fn max_cross_len(&self, other: &Seq) -> Option<usize> {
+    pub fn max_cross_len(&self, other: &Seq) -> Option<usize> {
         let len1 = self.len()?;
         let len2 = other.len()?;
         Some(len1.saturating_mul(len2))
@@ -1841,6 +1837,14 @@ impl Seq {
             None => return,
             Some(len) => len,
         };
+        // Just give up now if our sequence contains an empty string.
+        if self.min_literal_len().map_or(false, |len| len == 0) {
+            // We squash the sequence so that nobody else gets any bright
+            // ideas to try and use it. An empty string implies a match at
+            // every position. A prefilter cannot help you here.
+            self.make_infinite();
+            return;
+        }
         // Make sure we start with the smallest sequence possible. We use a
         // special version of preference minimization that retains exactness.
         // This is legal because optimization is only expected to occur once
@@ -1910,34 +1914,41 @@ impl Seq {
                 // longest common prefix to be subject to the poison check.
             }
         }
-        // Everything below this check is more-or-less about trying to
-        // heuristically reduce the false positive rate of a prefilter. But
-        // if our sequence is completely exact, then it's possible the regex
-        // engine can be skipped entirely. In this case, the false positive
-        // rate is zero because every literal match corresponds to a regex
-        // match.
+        // If we have an exact sequence, we *probably* just want to keep it
+        // as-is. But there are some cases where we don't. So we save a copy of
+        // the exact sequence now, and then try to do some more optimizations
+        // below. If those don't work out, we go back to this exact sequence.
         //
-        // This is OK even if the sequence contains a poison literal. Remember,
-        // a literal is only poisononous because of what we assume about its
-        // impact on the false positive rate. However, we do still check for
-        // an empty string. Empty strings are weird and it's best to let the
-        // regex engine handle those.
+        // The specific motivation for this is that we sometimes wind up with
+        // an exact sequence with a hefty number of literals. Say, 100. If we
+        // stuck with that, it would be too big for Teddy and would result in
+        // using Aho-Corasick. Which is fine... but the lazy DFA is plenty
+        // suitable in such cases. The real issue is that we will wind up not
+        // using a fast prefilter at all. So in cases like this, even though
+        // we have an exact sequence, it would be better to try and shrink the
+        // sequence (which we do below) and use it as a prefilter that can
+        // produce false positive matches.
         //
-        // We do currently do this check after the longest common prefix (or
-        // suffix) check, under the theory that single-substring search is so
-        // fast that we want that even if we'd end up turning an exact sequence
-        // into an inexact one. But this might be wrong...
-        if self.is_exact()
-            && self.min_literal_len().map_or(false, |len| len > 0)
-        {
-            return;
-        }
+        // But if the shrinking below results in a sequence that "sucks," then
+        // we don't want to use that because we already have an exact sequence
+        // in hand.
+        let exact: Option<Seq> =
+            if self.is_exact() { Some(self.clone()) } else { None };
         // Now we attempt to shorten the sequence. The idea here is that we
         // don't want to look for too many literals, but we want to shorten
         // our sequence enough to improve our odds of using better algorithms
         // downstream (such as Teddy).
+        //
+        // The pair of numbers in this list corresponds to the maximal prefix
+        // (in bytes) to keep for all literals and the length of the sequence
+        // at which to do it.
+        //
+        // So for example, the pair (3, 500) would mean, "if we have more than
+        // 500 literals in our sequence, then truncate all of our literals
+        // such that they are at most 3 bytes in length and the minimize the
+        // sequence."
         const ATTEMPTS: [(usize, usize); 5] =
-            [(5, 64), (4, 64), (3, 64), (2, 64), (1, 10)];
+            [(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)];
         for (keep, limit) in ATTEMPTS {
             let len = match self.len() {
                 None => break,
@@ -1951,7 +1962,11 @@ impl Seq {
             } else {
                 self.keep_last_bytes(keep);
             }
-            self.minimize_by_preference();
+            if prefix {
+                if let Some(ref mut lits) = self.literals {
+                    PreferenceTrie::minimize(lits, true);
+                }
+            }
         }
         // Check for a poison literal. A poison literal is one that is short
         // and is believed to have a very high match count. These poisons
@@ -1968,6 +1983,30 @@ impl Seq {
                 self.make_infinite();
             }
         }
+        // OK, if we had an exact sequence before attempting more optimizations
+        // above and our post-optimized sequence sucks for some reason or
+        // another, then we go back to the exact sequence.
+        if let Some(exact) = exact {
+            // If optimizing resulted in dropping our literals, then certainly
+            // backup and use the exact sequence that we had.
+            if !self.is_finite() {
+                *self = exact;
+                return;
+            }
+            // If our optimized sequence contains a short literal, then it's
+            // *probably* not so great. So throw it away and revert to the
+            // exact sequence.
+            if self.min_literal_len().map_or(true, |len| len <= 2) {
+                *self = exact;
+                return;
+            }
+            // Finally, if our optimized sequence is "big" (i.e., can't use
+            // Teddy), then also don't use it and rely on the exact sequence.
+            if self.len().map_or(true, |len| len > 64) {
+                *self = exact;
+                return;
+            }
+        }
     }
 }
 
@@ -1977,7 +2016,7 @@ impl core::fmt::Debug for Seq {
         if let Some(lits) = self.literals() {
             f.debug_list().entries(lits.iter()).finish()
         } else {
-            write!(f, "[∅]")
+            write!(f, "[∞]")
         }
     }
 }
@@ -2160,12 +2199,19 @@ impl core::fmt::Debug for Literal {
 /// never seen this show up on a profile. Because of the heuristic limits
 /// imposed on literal extractions, the size of the inputs here is usually
 /// very small.)
-#[derive(Debug, Default)]
+#[derive(Debug)]
 struct PreferenceTrie {
     /// The states in this trie. The index of a state in this vector is its ID.
     states: Vec<State>,
+    /// This vec indicates which states are match states. It always has
+    /// the same length as `states` and is indexed by the same state ID.
+    /// A state with identifier `sid` is a match state if and only if
+    /// `matches[sid].is_some()`. The option contains the index of the literal
+    /// corresponding to the match. The index is offset by 1 so that it fits in
+    /// a NonZeroUsize.
+    matches: Vec<Option<NonZeroUsize>>,
     /// The index to allocate to the next literal added to this trie. Starts at
-    /// 0 and increments by 1 for every literal successfully added to the trie.
+    /// 1 and increments by 1 for every literal successfully added to the trie.
     next_literal_index: usize,
 }
 
@@ -2176,9 +2222,6 @@ struct State {
     /// are sorted by byte. There is at most one such transition for any
     /// particular byte.
     trans: Vec<(u8, usize)>,
-    /// Whether this is a matching state or not. If it is, then it contains the
-    /// index to the matching literal.
-    literal_index: Option<usize>,
 }
 
 impl PreferenceTrie {
@@ -2192,20 +2235,19 @@ impl PreferenceTrie {
     /// after them and because any removed literals are guaranteed to never
     /// match.
     fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) {
-        use core::cell::RefCell;
-
-        // MSRV(1.61): Use retain_mut here to avoid interior mutability.
-        let trie = RefCell::new(PreferenceTrie::default());
+        let mut trie = PreferenceTrie {
+            states: vec![],
+            matches: vec![],
+            next_literal_index: 1,
+        };
         let mut make_inexact = vec![];
-        literals.retain(|lit| {
-            match trie.borrow_mut().insert(lit.as_bytes()) {
-                Ok(_) => true,
-                Err(i) => {
-                    if !keep_exact {
-                        make_inexact.push(i);
-                    }
-                    false
+        literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) {
+            Ok(_) => true,
+            Err(i) => {
+                if !keep_exact {
+                    make_inexact.push(i.checked_sub(1).unwrap());
                 }
+                false
             }
         });
         for i in make_inexact {
@@ -2225,15 +2267,15 @@ impl PreferenceTrie {
     /// search.
     fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> {
         let mut prev = self.root();
-        if let Some(idx) = self.states[prev].literal_index {
-            return Err(idx);
+        if let Some(idx) = self.matches[prev] {
+            return Err(idx.get());
         }
         for &b in bytes.iter() {
             match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) {
                 Ok(i) => {
                     prev = self.states[prev].trans[i].1;
-                    if let Some(idx) = self.states[prev].literal_index {
-                        return Err(idx);
+                    if let Some(idx) = self.matches[prev] {
+                        return Err(idx.get());
                     }
                 }
                 Err(i) => {
@@ -2245,7 +2287,7 @@ impl PreferenceTrie {
         }
         let idx = self.next_literal_index;
         self.next_literal_index += 1;
-        self.states[prev].literal_index = Some(idx);
+        self.matches[prev] = NonZeroUsize::new(idx);
         Ok(idx)
     }
 
@@ -2262,6 +2304,7 @@ impl PreferenceTrie {
     fn create_state(&mut self) -> usize {
         let id = self.states.len();
         self.states.push(State::default());
+        self.matches.push(None);
         id
     }
 }
@@ -2603,6 +2646,12 @@ mod tests {
             ]),
             e(r"(ab|cd)(ef|gh)(ij|kl)")
         );
+
+        assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}"));
+
+        assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}"));
+
+        assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}"));
     }
 
     #[test]
@@ -2815,13 +2864,13 @@ mod tests {
     // repeats.
     #[test]
     fn crazy_repeats() {
-        assert_eq!(inexact([I("")], [I("")]), e(r"(?:){4294967295}"));
+        assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}"));
         assert_eq!(
-            inexact([I("")], [I("")]),
+            inexact([E("")], [E("")]),
             e(r"(?:){64}{64}{64}{64}{64}{64}")
         );
-        assert_eq!(inexact([I("")], [I("")]), e(r"x{0}{4294967295}"));
-        assert_eq!(inexact([I("")], [I("")]), e(r"(?:|){4294967295}"));
+        assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}"));
+        assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}"));
 
         assert_eq!(
             inexact([E("")], [E("")]),
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-06-19 09:26:03 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-06-19 09:26:03 +0000
commit	9918693037dce8aa4bb6f08741b6812923486c18 (patch)
tree	21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /vendor/regex-syntax/src/hir/literal.rs
parent	Releasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff)
download	rustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip