6 files changed, 42 insertions, 12 deletions
diff --git a/vendor/regex-automata/src/nfa/thompson/backtrack.rs b/vendor/regex-automata/src/nfa/thompson/backtrack.rs
index eba037c1d..df99e456d 100644
--- a/vendor/regex-automata/src/nfa/thompson/backtrack.rs
+++ b/vendor/regex-automata/src/nfa/thompson/backtrack.rs
@@ -820,8 +820,11 @@ impl BoundedBacktracker {
         // bytes to the capacity in bits.
         let capacity = 8 * self.get_config().get_visited_capacity();
         let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
-        let real_capacity = blocks * Visited::BLOCK_SIZE;
-        (real_capacity / self.nfa.states().len()) - 1
+        let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE);
+        // It's possible for `real_capacity` to be smaller than the number of
+        // NFA states for particularly large regexes, so we saturate towards
+        // zero.
+        (real_capacity / self.nfa.states().len()).saturating_sub(1)
     }
 }
 
@@ -1882,3 +1885,24 @@ fn div_ceil(lhs: usize, rhs: usize) -> usize {
         (lhs / rhs) + 1
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // This is a regression test for the maximum haystack length computation.
+    // Previously, it assumed that the total capacity of the backtracker's
+    // bitset would always be greater than the number of NFA states. But there
+    // is of course no guarantee that this is true. This regression test
+    // ensures that not only does `max_haystack_len` not panic, but that it
+    // should return `0`.
+    #[cfg(feature = "syntax")]
+    #[test]
+    fn max_haystack_len_overflow() {
+        let re = BoundedBacktracker::builder()
+            .configure(BoundedBacktracker::config().visited_capacity(10))
+            .build(r"[0-9A-Za-z]{100}")
+            .unwrap();
+        assert_eq!(0, re.max_haystack_len());
+    }
+}
diff --git a/vendor/regex-automata/src/nfa/thompson/builder.rs b/vendor/regex-automata/src/nfa/thompson/builder.rs
index b57e5bc0f..6b69e8784 100644
--- a/vendor/regex-automata/src/nfa/thompson/builder.rs
+++ b/vendor/regex-automata/src/nfa/thompson/builder.rs
@@ -61,7 +61,7 @@ enum State {
     Look { look: Look, next: StateID },
     /// An empty state that records the start of a capture location. This is an
     /// unconditional epsilon transition like `Empty`, except it can be used to
-    /// record position information for a captue group when using the NFA for
+    /// record position information for a capture group when using the NFA for
     /// search.
     CaptureStart {
         /// The ID of the pattern that this capture was defined.
@@ -77,7 +77,7 @@ enum State {
     },
     /// An empty state that records the end of a capture location. This is an
     /// unconditional epsilon transition like `Empty`, except it can be used to
-    /// record position information for a captue group when using the NFA for
+    /// record position information for a capture group when using the NFA for
     /// search.
     CaptureEnd {
         /// The ID of the pattern that this capture was defined.
@@ -128,7 +128,7 @@ enum State {
 }
 
 impl State {
-    /// If this state is an unconditional espilon transition, then this returns
+    /// If this state is an unconditional epsilon transition, then this returns
     /// the target of the transition.
     fn goto(&self) -> Option<StateID> {
         match *self {
diff --git a/vendor/regex-automata/src/nfa/thompson/compiler.rs b/vendor/regex-automata/src/nfa/thompson/compiler.rs
index 065e9ef27..2d2172957 100644
--- a/vendor/regex-automata/src/nfa/thompson/compiler.rs
+++ b/vendor/regex-automata/src/nfa/thompson/compiler.rs
@@ -1466,7 +1466,7 @@ impl Compiler {
         // compare and contrast performance of the Pike VM when the code below
         // is active vs the code above. Here's an example to try:
         //
-        //   regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru'
+        //   regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
         //
         // With Unicode classes generated below, this search takes about 45s on
         // my machine. But with the compressed version above, the search takes
@@ -1557,6 +1557,14 @@ impl Compiler {
             hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
             hir::Look::WordUnicode => Look::WordUnicode,
             hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+            hir::Look::WordStartAscii => Look::WordStartAscii,
+            hir::Look::WordEndAscii => Look::WordEndAscii,
+            hir::Look::WordStartUnicode => Look::WordStartUnicode,
+            hir::Look::WordEndUnicode => Look::WordEndUnicode,
+            hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
+            hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
+            hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
+            hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
         };
         let id = self.add_look(look)?;
         Ok(ThompsonRef { start: id, end: id })
diff --git a/vendor/regex-automata/src/nfa/thompson/map.rs b/vendor/regex-automata/src/nfa/thompson/map.rs
index c36ce5386..c92d4c0b8 100644
--- a/vendor/regex-automata/src/nfa/thompson/map.rs
+++ b/vendor/regex-automata/src/nfa/thompson/map.rs
@@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037;
 /// Specifically, one could observe the difference with std's hashmap via
 /// something like the following benchmark:
 ///
-///   hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
+///   hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
 ///
 /// But to observe that difference, you'd have to modify the code to use
 /// std's hashmap.
diff --git a/vendor/regex-automata/src/nfa/thompson/nfa.rs b/vendor/regex-automata/src/nfa/thompson/nfa.rs
index 2108fa338..1f57f8ebd 100644
--- a/vendor/regex-automata/src/nfa/thompson/nfa.rs
+++ b/vendor/regex-automata/src/nfa/thompson/nfa.rs
@@ -1841,14 +1841,12 @@ impl SparseTransitions {
         // This is an alternative implementation that uses binary search. In
         // some ad hoc experiments, like
         //
-        //   smallishru=OpenSubtitles2018.raw.sample.smallish.ru
-        //   regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
+        //   regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file
         //
         // I could not observe any improvement, and in fact, things seemed to
         // be a bit slower. I can see an improvement in at least one benchmark:
         //
-        //   allcpssmall=all-codepoints-utf8-10x
-        //   regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}'
+        //   regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8
         //
         // Where total search time goes from 3.2s to 2.4s when using binary
         // search.
diff --git a/vendor/regex-automata/src/nfa/thompson/range_trie.rs b/vendor/regex-automata/src/nfa/thompson/range_trie.rs
index 2d43a5b6f..75c9b796b 100644
--- a/vendor/regex-automata/src/nfa/thompson/range_trie.rs
+++ b/vendor/regex-automata/src/nfa/thompson/range_trie.rs
@@ -594,7 +594,7 @@ impl State {
         // Benchmarks suggest that binary search is just a bit faster than
         // straight linear search. Specifically when using the debug tool:
         //
-        //   hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
+        //   hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
         binary_search(&self.transitions, |t| range.start <= t.range.end)
     }