summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/nfa/thompson
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-automata/src/nfa/thompson')
-rw-r--r--vendor/regex-automata/src/nfa/thompson/backtrack.rs28
-rw-r--r--vendor/regex-automata/src/nfa/thompson/builder.rs6
-rw-r--r--vendor/regex-automata/src/nfa/thompson/compiler.rs10
-rw-r--r--vendor/regex-automata/src/nfa/thompson/map.rs2
-rw-r--r--vendor/regex-automata/src/nfa/thompson/nfa.rs6
-rw-r--r--vendor/regex-automata/src/nfa/thompson/range_trie.rs2
6 files changed, 42 insertions, 12 deletions
diff --git a/vendor/regex-automata/src/nfa/thompson/backtrack.rs b/vendor/regex-automata/src/nfa/thompson/backtrack.rs
index eba037c1d..df99e456d 100644
--- a/vendor/regex-automata/src/nfa/thompson/backtrack.rs
+++ b/vendor/regex-automata/src/nfa/thompson/backtrack.rs
@@ -820,8 +820,11 @@ impl BoundedBacktracker {
// bytes to the capacity in bits.
let capacity = 8 * self.get_config().get_visited_capacity();
let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
- let real_capacity = blocks * Visited::BLOCK_SIZE;
- (real_capacity / self.nfa.states().len()) - 1
+ let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE);
+ // It's possible for `real_capacity` to be smaller than the number of
+ // NFA states for particularly large regexes, so we saturate towards
+ // zero.
+ (real_capacity / self.nfa.states().len()).saturating_sub(1)
}
}
@@ -1882,3 +1885,24 @@ fn div_ceil(lhs: usize, rhs: usize) -> usize {
(lhs / rhs) + 1
}
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ // This is a regression test for the maximum haystack length computation.
+ // Previously, it assumed that the total capacity of the backtracker's
+ // bitset would always be greater than the number of NFA states. But there
+ // is of course no guarantee that this is true. This regression test
+ // ensures that not only does `max_haystack_len` not panic, but that it
+ // should return `0`.
+ #[cfg(feature = "syntax")]
+ #[test]
+ fn max_haystack_len_overflow() {
+ let re = BoundedBacktracker::builder()
+ .configure(BoundedBacktracker::config().visited_capacity(10))
+ .build(r"[0-9A-Za-z]{100}")
+ .unwrap();
+ assert_eq!(0, re.max_haystack_len());
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/thompson/builder.rs b/vendor/regex-automata/src/nfa/thompson/builder.rs
index b57e5bc0f..6b69e8784 100644
--- a/vendor/regex-automata/src/nfa/thompson/builder.rs
+++ b/vendor/regex-automata/src/nfa/thompson/builder.rs
@@ -61,7 +61,7 @@ enum State {
Look { look: Look, next: StateID },
/// An empty state that records the start of a capture location. This is an
/// unconditional epsilon transition like `Empty`, except it can be used to
- /// record position information for a captue group when using the NFA for
+ /// record position information for a capture group when using the NFA for
/// search.
CaptureStart {
/// The ID of the pattern that this capture was defined.
@@ -77,7 +77,7 @@ enum State {
},
/// An empty state that records the end of a capture location. This is an
/// unconditional epsilon transition like `Empty`, except it can be used to
- /// record position information for a captue group when using the NFA for
+ /// record position information for a capture group when using the NFA for
/// search.
CaptureEnd {
/// The ID of the pattern that this capture was defined.
@@ -128,7 +128,7 @@ enum State {
}
impl State {
- /// If this state is an unconditional espilon transition, then this returns
+ /// If this state is an unconditional epsilon transition, then this returns
/// the target of the transition.
fn goto(&self) -> Option<StateID> {
match *self {
diff --git a/vendor/regex-automata/src/nfa/thompson/compiler.rs b/vendor/regex-automata/src/nfa/thompson/compiler.rs
index 065e9ef27..2d2172957 100644
--- a/vendor/regex-automata/src/nfa/thompson/compiler.rs
+++ b/vendor/regex-automata/src/nfa/thompson/compiler.rs
@@ -1466,7 +1466,7 @@ impl Compiler {
// compare and contrast performance of the Pike VM when the code below
// is active vs the code above. Here's an example to try:
//
- // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru'
+ // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
//
// With Unicode classes generated below, this search takes about 45s on
// my machine. But with the compressed version above, the search takes
@@ -1557,6 +1557,14 @@ impl Compiler {
hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
hir::Look::WordUnicode => Look::WordUnicode,
hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+ hir::Look::WordStartAscii => Look::WordStartAscii,
+ hir::Look::WordEndAscii => Look::WordEndAscii,
+ hir::Look::WordStartUnicode => Look::WordStartUnicode,
+ hir::Look::WordEndUnicode => Look::WordEndUnicode,
+ hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
+ hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
+ hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
+ hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
};
let id = self.add_look(look)?;
Ok(ThompsonRef { start: id, end: id })
diff --git a/vendor/regex-automata/src/nfa/thompson/map.rs b/vendor/regex-automata/src/nfa/thompson/map.rs
index c36ce5386..c92d4c0b8 100644
--- a/vendor/regex-automata/src/nfa/thompson/map.rs
+++ b/vendor/regex-automata/src/nfa/thompson/map.rs
@@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037;
/// Specifically, one could observe the difference with std's hashmap via
/// something like the following benchmark:
///
-/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
+/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
///
/// But to observe that difference, you'd have to modify the code to use
/// std's hashmap.
diff --git a/vendor/regex-automata/src/nfa/thompson/nfa.rs b/vendor/regex-automata/src/nfa/thompson/nfa.rs
index 2108fa338..1f57f8ebd 100644
--- a/vendor/regex-automata/src/nfa/thompson/nfa.rs
+++ b/vendor/regex-automata/src/nfa/thompson/nfa.rs
@@ -1841,14 +1841,12 @@ impl SparseTransitions {
// This is an alternative implementation that uses binary search. In
// some ad hoc experiments, like
//
- // smallishru=OpenSubtitles2018.raw.sample.smallish.ru
- // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
+ // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file
//
// I could not observe any improvement, and in fact, things seemed to
// be a bit slower. I can see an improvement in at least one benchmark:
//
- // allcpssmall=all-codepoints-utf8-10x
- // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}'
+ // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8
//
// Where total search time goes from 3.2s to 2.4s when using binary
// search.
diff --git a/vendor/regex-automata/src/nfa/thompson/range_trie.rs b/vendor/regex-automata/src/nfa/thompson/range_trie.rs
index 2d43a5b6f..75c9b796b 100644
--- a/vendor/regex-automata/src/nfa/thompson/range_trie.rs
+++ b/vendor/regex-automata/src/nfa/thompson/range_trie.rs
@@ -594,7 +594,7 @@ impl State {
// Benchmarks suggest that binary search is just a bit faster than
// straight linear search. Specifically when using the debug tool:
//
- // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
+ // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
binary_search(&self.transitions, |t| range.start <= t.range.end)
}