diff options
Diffstat (limited to 'vendor/regex-automata/src/nfa')
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/backtrack.rs | 28 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/builder.rs | 6 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/compiler.rs | 10 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/map.rs | 2 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/nfa.rs | 6 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/range_trie.rs | 2 |
6 files changed, 42 insertions, 12 deletions
diff --git a/vendor/regex-automata/src/nfa/thompson/backtrack.rs b/vendor/regex-automata/src/nfa/thompson/backtrack.rs index eba037c1d..df99e456d 100644 --- a/vendor/regex-automata/src/nfa/thompson/backtrack.rs +++ b/vendor/regex-automata/src/nfa/thompson/backtrack.rs @@ -820,8 +820,11 @@ impl BoundedBacktracker { // bytes to the capacity in bits. let capacity = 8 * self.get_config().get_visited_capacity(); let blocks = div_ceil(capacity, Visited::BLOCK_SIZE); - let real_capacity = blocks * Visited::BLOCK_SIZE; - (real_capacity / self.nfa.states().len()) - 1 + let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE); + // It's possible for `real_capacity` to be smaller than the number of + // NFA states for particularly large regexes, so we saturate towards + // zero. + (real_capacity / self.nfa.states().len()).saturating_sub(1) } } @@ -1882,3 +1885,24 @@ fn div_ceil(lhs: usize, rhs: usize) -> usize { (lhs / rhs) + 1 } } + +#[cfg(test)] +mod tests { + use super::*; + + // This is a regression test for the maximum haystack length computation. + // Previously, it assumed that the total capacity of the backtracker's + // bitset would always be greater than the number of NFA states. But there + // is of course no guarantee that this is true. This regression test + // ensures that not only does `max_haystack_len` not panic, but that it + // should return `0`. + #[cfg(feature = "syntax")] + #[test] + fn max_haystack_len_overflow() { + let re = BoundedBacktracker::builder() + .configure(BoundedBacktracker::config().visited_capacity(10)) + .build(r"[0-9A-Za-z]{100}") + .unwrap(); + assert_eq!(0, re.max_haystack_len()); + } +} diff --git a/vendor/regex-automata/src/nfa/thompson/builder.rs b/vendor/regex-automata/src/nfa/thompson/builder.rs index b57e5bc0f..6b69e8784 100644 --- a/vendor/regex-automata/src/nfa/thompson/builder.rs +++ b/vendor/regex-automata/src/nfa/thompson/builder.rs @@ -61,7 +61,7 @@ enum State { Look { look: Look, next: StateID }, /// An empty state that records the start of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to - /// record position information for a captue group when using the NFA for + /// record position information for a capture group when using the NFA for /// search. CaptureStart { /// The ID of the pattern that this capture was defined. @@ -77,7 +77,7 @@ enum State { }, /// An empty state that records the end of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to - /// record position information for a captue group when using the NFA for + /// record position information for a capture group when using the NFA for /// search. CaptureEnd { /// The ID of the pattern that this capture was defined. @@ -128,7 +128,7 @@ enum State { } impl State { - /// If this state is an unconditional espilon transition, then this returns + /// If this state is an unconditional epsilon transition, then this returns /// the target of the transition. fn goto(&self) -> Option<StateID> { match *self { diff --git a/vendor/regex-automata/src/nfa/thompson/compiler.rs b/vendor/regex-automata/src/nfa/thompson/compiler.rs index 065e9ef27..2d2172957 100644 --- a/vendor/regex-automata/src/nfa/thompson/compiler.rs +++ b/vendor/regex-automata/src/nfa/thompson/compiler.rs @@ -1466,7 +1466,7 @@ impl Compiler { // compare and contrast performance of the Pike VM when the code below // is active vs the code above. Here's an example to try: // - // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru' + // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file // // With Unicode classes generated below, this search takes about 45s on // my machine. But with the compressed version above, the search takes @@ -1557,6 +1557,14 @@ impl Compiler { hir::Look::WordAsciiNegate => Look::WordAsciiNegate, hir::Look::WordUnicode => Look::WordUnicode, hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, + hir::Look::WordStartAscii => Look::WordStartAscii, + hir::Look::WordEndAscii => Look::WordEndAscii, + hir::Look::WordStartUnicode => Look::WordStartUnicode, + hir::Look::WordEndUnicode => Look::WordEndUnicode, + hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii, + hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii, + hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode, + hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode, }; let id = self.add_look(look)?; Ok(ThompsonRef { start: id, end: id }) diff --git a/vendor/regex-automata/src/nfa/thompson/map.rs b/vendor/regex-automata/src/nfa/thompson/map.rs index c36ce5386..c92d4c0b8 100644 --- a/vendor/regex-automata/src/nfa/thompson/map.rs +++ b/vendor/regex-automata/src/nfa/thompson/map.rs @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" +/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. diff --git a/vendor/regex-automata/src/nfa/thompson/nfa.rs b/vendor/regex-automata/src/nfa/thompson/nfa.rs index 2108fa338..1f57f8ebd 100644 --- a/vendor/regex-automata/src/nfa/thompson/nfa.rs +++ b/vendor/regex-automata/src/nfa/thompson/nfa.rs @@ -1841,14 +1841,12 @@ impl SparseTransitions { // This is an alternative implementation that uses binary search. In // some ad hoc experiments, like // - // smallishru=OpenSubtitles2018.raw.sample.smallish.ru - // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file // // I could not observe any improvement, and in fact, things seemed to // be a bit slower. I can see an improvement in at least one benchmark: // - // allcpssmall=all-codepoints-utf8-10x - // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}' + // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8 // // Where total search time goes from 3.2s to 2.4s when using binary // search. diff --git a/vendor/regex-automata/src/nfa/thompson/range_trie.rs b/vendor/regex-automata/src/nfa/thompson/range_trie.rs index 2d43a5b6f..75c9b796b 100644 --- a/vendor/regex-automata/src/nfa/thompson/range_trie.rs +++ b/vendor/regex-automata/src/nfa/thompson/range_trie.rs @@ -594,7 +594,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" + // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } |