diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
commit | c23a457e72abe608715ac76f076f47dc42af07a5 (patch) | |
tree | 2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /vendor/regex-automata/src/hybrid/search.rs | |
parent | Releasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff) | |
download | rustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip |
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/hybrid/search.rs')
-rw-r--r-- | vendor/regex-automata/src/hybrid/search.rs | 1101 |
1 files changed, 620 insertions, 481 deletions
diff --git a/vendor/regex-automata/src/hybrid/search.rs b/vendor/regex-automata/src/hybrid/search.rs index 92760cee2..f23283685 100644 --- a/vendor/regex-automata/src/hybrid/search.rs +++ b/vendor/regex-automata/src/hybrid/search.rs @@ -1,663 +1,802 @@ use crate::{ hybrid::{ - dfa::{Cache, DFA}, - id::{LazyStateID, OverlappingState, StateMatch}, + dfa::{Cache, OverlappingState, DFA}, + id::LazyStateID, }, - nfa::thompson, util::{ - id::PatternID, - matchtypes::{HalfMatch, MatchError}, - prefilter, MATCH_OFFSET, + prefilter::Prefilter, + search::{HalfMatch, Input, MatchError, Span}, }, }; #[inline(never)] -pub(crate) fn find_earliest_fwd( - pre: Option<&mut prefilter::Scanner>, +pub(crate) fn find_fwd( dfa: &DFA, cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, + input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError> { - // Searching with a pattern ID is always anchored, so we should never use - // a prefilter. - if pre.is_some() && pattern_id.is_none() { - find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end) - } else { - find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end) + if input.is_done() { + return Ok(None); } -} - -#[inline(never)] -pub(crate) fn find_leftmost_fwd( - pre: Option<&mut prefilter::Scanner>, - dfa: &DFA, - cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, -) -> Result<Option<HalfMatch>, MatchError> { - // Searching with a pattern ID is always anchored, so we should never use - // a prefilter. - if pre.is_some() && pattern_id.is_none() { - find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end) + let pre = if input.get_anchored().is_anchored() { + None + } else { + dfa.get_config().get_prefilter() + }; + // So what we do here is specialize four different versions of 'find_fwd': + // one for each of the combinations for 'has prefilter' and 'is earliest + // search'. The reason for doing this is that both of these things require + // branches and special handling in some code that can be very hot, + // and shaving off as much as we can when we don't need it tends to be + // beneficial in ad hoc benchmarks. To see these differences, you often + // need a query with a high match count. In other words, specializing these + // four routines *tends* to help latency more than throughput. + if pre.is_some() { + if input.get_earliest() { + find_fwd_imp(dfa, cache, input, pre, true) + } else { + find_fwd_imp(dfa, cache, input, pre, false) + } } else { - find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end) + if input.get_earliest() { + find_fwd_imp(dfa, cache, input, None, true) + } else { + find_fwd_imp(dfa, cache, input, None, false) + } } } -#[inline(always)] -fn find_fwd( - mut pre: Option<&mut prefilter::Scanner>, - earliest: bool, +#[cfg_attr(feature = "perf-inline", inline(always))] +fn find_fwd_imp( dfa: &DFA, cache: &mut Cache, - pattern_id: Option<PatternID>, - haystack: &[u8], - start: usize, - end: usize, + input: &Input<'_>, + pre: Option<&'_ Prefilter>, + earliest: bool, ) -> Result<Option<HalfMatch>, MatchError> { - assert!(start <= end); - assert!(start <= haystack.len()); - assert!(end <= haystack.len()); - - // Why do this? This lets 'bytes[at]' work without bounds checks below. - // It seems the assert on 'end <= haystack.len()' above is otherwise - // not enough. Why not just make 'bytes' scoped this way anyway? Well, - // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' - // for resolving look-ahead. - let bytes = &haystack[..end]; + // See 'prefilter_restart' docs for explanation. + let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty(); + let mut mat = None; + let mut sid = init_fwd(dfa, cache, input)?; + let mut at = input.start(); + // This could just be a closure, but then I think it would be unsound + // because it would need to be safe to invoke. This way, the lack of safety + // is clearer in the code below. + macro_rules! next_unchecked { + ($sid:expr, $at:expr) => {{ + let byte = *input.haystack().get_unchecked($at); + dfa.next_state_untagged_unchecked(cache, $sid, byte) + }}; + } - let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?; - let mut last_match = None; - let mut at = start; - if let Some(ref mut pre) = pre { - // If a prefilter doesn't report false positives, then we don't need to - // touch the DFA at all. However, since all matches include the pattern - // ID, and the prefilter infrastructure doesn't report pattern IDs, we - // limit this optimization to cases where there is exactly one pattern. - // In that case, any match must be the 0th pattern. - if dfa.pattern_count() == 1 && !pre.reports_false_positives() { - return Ok(pre.next_candidate(bytes, at).into_option().map( - |offset| HalfMatch { pattern: PatternID::ZERO, offset }, - )); - } else if pre.is_effective(at) { - match pre.next_candidate(bytes, at).into_option() { - None => return Ok(None), - Some(i) => { - at = i; + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => return Ok(mat), + Some(ref span) => { + at = span.start; + if !universal_start { + sid = prefilter_restart(dfa, cache, &input, at)?; } } } } - while at < end { + cache.search_start(at); + while at < input.end() { if sid.is_tagged() { + cache.search_update(at); sid = dfa - .next_state(cache, sid, bytes[at]) + .next_state(cache, sid, input.haystack()[at]) .map_err(|_| gave_up(at))?; - at += 1; } else { // SAFETY: There are two safety invariants we need to uphold - // here in the loop below: that 'sid' is a valid state ID for - // this DFA, and that 'at' is a valid index into 'bytes'. For - // the former, we rely on the invariant that next_state* and - // start_state_forward always returns a valid state ID (given a - // valid state ID in the former case), and that we are only at this - // place in the code if 'sid' is untagged. Moreover, every call to - // next_state_untagged_unchecked below is guarded by a check that - // sid is untagged. For the latter safety invariant, we always - // guard unchecked access with a check that 'at' is less than - // 'end', where 'end == bytes.len()'. + // here in the loops below: that 'sid' and 'prev_sid' are valid + // state IDs for this DFA, and that 'at' is a valid index into + // 'haystack'. For the former, we rely on the invariant that + // next_state* and start_state_forward always returns a valid state + // ID (given a valid state ID in the former case), and that we are + // only at this place in the code if 'sid' is untagged. Moreover, + // every call to next_state_untagged_unchecked below is guarded by + // a check that sid is untagged. For the latter safety invariant, + // we always guard unchecked access with a check that 'at' is less + // than 'end', where 'end <= haystack.len()'. In the unrolled loop + // below, we ensure that 'at' is always in bounds. + // + // PERF: For justification of omitting bounds checks, it gives us a + // ~10% bump in search time. This was used for a benchmark: + // + // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb + // + // PERF: For justification for the loop unrolling, we use a few + // different tests: // - // For justification, this gives us a ~10% bump in search time. - // This was used for a benchmark: + // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb + // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb + // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb // - // regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb + // And there are three different configurations: // - // With bounds checked: ~881.4ms. Without: ~775ms. For input, I - // used OpenSubtitles2018.raw.sample.medium.en. + // nounroll: this entire 'else' block vanishes and we just + // always use 'dfa.next_state(..)'. + // unroll1: just the outer loop below + // unroll2: just the inner loop below + // unroll3: both the outer and inner loops below + // + // This results in a matrix of timings for each of the above + // regexes with each of the above unrolling configurations: + // + // '\w{50}' '(?m)^.+$' 'ZQZQZQZQ' + // nounroll 1.51s 2.34s 1.51s + // unroll1 1.53s 2.32s 1.56s + // unroll2 2.22s 1.50s 0.61s + // unroll3 1.67s 1.45s 0.61s + // + // Ideally we'd be able to find a configuration that yields the + // best time for all regexes, but alas we settle for unroll3 that + // gives us *almost* the best for '\w{50}' and the best for the + // other two regexes. + // + // So what exactly is going on here? The first unrolling (grouping + // together runs of untagged transitions) specifically targets + // our choice of representation. The second unrolling (grouping + // together runs of self-transitions) specifically targets a common + // DFA topology. Let's dig in a little bit by looking at our + // regexes: + // + // '\w{50}': This regex spends a lot of time outside of the DFA's + // start state matching some part of the '\w' repetition. This + // means that it's a bit of a worst case for loop unrolling that + // targets self-transitions since the self-transitions in '\w{50}' + // are not particularly active for this haystack. However, the + // first unrolling (grouping together untagged transitions) + // does apply quite well here since very few transitions hit + // match/dead/quit/unknown states. It is however worth mentioning + // that if start states are configured to be tagged (which you + // typically want to do if you have a prefilter), then this regex + // actually slows way down because it is constantly ping-ponging + // out of the unrolled loop and into the handling of a tagged start + // state below. But when start states aren't tagged, the unrolled + // loop stays hot. (This is why it's imperative that start state + // tagging be disabled when there isn't a prefilter!) + // + // '(?m)^.+$': There are two important aspects of this regex: 1) + // on this haystack, its match count is very high, much higher + // than the other two regex and 2) it spends the vast majority + // of its time matching '.+'. Since Unicode mode is disabled, + // this corresponds to repeatedly following self transitions for + // the vast majority of the input. This does benefit from the + // untagged unrolling since most of the transitions will be to + // untagged states, but the untagged unrolling does more work than + // what is actually required. Namely, it has to keep track of the + // previous and next state IDs, which I guess requires a bit more + // shuffling. This is supported by the fact that nounroll+unroll1 + // are both slower than unroll2+unroll3, where the latter has a + // loop unrolling that specifically targets self-transitions. + // + // 'ZQZQZQZQ': This one is very similar to '(?m)^.+$' because it + // spends the vast majority of its time in self-transitions for + // the (implicit) unanchored prefix. The main difference with + // '(?m)^.+$' is that it has a much lower match count. So there + // isn't much time spent in the overhead of reporting matches. This + // is the primary explainer in the perf difference here. We include + // this regex and the former to make sure we have comparison points + // with high and low match counts. + // + // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'. + // + // NOTE: In a follow-up, it turns out that the "inner" loop + // mentioned above was a pretty big pessimization in some other + // cases. Namely, it resulted in too much ping-ponging into and out + // of the loop, which resulted in nearly ~2x regressions in search + // time when compared to the originally lazy DFA in the regex crate. + // So I've removed the second loop unrolling that targets the + // self-transition case. let mut prev_sid = sid; - while at < end { - prev_sid = sid; - sid = unsafe { - dfa.next_state_untagged_unchecked( - cache, - sid, - *bytes.get_unchecked(at), - ) - }; + while at < input.end() { + prev_sid = unsafe { next_unchecked!(sid, at) }; + if prev_sid.is_tagged() || at + 3 >= input.end() { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } at += 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; if sid.is_tagged() { break; } - // SAFETY: we make four unguarded accesses to 'bytes[at]' - // below, and each are safe because we know that 'at + 4' is - // in bounds. Moreover, while we don't check whether 'sid' is - // untagged directly, we know it is because of the check above. - // And the unrolled loop below quits when the next state is not - // equal to the previous state. - // - // PERF: For justification for eliminating bounds checks, - // see above. For justification for the unrolling, we use - // two tests. The one above with regex '(?m)^.+$', and also - // '(?m)^.{40}$'. The former is kinda the best case for - // unrolling, and gives a 1.67 boost primarily because the DFA - // spends most of its time munching through the input in the - // same state. But the latter pattern rarely spends time in the - // same state through subsequent transitions, so unrolling is - // pretty much always ineffective in that it craps out on the - // first 'sid != next' check below. However, without unrolling, - // search is only 1.03 times faster than with unrolling on the - // latter pattern, which we deem to be an acceptable loss in - // favor of optimizing the more common case of having a "hot" - // state somewhere in the DFA. - while at + 4 < end { - let next = unsafe { - dfa.next_state_untagged_unchecked( - cache, - sid, - *bytes.get_unchecked(at), - ) - }; - if sid != next { - break; - } - at += 1; - let next = unsafe { - dfa.next_state_untagged_unchecked( - cache, - sid, - *bytes.get_unchecked(at), - ) - }; - if sid != next { - break; - } - at += 1; - let next = unsafe { - dfa.next_state_untagged_unchecked( - cache, - sid, - *bytes.get_unchecked(at), - ) - }; - if sid != next { - break; - } - at += 1; - let next = unsafe { - dfa.next_state_untagged_unchecked( - cache, - sid, - *bytes.get_unchecked(at), - ) - }; - if sid != next { - break; - } - at += 1; + at += 1; + + prev_sid = unsafe { next_unchecked!(sid, at) }; + if prev_sid.is_tagged() { + core::mem::swap(&mut prev_sid, &mut sid); + break; } + at += 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if sid.is_tagged() { + break; + } + at += 1; } + // If we quit out of the code above with an unknown state ID at + // any point, then we need to re-compute that transition using + // 'next_state', which will do NFA powerset construction for us. if sid.is_unknown() { + cache.search_update(at); sid = dfa - .next_state(cache, prev_sid, bytes[at - 1]) - .map_err(|_| gave_up(at - 1))?; + .next_state(cache, prev_sid, input.haystack()[at]) + .map_err(|_| gave_up(at))?; } } if sid.is_tagged() { if sid.is_start() { - if let Some(ref mut pre) = pre { - if pre.is_effective(at) { - match pre.next_candidate(bytes, at).into_option() { - None => return Ok(None), - Some(i) => { - at = i; + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => { + cache.search_finish(span.end); + return Ok(mat); + } + Some(ref span) => { + // We want to skip any update to 'at' below + // at the end of this iteration and just + // jump immediately back to the next state + // transition at the leading position of the + // candidate match. + // + // ... but only if we actually made progress + // with our prefilter, otherwise if the start + // state has a self-loop, we can get stuck. + if span.start > at { + at = span.start; + if !universal_start { + sid = prefilter_restart( + dfa, cache, &input, at, + )?; + } + continue; } } } } } else if sid.is_match() { - last_match = Some(HalfMatch { - pattern: dfa.match_pattern(cache, sid, 0), - offset: at - MATCH_OFFSET, - }); + let pattern = dfa.match_pattern(cache, sid, 0); + // Since slice ranges are inclusive at the beginning and + // exclusive at the end, and since forward searches report + // the end, we can return 'at' as-is. This only works because + // matches are delayed by 1 byte. So by the time we observe a + // match, 'at' has already been set to 1 byte past the actual + // match location, which is precisely the exclusive ending + // bound of the match. + mat = Some(HalfMatch::new(pattern, at)); if earliest { - return Ok(last_match); + cache.search_finish(at); + return Ok(mat); } } else if sid.is_dead() { - return Ok(last_match); + cache.search_finish(at); + return Ok(mat); } else if sid.is_quit() { - if last_match.is_some() { - return Ok(last_match); - } - let offset = at - 1; - return Err(MatchError::Quit { byte: bytes[offset], offset }); + cache.search_finish(at); + return Err(MatchError::quit(input.haystack()[at], at)); } else { debug_assert!(sid.is_unknown()); unreachable!("sid being unknown is a bug"); } } + at += 1; } - // We are careful to use 'haystack' here, which contains the full context - // that we might want to inspect. - Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match)) + eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?; + cache.search_finish(input.end()); + Ok(mat) } #[inline(never)] -pub(crate) fn find_earliest_rev( +pub(crate) fn find_rev( dfa: &DFA, cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, + input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError> { - find_rev(true, dfa, cache, pattern_id, bytes, start, end) + if input.is_done() { + return Ok(None); + } + if input.get_earliest() { + find_rev_imp(dfa, cache, input, true) + } else { + find_rev_imp(dfa, cache, input, false) + } } -#[inline(never)] -pub(crate) fn find_leftmost_rev( +#[cfg_attr(feature = "perf-inline", inline(always))] +fn find_rev_imp( dfa: &DFA, cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, -) -> Result<Option<HalfMatch>, MatchError> { - find_rev(false, dfa, cache, pattern_id, bytes, start, end) -} - -#[inline(always)] -fn find_rev( + input: &Input<'_>, earliest: bool, - dfa: &DFA, - cache: &mut Cache, - pattern_id: Option<PatternID>, - haystack: &[u8], - start: usize, - end: usize, ) -> Result<Option<HalfMatch>, MatchError> { - assert!(start <= end); - assert!(start <= haystack.len()); - assert!(end <= haystack.len()); - - // Why do this? This lets 'bytes[at]' work without bounds checks below. - // It seems the assert on 'end <= haystack.len()' above is otherwise - // not enough. Why not just make 'bytes' scoped this way anyway? Well, - // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' - // for resolving look-ahead. - let bytes = &haystack[start..]; + let mut mat = None; + let mut sid = init_rev(dfa, cache, input)?; + // In reverse search, the loop below can't handle the case of searching an + // empty slice. Ideally we could write something congruent to the forward + // search, i.e., 'while at >= start', but 'start' might be 0. Since we use + // an unsigned offset, 'at >= 0' is trivially always true. We could avoid + // this extra case handling by using a signed offset, but Rust makes it + // annoying to do. So... We just handle the empty case separately. + if input.start() == input.end() { + eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; + return Ok(mat); + } - let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?; - let mut last_match = None; - let mut at = end - start; - while at > 0 { + let mut at = input.end() - 1; + macro_rules! next_unchecked { + ($sid:expr, $at:expr) => {{ + let byte = *input.haystack().get_unchecked($at); + dfa.next_state_untagged_unchecked(cache, $sid, byte) + }}; + } + cache.search_start(at); + loop { if sid.is_tagged() { - at -= 1; + cache.search_update(at); sid = dfa - .next_state(cache, sid, bytes[at]) + .next_state(cache, sid, input.haystack()[at]) .map_err(|_| gave_up(at))?; } else { - // SAFETY: See comments in 'find_fwd' for both a safety argument - // and a justification from a performance perspective as to 1) why - // we elide bounds checks and 2) why we do a specialized version of - // unrolling below. + // SAFETY: See comments in 'find_fwd' for a safety argument. + // + // PERF: The comments in 'find_fwd' also provide a justification + // from a performance perspective as to 1) why we elide bounds + // checks and 2) why we do a specialized version of unrolling + // below. The reverse search does have a slightly different + // consideration in that most reverse searches tend to be + // anchored and on shorter haystacks. However, this still makes a + // difference. Take this command for example: + // + // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb + // + // (Notice that we use 'find hybrid regex', not 'find hybrid dfa' + // like in the justification for the forward direction. The 'regex' + // sub-command will find start-of-match and thus run the reverse + // direction.) + // + // Without unrolling below, the above command takes around 3.76s. + // But with the unrolling below, we get down to 2.55s. If we keep + // the unrolling but add in bounds checks, then we get 2.86s. + // + // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'. let mut prev_sid = sid; - while at > 0 && !sid.is_tagged() { - prev_sid = sid; + while at >= input.start() { + prev_sid = unsafe { next_unchecked!(sid, at) }; + if prev_sid.is_tagged() + || at <= input.start().saturating_add(3) + { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } at -= 1; - while at > 3 { - let next = unsafe { - dfa.next_state_untagged_unchecked( - cache, - sid, - *bytes.get_unchecked(at), - ) - }; - if sid != next { - break; - } - at -= 1; - let next = unsafe { - dfa.next_state_untagged_unchecked( - cache, - sid, - *bytes.get_unchecked(at), - ) - }; - if sid != next { - break; - } - at -= 1; - let next = unsafe { - dfa.next_state_untagged_unchecked( - cache, - sid, - *bytes.get_unchecked(at), - ) - }; - if sid != next { - break; - } - at -= 1; - let next = unsafe { - dfa.next_state_untagged_unchecked( - cache, - sid, - *bytes.get_unchecked(at), - ) - }; - if sid != next { - break; - } - at -= 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if sid.is_tagged() { + break; + } + at -= 1; + + prev_sid = unsafe { next_unchecked!(sid, at) }; + if prev_sid.is_tagged() { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at -= 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if sid.is_tagged() { + break; } - sid = unsafe { - dfa.next_state_untagged_unchecked( - cache, - sid, - *bytes.get_unchecked(at), - ) - }; + at -= 1; } + // If we quit out of the code above with an unknown state ID at + // any point, then we need to re-compute that transition using + // 'next_state', which will do NFA powerset construction for us. if sid.is_unknown() { + cache.search_update(at); sid = dfa - .next_state(cache, prev_sid, bytes[at]) + .next_state(cache, prev_sid, input.haystack()[at]) .map_err(|_| gave_up(at))?; } } if sid.is_tagged() { if sid.is_start() { - continue; + // do nothing } else if sid.is_match() { - last_match = Some(HalfMatch { - pattern: dfa.match_pattern(cache, sid, 0), - offset: start + at + MATCH_OFFSET, - }); + let pattern = dfa.match_pattern(cache, sid, 0); + // Since reverse searches report the beginning of a match + // and the beginning is inclusive (not exclusive like the + // end of a match), we add 1 to make it inclusive. + mat = Some(HalfMatch::new(pattern, at + 1)); if earliest { - return Ok(last_match); + cache.search_finish(at); + return Ok(mat); } } else if sid.is_dead() { - return Ok(last_match); + cache.search_finish(at); + return Ok(mat); + } else if sid.is_quit() { + cache.search_finish(at); + return Err(MatchError::quit(input.haystack()[at], at)); } else { - debug_assert!(sid.is_quit()); - if last_match.is_some() { - return Ok(last_match); - } - return Err(MatchError::Quit { byte: bytes[at], offset: at }); + debug_assert!(sid.is_unknown()); + unreachable!("sid being unknown is a bug"); } } + if at == input.start() { + break; + } + at -= 1; } - Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match)) + cache.search_finish(input.start()); + eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; + Ok(mat) } #[inline(never)] pub(crate) fn find_overlapping_fwd( - pre: Option<&mut prefilter::Scanner>, dfa: &DFA, cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - caller_state: &mut OverlappingState, -) -> Result<Option<HalfMatch>, MatchError> { - // Searching with a pattern ID is always anchored, so we should only ever - // use a prefilter when no pattern ID is given. - if pre.is_some() && pattern_id.is_none() { - find_overlapping_fwd_imp( - pre, - dfa, - cache, - pattern_id, - bytes, - start, - end, - caller_state, - ) + input: &Input<'_>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + state.mat = None; + if input.is_done() { + return Ok(()); + } + let pre = if input.get_anchored().is_anchored() { + None } else { - find_overlapping_fwd_imp( - None, - dfa, - cache, - pattern_id, - bytes, - start, - end, - caller_state, - ) + dfa.get_config().get_prefilter() + }; + if pre.is_some() { + find_overlapping_fwd_imp(dfa, cache, input, pre, state) + } else { + find_overlapping_fwd_imp(dfa, cache, input, None, state) } } -#[inline(always)] +#[cfg_attr(feature = "perf-inline", inline(always))] fn find_overlapping_fwd_imp( - mut pre: Option<&mut prefilter::Scanner>, dfa: &DFA, cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - mut start: usize, - end: usize, - caller_state: &mut OverlappingState, -) -> Result<Option<HalfMatch>, MatchError> { - assert!(start <= end); - assert!(start <= bytes.len()); - assert!(end <= bytes.len()); - - let mut sid = match caller_state.id() { - None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?, + input: &Input<'_>, + pre: Option<&'_ Prefilter>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + // See 'prefilter_restart' docs for explanation. + let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty(); + let mut sid = match state.id { + None => { + state.at = input.start(); + init_fwd(dfa, cache, input)? + } Some(sid) => { - if let Some(last) = caller_state.last_match() { - let match_count = dfa.match_count(cache, sid); - if last.match_index < match_count { - let m = HalfMatch { - pattern: dfa.match_pattern( - cache, - sid, - last.match_index, - ), - offset: last.offset, - }; - last.match_index += 1; - return Ok(Some(m)); + if let Some(match_index) = state.next_match_index { + let match_len = dfa.match_len(cache, sid); + if match_index < match_len { + state.next_match_index = Some(match_index + 1); + let pattern = dfa.match_pattern(cache, sid, match_index); + state.mat = Some(HalfMatch::new(pattern, state.at)); + return Ok(()); } } - - // This is a subtle but critical detail. If the caller provides a - // non-None state ID, then it must be the case that the state ID - // corresponds to one set by this function. The state ID therefore - // corresponds to a match state, a dead state or some other state. - // However, "some other" state _only_ occurs when the input has - // been exhausted because the only way to stop before then is to - // see a match or a dead/quit state. - // - // If the input is exhausted or if it's a dead state, then - // incrementing the starting position has no relevance on - // correctness, since the loop below will either not execute - // at all or will immediately stop due to being in a dead state. - // (Once in a dead state it is impossible to leave it.) - // - // Therefore, the only case we need to consider is when - // caller_state is a match state. In this case, since our machines - // support the ability to delay a match by a certain number of - // bytes (to support look-around), it follows that we actually - // consumed that many additional bytes on our previous search. When - // the caller resumes their search to find subsequent matches, they - // will use the ending location from the previous match as the next - // starting point, which is `match_offset` bytes PRIOR to where - // we scanned to on the previous search. Therefore, we need to - // compensate by bumping `start` up by `MATCH_OFFSET` bytes. - // - // Incidentally, since MATCH_OFFSET is non-zero, this also makes - // dealing with empty matches convenient. Namely, callers needn't - // special case them when implementing an iterator. Instead, this - // ensures that forward progress is always made. - start += MATCH_OFFSET; + // Once we've reported all matches at a given position, we need to + // advance the search to the next position. + state.at += 1; + if state.at > input.end() { + return Ok(()); + } sid } }; - let mut at = start; - while at < end { - let byte = bytes[at]; - sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?; - at += 1; + // NOTE: We don't optimize the crap out of this routine primarily because + // it seems like most overlapping searches will have higher match counts, + // and thus, throughput is perhaps not as important. But if you have a use + // case for something faster, feel free to file an issue. + cache.search_start(state.at); + while state.at < input.end() { + sid = dfa + .next_state(cache, sid, input.haystack()[state.at]) + .map_err(|_| gave_up(state.at))?; if sid.is_tagged() { - caller_state.set_id(sid); + state.id = Some(sid); if sid.is_start() { - if let Some(ref mut pre) = pre { - if pre.is_effective(at) { - match pre.next_candidate(bytes, at).into_option() { - None => return Ok(None), - Some(i) => { - at = i; + if let Some(ref pre) = pre { + let span = Span::from(state.at..input.end()); + match pre.find(input.haystack(), span) { + None => return Ok(()), + Some(ref span) => { + if span.start > state.at { + state.at = span.start; + if !universal_start { + sid = prefilter_restart( + dfa, cache, &input, state.at, + )?; + } + continue; } } } } } else if sid.is_match() { - let offset = at - MATCH_OFFSET; - caller_state - .set_last_match(StateMatch { match_index: 1, offset }); - return Ok(Some(HalfMatch { - pattern: dfa.match_pattern(cache, sid, 0), - offset, - })); + state.next_match_index = Some(1); + let pattern = dfa.match_pattern(cache, sid, 0); + state.mat = Some(HalfMatch::new(pattern, state.at)); + cache.search_finish(state.at); + return Ok(()); } else if sid.is_dead() { - return Ok(None); + cache.search_finish(state.at); + return Ok(()); + } else if sid.is_quit() { + cache.search_finish(state.at); + return Err(MatchError::quit( + input.haystack()[state.at], + state.at, + )); } else { - debug_assert!(sid.is_quit()); - return Err(MatchError::Quit { byte, offset: at - 1 }); + debug_assert!(sid.is_unknown()); + unreachable!("sid being unknown is a bug"); } } + state.at += 1; + cache.search_update(state.at); } - let result = eoi_fwd(dfa, cache, bytes, end, &mut sid); - caller_state.set_id(sid); - if let Ok(Some(ref last_match)) = result { - caller_state.set_last_match(StateMatch { - // '1' is always correct here since if we get to this point, this - // always corresponds to the first (index '0') match discovered at - // this position. So the next match to report at this position (if - // it exists) is at index '1'. - match_index: 1, - offset: last_match.offset(), - }); + let result = eoi_fwd(dfa, cache, input, &mut sid, &mut state.mat); + state.id = Some(sid); + if state.mat.is_some() { + // '1' is always correct here since if we get to this point, this + // always corresponds to the first (index '0') match discovered at + // this position. So the next match to report at this position (if + // it exists) is at index '1'. + state.next_match_index = Some(1); } + cache.search_finish(input.end()); result } -#[inline(always)] +#[inline(never)] +pub(crate) fn find_overlapping_rev( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + state.mat = None; + if input.is_done() { + return Ok(()); + } + let mut sid = match state.id { + None => { + let sid = init_rev(dfa, cache, input)?; + state.id = Some(sid); + if input.start() == input.end() { + state.rev_eoi = true; + } else { + state.at = input.end() - 1; + } + sid + } + Some(sid) => { + if let Some(match_index) = state.next_match_index { + let match_len = dfa.match_len(cache, sid); + if match_index < match_len { + state.next_match_index = Some(match_index + 1); + let pattern = dfa.match_pattern(cache, sid, match_index); + state.mat = Some(HalfMatch::new(pattern, state.at)); + return Ok(()); + } + } + // Once we've reported all matches at a given position, we need + // to advance the search to the next position. However, if we've + // already followed the EOI transition, then we know we're done + // with the search and there cannot be any more matches to report. + if state.rev_eoi { + return Ok(()); + } else if state.at == input.start() { + // At this point, we should follow the EOI transition. This + // will cause us the skip the main loop below and fall through + // to the final 'eoi_rev' transition. + state.rev_eoi = true; + } else { + // We haven't hit the end of the search yet, so move on. + state.at -= 1; + } + sid + } + }; + cache.search_start(state.at); + while !state.rev_eoi { + sid = dfa + .next_state(cache, sid, input.haystack()[state.at]) + .map_err(|_| gave_up(state.at))?; + if sid.is_tagged() { + state.id = Some(sid); + if sid.is_start() { + // do nothing + } else if sid.is_match() { + state.next_match_index = Some(1); + let pattern = dfa.match_pattern(cache, sid, 0); + state.mat = Some(HalfMatch::new(pattern, state.at + 1)); + cache.search_finish(state.at); + return Ok(()); + } else if sid.is_dead() { + cache.search_finish(state.at); + return Ok(()); + } else if sid.is_quit() { + cache.search_finish(state.at); + return Err(MatchError::quit( + input.haystack()[state.at], + state.at, + )); + } else { + debug_assert!(sid.is_unknown()); + unreachable!("sid being unknown is a bug"); + } + } + if state.at == input.start() { + break; + } + state.at -= 1; + cache.search_update(state.at); + } + + let result = eoi_rev(dfa, cache, input, &mut sid, &mut state.mat); + state.rev_eoi = true; + state.id = Some(sid); + if state.mat.is_some() { + // '1' is always correct here since if we get to this point, this + // always corresponds to the first (index '0') match discovered at + // this position. So the next match to report at this position (if + // it exists) is at index '1'. + state.next_match_index = Some(1); + } + cache.search_finish(input.start()); + result +} + +#[cfg_attr(feature = "perf-inline", inline(always))] fn init_fwd( dfa: &DFA, cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, + input: &Input<'_>, ) -> Result<LazyStateID, MatchError> { - let sid = dfa - .start_state_forward(cache, pattern_id, bytes, start, end) - .map_err(|_| gave_up(start))?; + let sid = dfa.start_state_forward(cache, input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. - assert!(!sid.is_match()); + debug_assert!(!sid.is_match()); Ok(sid) } -#[inline(always)] +#[cfg_attr(feature = "perf-inline", inline(always))] fn init_rev( dfa: &DFA, cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, + input: &Input<'_>, ) -> Result<LazyStateID, MatchError> { - let sid = dfa - .start_state_reverse(cache, pattern_id, bytes, start, end) - .map_err(|_| gave_up(end))?; + let sid = dfa.start_state_reverse(cache, input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. - assert!(!sid.is_match()); + debug_assert!(!sid.is_match()); Ok(sid) } -#[inline(always)] +#[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_fwd( dfa: &DFA, cache: &mut Cache, - bytes: &[u8], - end: usize, + input: &Input<'_>, sid: &mut LazyStateID, -) -> Result<Option<HalfMatch>, MatchError> { - match bytes.get(end) { + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + let sp = input.get_span(); + match input.haystack().get(sp.end) { Some(&b) => { - *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?; + *sid = + dfa.next_state(cache, *sid, b).map_err(|_| gave_up(sp.end))?; if sid.is_match() { - Ok(Some(HalfMatch { - pattern: dfa.match_pattern(cache, *sid, 0), - offset: end, - })) - } else { - Ok(None) + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.end)); + } else if sid.is_quit() { + return Err(MatchError::quit(b, sp.end)); } } None => { *sid = dfa .next_eoi_state(cache, *sid) - .map_err(|_| gave_up(bytes.len()))?; + .map_err(|_| gave_up(input.haystack().len()))?; if sid.is_match() { - Ok(Some(HalfMatch { - pattern: dfa.match_pattern(cache, *sid, 0), - offset: bytes.len(), - })) - } else { - Ok(None) + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!sid.is_quit()); } } + Ok(()) } -#[inline(always)] +#[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_rev( dfa: &DFA, cache: &mut Cache, - bytes: &[u8], - start: usize, - state: LazyStateID, -) -> Result<Option<HalfMatch>, MatchError> { - if start > 0 { - let sid = dfa - .next_state(cache, state, bytes[start - 1]) - .map_err(|_| gave_up(start))?; + input: &Input<'_>, + sid: &mut LazyStateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + let sp = input.get_span(); + if sp.start > 0 { + let byte = input.haystack()[sp.start - 1]; + *sid = dfa + .next_state(cache, *sid, byte) + .map_err(|_| gave_up(sp.start))?; if sid.is_match() { - Ok(Some(HalfMatch { - pattern: dfa.match_pattern(cache, sid, 0), - offset: start, - })) - } else { - Ok(None) + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.start)); + } else if sid.is_quit() { + return Err(MatchError::quit(byte, sp.start - 1)); } } else { - let sid = - dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?; + *sid = + dfa.next_eoi_state(cache, *sid).map_err(|_| gave_up(sp.start))?; if sid.is_match() { - Ok(Some(HalfMatch { - pattern: dfa.match_pattern(cache, sid, 0), - offset: 0, - })) - } else { - Ok(None) + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, 0)); } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!sid.is_quit()); } + Ok(()) +} + +/// Re-compute the starting state that a DFA should be in after finding a +/// prefilter candidate match at the position `at`. +/// +/// It is always correct to call this, but not always necessary. Namely, +/// whenever the DFA has a universal start state, the DFA can remain in the +/// start state that it was in when it ran the prefilter. Why? Because in that +/// case, there is only one start state. +/// +/// When does a DFA have a universal start state? In precisely cases where +/// it has no look-around assertions in its prefix. So for example, `\bfoo` +/// does not have a universal start state because the start state depends on +/// whether the byte immediately before the start position is a word byte or +/// not. However, `foo\b` does have a universal start state because the word +/// boundary does not appear in the pattern's prefix. +/// +/// So... most cases don't need this, but when a pattern doesn't have a +/// universal start state, then after a prefilter candidate has been found, the +/// current state *must* be re-litigated as if computing the start state at the +/// beginning of the search because it might change. That is, not all start +/// states are created equal. +/// +/// Why avoid it? Because while it's not super expensive, it isn't a trivial +/// operation to compute the start state. It is much better to avoid it and +/// just state in the current state if you know it to be correct. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn prefilter_restart( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, + at: usize, +) -> Result<LazyStateID, MatchError> { + let mut input = input.clone(); + input.set_start(at); + init_fwd(dfa, cache, &input) } /// A convenience routine for constructing a "gave up" match error. -#[inline(always)] +#[cfg_attr(feature = "perf-inline", inline(always))] fn gave_up(offset: usize) -> MatchError { - MatchError::GaveUp { offset } + MatchError::gave_up(offset) } |