summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/hybrid/search.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 18:31:44 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 18:31:44 +0000
commitc23a457e72abe608715ac76f076f47dc42af07a5 (patch)
tree2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /vendor/regex-automata/src/hybrid/search.rs
parentReleasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff)
downloadrustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz
rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/hybrid/search.rs')
-rw-r--r--vendor/regex-automata/src/hybrid/search.rs1101
1 files changed, 620 insertions, 481 deletions
diff --git a/vendor/regex-automata/src/hybrid/search.rs b/vendor/regex-automata/src/hybrid/search.rs
index 92760cee2..f23283685 100644
--- a/vendor/regex-automata/src/hybrid/search.rs
+++ b/vendor/regex-automata/src/hybrid/search.rs
@@ -1,663 +1,802 @@
use crate::{
hybrid::{
- dfa::{Cache, DFA},
- id::{LazyStateID, OverlappingState, StateMatch},
+ dfa::{Cache, OverlappingState, DFA},
+ id::LazyStateID,
},
- nfa::thompson,
util::{
- id::PatternID,
- matchtypes::{HalfMatch, MatchError},
- prefilter, MATCH_OFFSET,
+ prefilter::Prefilter,
+ search::{HalfMatch, Input, MatchError, Span},
},
};
#[inline(never)]
-pub(crate) fn find_earliest_fwd(
- pre: Option<&mut prefilter::Scanner>,
+pub(crate) fn find_fwd(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- // Searching with a pattern ID is always anchored, so we should never use
- // a prefilter.
- if pre.is_some() && pattern_id.is_none() {
- find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end)
- } else {
- find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end)
+ if input.is_done() {
+ return Ok(None);
}
-}
-
-#[inline(never)]
-pub(crate) fn find_leftmost_fwd(
- pre: Option<&mut prefilter::Scanner>,
- dfa: &DFA,
- cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
-) -> Result<Option<HalfMatch>, MatchError> {
- // Searching with a pattern ID is always anchored, so we should never use
- // a prefilter.
- if pre.is_some() && pattern_id.is_none() {
- find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end)
+ let pre = if input.get_anchored().is_anchored() {
+ None
+ } else {
+ dfa.get_config().get_prefilter()
+ };
+ // So what we do here is specialize four different versions of 'find_fwd':
+ // one for each of the combinations for 'has prefilter' and 'is earliest
+ // search'. The reason for doing this is that both of these things require
+ // branches and special handling in some code that can be very hot,
+ // and shaving off as much as we can when we don't need it tends to be
+ // beneficial in ad hoc benchmarks. To see these differences, you often
+ // need a query with a high match count. In other words, specializing these
+ // four routines *tends* to help latency more than throughput.
+ if pre.is_some() {
+ if input.get_earliest() {
+ find_fwd_imp(dfa, cache, input, pre, true)
+ } else {
+ find_fwd_imp(dfa, cache, input, pre, false)
+ }
} else {
- find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end)
+ if input.get_earliest() {
+ find_fwd_imp(dfa, cache, input, None, true)
+ } else {
+ find_fwd_imp(dfa, cache, input, None, false)
+ }
}
}
-#[inline(always)]
-fn find_fwd(
- mut pre: Option<&mut prefilter::Scanner>,
- earliest: bool,
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_fwd_imp(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- haystack: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
+ pre: Option<&'_ Prefilter>,
+ earliest: bool,
) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= haystack.len());
- assert!(end <= haystack.len());
-
- // Why do this? This lets 'bytes[at]' work without bounds checks below.
- // It seems the assert on 'end <= haystack.len()' above is otherwise
- // not enough. Why not just make 'bytes' scoped this way anyway? Well,
- // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
- // for resolving look-ahead.
- let bytes = &haystack[..end];
+ // See 'prefilter_restart' docs for explanation.
+ let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty();
+ let mut mat = None;
+ let mut sid = init_fwd(dfa, cache, input)?;
+ let mut at = input.start();
+ // This could just be a closure, but then I think it would be unsound
+ // because it would need to be safe to invoke. This way, the lack of safety
+ // is clearer in the code below.
+ macro_rules! next_unchecked {
+ ($sid:expr, $at:expr) => {{
+ let byte = *input.haystack().get_unchecked($at);
+ dfa.next_state_untagged_unchecked(cache, $sid, byte)
+ }};
+ }
- let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?;
- let mut last_match = None;
- let mut at = start;
- if let Some(ref mut pre) = pre {
- // If a prefilter doesn't report false positives, then we don't need to
- // touch the DFA at all. However, since all matches include the pattern
- // ID, and the prefilter infrastructure doesn't report pattern IDs, we
- // limit this optimization to cases where there is exactly one pattern.
- // In that case, any match must be the 0th pattern.
- if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
- return Ok(pre.next_candidate(bytes, at).into_option().map(
- |offset| HalfMatch { pattern: PatternID::ZERO, offset },
- ));
- } else if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => return Ok(mat),
+ Some(ref span) => {
+ at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(dfa, cache, &input, at)?;
}
}
}
}
- while at < end {
+ cache.search_start(at);
+ while at < input.end() {
if sid.is_tagged() {
+ cache.search_update(at);
sid = dfa
- .next_state(cache, sid, bytes[at])
+ .next_state(cache, sid, input.haystack()[at])
.map_err(|_| gave_up(at))?;
- at += 1;
} else {
// SAFETY: There are two safety invariants we need to uphold
- // here in the loop below: that 'sid' is a valid state ID for
- // this DFA, and that 'at' is a valid index into 'bytes'. For
- // the former, we rely on the invariant that next_state* and
- // start_state_forward always returns a valid state ID (given a
- // valid state ID in the former case), and that we are only at this
- // place in the code if 'sid' is untagged. Moreover, every call to
- // next_state_untagged_unchecked below is guarded by a check that
- // sid is untagged. For the latter safety invariant, we always
- // guard unchecked access with a check that 'at' is less than
- // 'end', where 'end == bytes.len()'.
+ // here in the loops below: that 'sid' and 'prev_sid' are valid
+ // state IDs for this DFA, and that 'at' is a valid index into
+ // 'haystack'. For the former, we rely on the invariant that
+ // next_state* and start_state_forward always returns a valid state
+ // ID (given a valid state ID in the former case), and that we are
+ // only at this place in the code if 'sid' is untagged. Moreover,
+ // every call to next_state_untagged_unchecked below is guarded by
+ // a check that sid is untagged. For the latter safety invariant,
+ // we always guard unchecked access with a check that 'at' is less
+ // than 'end', where 'end <= haystack.len()'. In the unrolled loop
+ // below, we ensure that 'at' is always in bounds.
+ //
+ // PERF: For justification of omitting bounds checks, it gives us a
+ // ~10% bump in search time. This was used for a benchmark:
+ //
+ // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb
+ //
+ // PERF: For justification for the loop unrolling, we use a few
+ // different tests:
//
- // For justification, this gives us a ~10% bump in search time.
- // This was used for a benchmark:
+ // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb
+ // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb
+ // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb
//
- // regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb
+ // And there are three different configurations:
//
- // With bounds checked: ~881.4ms. Without: ~775ms. For input, I
- // used OpenSubtitles2018.raw.sample.medium.en.
+ // nounroll: this entire 'else' block vanishes and we just
+ // always use 'dfa.next_state(..)'.
+ // unroll1: just the outer loop below
+ // unroll2: just the inner loop below
+ // unroll3: both the outer and inner loops below
+ //
+ // This results in a matrix of timings for each of the above
+ // regexes with each of the above unrolling configurations:
+ //
+ // '\w{50}' '(?m)^.+$' 'ZQZQZQZQ'
+ // nounroll 1.51s 2.34s 1.51s
+ // unroll1 1.53s 2.32s 1.56s
+ // unroll2 2.22s 1.50s 0.61s
+ // unroll3 1.67s 1.45s 0.61s
+ //
+ // Ideally we'd be able to find a configuration that yields the
+ // best time for all regexes, but alas we settle for unroll3 that
+ // gives us *almost* the best for '\w{50}' and the best for the
+ // other two regexes.
+ //
+ // So what exactly is going on here? The first unrolling (grouping
+ // together runs of untagged transitions) specifically targets
+ // our choice of representation. The second unrolling (grouping
+ // together runs of self-transitions) specifically targets a common
+ // DFA topology. Let's dig in a little bit by looking at our
+ // regexes:
+ //
+ // '\w{50}': This regex spends a lot of time outside of the DFA's
+ // start state matching some part of the '\w' repetition. This
+ // means that it's a bit of a worst case for loop unrolling that
+ // targets self-transitions since the self-transitions in '\w{50}'
+ // are not particularly active for this haystack. However, the
+ // first unrolling (grouping together untagged transitions)
+ // does apply quite well here since very few transitions hit
+ // match/dead/quit/unknown states. It is however worth mentioning
+ // that if start states are configured to be tagged (which you
+ // typically want to do if you have a prefilter), then this regex
+ // actually slows way down because it is constantly ping-ponging
+ // out of the unrolled loop and into the handling of a tagged start
+ // state below. But when start states aren't tagged, the unrolled
+ // loop stays hot. (This is why it's imperative that start state
+ // tagging be disabled when there isn't a prefilter!)
+ //
+ // '(?m)^.+$': There are two important aspects of this regex: 1)
+ // on this haystack, its match count is very high, much higher
+ // than the other two regex and 2) it spends the vast majority
+ // of its time matching '.+'. Since Unicode mode is disabled,
+ // this corresponds to repeatedly following self transitions for
+ // the vast majority of the input. This does benefit from the
+ // untagged unrolling since most of the transitions will be to
+ // untagged states, but the untagged unrolling does more work than
+ // what is actually required. Namely, it has to keep track of the
+ // previous and next state IDs, which I guess requires a bit more
+ // shuffling. This is supported by the fact that nounroll+unroll1
+ // are both slower than unroll2+unroll3, where the latter has a
+ // loop unrolling that specifically targets self-transitions.
+ //
+ // 'ZQZQZQZQ': This one is very similar to '(?m)^.+$' because it
+ // spends the vast majority of its time in self-transitions for
+ // the (implicit) unanchored prefix. The main difference with
+ // '(?m)^.+$' is that it has a much lower match count. So there
+ // isn't much time spent in the overhead of reporting matches. This
+ // is the primary explainer in the perf difference here. We include
+ // this regex and the former to make sure we have comparison points
+ // with high and low match counts.
+ //
+ // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'.
+ //
+ // NOTE: In a follow-up, it turns out that the "inner" loop
+ // mentioned above was a pretty big pessimization in some other
+ // cases. Namely, it resulted in too much ping-ponging into and out
+ // of the loop, which resulted in nearly ~2x regressions in search
+ // time when compared to the originally lazy DFA in the regex crate.
+ // So I've removed the second loop unrolling that targets the
+ // self-transition case.
let mut prev_sid = sid;
- while at < end {
- prev_sid = sid;
- sid = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
+ while at < input.end() {
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if prev_sid.is_tagged() || at + 3 >= input.end() {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
at += 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
if sid.is_tagged() {
break;
}
- // SAFETY: we make four unguarded accesses to 'bytes[at]'
- // below, and each are safe because we know that 'at + 4' is
- // in bounds. Moreover, while we don't check whether 'sid' is
- // untagged directly, we know it is because of the check above.
- // And the unrolled loop below quits when the next state is not
- // equal to the previous state.
- //
- // PERF: For justification for eliminating bounds checks,
- // see above. For justification for the unrolling, we use
- // two tests. The one above with regex '(?m)^.+$', and also
- // '(?m)^.{40}$'. The former is kinda the best case for
- // unrolling, and gives a 1.67 boost primarily because the DFA
- // spends most of its time munching through the input in the
- // same state. But the latter pattern rarely spends time in the
- // same state through subsequent transitions, so unrolling is
- // pretty much always ineffective in that it craps out on the
- // first 'sid != next' check below. However, without unrolling,
- // search is only 1.03 times faster than with unrolling on the
- // latter pattern, which we deem to be an acceptable loss in
- // favor of optimizing the more common case of having a "hot"
- // state somewhere in the DFA.
- while at + 4 < end {
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at += 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at += 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at += 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at += 1;
+ at += 1;
+
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if prev_sid.is_tagged() {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
}
+ at += 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if sid.is_tagged() {
+ break;
+ }
+ at += 1;
}
+ // If we quit out of the code above with an unknown state ID at
+ // any point, then we need to re-compute that transition using
+ // 'next_state', which will do NFA powerset construction for us.
if sid.is_unknown() {
+ cache.search_update(at);
sid = dfa
- .next_state(cache, prev_sid, bytes[at - 1])
- .map_err(|_| gave_up(at - 1))?;
+ .next_state(cache, prev_sid, input.haystack()[at])
+ .map_err(|_| gave_up(at))?;
}
}
if sid.is_tagged() {
if sid.is_start() {
- if let Some(ref mut pre) = pre {
- if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => {
+ cache.search_finish(span.end);
+ return Ok(mat);
+ }
+ Some(ref span) => {
+ // We want to skip any update to 'at' below
+ // at the end of this iteration and just
+ // jump immediately back to the next state
+ // transition at the leading position of the
+ // candidate match.
+ //
+ // ... but only if we actually made progress
+ // with our prefilter, otherwise if the start
+ // state has a self-loop, we can get stuck.
+ if span.start > at {
+ at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(
+ dfa, cache, &input, at,
+ )?;
+ }
+ continue;
}
}
}
}
} else if sid.is_match() {
- last_match = Some(HalfMatch {
- pattern: dfa.match_pattern(cache, sid, 0),
- offset: at - MATCH_OFFSET,
- });
+ let pattern = dfa.match_pattern(cache, sid, 0);
+ // Since slice ranges are inclusive at the beginning and
+ // exclusive at the end, and since forward searches report
+ // the end, we can return 'at' as-is. This only works because
+ // matches are delayed by 1 byte. So by the time we observe a
+ // match, 'at' has already been set to 1 byte past the actual
+ // match location, which is precisely the exclusive ending
+ // bound of the match.
+ mat = Some(HalfMatch::new(pattern, at));
if earliest {
- return Ok(last_match);
+ cache.search_finish(at);
+ return Ok(mat);
}
} else if sid.is_dead() {
- return Ok(last_match);
+ cache.search_finish(at);
+ return Ok(mat);
} else if sid.is_quit() {
- if last_match.is_some() {
- return Ok(last_match);
- }
- let offset = at - 1;
- return Err(MatchError::Quit { byte: bytes[offset], offset });
+ cache.search_finish(at);
+ return Err(MatchError::quit(input.haystack()[at], at));
} else {
debug_assert!(sid.is_unknown());
unreachable!("sid being unknown is a bug");
}
}
+ at += 1;
}
- // We are careful to use 'haystack' here, which contains the full context
- // that we might want to inspect.
- Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match))
+ eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?;
+ cache.search_finish(input.end());
+ Ok(mat)
}
#[inline(never)]
-pub(crate) fn find_earliest_rev(
+pub(crate) fn find_rev(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- find_rev(true, dfa, cache, pattern_id, bytes, start, end)
+ if input.is_done() {
+ return Ok(None);
+ }
+ if input.get_earliest() {
+ find_rev_imp(dfa, cache, input, true)
+ } else {
+ find_rev_imp(dfa, cache, input, false)
+ }
}
-#[inline(never)]
-pub(crate) fn find_leftmost_rev(
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_rev_imp(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
-) -> Result<Option<HalfMatch>, MatchError> {
- find_rev(false, dfa, cache, pattern_id, bytes, start, end)
-}
-
-#[inline(always)]
-fn find_rev(
+ input: &Input<'_>,
earliest: bool,
- dfa: &DFA,
- cache: &mut Cache,
- pattern_id: Option<PatternID>,
- haystack: &[u8],
- start: usize,
- end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= haystack.len());
- assert!(end <= haystack.len());
-
- // Why do this? This lets 'bytes[at]' work without bounds checks below.
- // It seems the assert on 'end <= haystack.len()' above is otherwise
- // not enough. Why not just make 'bytes' scoped this way anyway? Well,
- // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
- // for resolving look-ahead.
- let bytes = &haystack[start..];
+ let mut mat = None;
+ let mut sid = init_rev(dfa, cache, input)?;
+ // In reverse search, the loop below can't handle the case of searching an
+ // empty slice. Ideally we could write something congruent to the forward
+ // search, i.e., 'while at >= start', but 'start' might be 0. Since we use
+ // an unsigned offset, 'at >= 0' is trivially always true. We could avoid
+ // this extra case handling by using a signed offset, but Rust makes it
+ // annoying to do. So... We just handle the empty case separately.
+ if input.start() == input.end() {
+ eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
+ return Ok(mat);
+ }
- let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?;
- let mut last_match = None;
- let mut at = end - start;
- while at > 0 {
+ let mut at = input.end() - 1;
+ macro_rules! next_unchecked {
+ ($sid:expr, $at:expr) => {{
+ let byte = *input.haystack().get_unchecked($at);
+ dfa.next_state_untagged_unchecked(cache, $sid, byte)
+ }};
+ }
+ cache.search_start(at);
+ loop {
if sid.is_tagged() {
- at -= 1;
+ cache.search_update(at);
sid = dfa
- .next_state(cache, sid, bytes[at])
+ .next_state(cache, sid, input.haystack()[at])
.map_err(|_| gave_up(at))?;
} else {
- // SAFETY: See comments in 'find_fwd' for both a safety argument
- // and a justification from a performance perspective as to 1) why
- // we elide bounds checks and 2) why we do a specialized version of
- // unrolling below.
+ // SAFETY: See comments in 'find_fwd' for a safety argument.
+ //
+ // PERF: The comments in 'find_fwd' also provide a justification
+ // from a performance perspective as to 1) why we elide bounds
+ // checks and 2) why we do a specialized version of unrolling
+ // below. The reverse search does have a slightly different
+ // consideration in that most reverse searches tend to be
+ // anchored and on shorter haystacks. However, this still makes a
+ // difference. Take this command for example:
+ //
+ // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb
+ //
+ // (Notice that we use 'find hybrid regex', not 'find hybrid dfa'
+ // like in the justification for the forward direction. The 'regex'
+ // sub-command will find start-of-match and thus run the reverse
+ // direction.)
+ //
+ // Without unrolling below, the above command takes around 3.76s.
+ // But with the unrolling below, we get down to 2.55s. If we keep
+ // the unrolling but add in bounds checks, then we get 2.86s.
+ //
+ // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'.
let mut prev_sid = sid;
- while at > 0 && !sid.is_tagged() {
- prev_sid = sid;
+ while at >= input.start() {
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if prev_sid.is_tagged()
+ || at <= input.start().saturating_add(3)
+ {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
at -= 1;
- while at > 3 {
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at -= 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at -= 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at -= 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at -= 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if sid.is_tagged() {
+ break;
+ }
+ at -= 1;
+
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if prev_sid.is_tagged() {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at -= 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if sid.is_tagged() {
+ break;
}
- sid = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
+ at -= 1;
}
+ // If we quit out of the code above with an unknown state ID at
+ // any point, then we need to re-compute that transition using
+ // 'next_state', which will do NFA powerset construction for us.
if sid.is_unknown() {
+ cache.search_update(at);
sid = dfa
- .next_state(cache, prev_sid, bytes[at])
+ .next_state(cache, prev_sid, input.haystack()[at])
.map_err(|_| gave_up(at))?;
}
}
if sid.is_tagged() {
if sid.is_start() {
- continue;
+ // do nothing
} else if sid.is_match() {
- last_match = Some(HalfMatch {
- pattern: dfa.match_pattern(cache, sid, 0),
- offset: start + at + MATCH_OFFSET,
- });
+ let pattern = dfa.match_pattern(cache, sid, 0);
+ // Since reverse searches report the beginning of a match
+ // and the beginning is inclusive (not exclusive like the
+ // end of a match), we add 1 to make it inclusive.
+ mat = Some(HalfMatch::new(pattern, at + 1));
if earliest {
- return Ok(last_match);
+ cache.search_finish(at);
+ return Ok(mat);
}
} else if sid.is_dead() {
- return Ok(last_match);
+ cache.search_finish(at);
+ return Ok(mat);
+ } else if sid.is_quit() {
+ cache.search_finish(at);
+ return Err(MatchError::quit(input.haystack()[at], at));
} else {
- debug_assert!(sid.is_quit());
- if last_match.is_some() {
- return Ok(last_match);
- }
- return Err(MatchError::Quit { byte: bytes[at], offset: at });
+ debug_assert!(sid.is_unknown());
+ unreachable!("sid being unknown is a bug");
}
}
+ if at == input.start() {
+ break;
+ }
+ at -= 1;
}
- Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match))
+ cache.search_finish(input.start());
+ eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
+ Ok(mat)
}
#[inline(never)]
pub(crate) fn find_overlapping_fwd(
- pre: Option<&mut prefilter::Scanner>,
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- caller_state: &mut OverlappingState,
-) -> Result<Option<HalfMatch>, MatchError> {
- // Searching with a pattern ID is always anchored, so we should only ever
- // use a prefilter when no pattern ID is given.
- if pre.is_some() && pattern_id.is_none() {
- find_overlapping_fwd_imp(
- pre,
- dfa,
- cache,
- pattern_id,
- bytes,
- start,
- end,
- caller_state,
- )
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ let pre = if input.get_anchored().is_anchored() {
+ None
} else {
- find_overlapping_fwd_imp(
- None,
- dfa,
- cache,
- pattern_id,
- bytes,
- start,
- end,
- caller_state,
- )
+ dfa.get_config().get_prefilter()
+ };
+ if pre.is_some() {
+ find_overlapping_fwd_imp(dfa, cache, input, pre, state)
+ } else {
+ find_overlapping_fwd_imp(dfa, cache, input, None, state)
}
}
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_overlapping_fwd_imp(
- mut pre: Option<&mut prefilter::Scanner>,
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- mut start: usize,
- end: usize,
- caller_state: &mut OverlappingState,
-) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= bytes.len());
- assert!(end <= bytes.len());
-
- let mut sid = match caller_state.id() {
- None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?,
+ input: &Input<'_>,
+ pre: Option<&'_ Prefilter>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ // See 'prefilter_restart' docs for explanation.
+ let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty();
+ let mut sid = match state.id {
+ None => {
+ state.at = input.start();
+ init_fwd(dfa, cache, input)?
+ }
Some(sid) => {
- if let Some(last) = caller_state.last_match() {
- let match_count = dfa.match_count(cache, sid);
- if last.match_index < match_count {
- let m = HalfMatch {
- pattern: dfa.match_pattern(
- cache,
- sid,
- last.match_index,
- ),
- offset: last.offset,
- };
- last.match_index += 1;
- return Ok(Some(m));
+ if let Some(match_index) = state.next_match_index {
+ let match_len = dfa.match_len(cache, sid);
+ if match_index < match_len {
+ state.next_match_index = Some(match_index + 1);
+ let pattern = dfa.match_pattern(cache, sid, match_index);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
}
}
-
- // This is a subtle but critical detail. If the caller provides a
- // non-None state ID, then it must be the case that the state ID
- // corresponds to one set by this function. The state ID therefore
- // corresponds to a match state, a dead state or some other state.
- // However, "some other" state _only_ occurs when the input has
- // been exhausted because the only way to stop before then is to
- // see a match or a dead/quit state.
- //
- // If the input is exhausted or if it's a dead state, then
- // incrementing the starting position has no relevance on
- // correctness, since the loop below will either not execute
- // at all or will immediately stop due to being in a dead state.
- // (Once in a dead state it is impossible to leave it.)
- //
- // Therefore, the only case we need to consider is when
- // caller_state is a match state. In this case, since our machines
- // support the ability to delay a match by a certain number of
- // bytes (to support look-around), it follows that we actually
- // consumed that many additional bytes on our previous search. When
- // the caller resumes their search to find subsequent matches, they
- // will use the ending location from the previous match as the next
- // starting point, which is `match_offset` bytes PRIOR to where
- // we scanned to on the previous search. Therefore, we need to
- // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
- //
- // Incidentally, since MATCH_OFFSET is non-zero, this also makes
- // dealing with empty matches convenient. Namely, callers needn't
- // special case them when implementing an iterator. Instead, this
- // ensures that forward progress is always made.
- start += MATCH_OFFSET;
+ // Once we've reported all matches at a given position, we need to
+ // advance the search to the next position.
+ state.at += 1;
+ if state.at > input.end() {
+ return Ok(());
+ }
sid
}
};
- let mut at = start;
- while at < end {
- let byte = bytes[at];
- sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?;
- at += 1;
+ // NOTE: We don't optimize the crap out of this routine primarily because
+ // it seems like most overlapping searches will have higher match counts,
+ // and thus, throughput is perhaps not as important. But if you have a use
+ // case for something faster, feel free to file an issue.
+ cache.search_start(state.at);
+ while state.at < input.end() {
+ sid = dfa
+ .next_state(cache, sid, input.haystack()[state.at])
+ .map_err(|_| gave_up(state.at))?;
if sid.is_tagged() {
- caller_state.set_id(sid);
+ state.id = Some(sid);
if sid.is_start() {
- if let Some(ref mut pre) = pre {
- if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ if let Some(ref pre) = pre {
+ let span = Span::from(state.at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => return Ok(()),
+ Some(ref span) => {
+ if span.start > state.at {
+ state.at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(
+ dfa, cache, &input, state.at,
+ )?;
+ }
+ continue;
}
}
}
}
} else if sid.is_match() {
- let offset = at - MATCH_OFFSET;
- caller_state
- .set_last_match(StateMatch { match_index: 1, offset });
- return Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(cache, sid, 0),
- offset,
- }));
+ state.next_match_index = Some(1);
+ let pattern = dfa.match_pattern(cache, sid, 0);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ cache.search_finish(state.at);
+ return Ok(());
} else if sid.is_dead() {
- return Ok(None);
+ cache.search_finish(state.at);
+ return Ok(());
+ } else if sid.is_quit() {
+ cache.search_finish(state.at);
+ return Err(MatchError::quit(
+ input.haystack()[state.at],
+ state.at,
+ ));
} else {
- debug_assert!(sid.is_quit());
- return Err(MatchError::Quit { byte, offset: at - 1 });
+ debug_assert!(sid.is_unknown());
+ unreachable!("sid being unknown is a bug");
}
}
+ state.at += 1;
+ cache.search_update(state.at);
}
- let result = eoi_fwd(dfa, cache, bytes, end, &mut sid);
- caller_state.set_id(sid);
- if let Ok(Some(ref last_match)) = result {
- caller_state.set_last_match(StateMatch {
- // '1' is always correct here since if we get to this point, this
- // always corresponds to the first (index '0') match discovered at
- // this position. So the next match to report at this position (if
- // it exists) is at index '1'.
- match_index: 1,
- offset: last_match.offset(),
- });
+ let result = eoi_fwd(dfa, cache, input, &mut sid, &mut state.mat);
+ state.id = Some(sid);
+ if state.mat.is_some() {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ state.next_match_index = Some(1);
}
+ cache.search_finish(input.end());
result
}
-#[inline(always)]
+#[inline(never)]
+pub(crate) fn find_overlapping_rev(
+ dfa: &DFA,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ let mut sid = match state.id {
+ None => {
+ let sid = init_rev(dfa, cache, input)?;
+ state.id = Some(sid);
+ if input.start() == input.end() {
+ state.rev_eoi = true;
+ } else {
+ state.at = input.end() - 1;
+ }
+ sid
+ }
+ Some(sid) => {
+ if let Some(match_index) = state.next_match_index {
+ let match_len = dfa.match_len(cache, sid);
+ if match_index < match_len {
+ state.next_match_index = Some(match_index + 1);
+ let pattern = dfa.match_pattern(cache, sid, match_index);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
+ }
+ }
+ // Once we've reported all matches at a given position, we need
+ // to advance the search to the next position. However, if we've
+ // already followed the EOI transition, then we know we're done
+ // with the search and there cannot be any more matches to report.
+ if state.rev_eoi {
+ return Ok(());
+ } else if state.at == input.start() {
+ // At this point, we should follow the EOI transition. This
+ // will cause us the skip the main loop below and fall through
+ // to the final 'eoi_rev' transition.
+ state.rev_eoi = true;
+ } else {
+ // We haven't hit the end of the search yet, so move on.
+ state.at -= 1;
+ }
+ sid
+ }
+ };
+ cache.search_start(state.at);
+ while !state.rev_eoi {
+ sid = dfa
+ .next_state(cache, sid, input.haystack()[state.at])
+ .map_err(|_| gave_up(state.at))?;
+ if sid.is_tagged() {
+ state.id = Some(sid);
+ if sid.is_start() {
+ // do nothing
+ } else if sid.is_match() {
+ state.next_match_index = Some(1);
+ let pattern = dfa.match_pattern(cache, sid, 0);
+ state.mat = Some(HalfMatch::new(pattern, state.at + 1));
+ cache.search_finish(state.at);
+ return Ok(());
+ } else if sid.is_dead() {
+ cache.search_finish(state.at);
+ return Ok(());
+ } else if sid.is_quit() {
+ cache.search_finish(state.at);
+ return Err(MatchError::quit(
+ input.haystack()[state.at],
+ state.at,
+ ));
+ } else {
+ debug_assert!(sid.is_unknown());
+ unreachable!("sid being unknown is a bug");
+ }
+ }
+ if state.at == input.start() {
+ break;
+ }
+ state.at -= 1;
+ cache.search_update(state.at);
+ }
+
+ let result = eoi_rev(dfa, cache, input, &mut sid, &mut state.mat);
+ state.rev_eoi = true;
+ state.id = Some(sid);
+ if state.mat.is_some() {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ state.next_match_index = Some(1);
+ }
+ cache.search_finish(input.start());
+ result
+}
+
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_fwd(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
- let sid = dfa
- .start_state_forward(cache, pattern_id, bytes, start, end)
- .map_err(|_| gave_up(start))?;
+ let sid = dfa.start_state_forward(cache, input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
- assert!(!sid.is_match());
+ debug_assert!(!sid.is_match());
Ok(sid)
}
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_rev(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
- let sid = dfa
- .start_state_reverse(cache, pattern_id, bytes, start, end)
- .map_err(|_| gave_up(end))?;
+ let sid = dfa.start_state_reverse(cache, input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
- assert!(!sid.is_match());
+ debug_assert!(!sid.is_match());
Ok(sid)
}
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_fwd(
dfa: &DFA,
cache: &mut Cache,
- bytes: &[u8],
- end: usize,
+ input: &Input<'_>,
sid: &mut LazyStateID,
-) -> Result<Option<HalfMatch>, MatchError> {
- match bytes.get(end) {
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ match input.haystack().get(sp.end) {
Some(&b) => {
- *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?;
+ *sid =
+ dfa.next_state(cache, *sid, b).map_err(|_| gave_up(sp.end))?;
if sid.is_match() {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(cache, *sid, 0),
- offset: end,
- }))
- } else {
- Ok(None)
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.end));
+ } else if sid.is_quit() {
+ return Err(MatchError::quit(b, sp.end));
}
}
None => {
*sid = dfa
.next_eoi_state(cache, *sid)
- .map_err(|_| gave_up(bytes.len()))?;
+ .map_err(|_| gave_up(input.haystack().len()))?;
if sid.is_match() {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(cache, *sid, 0),
- offset: bytes.len(),
- }))
- } else {
- Ok(None)
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!sid.is_quit());
}
}
+ Ok(())
}
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_rev(
dfa: &DFA,
cache: &mut Cache,
- bytes: &[u8],
- start: usize,
- state: LazyStateID,
-) -> Result<Option<HalfMatch>, MatchError> {
- if start > 0 {
- let sid = dfa
- .next_state(cache, state, bytes[start - 1])
- .map_err(|_| gave_up(start))?;
+ input: &Input<'_>,
+ sid: &mut LazyStateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ if sp.start > 0 {
+ let byte = input.haystack()[sp.start - 1];
+ *sid = dfa
+ .next_state(cache, *sid, byte)
+ .map_err(|_| gave_up(sp.start))?;
if sid.is_match() {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(cache, sid, 0),
- offset: start,
- }))
- } else {
- Ok(None)
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.start));
+ } else if sid.is_quit() {
+ return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
- let sid =
- dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?;
+ *sid =
+ dfa.next_eoi_state(cache, *sid).map_err(|_| gave_up(sp.start))?;
if sid.is_match() {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(cache, sid, 0),
- offset: 0,
- }))
- } else {
- Ok(None)
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, 0));
}
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!sid.is_quit());
}
+ Ok(())
+}
+
+/// Re-compute the starting state that a DFA should be in after finding a
+/// prefilter candidate match at the position `at`.
+///
+/// It is always correct to call this, but not always necessary. Namely,
+/// whenever the DFA has a universal start state, the DFA can remain in the
+/// start state that it was in when it ran the prefilter. Why? Because in that
+/// case, there is only one start state.
+///
+/// When does a DFA have a universal start state? In precisely cases where
+/// it has no look-around assertions in its prefix. So for example, `\bfoo`
+/// does not have a universal start state because the start state depends on
+/// whether the byte immediately before the start position is a word byte or
+/// not. However, `foo\b` does have a universal start state because the word
+/// boundary does not appear in the pattern's prefix.
+///
+/// So... most cases don't need this, but when a pattern doesn't have a
+/// universal start state, then after a prefilter candidate has been found, the
+/// current state *must* be re-litigated as if computing the start state at the
+/// beginning of the search because it might change. That is, not all start
+/// states are created equal.
+///
+/// Why avoid it? Because while it's not super expensive, it isn't a trivial
+/// operation to compute the start state. It is much better to avoid it and
+/// just state in the current state if you know it to be correct.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn prefilter_restart(
+ dfa: &DFA,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ at: usize,
+) -> Result<LazyStateID, MatchError> {
+ let mut input = input.clone();
+ input.set_start(at);
+ init_fwd(dfa, cache, &input)
}
/// A convenience routine for constructing a "gave up" match error.
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn gave_up(offset: usize) -> MatchError {
- MatchError::GaveUp { offset }
+ MatchError::gave_up(offset)
}