summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/dfa/search.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-automata/src/dfa/search.rs')
-rw-r--r--vendor/regex-automata/src/dfa/search.rs891
1 files changed, 526 insertions, 365 deletions
diff --git a/vendor/regex-automata/src/dfa/search.rs b/vendor/regex-automata/src/dfa/search.rs
index 492414981..8c012a594 100644
--- a/vendor/regex-automata/src/dfa/search.rs
+++ b/vendor/regex-automata/src/dfa/search.rs
@@ -1,493 +1,654 @@
use crate::{
dfa::{
accel,
- automaton::{Automaton, OverlappingState, StateMatch},
+ automaton::{Automaton, OverlappingState},
},
util::{
- id::{PatternID, StateID},
- matchtypes::HalfMatch,
- prefilter, MATCH_OFFSET,
+ prefilter::Prefilter,
+ primitives::StateID,
+ search::{Anchored, HalfMatch, Input, Span},
},
MatchError,
};
#[inline(never)]
-pub fn find_earliest_fwd<A: Automaton + ?Sized>(
- pre: Option<&mut prefilter::Scanner>,
+pub fn find_fwd<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- // Searching with a pattern ID is always anchored, so we should never use
- // a prefilter.
- if pre.is_some() && pattern_id.is_none() {
- find_fwd(pre, true, dfa, pattern_id, bytes, start, end)
- } else {
- find_fwd(None, true, dfa, pattern_id, bytes, start, end)
+ if input.is_done() {
+ return Ok(None);
}
-}
-
-#[inline(never)]
-pub fn find_leftmost_fwd<A: Automaton + ?Sized>(
- pre: Option<&mut prefilter::Scanner>,
- dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
-) -> Result<Option<HalfMatch>, MatchError> {
+ let pre = if input.get_anchored().is_anchored() {
+ None
+ } else {
+ dfa.get_prefilter()
+ };
// Searching with a pattern ID is always anchored, so we should never use
// a prefilter.
- if pre.is_some() && pattern_id.is_none() {
- find_fwd(pre, false, dfa, pattern_id, bytes, start, end)
+ if pre.is_some() {
+ if input.get_earliest() {
+ find_fwd_imp(dfa, input, pre, true)
+ } else {
+ find_fwd_imp(dfa, input, pre, false)
+ }
} else {
- find_fwd(None, false, dfa, pattern_id, bytes, start, end)
+ if input.get_earliest() {
+ find_fwd_imp(dfa, input, None, true)
+ } else {
+ find_fwd_imp(dfa, input, None, false)
+ }
}
}
-/// This is marked as `inline(always)` specifically because it supports
-/// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters
-/// getting inlined eliminate some critical branches. To avoid bloating binary
-/// size, we only call this function in a fixed number of places.
-#[inline(always)]
-fn find_fwd<A: Automaton + ?Sized>(
- mut pre: Option<&mut prefilter::Scanner>,
- earliest: bool,
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_fwd_imp<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- haystack: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
+ pre: Option<&'_ Prefilter>,
+ earliest: bool,
) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= haystack.len());
- assert!(end <= haystack.len());
-
- // Why do this? This lets 'bytes[at]' work without bounds checks below.
- // It seems the assert on 'end <= haystack.len()' above is otherwise
- // not enough. Why not just make 'bytes' scoped this way anyway? Well,
- // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
- // for resolving look-ahead.
- let bytes = &haystack[..end];
+ // See 'prefilter_restart' docs for explanation.
+ let universal_start = dfa.universal_start_state(Anchored::No).is_some();
+ let mut mat = None;
+ let mut sid = init_fwd(dfa, input)?;
+ let mut at = input.start();
+ // This could just be a closure, but then I think it would be unsound
+ // because it would need to be safe to invoke. This way, the lack of safety
+ // is clearer in the code below.
+ macro_rules! next_unchecked {
+ ($sid:expr, $at:expr) => {{
+ let byte = *input.haystack().get_unchecked($at);
+ dfa.next_state_unchecked($sid, byte)
+ }};
+ }
- let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?;
- let mut last_match = None;
- let mut at = start;
- if let Some(ref mut pre) = pre {
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
// If a prefilter doesn't report false positives, then we don't need to
// touch the DFA at all. However, since all matches include the pattern
// ID, and the prefilter infrastructure doesn't report pattern IDs, we
// limit this optimization to cases where there is exactly one pattern.
// In that case, any match must be the 0th pattern.
- if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
- return Ok(pre.next_candidate(bytes, at).into_option().map(
- |offset| HalfMatch { pattern: PatternID::ZERO, offset },
- ));
- } else if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ match pre.find(input.haystack(), span) {
+ None => return Ok(mat),
+ Some(ref span) => {
+ at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(dfa, &input, at)?;
}
}
}
}
- while at < end {
- let byte = bytes[at];
- state = dfa.next_state(state, byte);
- at += 1;
- if dfa.is_special_state(state) {
- if dfa.is_start_state(state) {
- if let Some(ref mut pre) = pre {
- if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ while at < input.end() {
+ // SAFETY: There are two safety invariants we need to uphold here in
+ // the loops below: that 'sid' and 'prev_sid' are valid state IDs
+ // for this DFA, and that 'at' is a valid index into 'haystack'.
+ // For the former, we rely on the invariant that next_state* and
+ // start_state_forward always returns a valid state ID (given a valid
+ // state ID in the former case). For the latter safety invariant, we
+ // always guard unchecked access with a check that 'at' is less than
+ // 'end', where 'end <= haystack.len()'. In the unrolled loop below, we
+ // ensure that 'at' is always in bounds.
+ //
+ // PERF: See a similar comment in src/hybrid/search.rs that justifies
+ // this extra work to make the search loop fast. The same reasoning and
+ // benchmarks apply here.
+ let mut prev_sid;
+ while at < input.end() {
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid) || at + 3 >= input.end() {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at += 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
+ at += 1;
+
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid) {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at += 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
+ at += 1;
+ }
+ if dfa.is_special_state(sid) {
+ if dfa.is_start_state(sid) {
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => return Ok(mat),
+ Some(ref span) => {
+ // We want to skip any update to 'at' below
+ // at the end of this iteration and just
+ // jump immediately back to the next state
+ // transition at the leading position of the
+ // candidate match.
+ //
+ // ... but only if we actually made progress
+ // with our prefilter, otherwise if the start
+ // state has a self-loop, we can get stuck.
+ if span.start > at {
+ at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(dfa, &input, at)?;
+ }
+ continue;
}
}
}
- } else if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_fwd(needles, bytes, at)
- .unwrap_or(bytes.len());
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_fwd(needles, input.haystack(), at + 1)
+ .unwrap_or(input.end());
+ continue;
}
- } else if dfa.is_match_state(state) {
- last_match = Some(HalfMatch {
- pattern: dfa.match_pattern(state, 0),
- offset: at - MATCH_OFFSET,
- });
+ } else if dfa.is_match_state(sid) {
+ let pattern = dfa.match_pattern(sid, 0);
+ mat = Some(HalfMatch::new(pattern, at));
if earliest {
- return Ok(last_match);
+ return Ok(mat);
}
- if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_fwd(needles, bytes, at)
- .unwrap_or(bytes.len());
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_fwd(needles, input.haystack(), at + 1)
+ .unwrap_or(input.end());
+ continue;
}
- } else if dfa.is_accel_state(state) {
- let needs = dfa.accelerator(state);
- at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
- } else if dfa.is_dead_state(state) {
- return Ok(last_match);
+ } else if dfa.is_accel_state(sid) {
+ let needs = dfa.accelerator(sid);
+ at = accel::find_fwd(needs, input.haystack(), at + 1)
+ .unwrap_or(input.end());
+ continue;
+ } else if dfa.is_dead_state(sid) {
+ return Ok(mat);
} else {
- debug_assert!(dfa.is_quit_state(state));
- if last_match.is_some() {
- return Ok(last_match);
- }
- return Err(MatchError::Quit { byte, offset: at - 1 });
+ // It's important that this is a debug_assert, since this can
+ // actually be tripped even if DFA::from_bytes succeeds and
+ // returns a supposedly valid DFA.
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(input.haystack()[at], at));
}
}
- while at < end && dfa.next_state(state, bytes[at]) == state {
- at += 1;
- }
+ at += 1;
}
- Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match))
+ eoi_fwd(dfa, input, &mut sid, &mut mat)?;
+ Ok(mat)
}
#[inline(never)]
-pub fn find_earliest_rev<A: Automaton + ?Sized>(
+pub fn find_rev<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- find_rev(true, dfa, pattern_id, bytes, start, end)
+ if input.is_done() {
+ return Ok(None);
+ }
+ if input.get_earliest() {
+ find_rev_imp(dfa, input, true)
+ } else {
+ find_rev_imp(dfa, input, false)
+ }
}
-#[inline(never)]
-pub fn find_leftmost_rev<A: Automaton + ?Sized>(
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_rev_imp<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
-) -> Result<Option<HalfMatch>, MatchError> {
- find_rev(false, dfa, pattern_id, bytes, start, end)
-}
-
-/// This is marked as `inline(always)` specifically because it supports
-/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
-/// permits eliminating a few crucial branches.
-#[inline(always)]
-fn find_rev<A: Automaton + ?Sized>(
+ input: &Input<'_>,
earliest: bool,
- dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= bytes.len());
- assert!(end <= bytes.len());
+ let mut mat = None;
+ let mut sid = init_rev(dfa, input)?;
+ // In reverse search, the loop below can't handle the case of searching an
+ // empty slice. Ideally we could write something congruent to the forward
+ // search, i.e., 'while at >= start', but 'start' might be 0. Since we use
+ // an unsigned offset, 'at >= 0' is trivially always true. We could avoid
+ // this extra case handling by using a signed offset, but Rust makes it
+ // annoying to do. So... We just handle the empty case separately.
+ if input.start() == input.end() {
+ eoi_rev(dfa, input, &mut sid, &mut mat)?;
+ return Ok(mat);
+ }
- let mut state = init_rev(dfa, pattern_id, bytes, start, end)?;
- let mut last_match = None;
- let mut at = end;
- while at > start {
- at -= 1;
- while at > start && dfa.next_state(state, bytes[at]) == state {
+ let mut at = input.end() - 1;
+ macro_rules! next_unchecked {
+ ($sid:expr, $at:expr) => {{
+ let byte = *input.haystack().get_unchecked($at);
+ dfa.next_state_unchecked($sid, byte)
+ }};
+ }
+ loop {
+ // SAFETY: See comments in 'find_fwd' for a safety argument.
+ let mut prev_sid;
+ while at >= input.start() {
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid)
+ || at <= input.start().saturating_add(3)
+ {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at -= 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
at -= 1;
- }
- let byte = bytes[at];
- state = dfa.next_state(state, byte);
- if dfa.is_special_state(state) {
- if dfa.is_start_state(state) {
- if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_rev(needles, bytes, at)
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid) {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at -= 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
+ at -= 1;
+ }
+ if dfa.is_special_state(sid) {
+ if dfa.is_start_state(sid) {
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
- .unwrap_or(0);
+ .unwrap_or(input.start());
}
- } else if dfa.is_match_state(state) {
- last_match = Some(HalfMatch {
- pattern: dfa.match_pattern(state, 0),
- offset: at + MATCH_OFFSET,
- });
+ } else if dfa.is_match_state(sid) {
+ let pattern = dfa.match_pattern(sid, 0);
+ // Since reverse searches report the beginning of a match
+ // and the beginning is inclusive (not exclusive like the
+ // end of a match), we add 1 to make it inclusive.
+ mat = Some(HalfMatch::new(pattern, at + 1));
if earliest {
- return Ok(last_match);
+ return Ok(mat);
}
- if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_rev(needles, bytes, at)
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
- .unwrap_or(0);
+ .unwrap_or(input.start());
}
- } else if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_rev(needles, bytes, at)
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ // If the accelerator returns nothing, why don't we quit the
+ // search? Well, if the accelerator doesn't find anything, that
+ // doesn't mean we don't have a match. It just means that we
+ // can't leave the current state given one of the 255 possible
+ // byte values. However, there might be an EOI transition. So
+ // we set 'at' to the end of the haystack, which will cause
+ // this loop to stop and fall down into the EOI transition.
+ at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
- .unwrap_or(0);
- } else if dfa.is_dead_state(state) {
- return Ok(last_match);
+ .unwrap_or(input.start());
+ } else if dfa.is_dead_state(sid) {
+ return Ok(mat);
} else {
- debug_assert!(dfa.is_quit_state(state));
- if last_match.is_some() {
- return Ok(last_match);
- }
- return Err(MatchError::Quit { byte, offset: at });
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(input.haystack()[at], at));
}
}
+ if at == input.start() {
+ break;
+ }
+ at -= 1;
}
- Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match))
+ eoi_rev(dfa, input, &mut sid, &mut mat)?;
+ Ok(mat)
}
#[inline(never)]
pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
- pre: Option<&mut prefilter::Scanner>,
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- caller_state: &mut OverlappingState,
-) -> Result<Option<HalfMatch>, MatchError> {
- // Searching with a pattern ID is always anchored, so we should only ever
- // use a prefilter when no pattern ID is given.
- if pre.is_some() && pattern_id.is_none() {
- find_overlapping_fwd_imp(
- pre,
- dfa,
- pattern_id,
- bytes,
- start,
- end,
- caller_state,
- )
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ let pre = if input.get_anchored().is_anchored() {
+ None
+ } else {
+ dfa.get_prefilter()
+ };
+ if pre.is_some() {
+ find_overlapping_fwd_imp(dfa, input, pre, state)
} else {
- find_overlapping_fwd_imp(
- None,
- dfa,
- pattern_id,
- bytes,
- start,
- end,
- caller_state,
- )
+ find_overlapping_fwd_imp(dfa, input, None, state)
}
}
-/// This is marked as `inline(always)` specifically because it supports
-/// multiple modes of searching. Namely, the 'pre' prefilter getting inlined
-/// permits eliminating a few crucial branches and reduces code size when it is
-/// not used.
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
- mut pre: Option<&mut prefilter::Scanner>,
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- mut start: usize,
- end: usize,
- caller_state: &mut OverlappingState,
-) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= bytes.len());
- assert!(end <= bytes.len());
-
- let mut state = match caller_state.id() {
- None => init_fwd(dfa, pattern_id, bytes, start, end)?,
- Some(id) => {
- if let Some(last) = caller_state.last_match() {
- let match_count = dfa.match_count(id);
- if last.match_index < match_count {
- let m = HalfMatch {
- pattern: dfa.match_pattern(id, last.match_index),
- offset: last.offset,
- };
- last.match_index += 1;
- return Ok(Some(m));
+ input: &Input<'_>,
+ pre: Option<&'_ Prefilter>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ // See 'prefilter_restart' docs for explanation.
+ let universal_start = dfa.universal_start_state(Anchored::No).is_some();
+ let mut sid = match state.id {
+ None => {
+ state.at = input.start();
+ init_fwd(dfa, input)?
+ }
+ Some(sid) => {
+ if let Some(match_index) = state.next_match_index {
+ let match_len = dfa.match_len(sid);
+ if match_index < match_len {
+ state.next_match_index = Some(match_index + 1);
+ let pattern = dfa.match_pattern(sid, match_index);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
}
}
-
- // This is a subtle but critical detail. If the caller provides a
- // non-None state ID, then it must be the case that the state ID
- // corresponds to one set by this function. The state ID therefore
- // corresponds to a match state, a dead state or some other state.
- // However, "some other" state _only_ occurs when the input has
- // been exhausted because the only way to stop before then is to
- // see a match or a dead/quit state.
- //
- // If the input is exhausted or if it's a dead state, then
- // incrementing the starting position has no relevance on
- // correctness, since the loop below will either not execute
- // at all or will immediately stop due to being in a dead state.
- // (Once in a dead state it is impossible to leave it.)
- //
- // Therefore, the only case we need to consider is when
- // caller_state is a match state. In this case, since our machines
- // support the ability to delay a match by a certain number of
- // bytes (to support look-around), it follows that we actually
- // consumed that many additional bytes on our previous search. When
- // the caller resumes their search to find subsequent matches, they
- // will use the ending location from the previous match as the next
- // starting point, which is `MATCH_OFFSET` bytes PRIOR to where
- // we scanned to on the previous search. Therefore, we need to
- // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
- //
- // Incidentally, since MATCH_OFFSET is non-zero, this also makes
- // dealing with empty matches convenient. Namely, callers needn't
- // special case them when implementing an iterator. Instead, this
- // ensures that forward progress is always made.
- start += MATCH_OFFSET;
- id
+ // Once we've reported all matches at a given position, we need to
+ // advance the search to the next position.
+ state.at += 1;
+ if state.at > input.end() {
+ return Ok(());
+ }
+ sid
}
};
- let mut at = start;
- while at < end {
- let byte = bytes[at];
- state = dfa.next_state(state, byte);
- at += 1;
- if dfa.is_special_state(state) {
- caller_state.set_id(state);
- if dfa.is_start_state(state) {
- if let Some(ref mut pre) = pre {
- if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ // NOTE: We don't optimize the crap out of this routine primarily because
+ // it seems like most find_overlapping searches will have higher match
+ // counts, and thus, throughput is perhaps not as important. But if you
+ // have a use case for something faster, feel free to file an issue.
+ while state.at < input.end() {
+ sid = dfa.next_state(sid, input.haystack()[state.at]);
+ if dfa.is_special_state(sid) {
+ state.id = Some(sid);
+ if dfa.is_start_state(sid) {
+ if let Some(ref pre) = pre {
+ let span = Span::from(state.at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => return Ok(()),
+ Some(ref span) => {
+ if span.start > state.at {
+ state.at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(
+ dfa, &input, state.at,
+ )?;
+ }
+ continue;
}
}
}
- } else if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_fwd(needles, bytes, at)
- .unwrap_or(bytes.len());
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ state.at = accel::find_fwd(
+ needles,
+ input.haystack(),
+ state.at + 1,
+ )
+ .unwrap_or(input.end());
+ continue;
+ }
+ } else if dfa.is_match_state(sid) {
+ state.next_match_index = Some(1);
+ let pattern = dfa.match_pattern(sid, 0);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
+ } else if dfa.is_accel_state(sid) {
+ let needs = dfa.accelerator(sid);
+ // If the accelerator returns nothing, why don't we quit the
+ // search? Well, if the accelerator doesn't find anything, that
+ // doesn't mean we don't have a match. It just means that we
+ // can't leave the current state given one of the 255 possible
+ // byte values. However, there might be an EOI transition. So
+ // we set 'at' to the end of the haystack, which will cause
+ // this loop to stop and fall down into the EOI transition.
+ state.at =
+ accel::find_fwd(needs, input.haystack(), state.at + 1)
+ .unwrap_or(input.end());
+ continue;
+ } else if dfa.is_dead_state(sid) {
+ return Ok(());
+ } else {
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(
+ input.haystack()[state.at],
+ state.at,
+ ));
+ }
+ }
+ state.at += 1;
+ }
+
+ let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat);
+ state.id = Some(sid);
+ if state.mat.is_some() {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ state.next_match_index = Some(1);
+ }
+ result
+}
+
+#[inline(never)]
+pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ let mut sid = match state.id {
+ None => {
+ let sid = init_rev(dfa, input)?;
+ state.id = Some(sid);
+ if input.start() == input.end() {
+ state.rev_eoi = true;
+ } else {
+ state.at = input.end() - 1;
+ }
+ sid
+ }
+ Some(sid) => {
+ if let Some(match_index) = state.next_match_index {
+ let match_len = dfa.match_len(sid);
+ if match_index < match_len {
+ state.next_match_index = Some(match_index + 1);
+ let pattern = dfa.match_pattern(sid, match_index);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
+ }
+ }
+ // Once we've reported all matches at a given position, we need
+ // to advance the search to the next position. However, if we've
+ // already followed the EOI transition, then we know we're done
+ // with the search and there cannot be any more matches to report.
+ if state.rev_eoi {
+ return Ok(());
+ } else if state.at == input.start() {
+ // At this point, we should follow the EOI transition. This
+ // will cause us the skip the main loop below and fall through
+ // to the final 'eoi_rev' transition.
+ state.rev_eoi = true;
+ } else {
+ // We haven't hit the end of the search yet, so move on.
+ state.at -= 1;
+ }
+ sid
+ }
+ };
+ while !state.rev_eoi {
+ sid = dfa.next_state(sid, input.haystack()[state.at]);
+ if dfa.is_special_state(sid) {
+ state.id = Some(sid);
+ if dfa.is_start_state(sid) {
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ state.at =
+ accel::find_rev(needles, input.haystack(), state.at)
+ .map(|i| i + 1)
+ .unwrap_or(input.start());
}
- } else if dfa.is_match_state(state) {
- let offset = at - MATCH_OFFSET;
- caller_state
- .set_last_match(StateMatch { match_index: 1, offset });
- return Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(state, 0),
- offset,
- }));
- } else if dfa.is_accel_state(state) {
- let needs = dfa.accelerator(state);
- at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
- } else if dfa.is_dead_state(state) {
- return Ok(None);
+ } else if dfa.is_match_state(sid) {
+ state.next_match_index = Some(1);
+ let pattern = dfa.match_pattern(sid, 0);
+ state.mat = Some(HalfMatch::new(pattern, state.at + 1));
+ return Ok(());
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ // If the accelerator returns nothing, why don't we quit the
+ // search? Well, if the accelerator doesn't find anything, that
+ // doesn't mean we don't have a match. It just means that we
+ // can't leave the current state given one of the 255 possible
+ // byte values. However, there might be an EOI transition. So
+ // we set 'at' to the end of the haystack, which will cause
+ // this loop to stop and fall down into the EOI transition.
+ state.at =
+ accel::find_rev(needles, input.haystack(), state.at)
+ .map(|i| i + 1)
+ .unwrap_or(input.start());
+ } else if dfa.is_dead_state(sid) {
+ return Ok(());
} else {
- debug_assert!(dfa.is_quit_state(state));
- return Err(MatchError::Quit { byte, offset: at - 1 });
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(
+ input.haystack()[state.at],
+ state.at,
+ ));
}
}
+ if state.at == input.start() {
+ break;
+ }
+ state.at -= 1;
}
- let result = eoi_fwd(dfa, bytes, end, &mut state);
- caller_state.set_id(state);
- if let Ok(Some(ref last_match)) = result {
- caller_state.set_last_match(StateMatch {
- match_index: 1,
- offset: last_match.offset(),
- });
+ let result = eoi_rev(dfa, input, &mut sid, &mut state.mat);
+ state.rev_eoi = true;
+ state.id = Some(sid);
+ if state.mat.is_some() {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ state.next_match_index = Some(1);
}
result
}
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_fwd<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<StateID, MatchError> {
- let state = dfa.start_state_forward(pattern_id, bytes, start, end);
+ let sid = dfa.start_state_forward(input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
- assert!(!dfa.is_match_state(state));
- Ok(state)
+ debug_assert!(!dfa.is_match_state(sid));
+ Ok(sid)
}
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_rev<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<StateID, MatchError> {
- let state = dfa.start_state_reverse(pattern_id, bytes, start, end);
+ let sid = dfa.start_state_reverse(input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
- assert!(!dfa.is_match_state(state));
- Ok(state)
+ debug_assert!(!dfa.is_match_state(sid));
+ Ok(sid)
}
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_fwd<A: Automaton + ?Sized>(
dfa: &A,
- bytes: &[u8],
- end: usize,
- state: &mut StateID,
-) -> Result<Option<HalfMatch>, MatchError> {
- match bytes.get(end) {
+ input: &Input<'_>,
+ sid: &mut StateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ match input.haystack().get(sp.end) {
Some(&b) => {
- *state = dfa.next_state(*state, b);
- if dfa.is_match_state(*state) {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(*state, 0),
- offset: end,
- }))
- } else {
- Ok(None)
+ *sid = dfa.next_state(*sid, b);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.end));
+ } else if dfa.is_quit_state(*sid) {
+ return Err(MatchError::quit(b, sp.end));
}
}
None => {
- *state = dfa.next_eoi_state(*state);
- if dfa.is_match_state(*state) {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(*state, 0),
- offset: bytes.len(),
- }))
- } else {
- Ok(None)
+ *sid = dfa.next_eoi_state(*sid);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!dfa.is_quit_state(*sid));
}
}
+ Ok(())
}
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_rev<A: Automaton + ?Sized>(
dfa: &A,
- bytes: &[u8],
- start: usize,
- state: StateID,
-) -> Result<Option<HalfMatch>, MatchError> {
- if start > 0 {
- let state = dfa.next_state(state, bytes[start - 1]);
- if dfa.is_match_state(state) {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(state, 0),
- offset: start,
- }))
- } else {
- Ok(None)
+ input: &Input<'_>,
+ sid: &mut StateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ if sp.start > 0 {
+ let byte = input.haystack()[sp.start - 1];
+ *sid = dfa.next_state(*sid, byte);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.start));
+ } else if dfa.is_quit_state(*sid) {
+ return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
- let state = dfa.next_eoi_state(state);
- if dfa.is_match_state(state) {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(state, 0),
- offset: 0,
- }))
- } else {
- Ok(None)
+ *sid = dfa.next_eoi_state(*sid);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, 0));
}
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!dfa.is_quit_state(*sid));
}
+ Ok(())
}
-// Currently unused, but is useful to keep around. This was originally used
-// when the code above used raw pointers for its main loop.
-// /// Returns the distance between the given pointer and the start of `bytes`.
-// /// This assumes that the given pointer points to somewhere in the `bytes`
-// /// slice given.
-// fn offset(bytes: &[u8], p: *const u8) -> usize {
-// debug_assert!(bytes.as_ptr() <= p);
-// debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
-// ((p as isize) - (bytes.as_ptr() as isize)) as usize
-// }
+/// Re-compute the starting state that a DFA should be in after finding a
+/// prefilter candidate match at the position `at`.
+///
+/// The function with the same name has a bit more docs in hybrid/search.rs.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn prefilter_restart<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ at: usize,
+) -> Result<StateID, MatchError> {
+ let mut input = input.clone();
+ input.set_start(at);
+ init_fwd(dfa, &input)
+}