summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/meta/strategy.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/meta/strategy.rs')
-rw-r--r--third_party/rust/regex-automata/src/meta/strategy.rs1908
1 files changed, 1908 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/meta/strategy.rs b/third_party/rust/regex-automata/src/meta/strategy.rs
new file mode 100644
index 0000000000..ea6c6ab576
--- /dev/null
+++ b/third_party/rust/regex-automata/src/meta/strategy.rs
@@ -0,0 +1,1908 @@
+use core::{
+ fmt::Debug,
+ panic::{RefUnwindSafe, UnwindSafe},
+};
+
+use alloc::sync::Arc;
+
+use regex_syntax::hir::{literal, Hir};
+
+use crate::{
+ meta::{
+ error::{BuildError, RetryError, RetryFailError, RetryQuadraticError},
+ regex::{Cache, RegexInfo},
+ reverse_inner, wrappers,
+ },
+ nfa::thompson::{self, WhichCaptures, NFA},
+ util::{
+ captures::{Captures, GroupInfo},
+ look::LookMatcher,
+ prefilter::{self, Prefilter, PrefilterI},
+ primitives::{NonMaxUsize, PatternID},
+ search::{Anchored, HalfMatch, Input, Match, MatchKind, PatternSet},
+ },
+};
+
+/// A trait that represents a single meta strategy. Its main utility is in
+/// providing a way to do dynamic dispatch over a few choices.
+///
+/// Why dynamic dispatch? I actually don't have a super compelling reason, and
+/// importantly, I have not benchmarked it with the main alternative: an enum.
+/// I went with dynamic dispatch initially because the regex engine search code
+/// really can't be inlined into caller code in most cases because it's just
+/// too big. In other words, it is already expected that every regex search
+/// will entail at least the cost of a function call.
+///
+/// I do wonder whether using enums would result in better codegen overall
+/// though. It's a worthwhile experiment to try. Probably the most interesting
+/// benchmark to run in such a case would be one with a high match count. That
+/// is, a benchmark to test the overall latency of a search call.
+pub(super) trait Strategy:
+ Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static
+{
+ fn group_info(&self) -> &GroupInfo;
+
+ fn create_cache(&self) -> Cache;
+
+ fn reset_cache(&self, cache: &mut Cache);
+
+ fn is_accelerated(&self) -> bool;
+
+ fn memory_usage(&self) -> usize;
+
+ fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match>;
+
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch>;
+
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool;
+
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID>;
+
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ );
+}
+
+pub(super) fn new(
+ info: &RegexInfo,
+ hirs: &[&Hir],
+) -> Result<Arc<dyn Strategy>, BuildError> {
+ // At this point, we're committed to a regex engine of some kind. So pull
+ // out a prefilter if we can, which will feed to each of the constituent
+ // regex engines.
+ let pre = if info.is_always_anchored_start() {
+ // PERF: I'm not sure we necessarily want to do this... We may want to
+ // run a prefilter for quickly rejecting in some cases. The problem
+ // is that anchored searches overlap quite a bit with the use case
+ // of "run a regex on every line to extract data." In that case, the
+ // regex always matches, so running a prefilter doesn't really help us
+ // there. The main place where a prefilter helps in an anchored search
+ // is if the anchored search is not expected to match frequently. That
+ // is, the prefilter gives us a way to possibly reject a haystack very
+ // quickly.
+ //
+ // Maybe we should do use a prefilter, but only for longer haystacks?
+ // Or maybe we should only use a prefilter when we think it's "fast"?
+ //
+ // Interestingly, I think we currently lack the infrastructure for
+ // disabling a prefilter based on haystack length. That would probably
+ // need to be a new 'Input' option. (Interestingly, an 'Input' used to
+ // carry a 'Prefilter' with it, but I moved away from that.)
+ debug!("skipping literal extraction since regex is anchored");
+ None
+ } else if let Some(pre) = info.config().get_prefilter() {
+ debug!(
+ "skipping literal extraction since the caller provided a prefilter"
+ );
+ Some(pre.clone())
+ } else if info.config().get_auto_prefilter() {
+ let kind = info.config().get_match_kind();
+ let prefixes = crate::util::prefilter::prefixes(kind, hirs);
+ // If we can build a full `Strategy` from just the extracted prefixes,
+ // then we can short-circuit and avoid building a regex engine at all.
+ if let Some(pre) = Pre::from_prefixes(info, &prefixes) {
+ debug!(
+ "found that the regex can be broken down to a literal \
+ search, avoiding the regex engine entirely",
+ );
+ return Ok(pre);
+ }
+ // This now attempts another short-circuit of the regex engine: if we
+ // have a huge alternation of just plain literals, then we can just use
+ // Aho-Corasick for that and avoid the regex engine entirely.
+ //
+ // You might think this case would just be handled by
+ // `Pre::from_prefixes`, but that technique relies on heuristic literal
+ // extraction from the corresponding `Hir`. That works, but part of
+ // heuristics limit the size and number of literals returned. This case
+ // will specifically handle patterns with very large alternations.
+ //
+ // One wonders if we should just roll this our heuristic literal
+ // extraction, and then I think this case could disappear entirely.
+ if let Some(pre) = Pre::from_alternation_literals(info, hirs) {
+ debug!(
+ "found plain alternation of literals, \
+ avoiding regex engine entirely and using Aho-Corasick"
+ );
+ return Ok(pre);
+ }
+ prefixes.literals().and_then(|strings| {
+ debug!(
+ "creating prefilter from {} literals: {:?}",
+ strings.len(),
+ strings,
+ );
+ Prefilter::new(kind, strings)
+ })
+ } else {
+ debug!("skipping literal extraction since prefilters were disabled");
+ None
+ };
+ let mut core = Core::new(info.clone(), pre.clone(), hirs)?;
+ // Now that we have our core regex engines built, there are a few cases
+ // where we can do a little bit better than just a normal "search forward
+ // and maybe use a prefilter when in a start state." However, these cases
+ // may not always work or otherwise build on top of the Core searcher.
+ // For example, the reverse anchored optimization seems like it might
+ // always work, but only the DFAs support reverse searching and the DFAs
+ // might give up or quit for reasons. If we had, e.g., a PikeVM that
+ // supported reverse searching, then we could avoid building a full Core
+ // engine for this case.
+ core = match ReverseAnchored::new(core) {
+ Err(core) => core,
+ Ok(ra) => {
+ debug!("using reverse anchored strategy");
+ return Ok(Arc::new(ra));
+ }
+ };
+ core = match ReverseSuffix::new(core, hirs) {
+ Err(core) => core,
+ Ok(rs) => {
+ debug!("using reverse suffix strategy");
+ return Ok(Arc::new(rs));
+ }
+ };
+ core = match ReverseInner::new(core, hirs) {
+ Err(core) => core,
+ Ok(ri) => {
+ debug!("using reverse inner strategy");
+ return Ok(Arc::new(ri));
+ }
+ };
+ debug!("using core strategy");
+ Ok(Arc::new(core))
+}
+
+#[derive(Clone, Debug)]
+struct Pre<P> {
+ pre: P,
+ group_info: GroupInfo,
+}
+
+impl<P: PrefilterI> Pre<P> {
+ fn new(pre: P) -> Arc<dyn Strategy> {
+ // The only thing we support when we use prefilters directly as a
+ // strategy is the start and end of the overall match for a single
+ // pattern. In other words, exactly one implicit capturing group. Which
+ // is exactly what we use here for a GroupInfo.
+ let group_info = GroupInfo::new([[None::<&str>]]).unwrap();
+ Arc::new(Pre { pre, group_info })
+ }
+}
+
+// This is a little weird, but we don't actually care about the type parameter
+// here because we're selecting which underlying prefilter to use. So we just
+// define it on an arbitrary type.
+impl Pre<()> {
+ /// Given a sequence of prefixes, attempt to return a full `Strategy` using
+ /// just the prefixes.
+ ///
+ /// Basically, this occurs when the prefixes given not just prefixes,
+ /// but an enumeration of the entire language matched by the regular
+ /// expression.
+ ///
+ /// A number of other conditions need to be true too. For example, there
+ /// can be only one pattern, the number of explicit capture groups is 0, no
+ /// look-around assertions and so on.
+ ///
+ /// Note that this ignores `Config::get_auto_prefilter` because if this
+ /// returns something, then it isn't a prefilter but a matcher itself.
+ /// Therefore, it shouldn't suffer from the problems typical to prefilters
+ /// (such as a high false positive rate).
+ fn from_prefixes(
+ info: &RegexInfo,
+ prefixes: &literal::Seq,
+ ) -> Option<Arc<dyn Strategy>> {
+ let kind = info.config().get_match_kind();
+ // Check to see if our prefixes are exact, which means we might be
+ // able to bypass the regex engine entirely and just rely on literal
+ // searches.
+ if !prefixes.is_exact() {
+ return None;
+ }
+ // We also require that we have a single regex pattern. Namely,
+ // we reuse the prefilter infrastructure to implement search and
+ // prefilters only report spans. Prefilters don't know about pattern
+ // IDs. The multi-regex case isn't a lost cause, we might still use
+ // Aho-Corasick and we might still just use a regular prefilter, but
+ // that's done below.
+ if info.pattern_len() != 1 {
+ return None;
+ }
+ // We can't have any capture groups either. The literal engines don't
+ // know how to deal with things like '(foo)(bar)'. In that case, a
+ // prefilter will just be used and then the regex engine will resolve
+ // the capture groups.
+ if info.props()[0].explicit_captures_len() != 0 {
+ return None;
+ }
+ // We also require that it has zero look-around assertions. Namely,
+ // literal extraction treats look-around assertions as if they match
+ // *every* empty string. But of course, that isn't true. So for
+ // example, 'foo\bquux' never matches anything, but 'fooquux' is
+ // extracted from that as an exact literal. Such cases should just run
+ // the regex engine. 'fooquux' will be used as a normal prefilter, and
+ // then the regex engine will try to look for an actual match.
+ if !info.props()[0].look_set().is_empty() {
+ return None;
+ }
+ // Finally, currently, our prefilters are all oriented around
+ // leftmost-first match semantics, so don't try to use them if the
+ // caller asked for anything else.
+ if kind != MatchKind::LeftmostFirst {
+ return None;
+ }
+ // The above seems like a lot of requirements to meet, but it applies
+ // to a lot of cases. 'foo', '[abc][123]' and 'foo|bar|quux' all meet
+ // the above criteria, for example.
+ //
+ // Note that this is effectively a latency optimization. If we didn't
+ // do this, then the extracted literals would still get bundled into
+ // a prefilter, and every regex engine capable of running unanchored
+ // searches supports prefilters. So this optimization merely sidesteps
+ // having to run the regex engine at all to confirm the match. Thus, it
+ // decreases the latency of a match.
+
+ // OK because we know the set is exact and thus finite.
+ let prefixes = prefixes.literals().unwrap();
+ debug!(
+ "trying to bypass regex engine by creating \
+ prefilter from {} literals: {:?}",
+ prefixes.len(),
+ prefixes,
+ );
+ let choice = match prefilter::Choice::new(kind, prefixes) {
+ Some(choice) => choice,
+ None => {
+ debug!(
+ "regex bypass failed because no prefilter could be built"
+ );
+ return None;
+ }
+ };
+ let strat: Arc<dyn Strategy> = match choice {
+ prefilter::Choice::Memchr(pre) => Pre::new(pre),
+ prefilter::Choice::Memchr2(pre) => Pre::new(pre),
+ prefilter::Choice::Memchr3(pre) => Pre::new(pre),
+ prefilter::Choice::Memmem(pre) => Pre::new(pre),
+ prefilter::Choice::Teddy(pre) => Pre::new(pre),
+ prefilter::Choice::ByteSet(pre) => Pre::new(pre),
+ prefilter::Choice::AhoCorasick(pre) => Pre::new(pre),
+ };
+ Some(strat)
+ }
+
+ /// Attempts to extract an alternation of literals, and if it's deemed
+ /// worth doing, returns an Aho-Corasick prefilter as a strategy.
+ ///
+ /// And currently, this only returns something when 'hirs.len() == 1'. This
+ /// could in theory do something if there are multiple HIRs where all of
+ /// them are alternation of literals, but I haven't had the time to go down
+ /// that path yet.
+ fn from_alternation_literals(
+ info: &RegexInfo,
+ hirs: &[&Hir],
+ ) -> Option<Arc<dyn Strategy>> {
+ use crate::util::prefilter::AhoCorasick;
+
+ let lits = crate::meta::literal::alternation_literals(info, hirs)?;
+ let ac = AhoCorasick::new(MatchKind::LeftmostFirst, &lits)?;
+ Some(Pre::new(ac))
+ }
+}
+
+// This implements Strategy for anything that implements PrefilterI.
+//
+// Note that this must only be used for regexes of length 1. Multi-regexes
+// don't work here. The prefilter interface only provides the span of a match
+// and not the pattern ID. (I did consider making it more expressive, but I
+// couldn't figure out how to tie everything together elegantly.) Thus, so long
+// as the regex only contains one pattern, we can simply assume that a match
+// corresponds to PatternID::ZERO. And indeed, that's what we do here.
+//
+// In practice, since this impl is used to report matches directly and thus
+// completely bypasses the regex engine, we only wind up using this under the
+// following restrictions:
+//
+// * There must be only one pattern. As explained above.
+// * The literal sequence must be finite and only contain exact literals.
+// * There must not be any look-around assertions. If there are, the literals
+// extracted might be exact, but a match doesn't necessarily imply an overall
+// match. As a trivial example, 'foo\bbar' does not match 'foobar'.
+// * The pattern must not have any explicit capturing groups. If it does, the
+// caller might expect them to be resolved. e.g., 'foo(bar)'.
+//
+// So when all of those things are true, we use a prefilter directly as a
+// strategy.
+//
+// In the case where the number of patterns is more than 1, we don't use this
+// but do use a special Aho-Corasick strategy if all of the regexes are just
+// simple literals or alternations of literals. (We also use the Aho-Corasick
+// strategy when len(patterns)==1 if the number of literals is large. In that
+// case, literal extraction gives up and will return an infinite set.)
+impl<P: PrefilterI> Strategy for Pre<P> {
+ fn group_info(&self) -> &GroupInfo {
+ &self.group_info
+ }
+
+ fn create_cache(&self) -> Cache {
+ Cache {
+ capmatches: Captures::all(self.group_info().clone()),
+ pikevm: wrappers::PikeVMCache::none(),
+ backtrack: wrappers::BoundedBacktrackerCache::none(),
+ onepass: wrappers::OnePassCache::none(),
+ hybrid: wrappers::HybridCache::none(),
+ revhybrid: wrappers::ReverseHybridCache::none(),
+ }
+ }
+
+ fn reset_cache(&self, _cache: &mut Cache) {}
+
+ fn is_accelerated(&self) -> bool {
+ self.pre.is_fast()
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.pre.memory_usage()
+ }
+
+ fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+ if input.is_done() {
+ return None;
+ }
+ if input.get_anchored().is_anchored() {
+ return self
+ .pre
+ .prefix(input.haystack(), input.get_span())
+ .map(|sp| Match::new(PatternID::ZERO, sp));
+ }
+ self.pre
+ .find(input.haystack(), input.get_span())
+ .map(|sp| Match::new(PatternID::ZERO, sp))
+ }
+
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end()))
+ }
+
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ self.search(cache, input).is_some()
+ }
+
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ let m = self.search(cache, input)?;
+ if let Some(slot) = slots.get_mut(0) {
+ *slot = NonMaxUsize::new(m.start());
+ }
+ if let Some(slot) = slots.get_mut(1) {
+ *slot = NonMaxUsize::new(m.end());
+ }
+ Some(m.pattern())
+ }
+
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ if self.search(cache, input).is_some() {
+ patset.insert(PatternID::ZERO);
+ }
+ }
+}
+
+#[derive(Debug)]
+struct Core {
+ info: RegexInfo,
+ pre: Option<Prefilter>,
+ nfa: NFA,
+ nfarev: Option<NFA>,
+ pikevm: wrappers::PikeVM,
+ backtrack: wrappers::BoundedBacktracker,
+ onepass: wrappers::OnePass,
+ hybrid: wrappers::Hybrid,
+ dfa: wrappers::DFA,
+}
+
+impl Core {
+ fn new(
+ info: RegexInfo,
+ pre: Option<Prefilter>,
+ hirs: &[&Hir],
+ ) -> Result<Core, BuildError> {
+ let mut lookm = LookMatcher::new();
+ lookm.set_line_terminator(info.config().get_line_terminator());
+ let thompson_config = thompson::Config::new()
+ .utf8(info.config().get_utf8_empty())
+ .nfa_size_limit(info.config().get_nfa_size_limit())
+ .shrink(false)
+ .which_captures(info.config().get_which_captures())
+ .look_matcher(lookm);
+ let nfa = thompson::Compiler::new()
+ .configure(thompson_config.clone())
+ .build_many_from_hir(hirs)
+ .map_err(BuildError::nfa)?;
+ // It's possible for the PikeVM or the BB to fail to build, even though
+ // at this point, we already have a full NFA in hand. They can fail
+ // when a Unicode word boundary is used but where Unicode word boundary
+ // support is disabled at compile time, thus making it impossible to
+ // match. (Construction can also fail if the NFA was compiled without
+ // captures, but we always enable that above.)
+ let pikevm = wrappers::PikeVM::new(&info, pre.clone(), &nfa)?;
+ let backtrack =
+ wrappers::BoundedBacktracker::new(&info, pre.clone(), &nfa)?;
+ // The onepass engine can of course fail to build, but we expect it to
+ // fail in many cases because it is an optimization that doesn't apply
+ // to all regexes. The 'OnePass' wrapper encapsulates this failure (and
+ // logs a message if it occurs).
+ let onepass = wrappers::OnePass::new(&info, &nfa);
+ // We try to encapsulate whether a particular regex engine should be
+ // used within each respective wrapper, but the DFAs need a reverse NFA
+ // to build itself, and we really do not want to build a reverse NFA if
+ // we know we aren't going to use the lazy DFA. So we do a config check
+ // up front, which is in practice the only way we won't try to use the
+ // DFA.
+ let (nfarev, hybrid, dfa) =
+ if !info.config().get_hybrid() && !info.config().get_dfa() {
+ (None, wrappers::Hybrid::none(), wrappers::DFA::none())
+ } else {
+ // FIXME: Technically, we don't quite yet KNOW that we need
+ // a reverse NFA. It's possible for the DFAs below to both
+ // fail to build just based on the forward NFA. In which case,
+ // building the reverse NFA was totally wasted work. But...
+ // fixing this requires breaking DFA construction apart into
+ // two pieces: one for the forward part and another for the
+ // reverse part. Quite annoying. Making it worse, when building
+ // both DFAs fails, it's quite likely that the NFA is large and
+ // that it will take quite some time to build the reverse NFA
+ // too. So... it's really probably worth it to do this!
+ let nfarev = thompson::Compiler::new()
+ // Currently, reverse NFAs don't support capturing groups,
+ // so we MUST disable them. But even if we didn't have to,
+ // we would, because nothing in this crate does anything
+ // useful with capturing groups in reverse. And of course,
+ // the lazy DFA ignores capturing groups in all cases.
+ .configure(
+ thompson_config
+ .clone()
+ .which_captures(WhichCaptures::None)
+ .reverse(true),
+ )
+ .build_many_from_hir(hirs)
+ .map_err(BuildError::nfa)?;
+ let dfa = if !info.config().get_dfa() {
+ wrappers::DFA::none()
+ } else {
+ wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev)
+ };
+ let hybrid = if !info.config().get_hybrid() {
+ wrappers::Hybrid::none()
+ } else if dfa.is_some() {
+ debug!("skipping lazy DFA because we have a full DFA");
+ wrappers::Hybrid::none()
+ } else {
+ wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev)
+ };
+ (Some(nfarev), hybrid, dfa)
+ };
+ Ok(Core {
+ info,
+ pre,
+ nfa,
+ nfarev,
+ pikevm,
+ backtrack,
+ onepass,
+ hybrid,
+ dfa,
+ })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_mayfail(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<Result<Option<Match>, RetryFailError>> {
+ if let Some(e) = self.dfa.get(input) {
+ trace!("using full DFA for search at {:?}", input.get_span());
+ Some(e.try_search(input))
+ } else if let Some(e) = self.hybrid.get(input) {
+ trace!("using lazy DFA for search at {:?}", input.get_span());
+ Some(e.try_search(&mut cache.hybrid, input))
+ } else {
+ None
+ }
+ }
+
+ fn search_nofail(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<Match> {
+ let caps = &mut cache.capmatches;
+ caps.set_pattern(None);
+ // We manually inline 'try_search_slots_nofail' here because we need to
+ // borrow from 'cache.capmatches' in this method, but if we do, then
+ // we can't pass 'cache' wholesale to to 'try_slots_no_hybrid'. It's a
+ // classic example of how the borrow checker inhibits decomposition.
+ // There are of course work-arounds (more types and/or interior
+ // mutability), but that's more annoying than this IMO.
+ let pid = if let Some(ref e) = self.onepass.get(input) {
+ trace!("using OnePass for search at {:?}", input.get_span());
+ e.search_slots(&mut cache.onepass, input, caps.slots_mut())
+ } else if let Some(ref e) = self.backtrack.get(input) {
+ trace!(
+ "using BoundedBacktracker for search at {:?}",
+ input.get_span()
+ );
+ e.search_slots(&mut cache.backtrack, input, caps.slots_mut())
+ } else {
+ trace!("using PikeVM for search at {:?}", input.get_span());
+ let e = self.pikevm.get();
+ e.search_slots(&mut cache.pikevm, input, caps.slots_mut())
+ };
+ caps.set_pattern(pid);
+ caps.get_match()
+ }
+
+ fn search_half_nofail(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ // Only the lazy/full DFA returns half-matches, since the DFA requires
+ // a reverse scan to find the start position. These fallback regex
+ // engines can find the start and end in a single pass, so we just do
+ // that and throw away the start offset to conform to the API.
+ let m = self.search_nofail(cache, input)?;
+ Some(HalfMatch::new(m.pattern(), m.end()))
+ }
+
+ fn search_slots_nofail(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ if let Some(ref e) = self.onepass.get(input) {
+ trace!(
+ "using OnePass for capture search at {:?}",
+ input.get_span()
+ );
+ e.search_slots(&mut cache.onepass, input, slots)
+ } else if let Some(ref e) = self.backtrack.get(input) {
+ trace!(
+ "using BoundedBacktracker for capture search at {:?}",
+ input.get_span()
+ );
+ e.search_slots(&mut cache.backtrack, input, slots)
+ } else {
+ trace!(
+ "using PikeVM for capture search at {:?}",
+ input.get_span()
+ );
+ let e = self.pikevm.get();
+ e.search_slots(&mut cache.pikevm, input, slots)
+ }
+ }
+
+ fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ if let Some(ref e) = self.onepass.get(input) {
+ trace!(
+ "using OnePass for is-match search at {:?}",
+ input.get_span()
+ );
+ e.search_slots(&mut cache.onepass, input, &mut []).is_some()
+ } else if let Some(ref e) = self.backtrack.get(input) {
+ trace!(
+ "using BoundedBacktracker for is-match search at {:?}",
+ input.get_span()
+ );
+ e.is_match(&mut cache.backtrack, input)
+ } else {
+ trace!(
+ "using PikeVM for is-match search at {:?}",
+ input.get_span()
+ );
+ let e = self.pikevm.get();
+ e.is_match(&mut cache.pikevm, input)
+ }
+ }
+
+ fn is_capture_search_needed(&self, slots_len: usize) -> bool {
+ slots_len > self.nfa.group_info().implicit_slot_len()
+ }
+}
+
+impl Strategy for Core {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn group_info(&self) -> &GroupInfo {
+ self.nfa.group_info()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn create_cache(&self) -> Cache {
+ Cache {
+ capmatches: Captures::all(self.group_info().clone()),
+ pikevm: self.pikevm.create_cache(),
+ backtrack: self.backtrack.create_cache(),
+ onepass: self.onepass.create_cache(),
+ hybrid: self.hybrid.create_cache(),
+ revhybrid: wrappers::ReverseHybridCache::none(),
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn reset_cache(&self, cache: &mut Cache) {
+ cache.pikevm.reset(&self.pikevm);
+ cache.backtrack.reset(&self.backtrack);
+ cache.onepass.reset(&self.onepass);
+ cache.hybrid.reset(&self.hybrid);
+ }
+
+ fn is_accelerated(&self) -> bool {
+ self.pre.as_ref().map_or(false, |pre| pre.is_fast())
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.info.memory_usage()
+ + self.pre.as_ref().map_or(0, |pre| pre.memory_usage())
+ + self.nfa.memory_usage()
+ + self.nfarev.as_ref().map_or(0, |nfa| nfa.memory_usage())
+ + self.onepass.memory_usage()
+ + self.dfa.memory_usage()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+ // We manually inline try_search_mayfail here because letting the
+ // compiler do it seems to produce pretty crappy codegen.
+ return if let Some(e) = self.dfa.get(input) {
+ trace!("using full DFA for full search at {:?}", input.get_span());
+ match e.try_search(input) {
+ Ok(x) => x,
+ Err(_err) => {
+ trace!("full DFA search failed: {}", _err);
+ self.search_nofail(cache, input)
+ }
+ }
+ } else if let Some(e) = self.hybrid.get(input) {
+ trace!("using lazy DFA for full search at {:?}", input.get_span());
+ match e.try_search(&mut cache.hybrid, input) {
+ Ok(x) => x,
+ Err(_err) => {
+ trace!("lazy DFA search failed: {}", _err);
+ self.search_nofail(cache, input)
+ }
+ }
+ } else {
+ self.search_nofail(cache, input)
+ };
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ // The main difference with 'search' is that if we're using a DFA, we
+ // can use a single forward scan without needing to run the reverse
+ // DFA.
+ if let Some(e) = self.dfa.get(input) {
+ trace!("using full DFA for half search at {:?}", input.get_span());
+ match e.try_search_half_fwd(input) {
+ Ok(x) => x,
+ Err(_err) => {
+ trace!("full DFA half search failed: {}", _err);
+ self.search_half_nofail(cache, input)
+ }
+ }
+ } else if let Some(e) = self.hybrid.get(input) {
+ trace!("using lazy DFA for half search at {:?}", input.get_span());
+ match e.try_search_half_fwd(&mut cache.hybrid, input) {
+ Ok(x) => x,
+ Err(_err) => {
+ trace!("lazy DFA half search failed: {}", _err);
+ self.search_half_nofail(cache, input)
+ }
+ }
+ } else {
+ self.search_half_nofail(cache, input)
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ if let Some(e) = self.dfa.get(input) {
+ trace!(
+ "using full DFA for is-match search at {:?}",
+ input.get_span()
+ );
+ match e.try_search_half_fwd(input) {
+ Ok(x) => x.is_some(),
+ Err(_err) => {
+ trace!("full DFA half search failed: {}", _err);
+ self.is_match_nofail(cache, input)
+ }
+ }
+ } else if let Some(e) = self.hybrid.get(input) {
+ trace!(
+ "using lazy DFA for is-match search at {:?}",
+ input.get_span()
+ );
+ match e.try_search_half_fwd(&mut cache.hybrid, input) {
+ Ok(x) => x.is_some(),
+ Err(_err) => {
+ trace!("lazy DFA half search failed: {}", _err);
+ self.is_match_nofail(cache, input)
+ }
+ }
+ } else {
+ self.is_match_nofail(cache, input)
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ // Even if the regex has explicit capture groups, if the caller didn't
+ // provide any explicit slots, then it doesn't make sense to try and do
+ // extra work to get offsets for those slots. Ideally the caller should
+ // realize this and not call this routine in the first place, but alas,
+ // we try to save the caller from themselves if they do.
+ if !self.is_capture_search_needed(slots.len()) {
+ trace!("asked for slots unnecessarily, trying fast path");
+ let m = self.search(cache, input)?;
+ copy_match_to_slots(m, slots);
+ return Some(m.pattern());
+ }
+ // If the onepass DFA is available for this search (which only happens
+ // when it's anchored), then skip running a fallible DFA. The onepass
+ // DFA isn't as fast as a full or lazy DFA, but it is typically quite
+ // a bit faster than the backtracker or the PikeVM. So it isn't as
+ // advantageous to try and do a full/lazy DFA scan first.
+ //
+ // We still theorize that it's better to do a full/lazy DFA scan, even
+ // when it's anchored, because it's usually much faster and permits us
+ // to say "no match" much more quickly. This does hurt the case of,
+ // say, parsing each line in a log file into capture groups, because
+ // in that case, the line always matches. So the lazy DFA scan is
+ // usually just wasted work. But, the lazy DFA is usually quite fast
+ // and doesn't cost too much here.
+ if self.onepass.get(&input).is_some() {
+ return self.search_slots_nofail(cache, &input, slots);
+ }
+ let m = match self.try_search_mayfail(cache, input) {
+ Some(Ok(Some(m))) => m,
+ Some(Ok(None)) => return None,
+ Some(Err(_err)) => {
+ trace!("fast capture search failed: {}", _err);
+ return self.search_slots_nofail(cache, input, slots);
+ }
+ None => {
+ return self.search_slots_nofail(cache, input, slots);
+ }
+ };
+ // At this point, now that we've found the bounds of the
+ // match, we need to re-run something that can resolve
+ // capturing groups. But we only need to run on it on the
+ // match bounds and not the entire haystack.
+ trace!(
+ "match found at {}..{} in capture search, \
+ using another engine to find captures",
+ m.start(),
+ m.end(),
+ );
+ let input = input
+ .clone()
+ .span(m.start()..m.end())
+ .anchored(Anchored::Pattern(m.pattern()));
+ Some(
+ self.search_slots_nofail(cache, &input, slots)
+ .expect("should find a match"),
+ )
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ if let Some(e) = self.dfa.get(input) {
+ trace!(
+ "using full DFA for overlapping search at {:?}",
+ input.get_span()
+ );
+ let _err = match e.try_which_overlapping_matches(input, patset) {
+ Ok(()) => return,
+ Err(err) => err,
+ };
+ trace!("fast overlapping search failed: {}", _err);
+ } else if let Some(e) = self.hybrid.get(input) {
+ trace!(
+ "using lazy DFA for overlapping search at {:?}",
+ input.get_span()
+ );
+ let _err = match e.try_which_overlapping_matches(
+ &mut cache.hybrid,
+ input,
+ patset,
+ ) {
+ Ok(()) => {
+ return;
+ }
+ Err(err) => err,
+ };
+ trace!("fast overlapping search failed: {}", _err);
+ }
+ trace!(
+ "using PikeVM for overlapping search at {:?}",
+ input.get_span()
+ );
+ let e = self.pikevm.get();
+ e.which_overlapping_matches(&mut cache.pikevm, input, patset)
+ }
+}
+
+#[derive(Debug)]
+struct ReverseAnchored {
+ core: Core,
+}
+
+impl ReverseAnchored {
+ fn new(core: Core) -> Result<ReverseAnchored, Core> {
+ if !core.info.is_always_anchored_end() {
+ debug!(
+ "skipping reverse anchored optimization because \
+ the regex is not always anchored at the end"
+ );
+ return Err(core);
+ }
+ // Note that the caller can still request an anchored search even when
+ // the regex isn't anchored at the start. We detect that case in the
+ // search routines below and just fallback to the core engine. This
+ // is fine because both searches are anchored. It's just a matter of
+ // picking one. Falling back to the core engine is a little simpler,
+ // since if we used the reverse anchored approach, we'd have to add an
+ // extra check to ensure the match reported starts at the place where
+ // the caller requested the search to start.
+ if core.info.is_always_anchored_start() {
+ debug!(
+ "skipping reverse anchored optimization because \
+ the regex is also anchored at the start"
+ );
+ return Err(core);
+ }
+ // Only DFAs can do reverse searches (currently), so we need one of
+ // them in order to do this optimization. It's possible (although
+ // pretty unlikely) that we have neither and need to give up.
+ if !core.hybrid.is_some() && !core.dfa.is_some() {
+ debug!(
+ "skipping reverse anchored optimization because \
+ we don't have a lazy DFA or a full DFA"
+ );
+ return Err(core);
+ }
+ Ok(ReverseAnchored { core })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_anchored_rev(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, RetryFailError> {
+ // We of course always want an anchored search. In theory, the
+ // underlying regex engines should automatically enable anchored
+ // searches since the regex is itself anchored, but this more clearly
+ // expresses intent and is always correct.
+ let input = input.clone().anchored(Anchored::Yes);
+ if let Some(e) = self.core.dfa.get(&input) {
+ trace!(
+ "using full DFA for reverse anchored search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_rev(&input)
+ } else if let Some(e) = self.core.hybrid.get(&input) {
+ trace!(
+ "using lazy DFA for reverse anchored search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_rev(&mut cache.hybrid, &input)
+ } else {
+ unreachable!("ReverseAnchored always has a DFA")
+ }
+ }
+}
+
+// Note that in this impl, we don't check that 'input.end() ==
+// input.haystack().len()'. In particular, when that condition is false, a
+// match is always impossible because we know that the regex is always anchored
+// at the end (or else 'ReverseAnchored' won't be built). We don't check that
+// here because the 'Regex' wrapper actually does that for us in all cases.
+// Thus, in this impl, we can actually assume that the end position in 'input'
+// is equivalent to the length of the haystack.
+impl Strategy for ReverseAnchored {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn group_info(&self) -> &GroupInfo {
+ self.core.group_info()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn create_cache(&self) -> Cache {
+ self.core.create_cache()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn reset_cache(&self, cache: &mut Cache) {
+ self.core.reset_cache(cache);
+ }
+
+ fn is_accelerated(&self) -> bool {
+ // Since this is anchored at the end, a reverse anchored search is
+ // almost certainly guaranteed to result in a much faster search than
+ // a standard forward search.
+ true
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.core.memory_usage()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search(cache, input);
+ }
+ match self.try_search_half_anchored_rev(cache, input) {
+ Err(_err) => {
+ trace!("fast reverse anchored search failed: {}", _err);
+ self.core.search_nofail(cache, input)
+ }
+ Ok(None) => None,
+ Ok(Some(hm)) => {
+ Some(Match::new(hm.pattern(), hm.offset()..input.end()))
+ }
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_half(cache, input);
+ }
+ match self.try_search_half_anchored_rev(cache, input) {
+ Err(_err) => {
+ trace!("fast reverse anchored search failed: {}", _err);
+ self.core.search_half_nofail(cache, input)
+ }
+ Ok(None) => None,
+ Ok(Some(hm)) => {
+ // Careful here! 'try_search_half' is a *forward* search that
+ // only cares about the *end* position of a match. But
+ // 'hm.offset()' is actually the start of the match. So we
+ // actually just throw that away here and, since we know we
+ // have a match, return the only possible position at which a
+ // match can occur: input.end().
+ Some(HalfMatch::new(hm.pattern(), input.end()))
+ }
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ if input.get_anchored().is_anchored() {
+ return self.core.is_match(cache, input);
+ }
+ match self.try_search_half_anchored_rev(cache, input) {
+ Err(_err) => {
+ trace!("fast reverse anchored search failed: {}", _err);
+ self.core.is_match_nofail(cache, input)
+ }
+ Ok(None) => false,
+ Ok(Some(_)) => true,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_slots(cache, input, slots);
+ }
+ match self.try_search_half_anchored_rev(cache, input) {
+ Err(_err) => {
+ trace!("fast reverse anchored search failed: {}", _err);
+ self.core.search_slots_nofail(cache, input, slots)
+ }
+ Ok(None) => None,
+ Ok(Some(hm)) => {
+ if !self.core.is_capture_search_needed(slots.len()) {
+ trace!("asked for slots unnecessarily, skipping captures");
+ let m = Match::new(hm.pattern(), hm.offset()..input.end());
+ copy_match_to_slots(m, slots);
+ return Some(m.pattern());
+ }
+ let start = hm.offset();
+ let input = input
+ .clone()
+ .span(start..input.end())
+ .anchored(Anchored::Pattern(hm.pattern()));
+ self.core.search_slots_nofail(cache, &input, slots)
+ }
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ // It seems like this could probably benefit from a reverse anchored
+ // optimization, perhaps by doing an overlapping reverse search (which
+ // the DFAs do support). I haven't given it much thought though, and
+ // I'm currently focus more on the single pattern case.
+ self.core.which_overlapping_matches(cache, input, patset)
+ }
+}
+
+#[derive(Debug)]
+struct ReverseSuffix {
+ core: Core,
+ pre: Prefilter,
+}
+
+impl ReverseSuffix {
+ fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseSuffix, Core> {
+ if !core.info.config().get_auto_prefilter() {
+ debug!(
+ "skipping reverse suffix optimization because \
+ automatic prefilters are disabled"
+ );
+ return Err(core);
+ }
+ // Like the reverse inner optimization, we don't do this for regexes
+ // that are always anchored. It could lead to scanning too much, but
+ // could say "no match" much more quickly than running the regex
+ // engine if the initial literal scan doesn't match. With that said,
+ // the reverse suffix optimization has lower overhead, since it only
+ // requires a reverse scan after a literal match to confirm or reject
+ // the match. (Although, in the case of confirmation, it then needs to
+ // do another forward scan to find the end position.)
+ //
+ // Note that the caller can still request an anchored search even
+ // when the regex isn't anchored. We detect that case in the search
+ // routines below and just fallback to the core engine. Currently this
+ // optimization assumes all searches are unanchored, so if we do want
+ // to enable this optimization for anchored searches, it will need a
+ // little work to support it.
+ if core.info.is_always_anchored_start() {
+ debug!(
+ "skipping reverse suffix optimization because \
+ the regex is always anchored at the start",
+ );
+ return Err(core);
+ }
+ // Only DFAs can do reverse searches (currently), so we need one of
+ // them in order to do this optimization. It's possible (although
+ // pretty unlikely) that we have neither and need to give up.
+ if !core.hybrid.is_some() && !core.dfa.is_some() {
+ debug!(
+ "skipping reverse suffix optimization because \
+ we don't have a lazy DFA or a full DFA"
+ );
+ return Err(core);
+ }
+ if core.pre.as_ref().map_or(false, |p| p.is_fast()) {
+ debug!(
+ "skipping reverse suffix optimization because \
+ we already have a prefilter that we think is fast"
+ );
+ return Err(core);
+ }
+ let kind = core.info.config().get_match_kind();
+ let suffixes = crate::util::prefilter::suffixes(kind, hirs);
+ let lcs = match suffixes.longest_common_suffix() {
+ None => {
+ debug!(
+ "skipping reverse suffix optimization because \
+ a longest common suffix could not be found",
+ );
+ return Err(core);
+ }
+ Some(lcs) if lcs.is_empty() => {
+ debug!(
+ "skipping reverse suffix optimization because \
+ the longest common suffix is the empty string",
+ );
+ return Err(core);
+ }
+ Some(lcs) => lcs,
+ };
+ let pre = match Prefilter::new(kind, &[lcs]) {
+ Some(pre) => pre,
+ None => {
+ debug!(
+ "skipping reverse suffix optimization because \
+ a prefilter could not be constructed from the \
+ longest common suffix",
+ );
+ return Err(core);
+ }
+ };
+ if !pre.is_fast() {
+ debug!(
+ "skipping reverse suffix optimization because \
+ while we have a suffix prefilter, it is not \
+ believed to be 'fast'"
+ );
+ return Err(core);
+ }
+ Ok(ReverseSuffix { core, pre })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_start(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, RetryError> {
+ let mut span = input.get_span();
+ let mut min_start = 0;
+ loop {
+ let litmatch = match self.pre.find(input.haystack(), span) {
+ None => return Ok(None),
+ Some(span) => span,
+ };
+ trace!("reverse suffix scan found suffix match at {:?}", litmatch);
+ let revinput = input
+ .clone()
+ .anchored(Anchored::Yes)
+ .span(input.start()..litmatch.end);
+ match self
+ .try_search_half_rev_limited(cache, &revinput, min_start)?
+ {
+ None => {
+ if span.start >= span.end {
+ break;
+ }
+ span.start = litmatch.start.checked_add(1).unwrap();
+ }
+ Some(hm) => return Ok(Some(hm)),
+ }
+ min_start = litmatch.end;
+ }
+ Ok(None)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_fwd(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, RetryFailError> {
+ if let Some(e) = self.core.dfa.get(&input) {
+ trace!(
+ "using full DFA for forward reverse suffix search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_fwd(&input)
+ } else if let Some(e) = self.core.hybrid.get(&input) {
+ trace!(
+ "using lazy DFA for forward reverse suffix search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_fwd(&mut cache.hybrid, &input)
+ } else {
+ unreachable!("ReverseSuffix always has a DFA")
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_rev_limited(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ min_start: usize,
+ ) -> Result<Option<HalfMatch>, RetryError> {
+ if let Some(e) = self.core.dfa.get(&input) {
+ trace!(
+ "using full DFA for reverse suffix search at {:?}, \
+ but will be stopped at {} to avoid quadratic behavior",
+ input.get_span(),
+ min_start,
+ );
+ e.try_search_half_rev_limited(&input, min_start)
+ } else if let Some(e) = self.core.hybrid.get(&input) {
+ trace!(
+ "using lazy DFA for reverse inner search at {:?}, \
+ but will be stopped at {} to avoid quadratic behavior",
+ input.get_span(),
+ min_start,
+ );
+ e.try_search_half_rev_limited(&mut cache.hybrid, &input, min_start)
+ } else {
+ unreachable!("ReverseSuffix always has a DFA")
+ }
+ }
+}
+
+impl Strategy for ReverseSuffix {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn group_info(&self) -> &GroupInfo {
+ self.core.group_info()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn create_cache(&self) -> Cache {
+ self.core.create_cache()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn reset_cache(&self, cache: &mut Cache) {
+ self.core.reset_cache(cache);
+ }
+
+ fn is_accelerated(&self) -> bool {
+ self.pre.is_fast()
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.core.memory_usage() + self.pre.memory_usage()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search(cache, input);
+ }
+ match self.try_search_half_start(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse suffix optimization failed: {}", _err);
+ self.core.search(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!("reverse suffix reverse fast search failed: {}", _err);
+ self.core.search_nofail(cache, input)
+ }
+ Ok(None) => None,
+ Ok(Some(hm_start)) => {
+ let fwdinput = input
+ .clone()
+ .anchored(Anchored::Pattern(hm_start.pattern()))
+ .span(hm_start.offset()..input.end());
+ match self.try_search_half_fwd(cache, &fwdinput) {
+ Err(_err) => {
+ trace!(
+ "reverse suffix forward fast search failed: {}",
+ _err
+ );
+ self.core.search_nofail(cache, input)
+ }
+ Ok(None) => {
+ unreachable!(
+ "suffix match plus reverse match implies \
+ there must be a match",
+ )
+ }
+ Ok(Some(hm_end)) => Some(Match::new(
+ hm_start.pattern(),
+ hm_start.offset()..hm_end.offset(),
+ )),
+ }
+ }
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_half(cache, input);
+ }
+ match self.try_search_half_start(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse suffix half optimization failed: {}", _err);
+ self.core.search_half(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!(
+ "reverse suffix reverse fast half search failed: {}",
+ _err
+ );
+ self.core.search_half_nofail(cache, input)
+ }
+ Ok(None) => None,
+ Ok(Some(hm_start)) => {
+ // This is a bit subtle. It is tempting to just stop searching
+ // at this point and return a half-match with an offset
+ // corresponding to where the suffix was found. But the suffix
+ // match does not necessarily correspond to the end of the
+ // proper leftmost-first match. Consider /[a-z]+ing/ against
+ // 'tingling'. The first suffix match is the first 'ing', and
+ // the /[a-z]+/ matches the 't'. So if we stopped here, then
+ // we'd report 'ting' as the match. But 'tingling' is the
+ // correct match because of greediness.
+ let fwdinput = input
+ .clone()
+ .anchored(Anchored::Pattern(hm_start.pattern()))
+ .span(hm_start.offset()..input.end());
+ match self.try_search_half_fwd(cache, &fwdinput) {
+ Err(_err) => {
+ trace!(
+ "reverse suffix forward fast search failed: {}",
+ _err
+ );
+ self.core.search_half_nofail(cache, input)
+ }
+ Ok(None) => {
+ unreachable!(
+ "suffix match plus reverse match implies \
+ there must be a match",
+ )
+ }
+ Ok(Some(hm_end)) => Some(hm_end),
+ }
+ }
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ if input.get_anchored().is_anchored() {
+ return self.core.is_match(cache, input);
+ }
+ match self.try_search_half_start(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse suffix half optimization failed: {}", _err);
+ self.core.is_match_nofail(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!(
+ "reverse suffix reverse fast half search failed: {}",
+ _err
+ );
+ self.core.is_match_nofail(cache, input)
+ }
+ Ok(None) => false,
+ Ok(Some(_)) => true,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_slots(cache, input, slots);
+ }
+ if !self.core.is_capture_search_needed(slots.len()) {
+ trace!("asked for slots unnecessarily, trying fast path");
+ let m = self.search(cache, input)?;
+ copy_match_to_slots(m, slots);
+ return Some(m.pattern());
+ }
+ let hm_start = match self.try_search_half_start(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!(
+ "reverse suffix captures optimization failed: {}",
+ _err
+ );
+ return self.core.search_slots(cache, input, slots);
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!(
+ "reverse suffix reverse fast captures search failed: {}",
+ _err
+ );
+ return self.core.search_slots_nofail(cache, input, slots);
+ }
+ Ok(None) => return None,
+ Ok(Some(hm_start)) => hm_start,
+ };
+ trace!(
+ "match found at {}..{} in capture search, \
+ using another engine to find captures",
+ hm_start.offset(),
+ input.end(),
+ );
+ let start = hm_start.offset();
+ let input = input
+ .clone()
+ .span(start..input.end())
+ .anchored(Anchored::Pattern(hm_start.pattern()));
+ self.core.search_slots_nofail(cache, &input, slots)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ self.core.which_overlapping_matches(cache, input, patset)
+ }
+}
+
+#[derive(Debug)]
+struct ReverseInner {
+ core: Core,
+ preinner: Prefilter,
+ nfarev: NFA,
+ hybrid: wrappers::ReverseHybrid,
+ dfa: wrappers::ReverseDFA,
+}
+
+impl ReverseInner {
+ fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseInner, Core> {
+ if !core.info.config().get_auto_prefilter() {
+ debug!(
+ "skipping reverse inner optimization because \
+ automatic prefilters are disabled"
+ );
+ return Err(core);
+ }
+ // Currently we hard-code the assumption of leftmost-first match
+ // semantics. This isn't a huge deal because 'all' semantics tend to
+ // only be used for forward overlapping searches with multiple regexes,
+ // and this optimization only supports a single pattern at the moment.
+ if core.info.config().get_match_kind() != MatchKind::LeftmostFirst {
+ debug!(
+ "skipping reverse inner optimization because \
+ match kind is {:?} but this only supports leftmost-first",
+ core.info.config().get_match_kind(),
+ );
+ return Err(core);
+ }
+ // It's likely that a reverse inner scan has too much overhead for it
+ // to be worth it when the regex is anchored at the start. It is
+ // possible for it to be quite a bit faster if the initial literal
+ // scan fails to detect a match, in which case, we can say "no match"
+ // very quickly. But this could be undesirable, e.g., scanning too far
+ // or when the literal scan matches. If it matches, then confirming the
+ // match requires a reverse scan followed by a forward scan to confirm
+ // or reject, which is a fair bit of work.
+ //
+ // Note that the caller can still request an anchored search even
+ // when the regex isn't anchored. We detect that case in the search
+ // routines below and just fallback to the core engine. Currently this
+ // optimization assumes all searches are unanchored, so if we do want
+ // to enable this optimization for anchored searches, it will need a
+ // little work to support it.
+ if core.info.is_always_anchored_start() {
+ debug!(
+ "skipping reverse inner optimization because \
+ the regex is always anchored at the start",
+ );
+ return Err(core);
+ }
+ // Only DFAs can do reverse searches (currently), so we need one of
+ // them in order to do this optimization. It's possible (although
+ // pretty unlikely) that we have neither and need to give up.
+ if !core.hybrid.is_some() && !core.dfa.is_some() {
+ debug!(
+ "skipping reverse inner optimization because \
+ we don't have a lazy DFA or a full DFA"
+ );
+ return Err(core);
+ }
+ if core.pre.as_ref().map_or(false, |p| p.is_fast()) {
+ debug!(
+ "skipping reverse inner optimization because \
+ we already have a prefilter that we think is fast"
+ );
+ return Err(core);
+ } else if core.pre.is_some() {
+ debug!(
+ "core engine has a prefix prefilter, but it is \
+ probably not fast, so continuing with attempt to \
+ use reverse inner prefilter"
+ );
+ }
+ let (concat_prefix, preinner) = match reverse_inner::extract(hirs) {
+ Some(x) => x,
+ // N.B. the 'extract' function emits debug messages explaining
+ // why we bailed out here.
+ None => return Err(core),
+ };
+ debug!("building reverse NFA for prefix before inner literal");
+ let mut lookm = LookMatcher::new();
+ lookm.set_line_terminator(core.info.config().get_line_terminator());
+ let thompson_config = thompson::Config::new()
+ .reverse(true)
+ .utf8(core.info.config().get_utf8_empty())
+ .nfa_size_limit(core.info.config().get_nfa_size_limit())
+ .shrink(false)
+ .which_captures(WhichCaptures::None)
+ .look_matcher(lookm);
+ let result = thompson::Compiler::new()
+ .configure(thompson_config)
+ .build_from_hir(&concat_prefix);
+ let nfarev = match result {
+ Ok(nfarev) => nfarev,
+ Err(_err) => {
+ debug!(
+ "skipping reverse inner optimization because the \
+ reverse NFA failed to build: {}",
+ _err,
+ );
+ return Err(core);
+ }
+ };
+ debug!("building reverse DFA for prefix before inner literal");
+ let dfa = if !core.info.config().get_dfa() {
+ wrappers::ReverseDFA::none()
+ } else {
+ wrappers::ReverseDFA::new(&core.info, &nfarev)
+ };
+ let hybrid = if !core.info.config().get_hybrid() {
+ wrappers::ReverseHybrid::none()
+ } else if dfa.is_some() {
+ debug!(
+ "skipping lazy DFA for reverse inner optimization \
+ because we have a full DFA"
+ );
+ wrappers::ReverseHybrid::none()
+ } else {
+ wrappers::ReverseHybrid::new(&core.info, &nfarev)
+ };
+ Ok(ReverseInner { core, preinner, nfarev, hybrid, dfa })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_full(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, RetryError> {
+ let mut span = input.get_span();
+ let mut min_match_start = 0;
+ let mut min_pre_start = 0;
+ loop {
+ let litmatch = match self.preinner.find(input.haystack(), span) {
+ None => return Ok(None),
+ Some(span) => span,
+ };
+ if litmatch.start < min_pre_start {
+ trace!(
+ "found inner prefilter match at {:?}, which starts \
+ before the end of the last forward scan at {}, \
+ quitting to avoid quadratic behavior",
+ litmatch,
+ min_pre_start,
+ );
+ return Err(RetryError::Quadratic(RetryQuadraticError::new()));
+ }
+ trace!("reverse inner scan found inner match at {:?}", litmatch);
+ let revinput = input
+ .clone()
+ .anchored(Anchored::Yes)
+ .span(input.start()..litmatch.start);
+ // Note that in addition to the literal search above scanning past
+ // our minimum start point, this routine can also return an error
+ // as a result of detecting possible quadratic behavior if the
+ // reverse scan goes past the minimum start point. That is, the
+ // literal search might not, but the reverse regex search for the
+ // prefix might!
+ match self.try_search_half_rev_limited(
+ cache,
+ &revinput,
+ min_match_start,
+ )? {
+ None => {
+ if span.start >= span.end {
+ break;
+ }
+ span.start = litmatch.start.checked_add(1).unwrap();
+ }
+ Some(hm_start) => {
+ let fwdinput = input
+ .clone()
+ .anchored(Anchored::Pattern(hm_start.pattern()))
+ .span(hm_start.offset()..input.end());
+ match self.try_search_half_fwd_stopat(cache, &fwdinput)? {
+ Err(stopat) => {
+ min_pre_start = stopat;
+ span.start =
+ litmatch.start.checked_add(1).unwrap();
+ }
+ Ok(hm_end) => {
+ return Ok(Some(Match::new(
+ hm_start.pattern(),
+ hm_start.offset()..hm_end.offset(),
+ )))
+ }
+ }
+ }
+ }
+ min_match_start = litmatch.end;
+ }
+ Ok(None)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_fwd_stopat(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Result<Result<HalfMatch, usize>, RetryFailError> {
+ if let Some(e) = self.core.dfa.get(&input) {
+ trace!(
+ "using full DFA for forward reverse inner search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_fwd_stopat(&input)
+ } else if let Some(e) = self.core.hybrid.get(&input) {
+ trace!(
+ "using lazy DFA for forward reverse inner search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_fwd_stopat(&mut cache.hybrid, &input)
+ } else {
+ unreachable!("ReverseInner always has a DFA")
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_rev_limited(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ min_start: usize,
+ ) -> Result<Option<HalfMatch>, RetryError> {
+ if let Some(e) = self.dfa.get(&input) {
+ trace!(
+ "using full DFA for reverse inner search at {:?}, \
+ but will be stopped at {} to avoid quadratic behavior",
+ input.get_span(),
+ min_start,
+ );
+ e.try_search_half_rev_limited(&input, min_start)
+ } else if let Some(e) = self.hybrid.get(&input) {
+ trace!(
+ "using lazy DFA for reverse inner search at {:?}, \
+ but will be stopped at {} to avoid quadratic behavior",
+ input.get_span(),
+ min_start,
+ );
+ e.try_search_half_rev_limited(
+ &mut cache.revhybrid,
+ &input,
+ min_start,
+ )
+ } else {
+ unreachable!("ReverseInner always has a DFA")
+ }
+ }
+}
+
+impl Strategy for ReverseInner {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn group_info(&self) -> &GroupInfo {
+ self.core.group_info()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn create_cache(&self) -> Cache {
+ let mut cache = self.core.create_cache();
+ cache.revhybrid = self.hybrid.create_cache();
+ cache
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn reset_cache(&self, cache: &mut Cache) {
+ self.core.reset_cache(cache);
+ cache.revhybrid.reset(&self.hybrid);
+ }
+
+ fn is_accelerated(&self) -> bool {
+ self.preinner.is_fast()
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.core.memory_usage()
+ + self.preinner.memory_usage()
+ + self.nfarev.memory_usage()
+ + self.dfa.memory_usage()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search(cache, input);
+ }
+ match self.try_search_full(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse inner optimization failed: {}", _err);
+ self.core.search(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!("reverse inner fast search failed: {}", _err);
+ self.core.search_nofail(cache, input)
+ }
+ Ok(matornot) => matornot,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_half(cache, input);
+ }
+ match self.try_search_full(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse inner half optimization failed: {}", _err);
+ self.core.search_half(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!("reverse inner fast half search failed: {}", _err);
+ self.core.search_half_nofail(cache, input)
+ }
+ Ok(None) => None,
+ Ok(Some(m)) => Some(HalfMatch::new(m.pattern(), m.end())),
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ if input.get_anchored().is_anchored() {
+ return self.core.is_match(cache, input);
+ }
+ match self.try_search_full(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse inner half optimization failed: {}", _err);
+ self.core.is_match_nofail(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!("reverse inner fast half search failed: {}", _err);
+ self.core.is_match_nofail(cache, input)
+ }
+ Ok(None) => false,
+ Ok(Some(_)) => true,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_slots(cache, input, slots);
+ }
+ if !self.core.is_capture_search_needed(slots.len()) {
+ trace!("asked for slots unnecessarily, trying fast path");
+ let m = self.search(cache, input)?;
+ copy_match_to_slots(m, slots);
+ return Some(m.pattern());
+ }
+ let m = match self.try_search_full(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse inner captures optimization failed: {}", _err);
+ return self.core.search_slots(cache, input, slots);
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!("reverse inner fast captures search failed: {}", _err);
+ return self.core.search_slots_nofail(cache, input, slots);
+ }
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ trace!(
+ "match found at {}..{} in capture search, \
+ using another engine to find captures",
+ m.start(),
+ m.end(),
+ );
+ let input = input
+ .clone()
+ .span(m.start()..m.end())
+ .anchored(Anchored::Pattern(m.pattern()));
+ self.core.search_slots_nofail(cache, &input, slots)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ self.core.which_overlapping_matches(cache, input, patset)
+ }
+}
+
+/// Copies the offsets in the given match to the corresponding positions in
+/// `slots`.
+///
+/// In effect, this sets the slots corresponding to the implicit group for the
+/// pattern in the given match. If the indices for the corresponding slots do
+/// not exist, then no slots are set.
+///
+/// This is useful when the caller provides slots (or captures), but you use a
+/// regex engine that doesn't operate on slots (like a lazy DFA). This function
+/// lets you map the match you get back to the slots provided by the caller.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn copy_match_to_slots(m: Match, slots: &mut [Option<NonMaxUsize>]) {
+ let slot_start = m.pattern().as_usize() * 2;
+ let slot_end = slot_start + 1;
+ if let Some(slot) = slots.get_mut(slot_start) {
+ *slot = NonMaxUsize::new(m.start());
+ }
+ if let Some(slot) = slots.get_mut(slot_end) {
+ *slot = NonMaxUsize::new(m.end());
+ }
+}