use core::{ fmt::Debug, panic::{RefUnwindSafe, UnwindSafe}, }; use alloc::sync::Arc; use regex_syntax::hir::{literal, Hir}; use crate::{ meta::{ error::{BuildError, RetryError, RetryFailError, RetryQuadraticError}, regex::{Cache, RegexInfo}, reverse_inner, wrappers, }, nfa::thompson::{self, WhichCaptures, NFA}, util::{ captures::{Captures, GroupInfo}, look::LookMatcher, prefilter::{self, Prefilter, PrefilterI}, primitives::{NonMaxUsize, PatternID}, search::{Anchored, HalfMatch, Input, Match, MatchKind, PatternSet}, }, }; /// A trait that represents a single meta strategy. Its main utility is in /// providing a way to do dynamic dispatch over a few choices. /// /// Why dynamic dispatch? I actually don't have a super compelling reason, and /// importantly, I have not benchmarked it with the main alternative: an enum. /// I went with dynamic dispatch initially because the regex engine search code /// really can't be inlined into caller code in most cases because it's just /// too big. In other words, it is already expected that every regex search /// will entail at least the cost of a function call. /// /// I do wonder whether using enums would result in better codegen overall /// though. It's a worthwhile experiment to try. Probably the most interesting /// benchmark to run in such a case would be one with a high match count. That /// is, a benchmark to test the overall latency of a search call. pub(super) trait Strategy: Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static { fn group_info(&self) -> &GroupInfo; fn create_cache(&self) -> Cache; fn reset_cache(&self, cache: &mut Cache); fn is_accelerated(&self) -> bool; fn memory_usage(&self) -> usize; fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option; fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option; fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option; fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ); } pub(super) fn new( info: &RegexInfo, hirs: &[&Hir], ) -> Result, BuildError> { // At this point, we're committed to a regex engine of some kind. So pull // out a prefilter if we can, which will feed to each of the constituent // regex engines. let pre = if info.is_always_anchored_start() { // PERF: I'm not sure we necessarily want to do this... We may want to // run a prefilter for quickly rejecting in some cases. The problem // is that anchored searches overlap quite a bit with the use case // of "run a regex on every line to extract data." In that case, the // regex always matches, so running a prefilter doesn't really help us // there. The main place where a prefilter helps in an anchored search // is if the anchored search is not expected to match frequently. That // is, the prefilter gives us a way to possibly reject a haystack very // quickly. // // Maybe we should do use a prefilter, but only for longer haystacks? // Or maybe we should only use a prefilter when we think it's "fast"? // // Interestingly, I think we currently lack the infrastructure for // disabling a prefilter based on haystack length. That would probably // need to be a new 'Input' option. (Interestingly, an 'Input' used to // carry a 'Prefilter' with it, but I moved away from that.) debug!("skipping literal extraction since regex is anchored"); None } else if let Some(pre) = info.config().get_prefilter() { debug!( "skipping literal extraction since the caller provided a prefilter" ); Some(pre.clone()) } else if info.config().get_auto_prefilter() { let kind = info.config().get_match_kind(); let prefixes = crate::util::prefilter::prefixes(kind, hirs); // If we can build a full `Strategy` from just the extracted prefixes, // then we can short-circuit and avoid building a regex engine at all. if let Some(pre) = Pre::from_prefixes(info, &prefixes) { debug!( "found that the regex can be broken down to a literal \ search, avoiding the regex engine entirely", ); return Ok(pre); } // This now attempts another short-circuit of the regex engine: if we // have a huge alternation of just plain literals, then we can just use // Aho-Corasick for that and avoid the regex engine entirely. // // You might think this case would just be handled by // `Pre::from_prefixes`, but that technique relies on heuristic literal // extraction from the corresponding `Hir`. That works, but part of // heuristics limit the size and number of literals returned. This case // will specifically handle patterns with very large alternations. // // One wonders if we should just roll this our heuristic literal // extraction, and then I think this case could disappear entirely. if let Some(pre) = Pre::from_alternation_literals(info, hirs) { debug!( "found plain alternation of literals, \ avoiding regex engine entirely and using Aho-Corasick" ); return Ok(pre); } prefixes.literals().and_then(|strings| { debug!( "creating prefilter from {} literals: {:?}", strings.len(), strings, ); Prefilter::new(kind, strings) }) } else { debug!("skipping literal extraction since prefilters were disabled"); None }; let mut core = Core::new(info.clone(), pre.clone(), hirs)?; // Now that we have our core regex engines built, there are a few cases // where we can do a little bit better than just a normal "search forward // and maybe use a prefilter when in a start state." However, these cases // may not always work or otherwise build on top of the Core searcher. // For example, the reverse anchored optimization seems like it might // always work, but only the DFAs support reverse searching and the DFAs // might give up or quit for reasons. If we had, e.g., a PikeVM that // supported reverse searching, then we could avoid building a full Core // engine for this case. core = match ReverseAnchored::new(core) { Err(core) => core, Ok(ra) => { debug!("using reverse anchored strategy"); return Ok(Arc::new(ra)); } }; core = match ReverseSuffix::new(core, hirs) { Err(core) => core, Ok(rs) => { debug!("using reverse suffix strategy"); return Ok(Arc::new(rs)); } }; core = match ReverseInner::new(core, hirs) { Err(core) => core, Ok(ri) => { debug!("using reverse inner strategy"); return Ok(Arc::new(ri)); } }; debug!("using core strategy"); Ok(Arc::new(core)) } #[derive(Clone, Debug)] struct Pre

{ pre: P, group_info: GroupInfo, } impl Pre

{ fn new(pre: P) -> Arc { // The only thing we support when we use prefilters directly as a // strategy is the start and end of the overall match for a single // pattern. In other words, exactly one implicit capturing group. Which // is exactly what we use here for a GroupInfo. let group_info = GroupInfo::new([[None::<&str>]]).unwrap(); Arc::new(Pre { pre, group_info }) } } // This is a little weird, but we don't actually care about the type parameter // here because we're selecting which underlying prefilter to use. So we just // define it on an arbitrary type. impl Pre<()> { /// Given a sequence of prefixes, attempt to return a full `Strategy` using /// just the prefixes. /// /// Basically, this occurs when the prefixes given not just prefixes, /// but an enumeration of the entire language matched by the regular /// expression. /// /// A number of other conditions need to be true too. For example, there /// can be only one pattern, the number of explicit capture groups is 0, no /// look-around assertions and so on. /// /// Note that this ignores `Config::get_auto_prefilter` because if this /// returns something, then it isn't a prefilter but a matcher itself. /// Therefore, it shouldn't suffer from the problems typical to prefilters /// (such as a high false positive rate). fn from_prefixes( info: &RegexInfo, prefixes: &literal::Seq, ) -> Option> { let kind = info.config().get_match_kind(); // Check to see if our prefixes are exact, which means we might be // able to bypass the regex engine entirely and just rely on literal // searches. if !prefixes.is_exact() { return None; } // We also require that we have a single regex pattern. Namely, // we reuse the prefilter infrastructure to implement search and // prefilters only report spans. Prefilters don't know about pattern // IDs. The multi-regex case isn't a lost cause, we might still use // Aho-Corasick and we might still just use a regular prefilter, but // that's done below. if info.pattern_len() != 1 { return None; } // We can't have any capture groups either. The literal engines don't // know how to deal with things like '(foo)(bar)'. In that case, a // prefilter will just be used and then the regex engine will resolve // the capture groups. if info.props()[0].explicit_captures_len() != 0 { return None; } // We also require that it has zero look-around assertions. Namely, // literal extraction treats look-around assertions as if they match // *every* empty string. But of course, that isn't true. So for // example, 'foo\bquux' never matches anything, but 'fooquux' is // extracted from that as an exact literal. Such cases should just run // the regex engine. 'fooquux' will be used as a normal prefilter, and // then the regex engine will try to look for an actual match. if !info.props()[0].look_set().is_empty() { return None; } // Finally, currently, our prefilters are all oriented around // leftmost-first match semantics, so don't try to use them if the // caller asked for anything else. if kind != MatchKind::LeftmostFirst { return None; } // The above seems like a lot of requirements to meet, but it applies // to a lot of cases. 'foo', '[abc][123]' and 'foo|bar|quux' all meet // the above criteria, for example. // // Note that this is effectively a latency optimization. If we didn't // do this, then the extracted literals would still get bundled into // a prefilter, and every regex engine capable of running unanchored // searches supports prefilters. So this optimization merely sidesteps // having to run the regex engine at all to confirm the match. Thus, it // decreases the latency of a match. // OK because we know the set is exact and thus finite. let prefixes = prefixes.literals().unwrap(); debug!( "trying to bypass regex engine by creating \ prefilter from {} literals: {:?}", prefixes.len(), prefixes, ); let choice = match prefilter::Choice::new(kind, prefixes) { Some(choice) => choice, None => { debug!( "regex bypass failed because no prefilter could be built" ); return None; } }; let strat: Arc = match choice { prefilter::Choice::Memchr(pre) => Pre::new(pre), prefilter::Choice::Memchr2(pre) => Pre::new(pre), prefilter::Choice::Memchr3(pre) => Pre::new(pre), prefilter::Choice::Memmem(pre) => Pre::new(pre), prefilter::Choice::Teddy(pre) => Pre::new(pre), prefilter::Choice::ByteSet(pre) => Pre::new(pre), prefilter::Choice::AhoCorasick(pre) => Pre::new(pre), }; Some(strat) } /// Attempts to extract an alternation of literals, and if it's deemed /// worth doing, returns an Aho-Corasick prefilter as a strategy. /// /// And currently, this only returns something when 'hirs.len() == 1'. This /// could in theory do something if there are multiple HIRs where all of /// them are alternation of literals, but I haven't had the time to go down /// that path yet. fn from_alternation_literals( info: &RegexInfo, hirs: &[&Hir], ) -> Option> { use crate::util::prefilter::AhoCorasick; let lits = crate::meta::literal::alternation_literals(info, hirs)?; let ac = AhoCorasick::new(MatchKind::LeftmostFirst, &lits)?; Some(Pre::new(ac)) } } // This implements Strategy for anything that implements PrefilterI. // // Note that this must only be used for regexes of length 1. Multi-regexes // don't work here. The prefilter interface only provides the span of a match // and not the pattern ID. (I did consider making it more expressive, but I // couldn't figure out how to tie everything together elegantly.) Thus, so long // as the regex only contains one pattern, we can simply assume that a match // corresponds to PatternID::ZERO. And indeed, that's what we do here. // // In practice, since this impl is used to report matches directly and thus // completely bypasses the regex engine, we only wind up using this under the // following restrictions: // // * There must be only one pattern. As explained above. // * The literal sequence must be finite and only contain exact literals. // * There must not be any look-around assertions. If there are, the literals // extracted might be exact, but a match doesn't necessarily imply an overall // match. As a trivial example, 'foo\bbar' does not match 'foobar'. // * The pattern must not have any explicit capturing groups. If it does, the // caller might expect them to be resolved. e.g., 'foo(bar)'. // // So when all of those things are true, we use a prefilter directly as a // strategy. // // In the case where the number of patterns is more than 1, we don't use this // but do use a special Aho-Corasick strategy if all of the regexes are just // simple literals or alternations of literals. (We also use the Aho-Corasick // strategy when len(patterns)==1 if the number of literals is large. In that // case, literal extraction gives up and will return an infinite set.) impl Strategy for Pre

{ fn group_info(&self) -> &GroupInfo { &self.group_info } fn create_cache(&self) -> Cache { Cache { capmatches: Captures::all(self.group_info().clone()), pikevm: wrappers::PikeVMCache::none(), backtrack: wrappers::BoundedBacktrackerCache::none(), onepass: wrappers::OnePassCache::none(), hybrid: wrappers::HybridCache::none(), revhybrid: wrappers::ReverseHybridCache::none(), } } fn reset_cache(&self, _cache: &mut Cache) {} fn is_accelerated(&self) -> bool { self.pre.is_fast() } fn memory_usage(&self) -> usize { self.pre.memory_usage() } fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option { if input.is_done() { return None; } if input.get_anchored().is_anchored() { return self .pre .prefix(input.haystack(), input.get_span()) .map(|sp| Match::new(PatternID::ZERO, sp)); } self.pre .find(input.haystack(), input.get_span()) .map(|sp| Match::new(PatternID::ZERO, sp)) } fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { self.search(cache, input).is_some() } fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { let m = self.search(cache, input)?; if let Some(slot) = slots.get_mut(0) { *slot = NonMaxUsize::new(m.start()); } if let Some(slot) = slots.get_mut(1) { *slot = NonMaxUsize::new(m.end()); } Some(m.pattern()) } fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { if self.search(cache, input).is_some() { patset.insert(PatternID::ZERO); } } } #[derive(Debug)] struct Core { info: RegexInfo, pre: Option, nfa: NFA, nfarev: Option, pikevm: wrappers::PikeVM, backtrack: wrappers::BoundedBacktracker, onepass: wrappers::OnePass, hybrid: wrappers::Hybrid, dfa: wrappers::DFA, } impl Core { fn new( info: RegexInfo, pre: Option, hirs: &[&Hir], ) -> Result { let mut lookm = LookMatcher::new(); lookm.set_line_terminator(info.config().get_line_terminator()); let thompson_config = thompson::Config::new() .utf8(info.config().get_utf8_empty()) .nfa_size_limit(info.config().get_nfa_size_limit()) .shrink(false) .which_captures(info.config().get_which_captures()) .look_matcher(lookm); let nfa = thompson::Compiler::new() .configure(thompson_config.clone()) .build_many_from_hir(hirs) .map_err(BuildError::nfa)?; // It's possible for the PikeVM or the BB to fail to build, even though // at this point, we already have a full NFA in hand. They can fail // when a Unicode word boundary is used but where Unicode word boundary // support is disabled at compile time, thus making it impossible to // match. (Construction can also fail if the NFA was compiled without // captures, but we always enable that above.) let pikevm = wrappers::PikeVM::new(&info, pre.clone(), &nfa)?; let backtrack = wrappers::BoundedBacktracker::new(&info, pre.clone(), &nfa)?; // The onepass engine can of course fail to build, but we expect it to // fail in many cases because it is an optimization that doesn't apply // to all regexes. The 'OnePass' wrapper encapsulates this failure (and // logs a message if it occurs). let onepass = wrappers::OnePass::new(&info, &nfa); // We try to encapsulate whether a particular regex engine should be // used within each respective wrapper, but the DFAs need a reverse NFA // to build itself, and we really do not want to build a reverse NFA if // we know we aren't going to use the lazy DFA. So we do a config check // up front, which is in practice the only way we won't try to use the // DFA. let (nfarev, hybrid, dfa) = if !info.config().get_hybrid() && !info.config().get_dfa() { (None, wrappers::Hybrid::none(), wrappers::DFA::none()) } else { // FIXME: Technically, we don't quite yet KNOW that we need // a reverse NFA. It's possible for the DFAs below to both // fail to build just based on the forward NFA. In which case, // building the reverse NFA was totally wasted work. But... // fixing this requires breaking DFA construction apart into // two pieces: one for the forward part and another for the // reverse part. Quite annoying. Making it worse, when building // both DFAs fails, it's quite likely that the NFA is large and // that it will take quite some time to build the reverse NFA // too. So... it's really probably worth it to do this! let nfarev = thompson::Compiler::new() // Currently, reverse NFAs don't support capturing groups, // so we MUST disable them. But even if we didn't have to, // we would, because nothing in this crate does anything // useful with capturing groups in reverse. And of course, // the lazy DFA ignores capturing groups in all cases. .configure( thompson_config .clone() .which_captures(WhichCaptures::None) .reverse(true), ) .build_many_from_hir(hirs) .map_err(BuildError::nfa)?; let dfa = if !info.config().get_dfa() { wrappers::DFA::none() } else { wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) }; let hybrid = if !info.config().get_hybrid() { wrappers::Hybrid::none() } else if dfa.is_some() { debug!("skipping lazy DFA because we have a full DFA"); wrappers::Hybrid::none() } else { wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) }; (Some(nfarev), hybrid, dfa) }; Ok(Core { info, pre, nfa, nfarev, pikevm, backtrack, onepass, hybrid, dfa, }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_mayfail( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option, RetryFailError>> { if let Some(e) = self.dfa.get(input) { trace!("using full DFA for search at {:?}", input.get_span()); Some(e.try_search(input)) } else if let Some(e) = self.hybrid.get(input) { trace!("using lazy DFA for search at {:?}", input.get_span()); Some(e.try_search(&mut cache.hybrid, input)) } else { None } } fn search_nofail( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { let caps = &mut cache.capmatches; caps.set_pattern(None); // We manually inline 'try_search_slots_nofail' here because we need to // borrow from 'cache.capmatches' in this method, but if we do, then // we can't pass 'cache' wholesale to to 'try_slots_no_hybrid'. It's a // classic example of how the borrow checker inhibits decomposition. // There are of course work-arounds (more types and/or interior // mutability), but that's more annoying than this IMO. let pid = if let Some(ref e) = self.onepass.get(input) { trace!("using OnePass for search at {:?}", input.get_span()); e.search_slots(&mut cache.onepass, input, caps.slots_mut()) } else if let Some(ref e) = self.backtrack.get(input) { trace!( "using BoundedBacktracker for search at {:?}", input.get_span() ); e.search_slots(&mut cache.backtrack, input, caps.slots_mut()) } else { trace!("using PikeVM for search at {:?}", input.get_span()); let e = self.pikevm.get(); e.search_slots(&mut cache.pikevm, input, caps.slots_mut()) }; caps.set_pattern(pid); caps.get_match() } fn search_half_nofail( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { // Only the lazy/full DFA returns half-matches, since the DFA requires // a reverse scan to find the start position. These fallback regex // engines can find the start and end in a single pass, so we just do // that and throw away the start offset to conform to the API. let m = self.search_nofail(cache, input)?; Some(HalfMatch::new(m.pattern(), m.end())) } fn search_slots_nofail( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { if let Some(ref e) = self.onepass.get(input) { trace!( "using OnePass for capture search at {:?}", input.get_span() ); e.search_slots(&mut cache.onepass, input, slots) } else if let Some(ref e) = self.backtrack.get(input) { trace!( "using BoundedBacktracker for capture search at {:?}", input.get_span() ); e.search_slots(&mut cache.backtrack, input, slots) } else { trace!( "using PikeVM for capture search at {:?}", input.get_span() ); let e = self.pikevm.get(); e.search_slots(&mut cache.pikevm, input, slots) } } fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { if let Some(ref e) = self.onepass.get(input) { trace!( "using OnePass for is-match search at {:?}", input.get_span() ); e.search_slots(&mut cache.onepass, input, &mut []).is_some() } else if let Some(ref e) = self.backtrack.get(input) { trace!( "using BoundedBacktracker for is-match search at {:?}", input.get_span() ); e.is_match(&mut cache.backtrack, input) } else { trace!( "using PikeVM for is-match search at {:?}", input.get_span() ); let e = self.pikevm.get(); e.is_match(&mut cache.pikevm, input) } } fn is_capture_search_needed(&self, slots_len: usize) -> bool { slots_len > self.nfa.group_info().implicit_slot_len() } } impl Strategy for Core { #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { self.nfa.group_info() } #[cfg_attr(feature = "perf-inline", inline(always))] fn create_cache(&self) -> Cache { Cache { capmatches: Captures::all(self.group_info().clone()), pikevm: self.pikevm.create_cache(), backtrack: self.backtrack.create_cache(), onepass: self.onepass.create_cache(), hybrid: self.hybrid.create_cache(), revhybrid: wrappers::ReverseHybridCache::none(), } } #[cfg_attr(feature = "perf-inline", inline(always))] fn reset_cache(&self, cache: &mut Cache) { cache.pikevm.reset(&self.pikevm); cache.backtrack.reset(&self.backtrack); cache.onepass.reset(&self.onepass); cache.hybrid.reset(&self.hybrid); } fn is_accelerated(&self) -> bool { self.pre.as_ref().map_or(false, |pre| pre.is_fast()) } fn memory_usage(&self) -> usize { self.info.memory_usage() + self.pre.as_ref().map_or(0, |pre| pre.memory_usage()) + self.nfa.memory_usage() + self.nfarev.as_ref().map_or(0, |nfa| nfa.memory_usage()) + self.onepass.memory_usage() + self.dfa.memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { // We manually inline try_search_mayfail here because letting the // compiler do it seems to produce pretty crappy codegen. return if let Some(e) = self.dfa.get(input) { trace!("using full DFA for full search at {:?}", input.get_span()); match e.try_search(input) { Ok(x) => x, Err(_err) => { trace!("full DFA search failed: {}", _err); self.search_nofail(cache, input) } } } else if let Some(e) = self.hybrid.get(input) { trace!("using lazy DFA for full search at {:?}", input.get_span()); match e.try_search(&mut cache.hybrid, input) { Ok(x) => x, Err(_err) => { trace!("lazy DFA search failed: {}", _err); self.search_nofail(cache, input) } } } else { self.search_nofail(cache, input) }; } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { // The main difference with 'search' is that if we're using a DFA, we // can use a single forward scan without needing to run the reverse // DFA. if let Some(e) = self.dfa.get(input) { trace!("using full DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(input) { Ok(x) => x, Err(_err) => { trace!("full DFA half search failed: {}", _err); self.search_half_nofail(cache, input) } } } else if let Some(e) = self.hybrid.get(input) { trace!("using lazy DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(&mut cache.hybrid, input) { Ok(x) => x, Err(_err) => { trace!("lazy DFA half search failed: {}", _err); self.search_half_nofail(cache, input) } } } else { self.search_half_nofail(cache, input) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { if let Some(e) = self.dfa.get(input) { trace!( "using full DFA for is-match search at {:?}", input.get_span() ); match e.try_search_half_fwd(input) { Ok(x) => x.is_some(), Err(_err) => { trace!("full DFA half search failed: {}", _err); self.is_match_nofail(cache, input) } } } else if let Some(e) = self.hybrid.get(input) { trace!( "using lazy DFA for is-match search at {:?}", input.get_span() ); match e.try_search_half_fwd(&mut cache.hybrid, input) { Ok(x) => x.is_some(), Err(_err) => { trace!("lazy DFA half search failed: {}", _err); self.is_match_nofail(cache, input) } } } else { self.is_match_nofail(cache, input) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { // Even if the regex has explicit capture groups, if the caller didn't // provide any explicit slots, then it doesn't make sense to try and do // extra work to get offsets for those slots. Ideally the caller should // realize this and not call this routine in the first place, but alas, // we try to save the caller from themselves if they do. if !self.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, trying fast path"); let m = self.search(cache, input)?; copy_match_to_slots(m, slots); return Some(m.pattern()); } // If the onepass DFA is available for this search (which only happens // when it's anchored), then skip running a fallible DFA. The onepass // DFA isn't as fast as a full or lazy DFA, but it is typically quite // a bit faster than the backtracker or the PikeVM. So it isn't as // advantageous to try and do a full/lazy DFA scan first. // // We still theorize that it's better to do a full/lazy DFA scan, even // when it's anchored, because it's usually much faster and permits us // to say "no match" much more quickly. This does hurt the case of, // say, parsing each line in a log file into capture groups, because // in that case, the line always matches. So the lazy DFA scan is // usually just wasted work. But, the lazy DFA is usually quite fast // and doesn't cost too much here. if self.onepass.get(&input).is_some() { return self.search_slots_nofail(cache, &input, slots); } let m = match self.try_search_mayfail(cache, input) { Some(Ok(Some(m))) => m, Some(Ok(None)) => return None, Some(Err(_err)) => { trace!("fast capture search failed: {}", _err); return self.search_slots_nofail(cache, input, slots); } None => { return self.search_slots_nofail(cache, input, slots); } }; // At this point, now that we've found the bounds of the // match, we need to re-run something that can resolve // capturing groups. But we only need to run on it on the // match bounds and not the entire haystack. trace!( "match found at {}..{} in capture search, \ using another engine to find captures", m.start(), m.end(), ); let input = input .clone() .span(m.start()..m.end()) .anchored(Anchored::Pattern(m.pattern())); Some( self.search_slots_nofail(cache, &input, slots) .expect("should find a match"), ) } #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { if let Some(e) = self.dfa.get(input) { trace!( "using full DFA for overlapping search at {:?}", input.get_span() ); let _err = match e.try_which_overlapping_matches(input, patset) { Ok(()) => return, Err(err) => err, }; trace!("fast overlapping search failed: {}", _err); } else if let Some(e) = self.hybrid.get(input) { trace!( "using lazy DFA for overlapping search at {:?}", input.get_span() ); let _err = match e.try_which_overlapping_matches( &mut cache.hybrid, input, patset, ) { Ok(()) => { return; } Err(err) => err, }; trace!("fast overlapping search failed: {}", _err); } trace!( "using PikeVM for overlapping search at {:?}", input.get_span() ); let e = self.pikevm.get(); e.which_overlapping_matches(&mut cache.pikevm, input, patset) } } #[derive(Debug)] struct ReverseAnchored { core: Core, } impl ReverseAnchored { fn new(core: Core) -> Result { if !core.info.is_always_anchored_end() { debug!( "skipping reverse anchored optimization because \ the regex is not always anchored at the end" ); return Err(core); } // Note that the caller can still request an anchored search even when // the regex isn't anchored at the start. We detect that case in the // search routines below and just fallback to the core engine. This // is fine because both searches are anchored. It's just a matter of // picking one. Falling back to the core engine is a little simpler, // since if we used the reverse anchored approach, we'd have to add an // extra check to ensure the match reported starts at the place where // the caller requested the search to start. if core.info.is_always_anchored_start() { debug!( "skipping reverse anchored optimization because \ the regex is also anchored at the start" ); return Err(core); } // Only DFAs can do reverse searches (currently), so we need one of // them in order to do this optimization. It's possible (although // pretty unlikely) that we have neither and need to give up. if !core.hybrid.is_some() && !core.dfa.is_some() { debug!( "skipping reverse anchored optimization because \ we don't have a lazy DFA or a full DFA" ); return Err(core); } Ok(ReverseAnchored { core }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_anchored_rev( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, RetryFailError> { // We of course always want an anchored search. In theory, the // underlying regex engines should automatically enable anchored // searches since the regex is itself anchored, but this more clearly // expresses intent and is always correct. let input = input.clone().anchored(Anchored::Yes); if let Some(e) = self.core.dfa.get(&input) { trace!( "using full DFA for reverse anchored search at {:?}", input.get_span() ); e.try_search_half_rev(&input) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( "using lazy DFA for reverse anchored search at {:?}", input.get_span() ); e.try_search_half_rev(&mut cache.hybrid, &input) } else { unreachable!("ReverseAnchored always has a DFA") } } } // Note that in this impl, we don't check that 'input.end() == // input.haystack().len()'. In particular, when that condition is false, a // match is always impossible because we know that the regex is always anchored // at the end (or else 'ReverseAnchored' won't be built). We don't check that // here because the 'Regex' wrapper actually does that for us in all cases. // Thus, in this impl, we can actually assume that the end position in 'input' // is equivalent to the length of the haystack. impl Strategy for ReverseAnchored { #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { self.core.group_info() } #[cfg_attr(feature = "perf-inline", inline(always))] fn create_cache(&self) -> Cache { self.core.create_cache() } #[cfg_attr(feature = "perf-inline", inline(always))] fn reset_cache(&self, cache: &mut Cache) { self.core.reset_cache(cache); } fn is_accelerated(&self) -> bool { // Since this is anchored at the end, a reverse anchored search is // almost certainly guaranteed to result in a much faster search than // a standard forward search. true } fn memory_usage(&self) -> usize { self.core.memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { if input.get_anchored().is_anchored() { return self.core.search(cache, input); } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); self.core.search_nofail(cache, input) } Ok(None) => None, Ok(Some(hm)) => { Some(Match::new(hm.pattern(), hm.offset()..input.end())) } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_half(cache, input); } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); self.core.search_half_nofail(cache, input) } Ok(None) => None, Ok(Some(hm)) => { // Careful here! 'try_search_half' is a *forward* search that // only cares about the *end* position of a match. But // 'hm.offset()' is actually the start of the match. So we // actually just throw that away here and, since we know we // have a match, return the only possible position at which a // match can occur: input.end(). Some(HalfMatch::new(hm.pattern(), input.end())) } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { if input.get_anchored().is_anchored() { return self.core.is_match(cache, input); } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); self.core.is_match_nofail(cache, input) } Ok(None) => false, Ok(Some(_)) => true, } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_slots(cache, input, slots); } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); self.core.search_slots_nofail(cache, input, slots) } Ok(None) => None, Ok(Some(hm)) => { if !self.core.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, skipping captures"); let m = Match::new(hm.pattern(), hm.offset()..input.end()); copy_match_to_slots(m, slots); return Some(m.pattern()); } let start = hm.offset(); let input = input .clone() .span(start..input.end()) .anchored(Anchored::Pattern(hm.pattern())); self.core.search_slots_nofail(cache, &input, slots) } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { // It seems like this could probably benefit from a reverse anchored // optimization, perhaps by doing an overlapping reverse search (which // the DFAs do support). I haven't given it much thought though, and // I'm currently focus more on the single pattern case. self.core.which_overlapping_matches(cache, input, patset) } } #[derive(Debug)] struct ReverseSuffix { core: Core, pre: Prefilter, } impl ReverseSuffix { fn new(core: Core, hirs: &[&Hir]) -> Result { if !core.info.config().get_auto_prefilter() { debug!( "skipping reverse suffix optimization because \ automatic prefilters are disabled" ); return Err(core); } // Like the reverse inner optimization, we don't do this for regexes // that are always anchored. It could lead to scanning too much, but // could say "no match" much more quickly than running the regex // engine if the initial literal scan doesn't match. With that said, // the reverse suffix optimization has lower overhead, since it only // requires a reverse scan after a literal match to confirm or reject // the match. (Although, in the case of confirmation, it then needs to // do another forward scan to find the end position.) // // Note that the caller can still request an anchored search even // when the regex isn't anchored. We detect that case in the search // routines below and just fallback to the core engine. Currently this // optimization assumes all searches are unanchored, so if we do want // to enable this optimization for anchored searches, it will need a // little work to support it. if core.info.is_always_anchored_start() { debug!( "skipping reverse suffix optimization because \ the regex is always anchored at the start", ); return Err(core); } // Only DFAs can do reverse searches (currently), so we need one of // them in order to do this optimization. It's possible (although // pretty unlikely) that we have neither and need to give up. if !core.hybrid.is_some() && !core.dfa.is_some() { debug!( "skipping reverse suffix optimization because \ we don't have a lazy DFA or a full DFA" ); return Err(core); } if core.pre.as_ref().map_or(false, |p| p.is_fast()) { debug!( "skipping reverse suffix optimization because \ we already have a prefilter that we think is fast" ); return Err(core); } let kind = core.info.config().get_match_kind(); let suffixes = crate::util::prefilter::suffixes(kind, hirs); let lcs = match suffixes.longest_common_suffix() { None => { debug!( "skipping reverse suffix optimization because \ a longest common suffix could not be found", ); return Err(core); } Some(lcs) if lcs.is_empty() => { debug!( "skipping reverse suffix optimization because \ the longest common suffix is the empty string", ); return Err(core); } Some(lcs) => lcs, }; let pre = match Prefilter::new(kind, &[lcs]) { Some(pre) => pre, None => { debug!( "skipping reverse suffix optimization because \ a prefilter could not be constructed from the \ longest common suffix", ); return Err(core); } }; if !pre.is_fast() { debug!( "skipping reverse suffix optimization because \ while we have a suffix prefilter, it is not \ believed to be 'fast'" ); return Err(core); } Ok(ReverseSuffix { core, pre }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_start( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, RetryError> { let mut span = input.get_span(); let mut min_start = 0; loop { let litmatch = match self.pre.find(input.haystack(), span) { None => return Ok(None), Some(span) => span, }; trace!("reverse suffix scan found suffix match at {:?}", litmatch); let revinput = input .clone() .anchored(Anchored::Yes) .span(input.start()..litmatch.end); match self .try_search_half_rev_limited(cache, &revinput, min_start)? { None => { if span.start >= span.end { break; } span.start = litmatch.start.checked_add(1).unwrap(); } Some(hm) => return Ok(Some(hm)), } min_start = litmatch.end; } Ok(None) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_fwd( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, RetryFailError> { if let Some(e) = self.core.dfa.get(&input) { trace!( "using full DFA for forward reverse suffix search at {:?}", input.get_span() ); e.try_search_half_fwd(&input) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( "using lazy DFA for forward reverse suffix search at {:?}", input.get_span() ); e.try_search_half_fwd(&mut cache.hybrid, &input) } else { unreachable!("ReverseSuffix always has a DFA") } } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_rev_limited( &self, cache: &mut Cache, input: &Input<'_>, min_start: usize, ) -> Result, RetryError> { if let Some(e) = self.core.dfa.get(&input) { trace!( "using full DFA for reverse suffix search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, ); e.try_search_half_rev_limited(&input, min_start) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( "using lazy DFA for reverse inner search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, ); e.try_search_half_rev_limited(&mut cache.hybrid, &input, min_start) } else { unreachable!("ReverseSuffix always has a DFA") } } } impl Strategy for ReverseSuffix { #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { self.core.group_info() } #[cfg_attr(feature = "perf-inline", inline(always))] fn create_cache(&self) -> Cache { self.core.create_cache() } #[cfg_attr(feature = "perf-inline", inline(always))] fn reset_cache(&self, cache: &mut Cache) { self.core.reset_cache(cache); } fn is_accelerated(&self) -> bool { self.pre.is_fast() } fn memory_usage(&self) -> usize { self.core.memory_usage() + self.pre.memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { if input.get_anchored().is_anchored() { return self.core.search(cache, input); } match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse suffix optimization failed: {}", _err); self.core.search(cache, input) } Err(RetryError::Fail(_err)) => { trace!("reverse suffix reverse fast search failed: {}", _err); self.core.search_nofail(cache, input) } Ok(None) => None, Ok(Some(hm_start)) => { let fwdinput = input .clone() .anchored(Anchored::Pattern(hm_start.pattern())) .span(hm_start.offset()..input.end()); match self.try_search_half_fwd(cache, &fwdinput) { Err(_err) => { trace!( "reverse suffix forward fast search failed: {}", _err ); self.core.search_nofail(cache, input) } Ok(None) => { unreachable!( "suffix match plus reverse match implies \ there must be a match", ) } Ok(Some(hm_end)) => Some(Match::new( hm_start.pattern(), hm_start.offset()..hm_end.offset(), )), } } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_half(cache, input); } match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse suffix half optimization failed: {}", _err); self.core.search_half(cache, input) } Err(RetryError::Fail(_err)) => { trace!( "reverse suffix reverse fast half search failed: {}", _err ); self.core.search_half_nofail(cache, input) } Ok(None) => None, Ok(Some(hm_start)) => { // This is a bit subtle. It is tempting to just stop searching // at this point and return a half-match with an offset // corresponding to where the suffix was found. But the suffix // match does not necessarily correspond to the end of the // proper leftmost-first match. Consider /[a-z]+ing/ against // 'tingling'. The first suffix match is the first 'ing', and // the /[a-z]+/ matches the 't'. So if we stopped here, then // we'd report 'ting' as the match. But 'tingling' is the // correct match because of greediness. let fwdinput = input .clone() .anchored(Anchored::Pattern(hm_start.pattern())) .span(hm_start.offset()..input.end()); match self.try_search_half_fwd(cache, &fwdinput) { Err(_err) => { trace!( "reverse suffix forward fast search failed: {}", _err ); self.core.search_half_nofail(cache, input) } Ok(None) => { unreachable!( "suffix match plus reverse match implies \ there must be a match", ) } Ok(Some(hm_end)) => Some(hm_end), } } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { if input.get_anchored().is_anchored() { return self.core.is_match(cache, input); } match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse suffix half optimization failed: {}", _err); self.core.is_match_nofail(cache, input) } Err(RetryError::Fail(_err)) => { trace!( "reverse suffix reverse fast half search failed: {}", _err ); self.core.is_match_nofail(cache, input) } Ok(None) => false, Ok(Some(_)) => true, } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_slots(cache, input, slots); } if !self.core.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, trying fast path"); let m = self.search(cache, input)?; copy_match_to_slots(m, slots); return Some(m.pattern()); } let hm_start = match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!( "reverse suffix captures optimization failed: {}", _err ); return self.core.search_slots(cache, input, slots); } Err(RetryError::Fail(_err)) => { trace!( "reverse suffix reverse fast captures search failed: {}", _err ); return self.core.search_slots_nofail(cache, input, slots); } Ok(None) => return None, Ok(Some(hm_start)) => hm_start, }; trace!( "match found at {}..{} in capture search, \ using another engine to find captures", hm_start.offset(), input.end(), ); let start = hm_start.offset(); let input = input .clone() .span(start..input.end()) .anchored(Anchored::Pattern(hm_start.pattern())); self.core.search_slots_nofail(cache, &input, slots) } #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { self.core.which_overlapping_matches(cache, input, patset) } } #[derive(Debug)] struct ReverseInner { core: Core, preinner: Prefilter, nfarev: NFA, hybrid: wrappers::ReverseHybrid, dfa: wrappers::ReverseDFA, } impl ReverseInner { fn new(core: Core, hirs: &[&Hir]) -> Result { if !core.info.config().get_auto_prefilter() { debug!( "skipping reverse inner optimization because \ automatic prefilters are disabled" ); return Err(core); } // Currently we hard-code the assumption of leftmost-first match // semantics. This isn't a huge deal because 'all' semantics tend to // only be used for forward overlapping searches with multiple regexes, // and this optimization only supports a single pattern at the moment. if core.info.config().get_match_kind() != MatchKind::LeftmostFirst { debug!( "skipping reverse inner optimization because \ match kind is {:?} but this only supports leftmost-first", core.info.config().get_match_kind(), ); return Err(core); } // It's likely that a reverse inner scan has too much overhead for it // to be worth it when the regex is anchored at the start. It is // possible for it to be quite a bit faster if the initial literal // scan fails to detect a match, in which case, we can say "no match" // very quickly. But this could be undesirable, e.g., scanning too far // or when the literal scan matches. If it matches, then confirming the // match requires a reverse scan followed by a forward scan to confirm // or reject, which is a fair bit of work. // // Note that the caller can still request an anchored search even // when the regex isn't anchored. We detect that case in the search // routines below and just fallback to the core engine. Currently this // optimization assumes all searches are unanchored, so if we do want // to enable this optimization for anchored searches, it will need a // little work to support it. if core.info.is_always_anchored_start() { debug!( "skipping reverse inner optimization because \ the regex is always anchored at the start", ); return Err(core); } // Only DFAs can do reverse searches (currently), so we need one of // them in order to do this optimization. It's possible (although // pretty unlikely) that we have neither and need to give up. if !core.hybrid.is_some() && !core.dfa.is_some() { debug!( "skipping reverse inner optimization because \ we don't have a lazy DFA or a full DFA" ); return Err(core); } if core.pre.as_ref().map_or(false, |p| p.is_fast()) { debug!( "skipping reverse inner optimization because \ we already have a prefilter that we think is fast" ); return Err(core); } else if core.pre.is_some() { debug!( "core engine has a prefix prefilter, but it is \ probably not fast, so continuing with attempt to \ use reverse inner prefilter" ); } let (concat_prefix, preinner) = match reverse_inner::extract(hirs) { Some(x) => x, // N.B. the 'extract' function emits debug messages explaining // why we bailed out here. None => return Err(core), }; debug!("building reverse NFA for prefix before inner literal"); let mut lookm = LookMatcher::new(); lookm.set_line_terminator(core.info.config().get_line_terminator()); let thompson_config = thompson::Config::new() .reverse(true) .utf8(core.info.config().get_utf8_empty()) .nfa_size_limit(core.info.config().get_nfa_size_limit()) .shrink(false) .which_captures(WhichCaptures::None) .look_matcher(lookm); let result = thompson::Compiler::new() .configure(thompson_config) .build_from_hir(&concat_prefix); let nfarev = match result { Ok(nfarev) => nfarev, Err(_err) => { debug!( "skipping reverse inner optimization because the \ reverse NFA failed to build: {}", _err, ); return Err(core); } }; debug!("building reverse DFA for prefix before inner literal"); let dfa = if !core.info.config().get_dfa() { wrappers::ReverseDFA::none() } else { wrappers::ReverseDFA::new(&core.info, &nfarev) }; let hybrid = if !core.info.config().get_hybrid() { wrappers::ReverseHybrid::none() } else if dfa.is_some() { debug!( "skipping lazy DFA for reverse inner optimization \ because we have a full DFA" ); wrappers::ReverseHybrid::none() } else { wrappers::ReverseHybrid::new(&core.info, &nfarev) }; Ok(ReverseInner { core, preinner, nfarev, hybrid, dfa }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_full( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, RetryError> { let mut span = input.get_span(); let mut min_match_start = 0; let mut min_pre_start = 0; loop { let litmatch = match self.preinner.find(input.haystack(), span) { None => return Ok(None), Some(span) => span, }; if litmatch.start < min_pre_start { trace!( "found inner prefilter match at {:?}, which starts \ before the end of the last forward scan at {}, \ quitting to avoid quadratic behavior", litmatch, min_pre_start, ); return Err(RetryError::Quadratic(RetryQuadraticError::new())); } trace!("reverse inner scan found inner match at {:?}", litmatch); let revinput = input .clone() .anchored(Anchored::Yes) .span(input.start()..litmatch.start); // Note that in addition to the literal search above scanning past // our minimum start point, this routine can also return an error // as a result of detecting possible quadratic behavior if the // reverse scan goes past the minimum start point. That is, the // literal search might not, but the reverse regex search for the // prefix might! match self.try_search_half_rev_limited( cache, &revinput, min_match_start, )? { None => { if span.start >= span.end { break; } span.start = litmatch.start.checked_add(1).unwrap(); } Some(hm_start) => { let fwdinput = input .clone() .anchored(Anchored::Pattern(hm_start.pattern())) .span(hm_start.offset()..input.end()); match self.try_search_half_fwd_stopat(cache, &fwdinput)? { Err(stopat) => { min_pre_start = stopat; span.start = litmatch.start.checked_add(1).unwrap(); } Ok(hm_end) => { return Ok(Some(Match::new( hm_start.pattern(), hm_start.offset()..hm_end.offset(), ))) } } } } min_match_start = litmatch.end; } Ok(None) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_fwd_stopat( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, RetryFailError> { if let Some(e) = self.core.dfa.get(&input) { trace!( "using full DFA for forward reverse inner search at {:?}", input.get_span() ); e.try_search_half_fwd_stopat(&input) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( "using lazy DFA for forward reverse inner search at {:?}", input.get_span() ); e.try_search_half_fwd_stopat(&mut cache.hybrid, &input) } else { unreachable!("ReverseInner always has a DFA") } } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_rev_limited( &self, cache: &mut Cache, input: &Input<'_>, min_start: usize, ) -> Result, RetryError> { if let Some(e) = self.dfa.get(&input) { trace!( "using full DFA for reverse inner search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, ); e.try_search_half_rev_limited(&input, min_start) } else if let Some(e) = self.hybrid.get(&input) { trace!( "using lazy DFA for reverse inner search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, ); e.try_search_half_rev_limited( &mut cache.revhybrid, &input, min_start, ) } else { unreachable!("ReverseInner always has a DFA") } } } impl Strategy for ReverseInner { #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { self.core.group_info() } #[cfg_attr(feature = "perf-inline", inline(always))] fn create_cache(&self) -> Cache { let mut cache = self.core.create_cache(); cache.revhybrid = self.hybrid.create_cache(); cache } #[cfg_attr(feature = "perf-inline", inline(always))] fn reset_cache(&self, cache: &mut Cache) { self.core.reset_cache(cache); cache.revhybrid.reset(&self.hybrid); } fn is_accelerated(&self) -> bool { self.preinner.is_fast() } fn memory_usage(&self) -> usize { self.core.memory_usage() + self.preinner.memory_usage() + self.nfarev.memory_usage() + self.dfa.memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { if input.get_anchored().is_anchored() { return self.core.search(cache, input); } match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner optimization failed: {}", _err); self.core.search(cache, input) } Err(RetryError::Fail(_err)) => { trace!("reverse inner fast search failed: {}", _err); self.core.search_nofail(cache, input) } Ok(matornot) => matornot, } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_half(cache, input); } match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner half optimization failed: {}", _err); self.core.search_half(cache, input) } Err(RetryError::Fail(_err)) => { trace!("reverse inner fast half search failed: {}", _err); self.core.search_half_nofail(cache, input) } Ok(None) => None, Ok(Some(m)) => Some(HalfMatch::new(m.pattern(), m.end())), } } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { if input.get_anchored().is_anchored() { return self.core.is_match(cache, input); } match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner half optimization failed: {}", _err); self.core.is_match_nofail(cache, input) } Err(RetryError::Fail(_err)) => { trace!("reverse inner fast half search failed: {}", _err); self.core.is_match_nofail(cache, input) } Ok(None) => false, Ok(Some(_)) => true, } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_slots(cache, input, slots); } if !self.core.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, trying fast path"); let m = self.search(cache, input)?; copy_match_to_slots(m, slots); return Some(m.pattern()); } let m = match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner captures optimization failed: {}", _err); return self.core.search_slots(cache, input, slots); } Err(RetryError::Fail(_err)) => { trace!("reverse inner fast captures search failed: {}", _err); return self.core.search_slots_nofail(cache, input, slots); } Ok(None) => return None, Ok(Some(m)) => m, }; trace!( "match found at {}..{} in capture search, \ using another engine to find captures", m.start(), m.end(), ); let input = input .clone() .span(m.start()..m.end()) .anchored(Anchored::Pattern(m.pattern())); self.core.search_slots_nofail(cache, &input, slots) } #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { self.core.which_overlapping_matches(cache, input, patset) } } /// Copies the offsets in the given match to the corresponding positions in /// `slots`. /// /// In effect, this sets the slots corresponding to the implicit group for the /// pattern in the given match. If the indices for the corresponding slots do /// not exist, then no slots are set. /// /// This is useful when the caller provides slots (or captures), but you use a /// regex engine that doesn't operate on slots (like a lazy DFA). This function /// lets you map the match you get back to the slots provided by the caller. #[cfg_attr(feature = "perf-inline", inline(always))] fn copy_match_to_slots(m: Match, slots: &mut [Option]) { let slot_start = m.pattern().as_usize() * 2; let slot_end = slot_start + 1; if let Some(slot) = slots.get_mut(slot_start) { *slot = NonMaxUsize::new(m.start()); } if let Some(slot) = slots.get_mut(slot_end) { *slot = NonMaxUsize::new(m.end()); } }