diff options
Diffstat (limited to 'third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs')
-rw-r--r-- | third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs | 149 |
1 files changed, 149 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs b/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs new file mode 100644 index 0000000000..50cce827ee --- /dev/null +++ b/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs @@ -0,0 +1,149 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct AhoCorasick { + #[cfg(not(feature = "perf-literal-multisubstring"))] + _unused: (), + #[cfg(feature = "perf-literal-multisubstring")] + ac: aho_corasick::AhoCorasick, +} + +impl AhoCorasick { + pub(crate) fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<AhoCorasick> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + None + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // We used to use `aho_corasick::MatchKind::Standard` here when + // `kind` was `MatchKind::All`, but this is not correct. The + // "standard" Aho-Corasick match semantics are to report a match + // immediately as soon as it is seen, but `All` isn't like that. + // In particular, with "standard" semantics, given the needles + // "abc" and "b" and the haystack "abc," it would report a match + // at offset 1 before a match at offset 0. This is never what we + // want in the context of the regex engine, regardless of whether + // we have leftmost-first or 'all' semantics. Namely, we always + // want the leftmost match. + let ac_match_kind = match kind { + MatchKind::LeftmostFirst | MatchKind::All => { + aho_corasick::MatchKind::LeftmostFirst + } + }; + // This is kind of just an arbitrary number, but basically, if we + // have a small enough set of literals, then we try to use the VERY + // memory hungry DFA. Otherwise, we whimp out and use an NFA. The + // upshot is that the NFA is quite lean and decently fast. Faster + // than a naive Aho-Corasick NFA anyway. + let ac_kind = if needles.len() <= 500 { + aho_corasick::AhoCorasickKind::DFA + } else { + aho_corasick::AhoCorasickKind::ContiguousNFA + }; + let result = aho_corasick::AhoCorasick::builder() + .kind(Some(ac_kind)) + .match_kind(ac_match_kind) + .start_kind(aho_corasick::StartKind::Both) + // We try to handle all of the prefilter cases in the super + // module, and only use Aho-Corasick for the actual automaton. + // The aho-corasick crate does have some extra prefilters, + // namely, looking for rare bytes to feed to memchr{,2,3} + // instead of just the first byte. If we end up wanting + // those---and they are somewhat tricky to implement---then + // we could port them to this crate. + // + // The main reason for doing things this way is so we have a + // complete and easy to understand picture of which prefilters + // are available and how they work. Otherwise it seems too + // easy to get into a situation where we have a prefilter + // layered on top of prefilter, and that might have unintended + // consequences. + .prefilter(false) + .build(needles); + let ac = match result { + Ok(ac) => ac, + Err(_err) => { + debug!("aho-corasick prefilter failed to build: {}", _err); + return None; + } + }; + Some(AhoCorasick { ac }) + } + } +} + +impl PrefilterI for AhoCorasick { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let input = + aho_corasick::Input::new(haystack).span(span.start..span.end); + self.ac + .find(input) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let input = aho_corasick::Input::new(haystack) + .anchored(aho_corasick::Anchored::Yes) + .span(span.start..span.end); + self.ac + .find(input) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn memory_usage(&self) -> usize { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + self.ac.memory_usage() + } + } + + fn is_fast(&self) -> bool { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // Aho-Corasick is never considered "fast" because it's never + // going to be even close to an order of magnitude faster than the + // regex engine itself (assuming a DFA is used). In fact, it is + // usually slower. The magic of Aho-Corasick is that it can search + // a *large* number of literals with a relatively small amount of + // memory. The regex engines are far more wasteful. + // + // Aho-Corasick may be "fast" when the regex engine corresponds + // to, say, the PikeVM. That happens when the lazy DFA couldn't be + // built or used for some reason. But in these cases, the regex + // itself is likely quite big and we're probably hosed no matter + // what we do. (In this case, the best bet is for the caller to + // increase some of the memory limits on the hybrid cache capacity + // and hope that's enough.) + false + } + } +} |