summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs')
-rw-r--r--third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs149
1 files changed, 149 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs b/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs
new file mode 100644
index 0000000000..50cce827ee
--- /dev/null
+++ b/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs
@@ -0,0 +1,149 @@
+use crate::util::{
+ prefilter::PrefilterI,
+ search::{MatchKind, Span},
+};
+
+#[derive(Clone, Debug)]
+pub(crate) struct AhoCorasick {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ _unused: (),
+ #[cfg(feature = "perf-literal-multisubstring")]
+ ac: aho_corasick::AhoCorasick,
+}
+
+impl AhoCorasick {
+ pub(crate) fn new<B: AsRef<[u8]>>(
+ kind: MatchKind,
+ needles: &[B],
+ ) -> Option<AhoCorasick> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ None
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ // We used to use `aho_corasick::MatchKind::Standard` here when
+ // `kind` was `MatchKind::All`, but this is not correct. The
+ // "standard" Aho-Corasick match semantics are to report a match
+ // immediately as soon as it is seen, but `All` isn't like that.
+ // In particular, with "standard" semantics, given the needles
+ // "abc" and "b" and the haystack "abc," it would report a match
+ // at offset 1 before a match at offset 0. This is never what we
+ // want in the context of the regex engine, regardless of whether
+ // we have leftmost-first or 'all' semantics. Namely, we always
+ // want the leftmost match.
+ let ac_match_kind = match kind {
+ MatchKind::LeftmostFirst | MatchKind::All => {
+ aho_corasick::MatchKind::LeftmostFirst
+ }
+ };
+ // This is kind of just an arbitrary number, but basically, if we
+ // have a small enough set of literals, then we try to use the VERY
+ // memory hungry DFA. Otherwise, we whimp out and use an NFA. The
+ // upshot is that the NFA is quite lean and decently fast. Faster
+ // than a naive Aho-Corasick NFA anyway.
+ let ac_kind = if needles.len() <= 500 {
+ aho_corasick::AhoCorasickKind::DFA
+ } else {
+ aho_corasick::AhoCorasickKind::ContiguousNFA
+ };
+ let result = aho_corasick::AhoCorasick::builder()
+ .kind(Some(ac_kind))
+ .match_kind(ac_match_kind)
+ .start_kind(aho_corasick::StartKind::Both)
+ // We try to handle all of the prefilter cases in the super
+ // module, and only use Aho-Corasick for the actual automaton.
+ // The aho-corasick crate does have some extra prefilters,
+ // namely, looking for rare bytes to feed to memchr{,2,3}
+ // instead of just the first byte. If we end up wanting
+ // those---and they are somewhat tricky to implement---then
+ // we could port them to this crate.
+ //
+ // The main reason for doing things this way is so we have a
+ // complete and easy to understand picture of which prefilters
+ // are available and how they work. Otherwise it seems too
+ // easy to get into a situation where we have a prefilter
+ // layered on top of prefilter, and that might have unintended
+ // consequences.
+ .prefilter(false)
+ .build(needles);
+ let ac = match result {
+ Ok(ac) => ac,
+ Err(_err) => {
+ debug!("aho-corasick prefilter failed to build: {}", _err);
+ return None;
+ }
+ };
+ Some(AhoCorasick { ac })
+ }
+ }
+}
+
+impl PrefilterI for AhoCorasick {
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ let input =
+ aho_corasick::Input::new(haystack).span(span.start..span.end);
+ self.ac
+ .find(input)
+ .map(|m| Span { start: m.start(), end: m.end() })
+ }
+ }
+
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ let input = aho_corasick::Input::new(haystack)
+ .anchored(aho_corasick::Anchored::Yes)
+ .span(span.start..span.end);
+ self.ac
+ .find(input)
+ .map(|m| Span { start: m.start(), end: m.end() })
+ }
+ }
+
+ fn memory_usage(&self) -> usize {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ self.ac.memory_usage()
+ }
+ }
+
+ fn is_fast(&self) -> bool {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ // Aho-Corasick is never considered "fast" because it's never
+ // going to be even close to an order of magnitude faster than the
+ // regex engine itself (assuming a DFA is used). In fact, it is
+ // usually slower. The magic of Aho-Corasick is that it can search
+ // a *large* number of literals with a relatively small amount of
+ // memory. The regex engines are far more wasteful.
+ //
+ // Aho-Corasick may be "fast" when the regex engine corresponds
+ // to, say, the PikeVM. That happens when the lazy DFA couldn't be
+ // built or used for some reason. But in these cases, the regex
+ // itself is likely quite big and we're probably hosed no matter
+ // what we do. (In this case, the best bet is for the caller to
+ // increase some of the memory limits on the hybrid cache capacity
+ // and hope that's enough.)
+ false
+ }
+ }
+}