1 files changed, 149 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs b/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs
new file mode 100644
index 0000000000..50cce827ee
--- /dev/null
+++ b/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs
@@ -0,0 +1,149 @@
+use crate::util::{
+    prefilter::PrefilterI,
+    search::{MatchKind, Span},
+};
+
+#[derive(Clone, Debug)]
+pub(crate) struct AhoCorasick {
+    #[cfg(not(feature = "perf-literal-multisubstring"))]
+    _unused: (),
+    #[cfg(feature = "perf-literal-multisubstring")]
+    ac: aho_corasick::AhoCorasick,
+}
+
+impl AhoCorasick {
+    pub(crate) fn new<B: AsRef<[u8]>>(
+        kind: MatchKind,
+        needles: &[B],
+    ) -> Option<AhoCorasick> {
+        #[cfg(not(feature = "perf-literal-multisubstring"))]
+        {
+            None
+        }
+        #[cfg(feature = "perf-literal-multisubstring")]
+        {
+            // We used to use `aho_corasick::MatchKind::Standard` here when
+            // `kind` was `MatchKind::All`, but this is not correct. The
+            // "standard" Aho-Corasick match semantics are to report a match
+            // immediately as soon as it is seen, but `All` isn't like that.
+            // In particular, with "standard" semantics, given the needles
+            // "abc" and "b" and the haystack "abc," it would report a match
+            // at offset 1 before a match at offset 0. This is never what we
+            // want in the context of the regex engine, regardless of whether
+            // we have leftmost-first or 'all' semantics. Namely, we always
+            // want the leftmost match.
+            let ac_match_kind = match kind {
+                MatchKind::LeftmostFirst | MatchKind::All => {
+                    aho_corasick::MatchKind::LeftmostFirst
+                }
+            };
+            // This is kind of just an arbitrary number, but basically, if we
+            // have a small enough set of literals, then we try to use the VERY
+            // memory hungry DFA. Otherwise, we whimp out and use an NFA. The
+            // upshot is that the NFA is quite lean and decently fast. Faster
+            // than a naive Aho-Corasick NFA anyway.
+            let ac_kind = if needles.len() <= 500 {
+                aho_corasick::AhoCorasickKind::DFA
+            } else {
+                aho_corasick::AhoCorasickKind::ContiguousNFA
+            };
+            let result = aho_corasick::AhoCorasick::builder()
+                .kind(Some(ac_kind))
+                .match_kind(ac_match_kind)
+                .start_kind(aho_corasick::StartKind::Both)
+                // We try to handle all of the prefilter cases in the super
+                // module, and only use Aho-Corasick for the actual automaton.
+                // The aho-corasick crate does have some extra prefilters,
+                // namely, looking for rare bytes to feed to memchr{,2,3}
+                // instead of just the first byte. If we end up wanting
+                // those---and they are somewhat tricky to implement---then
+                // we could port them to this crate.
+                //
+                // The main reason for doing things this way is so we have a
+                // complete and easy to understand picture of which prefilters
+                // are available and how they work. Otherwise it seems too
+                // easy to get into a situation where we have a prefilter
+                // layered on top of prefilter, and that might have unintended
+                // consequences.
+                .prefilter(false)
+                .build(needles);
+            let ac = match result {
+                Ok(ac) => ac,
+                Err(_err) => {
+                    debug!("aho-corasick prefilter failed to build: {}", _err);
+                    return None;
+                }
+            };
+            Some(AhoCorasick { ac })
+        }
+    }
+}
+
+impl PrefilterI for AhoCorasick {
+    fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+        #[cfg(not(feature = "perf-literal-multisubstring"))]
+        {
+            unreachable!()
+        }
+        #[cfg(feature = "perf-literal-multisubstring")]
+        {
+            let input =
+                aho_corasick::Input::new(haystack).span(span.start..span.end);
+            self.ac
+                .find(input)
+                .map(|m| Span { start: m.start(), end: m.end() })
+        }
+    }
+
+    fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+        #[cfg(not(feature = "perf-literal-multisubstring"))]
+        {
+            unreachable!()
+        }
+        #[cfg(feature = "perf-literal-multisubstring")]
+        {
+            let input = aho_corasick::Input::new(haystack)
+                .anchored(aho_corasick::Anchored::Yes)
+                .span(span.start..span.end);
+            self.ac
+                .find(input)
+                .map(|m| Span { start: m.start(), end: m.end() })
+        }
+    }
+
+    fn memory_usage(&self) -> usize {
+        #[cfg(not(feature = "perf-literal-multisubstring"))]
+        {
+            unreachable!()
+        }
+        #[cfg(feature = "perf-literal-multisubstring")]
+        {
+            self.ac.memory_usage()
+        }
+    }
+
+    fn is_fast(&self) -> bool {
+        #[cfg(not(feature = "perf-literal-multisubstring"))]
+        {
+            unreachable!()
+        }
+        #[cfg(feature = "perf-literal-multisubstring")]
+        {
+            // Aho-Corasick is never considered "fast" because it's never
+            // going to be even close to an order of magnitude faster than the
+            // regex engine itself (assuming a DFA is used). In fact, it is
+            // usually slower. The magic of Aho-Corasick is that it can search
+            // a *large* number of literals with a relatively small amount of
+            // memory. The regex engines are far more wasteful.
+            //
+            // Aho-Corasick may be "fast" when the regex engine corresponds
+            // to, say, the PikeVM. That happens when the lazy DFA couldn't be
+            // built or used for some reason. But in these cases, the regex
+            // itself is likely quite big and we're probably hosed no matter
+            // what we do. (In this case, the best bet is for the caller to
+            // increase some of the memory limits on the hybrid cache capacity
+            // and hope that's enough.)
+            false
+        }
+    }
+}