summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/util/prefilter/teddy.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/util/prefilter/teddy.rs')
-rw-r--r--third_party/rust/regex-automata/src/util/prefilter/teddy.rs160
1 files changed, 160 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/util/prefilter/teddy.rs b/third_party/rust/regex-automata/src/util/prefilter/teddy.rs
new file mode 100644
index 0000000000..fc79f2b2f3
--- /dev/null
+++ b/third_party/rust/regex-automata/src/util/prefilter/teddy.rs
@@ -0,0 +1,160 @@
+use crate::util::{
+ prefilter::PrefilterI,
+ search::{MatchKind, Span},
+};
+
+#[derive(Clone, Debug)]
+pub(crate) struct Teddy {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ _unused: (),
+ /// The actual Teddy searcher.
+ ///
+ /// Technically, it's possible that Teddy doesn't actually get used, since
+ /// Teddy does require its haystack to at least be of a certain size
+ /// (usually around the size of whatever vector is being used, so ~16
+ /// or ~32 bytes). For haystacks shorter than that, the implementation
+ /// currently uses Rabin-Karp.
+ #[cfg(feature = "perf-literal-multisubstring")]
+ searcher: aho_corasick::packed::Searcher,
+ /// When running an anchored search, the packed searcher can't handle it so
+ /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed
+ /// searchers to support anchored search would be difficult at worst and
+ /// annoying at best. Since packed searchers only apply to small numbers of
+ /// literals, we content ourselves that this is not much of an added cost.
+ /// (That packed searchers only work with a small number of literals is
+ /// also why we use a DFA here. Otherwise, the memory usage of a DFA would
+ /// likely be unacceptable.)
+ #[cfg(feature = "perf-literal-multisubstring")]
+ anchored_ac: aho_corasick::dfa::DFA,
+ /// The length of the smallest literal we look for.
+ ///
+ /// We use this as a heuristic to figure out whether this will be "fast" or
+ /// not. Generally, the longer the better, because longer needles are more
+ /// discriminating and thus reduce false positive rate.
+ #[cfg(feature = "perf-literal-multisubstring")]
+ minimum_len: usize,
+}
+
+impl Teddy {
+ pub(crate) fn new<B: AsRef<[u8]>>(
+ kind: MatchKind,
+ needles: &[B],
+ ) -> Option<Teddy> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ None
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ // We only really support leftmost-first semantics. In
+ // theory we could at least support leftmost-longest, as the
+ // aho-corasick crate does, but regex-automata doesn't know about
+ // leftmost-longest currently.
+ //
+ // And like the aho-corasick prefilter, if we're using `All`
+ // semantics, then we can still use leftmost semantics for a
+ // prefilter. (This might be a suspicious choice for the literal
+ // engine, which uses a prefilter as a regex engine directly, but
+ // that only happens when using leftmost-first semantics.)
+ let (packed_match_kind, ac_match_kind) = match kind {
+ MatchKind::LeftmostFirst | MatchKind::All => (
+ aho_corasick::packed::MatchKind::LeftmostFirst,
+ aho_corasick::MatchKind::LeftmostFirst,
+ ),
+ };
+ let minimum_len =
+ needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0);
+ let packed = aho_corasick::packed::Config::new()
+ .match_kind(packed_match_kind)
+ .builder()
+ .extend(needles)
+ .build()?;
+ let anchored_ac = aho_corasick::dfa::DFA::builder()
+ .match_kind(ac_match_kind)
+ .start_kind(aho_corasick::StartKind::Anchored)
+ .prefilter(false)
+ .build(needles)
+ .ok()?;
+ Some(Teddy { searcher: packed, anchored_ac, minimum_len })
+ }
+ }
+}
+
+impl PrefilterI for Teddy {
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ let ac_span =
+ aho_corasick::Span { start: span.start, end: span.end };
+ self.searcher
+ .find_in(haystack, ac_span)
+ .map(|m| Span { start: m.start(), end: m.end() })
+ }
+ }
+
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ use aho_corasick::automaton::Automaton;
+ let input = aho_corasick::Input::new(haystack)
+ .anchored(aho_corasick::Anchored::Yes)
+ .span(span.start..span.end);
+ self.anchored_ac
+ .try_find(&input)
+ // OK because we build the DFA with anchored support.
+ .expect("aho-corasick DFA should never fail")
+ .map(|m| Span { start: m.start(), end: m.end() })
+ }
+ }
+
+ fn memory_usage(&self) -> usize {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ use aho_corasick::automaton::Automaton;
+ self.searcher.memory_usage() + self.anchored_ac.memory_usage()
+ }
+ }
+
+ fn is_fast(&self) -> bool {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ // Teddy is usually quite fast, but I have seen some cases where
+ // a large number of literals can overwhelm it and make it not so
+ // fast. We make an educated but conservative guess at a limit, at
+ // which point, we're not so comfortable thinking Teddy is "fast."
+ //
+ // Well... this used to incorporate a "limit" on the *number*
+ // of literals, but I have since changed it to a minimum on the
+ // *smallest* literal. Namely, when there is a very small literal
+ // (1 or 2 bytes), it is far more likely that it leads to a higher
+ // false positive rate. (Although, of course, not always. For
+ // example, 'zq' is likely to have a very low false positive rate.)
+ // But when we have 3 bytes, we have a really good chance of being
+ // quite discriminatory and thus fast.
+ //
+ // We may still want to add some kind of limit on the number of
+ // literals here, but keep in mind that Teddy already has its own
+ // somewhat small limit (64 at time of writing). The main issue
+ // here is that if 'is_fast' is false, it opens the door for the
+ // reverse inner optimization to kick in. We really only want to
+ // resort to the reverse inner optimization if we absolutely must.
+ self.minimum_len >= 3
+ }
+ }
+}