third_party/rust/regex-automata/src/util/prefilter/teddy.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

use crate::util::{
    prefilter::PrefilterI,
    search::{MatchKind, Span},
};

#[derive(Clone, Debug)]
pub(crate) struct Teddy {
    #[cfg(not(feature = "perf-literal-multisubstring"))]
    _unused: (),
    /// The actual Teddy searcher.
    ///
    /// Technically, it's possible that Teddy doesn't actually get used, since
    /// Teddy does require its haystack to at least be of a certain size
    /// (usually around the size of whatever vector is being used, so ~16
    /// or ~32 bytes). For haystacks shorter than that, the implementation
    /// currently uses Rabin-Karp.
    #[cfg(feature = "perf-literal-multisubstring")]
    searcher: aho_corasick::packed::Searcher,
    /// When running an anchored search, the packed searcher can't handle it so
    /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed
    /// searchers to support anchored search would be difficult at worst and
    /// annoying at best. Since packed searchers only apply to small numbers of
    /// literals, we content ourselves that this is not much of an added cost.
    /// (That packed searchers only work with a small number of literals is
    /// also why we use a DFA here. Otherwise, the memory usage of a DFA would
    /// likely be unacceptable.)
    #[cfg(feature = "perf-literal-multisubstring")]
    anchored_ac: aho_corasick::dfa::DFA,
    /// The length of the smallest literal we look for.
    ///
    /// We use this as a heuristic to figure out whether this will be "fast" or
    /// not. Generally, the longer the better, because longer needles are more
    /// discriminating and thus reduce false positive rate.
    #[cfg(feature = "perf-literal-multisubstring")]
    minimum_len: usize,
}

impl Teddy {
    pub(crate) fn new<B: AsRef<[u8]>>(
        kind: MatchKind,
        needles: &[B],
    ) -> Option<Teddy> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            None
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            // We only really support leftmost-first semantics. In
            // theory we could at least support leftmost-longest, as the
            // aho-corasick crate does, but regex-automata doesn't know about
            // leftmost-longest currently.
            //
            // And like the aho-corasick prefilter, if we're using `All`
            // semantics, then we can still use leftmost semantics for a
            // prefilter. (This might be a suspicious choice for the literal
            // engine, which uses a prefilter as a regex engine directly, but
            // that only happens when using leftmost-first semantics.)
            let (packed_match_kind, ac_match_kind) = match kind {
                MatchKind::LeftmostFirst | MatchKind::All => (
                    aho_corasick::packed::MatchKind::LeftmostFirst,
                    aho_corasick::MatchKind::LeftmostFirst,
                ),
            };
            let minimum_len =
                needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0);
            let packed = aho_corasick::packed::Config::new()
                .match_kind(packed_match_kind)
                .builder()
                .extend(needles)
                .build()?;
            let anchored_ac = aho_corasick::dfa::DFA::builder()
                .match_kind(ac_match_kind)
                .start_kind(aho_corasick::StartKind::Anchored)
                .prefilter(false)
                .build(needles)
                .ok()?;
            Some(Teddy { searcher: packed, anchored_ac, minimum_len })
        }
    }
}

impl PrefilterI for Teddy {
    fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            let ac_span =
                aho_corasick::Span { start: span.start, end: span.end };
            self.searcher
                .find_in(haystack, ac_span)
                .map(|m| Span { start: m.start(), end: m.end() })
        }
    }

    fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            use aho_corasick::automaton::Automaton;
            let input = aho_corasick::Input::new(haystack)
                .anchored(aho_corasick::Anchored::Yes)
                .span(span.start..span.end);
            self.anchored_ac
                .try_find(&input)
                // OK because we build the DFA with anchored support.
                .expect("aho-corasick DFA should never fail")
                .map(|m| Span { start: m.start(), end: m.end() })
        }
    }

    fn memory_usage(&self) -> usize {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            use aho_corasick::automaton::Automaton;
            self.searcher.memory_usage() + self.anchored_ac.memory_usage()
        }
    }

    fn is_fast(&self) -> bool {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            // Teddy is usually quite fast, but I have seen some cases where
            // a large number of literals can overwhelm it and make it not so
            // fast. We make an educated but conservative guess at a limit, at
            // which point, we're not so comfortable thinking Teddy is "fast."
            //
            // Well... this used to incorporate a "limit" on the *number*
            // of literals, but I have since changed it to a minimum on the
            // *smallest* literal. Namely, when there is a very small literal
            // (1 or 2 bytes), it is far more likely that it leads to a higher
            // false positive rate. (Although, of course, not always. For
            // example, 'zq' is likely to have a very low false positive rate.)
            // But when we have 3 bytes, we have a really good chance of being
            // quite discriminatory and thus fast.
            //
            // We may still want to add some kind of limit on the number of
            // literals here, but keep in mind that Teddy already has its own
            // somewhat small limit (64 at time of writing). The main issue
            // here is that if 'is_fast' is false, it opens the door for the
            // reverse inner optimization to kick in. We really only want to
            // resort to the reverse inner optimization if we absolutely must.
            self.minimum_len >= 3
        }
    }
}