summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs
blob: 50cce827ee2822a140306c5f6b6f79948f7b7d67 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
use crate::util::{
    prefilter::PrefilterI,
    search::{MatchKind, Span},
};

#[derive(Clone, Debug)]
pub(crate) struct AhoCorasick {
    #[cfg(not(feature = "perf-literal-multisubstring"))]
    _unused: (),
    #[cfg(feature = "perf-literal-multisubstring")]
    ac: aho_corasick::AhoCorasick,
}

impl AhoCorasick {
    pub(crate) fn new<B: AsRef<[u8]>>(
        kind: MatchKind,
        needles: &[B],
    ) -> Option<AhoCorasick> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            None
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            // We used to use `aho_corasick::MatchKind::Standard` here when
            // `kind` was `MatchKind::All`, but this is not correct. The
            // "standard" Aho-Corasick match semantics are to report a match
            // immediately as soon as it is seen, but `All` isn't like that.
            // In particular, with "standard" semantics, given the needles
            // "abc" and "b" and the haystack "abc," it would report a match
            // at offset 1 before a match at offset 0. This is never what we
            // want in the context of the regex engine, regardless of whether
            // we have leftmost-first or 'all' semantics. Namely, we always
            // want the leftmost match.
            let ac_match_kind = match kind {
                MatchKind::LeftmostFirst | MatchKind::All => {
                    aho_corasick::MatchKind::LeftmostFirst
                }
            };
            // This is kind of just an arbitrary number, but basically, if we
            // have a small enough set of literals, then we try to use the VERY
            // memory hungry DFA. Otherwise, we whimp out and use an NFA. The
            // upshot is that the NFA is quite lean and decently fast. Faster
            // than a naive Aho-Corasick NFA anyway.
            let ac_kind = if needles.len() <= 500 {
                aho_corasick::AhoCorasickKind::DFA
            } else {
                aho_corasick::AhoCorasickKind::ContiguousNFA
            };
            let result = aho_corasick::AhoCorasick::builder()
                .kind(Some(ac_kind))
                .match_kind(ac_match_kind)
                .start_kind(aho_corasick::StartKind::Both)
                // We try to handle all of the prefilter cases in the super
                // module, and only use Aho-Corasick for the actual automaton.
                // The aho-corasick crate does have some extra prefilters,
                // namely, looking for rare bytes to feed to memchr{,2,3}
                // instead of just the first byte. If we end up wanting
                // those---and they are somewhat tricky to implement---then
                // we could port them to this crate.
                //
                // The main reason for doing things this way is so we have a
                // complete and easy to understand picture of which prefilters
                // are available and how they work. Otherwise it seems too
                // easy to get into a situation where we have a prefilter
                // layered on top of prefilter, and that might have unintended
                // consequences.
                .prefilter(false)
                .build(needles);
            let ac = match result {
                Ok(ac) => ac,
                Err(_err) => {
                    debug!("aho-corasick prefilter failed to build: {}", _err);
                    return None;
                }
            };
            Some(AhoCorasick { ac })
        }
    }
}

impl PrefilterI for AhoCorasick {
    fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            let input =
                aho_corasick::Input::new(haystack).span(span.start..span.end);
            self.ac
                .find(input)
                .map(|m| Span { start: m.start(), end: m.end() })
        }
    }

    fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            let input = aho_corasick::Input::new(haystack)
                .anchored(aho_corasick::Anchored::Yes)
                .span(span.start..span.end);
            self.ac
                .find(input)
                .map(|m| Span { start: m.start(), end: m.end() })
        }
    }

    fn memory_usage(&self) -> usize {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            self.ac.memory_usage()
        }
    }

    fn is_fast(&self) -> bool {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            // Aho-Corasick is never considered "fast" because it's never
            // going to be even close to an order of magnitude faster than the
            // regex engine itself (assuming a DFA is used). In fact, it is
            // usually slower. The magic of Aho-Corasick is that it can search
            // a *large* number of literals with a relatively small amount of
            // memory. The regex engines are far more wasteful.
            //
            // Aho-Corasick may be "fast" when the regex engine corresponds
            // to, say, the PikeVM. That happens when the lazy DFA couldn't be
            // built or used for some reason. But in these cases, the regex
            // itself is likely quite big and we're probably hosed no matter
            // what we do. (In this case, the best bet is for the caller to
            // increase some of the memory limits on the hybrid cache capacity
            // and hope that's enough.)
            false
        }
    }
}