vendor/regex-automata/src/util/prefilter.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281

use crate::Match;

/// A candidate is the result of running a prefilter on a haystack at a
/// particular position. The result is one of no match, a confirmed match or
/// a possible match.
///
/// When no match is returned, the prefilter is guaranteeing that no possible
/// match can be found in the haystack, and the caller may trust this. That is,
/// all correct prefilters must never report false negatives.
///
/// In some cases, a prefilter can confirm a match very quickly, in which case,
/// the caller may use this to stop what it's doing and report the match. In
/// this case, prefilter implementations must never report a false positive.
/// In other cases, the prefilter can only report a potential match, in which
/// case the callers must attempt to confirm the match. In this case, prefilter
/// implementations are permitted to return false positives.
#[derive(Clone, Debug)]
pub enum Candidate {
    /// The prefilter reports that no match is possible. Prefilter
    /// implementations will never report false negatives.
    None,
    /// The prefilter reports that a match has been confirmed at the provided
    /// byte offsets. When this variant is reported, the prefilter is
    /// guaranteeing a match. No false positives are permitted.
    Match(Match),
    /// The prefilter reports that a match *may* start at the given position.
    /// When this variant is reported, it may correspond to a false positive.
    PossibleStartOfMatch(usize),
}

impl Candidate {
    /// Convert this candidate into an option. This is useful when callers do
    /// not distinguish between true positives and false positives (i.e., the
    /// caller must always confirm the match in order to update some other
    /// state).
    ///
    /// The byte offset in the option returned corresponds to the starting
    /// position of the possible match.
    pub fn into_option(self) -> Option<usize> {
        match self {
            Candidate::None => None,
            Candidate::Match(ref m) => Some(m.start()),
            Candidate::PossibleStartOfMatch(start) => Some(start),
        }
    }
}

/// A prefilter describes the behavior of fast literal scanners for quickly
/// skipping past bytes in the haystack that we know cannot possibly
/// participate in a match.
pub trait Prefilter: core::fmt::Debug {
    /// Returns the next possible match candidate. This may yield false
    /// positives, so callers must confirm a match starting at the position
    /// returned. This, however, must never produce false negatives. That is,
    /// this must, at minimum, return the starting position of the next match
    /// in the given haystack after or at the given position.
    fn next_candidate(
        &self,
        state: &mut State,
        haystack: &[u8],
        at: usize,
    ) -> Candidate;

    /// Returns the approximate total amount of heap used by this prefilter, in
    /// units of bytes.
    fn heap_bytes(&self) -> usize;

    /// Returns true if and only if this prefilter may return false positives
    /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful
    /// when false positives are not posssible (in which case, implementations
    /// should return false), which may allow completely avoiding heavier regex
    /// machinery when the prefilter can quickly confirm its own matches.
    ///
    /// By default, this returns true, which is conservative; it is always
    /// correct to return `true`. Returning `false` here and reporting a false
    /// positive will result in incorrect searches.
    fn reports_false_positives(&self) -> bool {
        true
    }
}

impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
    #[inline]
    fn next_candidate(
        &self,
        state: &mut State,
        haystack: &[u8],
        at: usize,
    ) -> Candidate {
        (**self).next_candidate(state, haystack, at)
    }

    fn heap_bytes(&self) -> usize {
        (**self).heap_bytes()
    }

    fn reports_false_positives(&self) -> bool {
        (**self).reports_false_positives()
    }
}

#[derive(Clone)]
pub struct Scanner<'p> {
    prefilter: &'p dyn Prefilter,
    state: State,
}

impl<'p> Scanner<'p> {
    pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> {
        Scanner { prefilter, state: State::new() }
    }

    pub(crate) fn is_effective(&mut self, at: usize) -> bool {
        self.state.is_effective(at)
    }

    pub(crate) fn reports_false_positives(&self) -> bool {
        self.prefilter.reports_false_positives()
    }

    pub(crate) fn next_candidate(
        &mut self,
        bytes: &[u8],
        at: usize,
    ) -> Candidate {
        let cand = self.prefilter.next_candidate(&mut self.state, bytes, at);
        match cand {
            Candidate::None => {
                self.state.update_skipped_bytes(bytes.len() - at);
            }
            Candidate::Match(ref m) => {
                self.state.update_skipped_bytes(m.start() - at);
            }
            Candidate::PossibleStartOfMatch(i) => {
                self.state.update_skipped_bytes(i - at);
            }
        }
        cand
    }
}

impl<'p> core::fmt::Debug for Scanner<'p> {
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
        f.debug_struct("Scanner").field("state", &self.state).finish()
    }
}

/// State tracks state associated with the effectiveness of a
/// prefilter. It is used to track how many bytes, on average, are skipped by
/// the prefilter. If this average dips below a certain threshold over time,
/// then the state renders the prefilter inert and stops using it.
///
/// A prefilter state should be created for each search. (Where creating an
/// iterator via, e.g., `find_iter`, is treated as a single search.)
#[derive(Clone, Debug)]
pub struct State {
    /// The number of skips that has been executed.
    skips: usize,
    /// The total number of bytes that have been skipped.
    skipped: usize,
    /// Once this heuristic has been deemed permanently ineffective, it will be
    /// inert throughout the rest of its lifetime. This serves as a cheap way
    /// to check inertness.
    inert: bool,
    /// The last (absolute) position at which a prefilter scanned to.
    /// Prefilters can use this position to determine whether to re-scan or
    /// not.
    ///
    /// Unlike other things that impact effectiveness, this is a fleeting
    /// condition. That is, a prefilter can be considered ineffective if it is
    /// at a position before `last_scan_at`, but can become effective again
    /// once the search moves past `last_scan_at`.
    ///
    /// The utility of this is to both avoid additional overhead from calling
    /// the prefilter and to avoid quadratic behavior. This ensures that a
    /// prefilter will scan any particular byte at most once. (Note that some
    /// prefilters, like the start-byte prefilter, do not need to use this
    /// field at all, since it only looks for starting bytes.)
    last_scan_at: usize,
}

impl State {
    /// The minimum number of skip attempts to try before considering whether
    /// a prefilter is effective or not.
    const MIN_SKIPS: usize = 40;

    /// The minimum amount of bytes that skipping must average.
    ///
    /// That is, after MIN_SKIPS have occurred, if the average number of bytes
    /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be
    /// rendered inert.
    const MIN_AVG_SKIP: usize = 16;

    /// Create a fresh prefilter state.
    pub fn new() -> State {
        State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 }
    }

    /// Updates the position at which the last scan stopped. This may be
    /// greater than the position of the last candidate reported. For example,
    /// searching for the byte `z` in `abczdef` for the pattern `abcz` will
    /// report a candidate at position `0`, but the end of its last scan will
    /// be at position `3`.
    ///
    /// This position factors into the effectiveness of this prefilter. If the
    /// current position is less than the last position at which a scan ended,
    /// then the prefilter should not be re-run until the search moves past
    /// that position.
    ///
    /// It is always correct to never update the last scan position. In fact,
    /// it is also always correct to set the last scan position to an arbitrary
    /// value. The key is setting it to a position in the future at which it
    /// makes sense to restart the prefilter.
    pub fn update_last_scan(&mut self, at: usize) {
        if at > self.last_scan_at {
            self.last_scan_at = at;
        }
    }

    /// Return true if and only if this state indicates that a prefilter is
    /// still effective. If the prefilter is not effective, then this state
    /// is rendered "inert." At which point, all subsequent calls to
    /// `is_effective` on this state will return `false`.
    ///
    /// `at` should correspond to the current starting position of the search.
    ///
    /// Callers typically do not need to use this, as it represents the
    /// default implementation of
    /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective).
    fn is_effective(&mut self, at: usize) -> bool {
        if self.inert {
            return false;
        }
        if at < self.last_scan_at {
            return false;
        }
        if self.skips < State::MIN_SKIPS {
            return true;
        }

        if self.skipped >= State::MIN_AVG_SKIP * self.skips {
            return true;
        }

        // We're inert.
        self.inert = true;
        false
    }

    /// Update this state with the number of bytes skipped on the last
    /// invocation of the prefilter.
    fn update_skipped_bytes(&mut self, skipped: usize) {
        self.skips += 1;
        self.skipped += skipped;
    }
}

/// A `Prefilter` implementation that reports a possible match at every
/// position.
///
/// This should generally not be used as an actual prefilter. It is only
/// useful when one needs to represent the absence of a prefilter in a generic
/// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex)
/// uses this prefilter by default to indicate that no prefilter should be
/// used.
///
/// A `None` prefilter value cannot be constructed.
#[derive(Clone, Debug)]
pub struct None {
    _priv: (),
}

impl Prefilter for None {
    fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate {
        Candidate::PossibleStartOfMatch(at)
    }

    fn heap_bytes(&self) -> usize {
        0
    }
}